17b94a
From 3e8b3a2c2c6f83635486035fc8040c87d89813d2 Mon Sep 17 00:00:00 2001
17b94a
From: Xavi Hernandez <xhernandez@redhat.com>
17b94a
Date: Thu, 2 Jul 2020 18:08:52 +0200
17b94a
Subject: [PATCH 457/465] cluster/ec: Improve detection of new heals
17b94a
17b94a
When EC successfully healed a directory it assumed that maybe other
17b94a
entries inside that directory could have been created, which could
17b94a
require additional heal cycles. For this reason, when the heal happened
17b94a
as part of one index heal iteration, it triggered a new iteration.
17b94a
17b94a
The problem happened when the directory was healthy, so no new entries
17b94a
were added, but its index entry was not removed for some reason. In
17b94a
this case self-heal started and endless loop healing the same directory
17b94a
continuously, cause high CPU utilization.
17b94a
17b94a
This patch improves detection of new files added to the heal index so
17b94a
that a new index heal iteration is only triggered if there is new work
17b94a
to do.
17b94a
17b94a
>Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24665/
17b94a
>Fixes: #1354
17b94a
17b94a
Change-Id: I2355742b85fbfa6de758bccc5d2e1a283c82b53f
17b94a
BUG: 1852736
17b94a
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
17b94a
Reviewed-on: https://code.engineering.redhat.com/gerrit/208041
17b94a
Tested-by: Ashish Pandey <aspandey@redhat.com>
17b94a
Tested-by: RHGS Build Bot <nigelb@redhat.com>
17b94a
Reviewed-by: Ashish Pandey <aspandey@redhat.com>
17b94a
---
17b94a
 xlators/cluster/ec/src/ec-common.c     |  2 +-
17b94a
 xlators/cluster/ec/src/ec-heal.c       | 58 +++++++++++++++++++++++-----------
17b94a
 xlators/cluster/ec/src/ec-heald.c      | 24 ++++++++++----
17b94a
 xlators/cluster/ec/src/ec-inode-read.c | 27 ++++++++++++++--
17b94a
 xlators/cluster/ec/src/ec-types.h      |  4 +--
17b94a
 xlators/cluster/ec/src/ec.h            |  1 +
17b94a
 6 files changed, 86 insertions(+), 30 deletions(-)
17b94a
17b94a
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
17b94a
index e580bfb..e3f8769 100644
17b94a
--- a/xlators/cluster/ec/src/ec-common.c
17b94a
+++ b/xlators/cluster/ec/src/ec-common.c
17b94a
@@ -230,7 +230,7 @@ ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
17b94a
 int32_t
17b94a
 ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this,
17b94a
                int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good,
17b94a
-               uintptr_t bad, dict_t *xdata)
17b94a
+               uintptr_t bad, uint32_t pending, dict_t *xdata)
17b94a
 {
17b94a
     if (op_ret < 0) {
17b94a
         gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL,
17b94a
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
17b94a
index 06a7016..e2de879 100644
17b94a
--- a/xlators/cluster/ec/src/ec-heal.c
17b94a
+++ b/xlators/cluster/ec/src/ec-heal.c
17b94a
@@ -72,6 +72,7 @@ struct ec_name_data {
17b94a
     char *name;
17b94a
     inode_t *parent;
17b94a
     default_args_cbk_t *replies;
17b94a
+    uint32_t heal_pending;
17b94a
 };
17b94a
 
17b94a
 static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
17b94a
@@ -996,6 +997,7 @@ ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
17b94a
         ret = -ENOTCONN;
17b94a
         goto out;
17b94a
     }
17b94a
+
17b94a
 out:
17b94a
     if (xattr)
17b94a
         dict_unref(xattr);
17b94a
@@ -1164,6 +1166,7 @@ ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
17b94a
     dict_t *xdata = NULL;
17b94a
     char *linkname = NULL;
17b94a
     ec_config_t config;
17b94a
+
17b94a
     /* There should be just one gfid key */
17b94a
     EC_REPLIES_ALLOC(replies, ec->nodes);
17b94a
     if (gfid_db->count != 1) {
17b94a
@@ -1408,6 +1411,11 @@ __ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
17b94a
 
17b94a
     ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
17b94a
                          participants);
17b94a
+    if (ret >= 0) {
17b94a
+        /* If ec_create_name() succeeded we return 1 to indicate that a new
17b94a
+         * file has been created and it will need to be healed. */
17b94a
+        ret = 1;
17b94a
+    }
17b94a
 out:
17b94a
     cluster_replies_wipe(replies, ec->nodes);
17b94a
     loc_wipe(&loc;;
17b94a
@@ -1485,18 +1493,22 @@ ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
17b94a
     ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
17b94a
                        name_on);
17b94a
 
17b94a
-    if (ret < 0)
17b94a
+    if (ret < 0) {
17b94a
         memset(name_on, 0, ec->nodes);
17b94a
+    } else {
17b94a
+        name_data->heal_pending += ret;
17b94a
+    }
17b94a
 
17b94a
     for (i = 0; i < ec->nodes; i++)
17b94a
         if (name_data->participants[i] && !name_on[i])
17b94a
             name_data->failed_on[i] = 1;
17b94a
+
17b94a
     return 0;
17b94a
 }
17b94a
 
17b94a
 int
17b94a
 ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
17b94a
-              unsigned char *participants)
17b94a
+              unsigned char *participants, uint32_t *pending)
17b94a
 {
17b94a
     int i = 0;
17b94a
     int j = 0;
17b94a
@@ -1509,7 +1521,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
17b94a
     name_data.frame = frame;
17b94a
     name_data.participants = participants;
17b94a
     name_data.failed_on = alloca0(ec->nodes);
17b94a
-    ;
17b94a
+    name_data.heal_pending = 0;
17b94a
 
17b94a
     for (i = 0; i < ec->nodes; i++) {
17b94a
         if (!participants[i])
17b94a
@@ -1528,6 +1540,8 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
17b94a
             break;
17b94a
         }
17b94a
     }
17b94a
+    *pending += name_data.heal_pending;
17b94a
+
17b94a
     loc_wipe(&loc;;
17b94a
     return ret;
17b94a
 }
17b94a
@@ -1535,7 +1549,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
17b94a
 int
17b94a
 __ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
17b94a
                 unsigned char *heal_on, unsigned char *sources,
17b94a
-                unsigned char *healed_sinks)
17b94a
+                unsigned char *healed_sinks, uint32_t *pending)
17b94a
 {
17b94a
     unsigned char *locked_on = NULL;
17b94a
     unsigned char *output = NULL;
17b94a
@@ -1580,7 +1594,7 @@ unlock:
17b94a
         if (sources[i] || healed_sinks[i])
17b94a
             participants[i] = 1;
17b94a
     }
17b94a
-    ret = ec_heal_names(frame, ec, inode, participants);
17b94a
+    ret = ec_heal_names(frame, ec, inode, participants, pending);
17b94a
 
17b94a
     if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
17b94a
         goto out;
17b94a
@@ -1601,7 +1615,8 @@ out:
17b94a
 
17b94a
 int
17b94a
 ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
17b94a
-              unsigned char *sources, unsigned char *healed_sinks)
17b94a
+              unsigned char *sources, unsigned char *healed_sinks,
17b94a
+              uint32_t *pending)
17b94a
 {
17b94a
     unsigned char *locked_on = NULL;
17b94a
     unsigned char *up_subvols = NULL;
17b94a
@@ -1632,7 +1647,7 @@ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
17b94a
             goto unlock;
17b94a
         }
17b94a
         ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
17b94a
-                              healed_sinks);
17b94a
+                              healed_sinks, pending);
17b94a
     }
17b94a
 unlock:
17b94a
     cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
17b94a
@@ -1953,14 +1968,14 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
17b94a
             if (fop->cbks.heal) {
17b94a
                 fop->cbks.heal(fop->req_frame, fop, fop->xl, 0, 0,
17b94a
                                (heal->good | heal->bad), heal->good, heal->bad,
17b94a
-                               NULL);
17b94a
+                               0, NULL);
17b94a
             }
17b94a
 
17b94a
             return EC_STATE_END;
17b94a
         case -EC_STATE_REPORT:
17b94a
             if (fop->cbks.heal) {
17b94a
-                fop->cbks.heal(fop->req_frame, fop, fop->xl, -1, fop->error, 0,
17b94a
-                               0, 0, NULL);
17b94a
+                fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
17b94a
+                               fop->error, 0, 0, 0, 0, NULL);
17b94a
             }
17b94a
 
17b94a
             return EC_STATE_END;
17b94a
@@ -1997,14 +2012,15 @@ out:
17b94a
     if (fop != NULL) {
17b94a
         ec_manager(fop, error);
17b94a
     } else {
17b94a
-        func(frame, NULL, this, -1, error, 0, 0, 0, NULL);
17b94a
+        func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
17b94a
     }
17b94a
 }
17b94a
 
17b94a
 int32_t
17b94a
 ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
17b94a
                    int32_t op_ret, int32_t op_errno, uintptr_t mask,
17b94a
-                   uintptr_t good, uintptr_t bad, dict_t *xdata)
17b94a
+                   uintptr_t good, uintptr_t bad, uint32_t pending,
17b94a
+                   dict_t *xdata)
17b94a
 {
17b94a
     ec_fop_data_t *fop = cookie;
17b94a
     ec_heal_t *heal = fop->data;
17b94a
@@ -2489,6 +2505,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
17b94a
     intptr_t mbad = 0;
17b94a
     intptr_t good = 0;
17b94a
     intptr_t bad = 0;
17b94a
+    uint32_t pending = 0;
17b94a
     ec_fop_data_t *fop = data;
17b94a
     gf_boolean_t blocking = _gf_false;
17b94a
     ec_heal_need_t need_heal = EC_HEAL_NONEED;
17b94a
@@ -2524,7 +2541,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
17b94a
     if (loc->name && strlen(loc->name)) {
17b94a
         ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
17b94a
                            participants);
17b94a
-        if (ret == 0) {
17b94a
+        if (ret >= 0) {
17b94a
             gf_msg_debug(this->name, 0,
17b94a
                          "%s: name heal "
17b94a
                          "successful on %" PRIXPTR,
17b94a
@@ -2542,7 +2559,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
17b94a
 
17b94a
     /* Mount triggers heal only when it detects that it must need heal, shd
17b94a
      * triggers heals periodically which need not be thorough*/
17b94a
-    if (ec->shd.iamshd) {
17b94a
+    if (ec->shd.iamshd && (ret <= 0)) {
17b94a
         ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
17b94a
                         &need_heal);
17b94a
 
17b94a
@@ -2552,13 +2569,15 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
17b94a
             goto out;
17b94a
         }
17b94a
     }
17b94a
+
17b94a
     sources = alloca0(ec->nodes);
17b94a
     healed_sinks = alloca0(ec->nodes);
17b94a
     if (IA_ISREG(loc->inode->ia_type)) {
17b94a
         ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
17b94a
                            healed_sinks);
17b94a
     } else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
17b94a
-        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks);
17b94a
+        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
17b94a
+                            &pending);
17b94a
     } else {
17b94a
         ret = 0;
17b94a
         memcpy(sources, participants, ec->nodes);
17b94a
@@ -2588,10 +2607,11 @@ out:
17b94a
     if (fop->cbks.heal) {
17b94a
         fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
17b94a
                        ec_char_array_to_mask(participants, ec->nodes),
17b94a
-                       mgood & good, mbad & bad, NULL);
17b94a
+                       mgood & good, mbad & bad, pending, NULL);
17b94a
     }
17b94a
     if (frame)
17b94a
         STACK_DESTROY(frame->root);
17b94a
+
17b94a
     return;
17b94a
 }
17b94a
 
17b94a
@@ -2638,8 +2658,8 @@ void
17b94a
 ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
17b94a
 {
17b94a
     if (fop->cbks.heal) {
17b94a
-        fop->cbks.heal(fop->req_frame, NULL, ec->xl, -1, fop->error, 0, 0, 0,
17b94a
-                       NULL);
17b94a
+        fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
17b94a
+                       0, 0, NULL);
17b94a
     }
17b94a
     ec_fop_data_release(fop);
17b94a
 }
17b94a
@@ -2826,7 +2846,7 @@ fail:
17b94a
     if (fop)
17b94a
         ec_fop_data_release(fop);
17b94a
     if (func)
17b94a
-        func(frame, NULL, this, -1, err, 0, 0, 0, NULL);
17b94a
+        func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
17b94a
 }
17b94a
 
17b94a
 int
17b94a
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
17b94a
index cba111a..4f4b6aa 100644
17b94a
--- a/xlators/cluster/ec/src/ec-heald.c
17b94a
+++ b/xlators/cluster/ec/src/ec-heald.c
17b94a
@@ -156,15 +156,27 @@ int
17b94a
 ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc,
17b94a
                 gf_boolean_t full)
17b94a
 {
17b94a
+    dict_t *xdata = NULL;
17b94a
+    uint32_t count;
17b94a
     int32_t ret;
17b94a
 
17b94a
-    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, NULL);
17b94a
-    if (!full && (ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
17b94a
+    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, &xdata);
17b94a
+    if (!full && (loc->inode->ia_type == IA_IFDIR)) {
17b94a
         /* If we have just healed a directory, it's possible that
17b94a
-         * other index entries have appeared to be healed. We put a
17b94a
-         * mark so that we can check it later and restart a scan
17b94a
-         * without delay. */
17b94a
-        healer->rerun = _gf_true;
17b94a
+         * other index entries have appeared to be healed. */
17b94a
+        if ((xdata != NULL) &&
17b94a
+            (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) &&
17b94a
+            (count > 0)) {
17b94a
+            /* Force a rerun of the index healer. */
17b94a
+            gf_msg_debug(healer->this->name, 0, "%d more entries to heal",
17b94a
+                         count);
17b94a
+
17b94a
+            healer->rerun = _gf_true;
17b94a
+        }
17b94a
+    }
17b94a
+
17b94a
+    if (xdata != NULL) {
17b94a
+        dict_unref(xdata);
17b94a
     }
17b94a
 
17b94a
     return ret;
17b94a
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
17b94a
index f87a94a..e82e8f6 100644
17b94a
--- a/xlators/cluster/ec/src/ec-inode-read.c
17b94a
+++ b/xlators/cluster/ec/src/ec-inode-read.c
17b94a
@@ -393,7 +393,8 @@ ec_manager_getxattr(ec_fop_data_t *fop, int32_t state)
17b94a
 int32_t
17b94a
 ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
17b94a
                      int32_t op_ret, int32_t op_errno, uintptr_t mask,
17b94a
-                     uintptr_t good, uintptr_t bad, dict_t *xdata)
17b94a
+                     uintptr_t good, uintptr_t bad, uint32_t pending,
17b94a
+                     dict_t *xdata)
17b94a
 {
17b94a
     ec_fop_data_t *fop = cookie;
17b94a
     fop_getxattr_cbk_t func = fop->data;
17b94a
@@ -402,6 +403,25 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
17b94a
     char *str;
17b94a
     char bin1[65], bin2[65];
17b94a
 
17b94a
+    /* We try to return the 'pending' information in xdata, but if this cannot
17b94a
+     * be set, we will ignore it silently. We prefer to report the success or
17b94a
+     * failure of the heal itself. */
17b94a
+    if (xdata == NULL) {
17b94a
+        xdata = dict_new();
17b94a
+    } else {
17b94a
+        dict_ref(xdata);
17b94a
+    }
17b94a
+    if (xdata != NULL) {
17b94a
+        if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) {
17b94a
+            /* dict_set_uint32() is marked as 'warn_unused_result' and gcc
17b94a
+             * enforces to check the result in this case. However we don't
17b94a
+             * really care if it succeeded or not. We'll just do the same.
17b94a
+             *
17b94a
+             * This empty 'if' avoids the warning, and it will be removed by
17b94a
+             * the optimizer. */
17b94a
+        }
17b94a
+    }
17b94a
+
17b94a
     if (op_ret >= 0) {
17b94a
         dict = dict_new();
17b94a
         if (dict == NULL) {
17b94a
@@ -435,11 +455,14 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
17b94a
     }
17b94a
 
17b94a
 out:
17b94a
-    func(frame, NULL, xl, op_ret, op_errno, dict, NULL);
17b94a
+    func(frame, NULL, xl, op_ret, op_errno, dict, xdata);
17b94a
 
17b94a
     if (dict != NULL) {
17b94a
         dict_unref(dict);
17b94a
     }
17b94a
+    if (xdata != NULL) {
17b94a
+        dict_unref(xdata);
17b94a
+    }
17b94a
 
17b94a
     return 0;
17b94a
 }
17b94a
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
17b94a
index 34a9768..f15429d 100644
17b94a
--- a/xlators/cluster/ec/src/ec-types.h
17b94a
+++ b/xlators/cluster/ec/src/ec-types.h
17b94a
@@ -186,10 +186,10 @@ struct _ec_inode {
17b94a
 
17b94a
 typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
17b94a
                                   int32_t, uintptr_t, uintptr_t, uintptr_t,
17b94a
-                                  dict_t *);
17b94a
+                                  uint32_t, dict_t *);
17b94a
 typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
17b94a
                                    int32_t, uintptr_t, uintptr_t, uintptr_t,
17b94a
-                                   dict_t *);
17b94a
+                                   uint32_t, dict_t *);
17b94a
 
17b94a
 union _ec_cbk {
17b94a
     fop_access_cbk_t access;
17b94a
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
17b94a
index 1b210d9..6f6de6d 100644
17b94a
--- a/xlators/cluster/ec/src/ec.h
17b94a
+++ b/xlators/cluster/ec/src/ec.h
17b94a
@@ -18,6 +18,7 @@
17b94a
 #define EC_XATTR_SIZE EC_XATTR_PREFIX "size"
17b94a
 #define EC_XATTR_VERSION EC_XATTR_PREFIX "version"
17b94a
 #define EC_XATTR_HEAL EC_XATTR_PREFIX "heal"
17b94a
+#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new"
17b94a
 #define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty"
17b94a
 #define EC_STRIPE_CACHE_MAX_SIZE 10
17b94a
 #define EC_VERSION_SIZE 2
17b94a
-- 
17b94a
1.8.3.1
17b94a