14f8ab
From 3e8b3a2c2c6f83635486035fc8040c87d89813d2 Mon Sep 17 00:00:00 2001
14f8ab
From: Xavi Hernandez <xhernandez@redhat.com>
14f8ab
Date: Thu, 2 Jul 2020 18:08:52 +0200
14f8ab
Subject: [PATCH 457/465] cluster/ec: Improve detection of new heals
14f8ab
14f8ab
When EC successfully healed a directory it assumed that maybe other
14f8ab
entries inside that directory could have been created, which could
14f8ab
require additional heal cycles. For this reason, when the heal happened
14f8ab
as part of one index heal iteration, it triggered a new iteration.
14f8ab
14f8ab
The problem happened when the directory was healthy, so no new entries
14f8ab
were added, but its index entry was not removed for some reason. In
14f8ab
this case self-heal started and endless loop healing the same directory
14f8ab
continuously, cause high CPU utilization.
14f8ab
14f8ab
This patch improves detection of new files added to the heal index so
14f8ab
that a new index heal iteration is only triggered if there is new work
14f8ab
to do.
14f8ab
14f8ab
>Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24665/
14f8ab
>Fixes: #1354
14f8ab
14f8ab
Change-Id: I2355742b85fbfa6de758bccc5d2e1a283c82b53f
14f8ab
BUG: 1852736
14f8ab
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
14f8ab
Reviewed-on: https://code.engineering.redhat.com/gerrit/208041
14f8ab
Tested-by: Ashish Pandey <aspandey@redhat.com>
14f8ab
Tested-by: RHGS Build Bot <nigelb@redhat.com>
14f8ab
Reviewed-by: Ashish Pandey <aspandey@redhat.com>
14f8ab
---
14f8ab
 xlators/cluster/ec/src/ec-common.c     |  2 +-
14f8ab
 xlators/cluster/ec/src/ec-heal.c       | 58 +++++++++++++++++++++++-----------
14f8ab
 xlators/cluster/ec/src/ec-heald.c      | 24 ++++++++++----
14f8ab
 xlators/cluster/ec/src/ec-inode-read.c | 27 ++++++++++++++--
14f8ab
 xlators/cluster/ec/src/ec-types.h      |  4 +--
14f8ab
 xlators/cluster/ec/src/ec.h            |  1 +
14f8ab
 6 files changed, 86 insertions(+), 30 deletions(-)
14f8ab
14f8ab
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
14f8ab
index e580bfb..e3f8769 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-common.c
14f8ab
+++ b/xlators/cluster/ec/src/ec-common.c
14f8ab
@@ -230,7 +230,7 @@ ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
14f8ab
 int32_t
14f8ab
 ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this,
14f8ab
                int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good,
14f8ab
-               uintptr_t bad, dict_t *xdata)
14f8ab
+               uintptr_t bad, uint32_t pending, dict_t *xdata)
14f8ab
 {
14f8ab
     if (op_ret < 0) {
14f8ab
         gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL,
14f8ab
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
14f8ab
index 06a7016..e2de879 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-heal.c
14f8ab
+++ b/xlators/cluster/ec/src/ec-heal.c
14f8ab
@@ -72,6 +72,7 @@ struct ec_name_data {
14f8ab
     char *name;
14f8ab
     inode_t *parent;
14f8ab
     default_args_cbk_t *replies;
14f8ab
+    uint32_t heal_pending;
14f8ab
 };
14f8ab
 
14f8ab
 static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
14f8ab
@@ -996,6 +997,7 @@ ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
14f8ab
         ret = -ENOTCONN;
14f8ab
         goto out;
14f8ab
     }
14f8ab
+
14f8ab
 out:
14f8ab
     if (xattr)
14f8ab
         dict_unref(xattr);
14f8ab
@@ -1164,6 +1166,7 @@ ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
14f8ab
     dict_t *xdata = NULL;
14f8ab
     char *linkname = NULL;
14f8ab
     ec_config_t config;
14f8ab
+
14f8ab
     /* There should be just one gfid key */
14f8ab
     EC_REPLIES_ALLOC(replies, ec->nodes);
14f8ab
     if (gfid_db->count != 1) {
14f8ab
@@ -1408,6 +1411,11 @@ __ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
14f8ab
 
14f8ab
     ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
14f8ab
                          participants);
14f8ab
+    if (ret >= 0) {
14f8ab
+        /* If ec_create_name() succeeded we return 1 to indicate that a new
14f8ab
+         * file has been created and it will need to be healed. */
14f8ab
+        ret = 1;
14f8ab
+    }
14f8ab
 out:
14f8ab
     cluster_replies_wipe(replies, ec->nodes);
14f8ab
     loc_wipe(&loc;;
14f8ab
@@ -1485,18 +1493,22 @@ ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
14f8ab
     ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
14f8ab
                        name_on);
14f8ab
 
14f8ab
-    if (ret < 0)
14f8ab
+    if (ret < 0) {
14f8ab
         memset(name_on, 0, ec->nodes);
14f8ab
+    } else {
14f8ab
+        name_data->heal_pending += ret;
14f8ab
+    }
14f8ab
 
14f8ab
     for (i = 0; i < ec->nodes; i++)
14f8ab
         if (name_data->participants[i] && !name_on[i])
14f8ab
             name_data->failed_on[i] = 1;
14f8ab
+
14f8ab
     return 0;
14f8ab
 }
14f8ab
 
14f8ab
 int
14f8ab
 ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
14f8ab
-              unsigned char *participants)
14f8ab
+              unsigned char *participants, uint32_t *pending)
14f8ab
 {
14f8ab
     int i = 0;
14f8ab
     int j = 0;
14f8ab
@@ -1509,7 +1521,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
14f8ab
     name_data.frame = frame;
14f8ab
     name_data.participants = participants;
14f8ab
     name_data.failed_on = alloca0(ec->nodes);
14f8ab
-    ;
14f8ab
+    name_data.heal_pending = 0;
14f8ab
 
14f8ab
     for (i = 0; i < ec->nodes; i++) {
14f8ab
         if (!participants[i])
14f8ab
@@ -1528,6 +1540,8 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
14f8ab
             break;
14f8ab
         }
14f8ab
     }
14f8ab
+    *pending += name_data.heal_pending;
14f8ab
+
14f8ab
     loc_wipe(&loc;;
14f8ab
     return ret;
14f8ab
 }
14f8ab
@@ -1535,7 +1549,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
14f8ab
 int
14f8ab
 __ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
14f8ab
                 unsigned char *heal_on, unsigned char *sources,
14f8ab
-                unsigned char *healed_sinks)
14f8ab
+                unsigned char *healed_sinks, uint32_t *pending)
14f8ab
 {
14f8ab
     unsigned char *locked_on = NULL;
14f8ab
     unsigned char *output = NULL;
14f8ab
@@ -1580,7 +1594,7 @@ unlock:
14f8ab
         if (sources[i] || healed_sinks[i])
14f8ab
             participants[i] = 1;
14f8ab
     }
14f8ab
-    ret = ec_heal_names(frame, ec, inode, participants);
14f8ab
+    ret = ec_heal_names(frame, ec, inode, participants, pending);
14f8ab
 
14f8ab
     if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
14f8ab
         goto out;
14f8ab
@@ -1601,7 +1615,8 @@ out:
14f8ab
 
14f8ab
 int
14f8ab
 ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
14f8ab
-              unsigned char *sources, unsigned char *healed_sinks)
14f8ab
+              unsigned char *sources, unsigned char *healed_sinks,
14f8ab
+              uint32_t *pending)
14f8ab
 {
14f8ab
     unsigned char *locked_on = NULL;
14f8ab
     unsigned char *up_subvols = NULL;
14f8ab
@@ -1632,7 +1647,7 @@ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
14f8ab
             goto unlock;
14f8ab
         }
14f8ab
         ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
14f8ab
-                              healed_sinks);
14f8ab
+                              healed_sinks, pending);
14f8ab
     }
14f8ab
 unlock:
14f8ab
     cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
14f8ab
@@ -1953,14 +1968,14 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
14f8ab
             if (fop->cbks.heal) {
14f8ab
                 fop->cbks.heal(fop->req_frame, fop, fop->xl, 0, 0,
14f8ab
                                (heal->good | heal->bad), heal->good, heal->bad,
14f8ab
-                               NULL);
14f8ab
+                               0, NULL);
14f8ab
             }
14f8ab
 
14f8ab
             return EC_STATE_END;
14f8ab
         case -EC_STATE_REPORT:
14f8ab
             if (fop->cbks.heal) {
14f8ab
-                fop->cbks.heal(fop->req_frame, fop, fop->xl, -1, fop->error, 0,
14f8ab
-                               0, 0, NULL);
14f8ab
+                fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
14f8ab
+                               fop->error, 0, 0, 0, 0, NULL);
14f8ab
             }
14f8ab
 
14f8ab
             return EC_STATE_END;
14f8ab
@@ -1997,14 +2012,15 @@ out:
14f8ab
     if (fop != NULL) {
14f8ab
         ec_manager(fop, error);
14f8ab
     } else {
14f8ab
-        func(frame, NULL, this, -1, error, 0, 0, 0, NULL);
14f8ab
+        func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
14f8ab
     }
14f8ab
 }
14f8ab
 
14f8ab
 int32_t
14f8ab
 ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
14f8ab
                    int32_t op_ret, int32_t op_errno, uintptr_t mask,
14f8ab
-                   uintptr_t good, uintptr_t bad, dict_t *xdata)
14f8ab
+                   uintptr_t good, uintptr_t bad, uint32_t pending,
14f8ab
+                   dict_t *xdata)
14f8ab
 {
14f8ab
     ec_fop_data_t *fop = cookie;
14f8ab
     ec_heal_t *heal = fop->data;
14f8ab
@@ -2489,6 +2505,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
14f8ab
     intptr_t mbad = 0;
14f8ab
     intptr_t good = 0;
14f8ab
     intptr_t bad = 0;
14f8ab
+    uint32_t pending = 0;
14f8ab
     ec_fop_data_t *fop = data;
14f8ab
     gf_boolean_t blocking = _gf_false;
14f8ab
     ec_heal_need_t need_heal = EC_HEAL_NONEED;
14f8ab
@@ -2524,7 +2541,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
14f8ab
     if (loc->name && strlen(loc->name)) {
14f8ab
         ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
14f8ab
                            participants);
14f8ab
-        if (ret == 0) {
14f8ab
+        if (ret >= 0) {
14f8ab
             gf_msg_debug(this->name, 0,
14f8ab
                          "%s: name heal "
14f8ab
                          "successful on %" PRIXPTR,
14f8ab
@@ -2542,7 +2559,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
14f8ab
 
14f8ab
     /* Mount triggers heal only when it detects that it must need heal, shd
14f8ab
      * triggers heals periodically which need not be thorough*/
14f8ab
-    if (ec->shd.iamshd) {
14f8ab
+    if (ec->shd.iamshd && (ret <= 0)) {
14f8ab
         ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
14f8ab
                         &need_heal);
14f8ab
 
14f8ab
@@ -2552,13 +2569,15 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
14f8ab
             goto out;
14f8ab
         }
14f8ab
     }
14f8ab
+
14f8ab
     sources = alloca0(ec->nodes);
14f8ab
     healed_sinks = alloca0(ec->nodes);
14f8ab
     if (IA_ISREG(loc->inode->ia_type)) {
14f8ab
         ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
14f8ab
                            healed_sinks);
14f8ab
     } else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
14f8ab
-        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks);
14f8ab
+        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
14f8ab
+                            &pending);
14f8ab
     } else {
14f8ab
         ret = 0;
14f8ab
         memcpy(sources, participants, ec->nodes);
14f8ab
@@ -2588,10 +2607,11 @@ out:
14f8ab
     if (fop->cbks.heal) {
14f8ab
         fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
14f8ab
                        ec_char_array_to_mask(participants, ec->nodes),
14f8ab
-                       mgood & good, mbad & bad, NULL);
14f8ab
+                       mgood & good, mbad & bad, pending, NULL);
14f8ab
     }
14f8ab
     if (frame)
14f8ab
         STACK_DESTROY(frame->root);
14f8ab
+
14f8ab
     return;
14f8ab
 }
14f8ab
 
14f8ab
@@ -2638,8 +2658,8 @@ void
14f8ab
 ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
14f8ab
 {
14f8ab
     if (fop->cbks.heal) {
14f8ab
-        fop->cbks.heal(fop->req_frame, NULL, ec->xl, -1, fop->error, 0, 0, 0,
14f8ab
-                       NULL);
14f8ab
+        fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
14f8ab
+                       0, 0, NULL);
14f8ab
     }
14f8ab
     ec_fop_data_release(fop);
14f8ab
 }
14f8ab
@@ -2826,7 +2846,7 @@ fail:
14f8ab
     if (fop)
14f8ab
         ec_fop_data_release(fop);
14f8ab
     if (func)
14f8ab
-        func(frame, NULL, this, -1, err, 0, 0, 0, NULL);
14f8ab
+        func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
14f8ab
 }
14f8ab
 
14f8ab
 int
14f8ab
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
14f8ab
index cba111a..4f4b6aa 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-heald.c
14f8ab
+++ b/xlators/cluster/ec/src/ec-heald.c
14f8ab
@@ -156,15 +156,27 @@ int
14f8ab
 ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc,
14f8ab
                 gf_boolean_t full)
14f8ab
 {
14f8ab
+    dict_t *xdata = NULL;
14f8ab
+    uint32_t count;
14f8ab
     int32_t ret;
14f8ab
 
14f8ab
-    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, NULL);
14f8ab
-    if (!full && (ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
14f8ab
+    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, &xdata);
14f8ab
+    if (!full && (loc->inode->ia_type == IA_IFDIR)) {
14f8ab
         /* If we have just healed a directory, it's possible that
14f8ab
-         * other index entries have appeared to be healed. We put a
14f8ab
-         * mark so that we can check it later and restart a scan
14f8ab
-         * without delay. */
14f8ab
-        healer->rerun = _gf_true;
14f8ab
+         * other index entries have appeared to be healed. */
14f8ab
+        if ((xdata != NULL) &&
14f8ab
+            (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) &&
14f8ab
+            (count > 0)) {
14f8ab
+            /* Force a rerun of the index healer. */
14f8ab
+            gf_msg_debug(healer->this->name, 0, "%d more entries to heal",
14f8ab
+                         count);
14f8ab
+
14f8ab
+            healer->rerun = _gf_true;
14f8ab
+        }
14f8ab
+    }
14f8ab
+
14f8ab
+    if (xdata != NULL) {
14f8ab
+        dict_unref(xdata);
14f8ab
     }
14f8ab
 
14f8ab
     return ret;
14f8ab
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
14f8ab
index f87a94a..e82e8f6 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-inode-read.c
14f8ab
+++ b/xlators/cluster/ec/src/ec-inode-read.c
14f8ab
@@ -393,7 +393,8 @@ ec_manager_getxattr(ec_fop_data_t *fop, int32_t state)
14f8ab
 int32_t
14f8ab
 ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
14f8ab
                      int32_t op_ret, int32_t op_errno, uintptr_t mask,
14f8ab
-                     uintptr_t good, uintptr_t bad, dict_t *xdata)
14f8ab
+                     uintptr_t good, uintptr_t bad, uint32_t pending,
14f8ab
+                     dict_t *xdata)
14f8ab
 {
14f8ab
     ec_fop_data_t *fop = cookie;
14f8ab
     fop_getxattr_cbk_t func = fop->data;
14f8ab
@@ -402,6 +403,25 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
14f8ab
     char *str;
14f8ab
     char bin1[65], bin2[65];
14f8ab
 
14f8ab
+    /* We try to return the 'pending' information in xdata, but if this cannot
14f8ab
+     * be set, we will ignore it silently. We prefer to report the success or
14f8ab
+     * failure of the heal itself. */
14f8ab
+    if (xdata == NULL) {
14f8ab
+        xdata = dict_new();
14f8ab
+    } else {
14f8ab
+        dict_ref(xdata);
14f8ab
+    }
14f8ab
+    if (xdata != NULL) {
14f8ab
+        if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) {
14f8ab
+            /* dict_set_uint32() is marked as 'warn_unused_result' and gcc
14f8ab
+             * enforces to check the result in this case. However we don't
14f8ab
+             * really care if it succeeded or not. We'll just do the same.
14f8ab
+             *
14f8ab
+             * This empty 'if' avoids the warning, and it will be removed by
14f8ab
+             * the optimizer. */
14f8ab
+        }
14f8ab
+    }
14f8ab
+
14f8ab
     if (op_ret >= 0) {
14f8ab
         dict = dict_new();
14f8ab
         if (dict == NULL) {
14f8ab
@@ -435,11 +455,14 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
14f8ab
     }
14f8ab
 
14f8ab
 out:
14f8ab
-    func(frame, NULL, xl, op_ret, op_errno, dict, NULL);
14f8ab
+    func(frame, NULL, xl, op_ret, op_errno, dict, xdata);
14f8ab
 
14f8ab
     if (dict != NULL) {
14f8ab
         dict_unref(dict);
14f8ab
     }
14f8ab
+    if (xdata != NULL) {
14f8ab
+        dict_unref(xdata);
14f8ab
+    }
14f8ab
 
14f8ab
     return 0;
14f8ab
 }
14f8ab
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
14f8ab
index 34a9768..f15429d 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-types.h
14f8ab
+++ b/xlators/cluster/ec/src/ec-types.h
14f8ab
@@ -186,10 +186,10 @@ struct _ec_inode {
14f8ab
 
14f8ab
 typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
14f8ab
                                   int32_t, uintptr_t, uintptr_t, uintptr_t,
14f8ab
-                                  dict_t *);
14f8ab
+                                  uint32_t, dict_t *);
14f8ab
 typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
14f8ab
                                    int32_t, uintptr_t, uintptr_t, uintptr_t,
14f8ab
-                                   dict_t *);
14f8ab
+                                   uint32_t, dict_t *);
14f8ab
 
14f8ab
 union _ec_cbk {
14f8ab
     fop_access_cbk_t access;
14f8ab
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
14f8ab
index 1b210d9..6f6de6d 100644
14f8ab
--- a/xlators/cluster/ec/src/ec.h
14f8ab
+++ b/xlators/cluster/ec/src/ec.h
14f8ab
@@ -18,6 +18,7 @@
14f8ab
 #define EC_XATTR_SIZE EC_XATTR_PREFIX "size"
14f8ab
 #define EC_XATTR_VERSION EC_XATTR_PREFIX "version"
14f8ab
 #define EC_XATTR_HEAL EC_XATTR_PREFIX "heal"
14f8ab
+#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new"
14f8ab
 #define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty"
14f8ab
 #define EC_STRIPE_CACHE_MAX_SIZE 10
14f8ab
 #define EC_VERSION_SIZE 2
14f8ab
-- 
14f8ab
1.8.3.1
14f8ab