74096c
From 3e8b3a2c2c6f83635486035fc8040c87d89813d2 Mon Sep 17 00:00:00 2001
74096c
From: Xavi Hernandez <xhernandez@redhat.com>
74096c
Date: Thu, 2 Jul 2020 18:08:52 +0200
74096c
Subject: [PATCH 457/465] cluster/ec: Improve detection of new heals
74096c
74096c
When EC successfully healed a directory it assumed that maybe other
74096c
entries inside that directory could have been created, which could
74096c
require additional heal cycles. For this reason, when the heal happened
74096c
as part of one index heal iteration, it triggered a new iteration.
74096c
74096c
The problem happened when the directory was healthy, so no new entries
74096c
were added, but its index entry was not removed for some reason. In
74096c
this case self-heal started and endless loop healing the same directory
74096c
continuously, cause high CPU utilization.
74096c
74096c
This patch improves detection of new files added to the heal index so
74096c
that a new index heal iteration is only triggered if there is new work
74096c
to do.
74096c
74096c
>Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24665/
74096c
>Fixes: #1354
74096c
74096c
Change-Id: I2355742b85fbfa6de758bccc5d2e1a283c82b53f
74096c
BUG: 1852736
74096c
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
74096c
Reviewed-on: https://code.engineering.redhat.com/gerrit/208041
74096c
Tested-by: Ashish Pandey <aspandey@redhat.com>
74096c
Tested-by: RHGS Build Bot <nigelb@redhat.com>
74096c
Reviewed-by: Ashish Pandey <aspandey@redhat.com>
74096c
---
74096c
 xlators/cluster/ec/src/ec-common.c     |  2 +-
74096c
 xlators/cluster/ec/src/ec-heal.c       | 58 +++++++++++++++++++++++-----------
74096c
 xlators/cluster/ec/src/ec-heald.c      | 24 ++++++++++----
74096c
 xlators/cluster/ec/src/ec-inode-read.c | 27 ++++++++++++++--
74096c
 xlators/cluster/ec/src/ec-types.h      |  4 +--
74096c
 xlators/cluster/ec/src/ec.h            |  1 +
74096c
 6 files changed, 86 insertions(+), 30 deletions(-)
74096c
74096c
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
74096c
index e580bfb..e3f8769 100644
74096c
--- a/xlators/cluster/ec/src/ec-common.c
74096c
+++ b/xlators/cluster/ec/src/ec-common.c
74096c
@@ -230,7 +230,7 @@ ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
74096c
 int32_t
74096c
 ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this,
74096c
                int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good,
74096c
-               uintptr_t bad, dict_t *xdata)
74096c
+               uintptr_t bad, uint32_t pending, dict_t *xdata)
74096c
 {
74096c
     if (op_ret < 0) {
74096c
         gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL,
74096c
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
74096c
index 06a7016..e2de879 100644
74096c
--- a/xlators/cluster/ec/src/ec-heal.c
74096c
+++ b/xlators/cluster/ec/src/ec-heal.c
74096c
@@ -72,6 +72,7 @@ struct ec_name_data {
74096c
     char *name;
74096c
     inode_t *parent;
74096c
     default_args_cbk_t *replies;
74096c
+    uint32_t heal_pending;
74096c
 };
74096c
 
74096c
 static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
74096c
@@ -996,6 +997,7 @@ ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
74096c
         ret = -ENOTCONN;
74096c
         goto out;
74096c
     }
74096c
+
74096c
 out:
74096c
     if (xattr)
74096c
         dict_unref(xattr);
74096c
@@ -1164,6 +1166,7 @@ ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
74096c
     dict_t *xdata = NULL;
74096c
     char *linkname = NULL;
74096c
     ec_config_t config;
74096c
+
74096c
     /* There should be just one gfid key */
74096c
     EC_REPLIES_ALLOC(replies, ec->nodes);
74096c
     if (gfid_db->count != 1) {
74096c
@@ -1408,6 +1411,11 @@ __ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
74096c
 
74096c
     ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
74096c
                          participants);
74096c
+    if (ret >= 0) {
74096c
+        /* If ec_create_name() succeeded we return 1 to indicate that a new
74096c
+         * file has been created and it will need to be healed. */
74096c
+        ret = 1;
74096c
+    }
74096c
 out:
74096c
     cluster_replies_wipe(replies, ec->nodes);
74096c
     loc_wipe(&loc;;
74096c
@@ -1485,18 +1493,22 @@ ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
74096c
     ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
74096c
                        name_on);
74096c
 
74096c
-    if (ret < 0)
74096c
+    if (ret < 0) {
74096c
         memset(name_on, 0, ec->nodes);
74096c
+    } else {
74096c
+        name_data->heal_pending += ret;
74096c
+    }
74096c
 
74096c
     for (i = 0; i < ec->nodes; i++)
74096c
         if (name_data->participants[i] && !name_on[i])
74096c
             name_data->failed_on[i] = 1;
74096c
+
74096c
     return 0;
74096c
 }
74096c
 
74096c
 int
74096c
 ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
74096c
-              unsigned char *participants)
74096c
+              unsigned char *participants, uint32_t *pending)
74096c
 {
74096c
     int i = 0;
74096c
     int j = 0;
74096c
@@ -1509,7 +1521,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
74096c
     name_data.frame = frame;
74096c
     name_data.participants = participants;
74096c
     name_data.failed_on = alloca0(ec->nodes);
74096c
-    ;
74096c
+    name_data.heal_pending = 0;
74096c
 
74096c
     for (i = 0; i < ec->nodes; i++) {
74096c
         if (!participants[i])
74096c
@@ -1528,6 +1540,8 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
74096c
             break;
74096c
         }
74096c
     }
74096c
+    *pending += name_data.heal_pending;
74096c
+
74096c
     loc_wipe(&loc;;
74096c
     return ret;
74096c
 }
74096c
@@ -1535,7 +1549,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
74096c
 int
74096c
 __ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
74096c
                 unsigned char *heal_on, unsigned char *sources,
74096c
-                unsigned char *healed_sinks)
74096c
+                unsigned char *healed_sinks, uint32_t *pending)
74096c
 {
74096c
     unsigned char *locked_on = NULL;
74096c
     unsigned char *output = NULL;
74096c
@@ -1580,7 +1594,7 @@ unlock:
74096c
         if (sources[i] || healed_sinks[i])
74096c
             participants[i] = 1;
74096c
     }
74096c
-    ret = ec_heal_names(frame, ec, inode, participants);
74096c
+    ret = ec_heal_names(frame, ec, inode, participants, pending);
74096c
 
74096c
     if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
74096c
         goto out;
74096c
@@ -1601,7 +1615,8 @@ out:
74096c
 
74096c
 int
74096c
 ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
74096c
-              unsigned char *sources, unsigned char *healed_sinks)
74096c
+              unsigned char *sources, unsigned char *healed_sinks,
74096c
+              uint32_t *pending)
74096c
 {
74096c
     unsigned char *locked_on = NULL;
74096c
     unsigned char *up_subvols = NULL;
74096c
@@ -1632,7 +1647,7 @@ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
74096c
             goto unlock;
74096c
         }
74096c
         ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
74096c
-                              healed_sinks);
74096c
+                              healed_sinks, pending);
74096c
     }
74096c
 unlock:
74096c
     cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
74096c
@@ -1953,14 +1968,14 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
74096c
             if (fop->cbks.heal) {
74096c
                 fop->cbks.heal(fop->req_frame, fop, fop->xl, 0, 0,
74096c
                                (heal->good | heal->bad), heal->good, heal->bad,
74096c
-                               NULL);
74096c
+                               0, NULL);
74096c
             }
74096c
 
74096c
             return EC_STATE_END;
74096c
         case -EC_STATE_REPORT:
74096c
             if (fop->cbks.heal) {
74096c
-                fop->cbks.heal(fop->req_frame, fop, fop->xl, -1, fop->error, 0,
74096c
-                               0, 0, NULL);
74096c
+                fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
74096c
+                               fop->error, 0, 0, 0, 0, NULL);
74096c
             }
74096c
 
74096c
             return EC_STATE_END;
74096c
@@ -1997,14 +2012,15 @@ out:
74096c
     if (fop != NULL) {
74096c
         ec_manager(fop, error);
74096c
     } else {
74096c
-        func(frame, NULL, this, -1, error, 0, 0, 0, NULL);
74096c
+        func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
74096c
     }
74096c
 }
74096c
 
74096c
 int32_t
74096c
 ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
74096c
                    int32_t op_ret, int32_t op_errno, uintptr_t mask,
74096c
-                   uintptr_t good, uintptr_t bad, dict_t *xdata)
74096c
+                   uintptr_t good, uintptr_t bad, uint32_t pending,
74096c
+                   dict_t *xdata)
74096c
 {
74096c
     ec_fop_data_t *fop = cookie;
74096c
     ec_heal_t *heal = fop->data;
74096c
@@ -2489,6 +2505,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
74096c
     intptr_t mbad = 0;
74096c
     intptr_t good = 0;
74096c
     intptr_t bad = 0;
74096c
+    uint32_t pending = 0;
74096c
     ec_fop_data_t *fop = data;
74096c
     gf_boolean_t blocking = _gf_false;
74096c
     ec_heal_need_t need_heal = EC_HEAL_NONEED;
74096c
@@ -2524,7 +2541,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
74096c
     if (loc->name && strlen(loc->name)) {
74096c
         ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
74096c
                            participants);
74096c
-        if (ret == 0) {
74096c
+        if (ret >= 0) {
74096c
             gf_msg_debug(this->name, 0,
74096c
                          "%s: name heal "
74096c
                          "successful on %" PRIXPTR,
74096c
@@ -2542,7 +2559,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
74096c
 
74096c
     /* Mount triggers heal only when it detects that it must need heal, shd
74096c
      * triggers heals periodically which need not be thorough*/
74096c
-    if (ec->shd.iamshd) {
74096c
+    if (ec->shd.iamshd && (ret <= 0)) {
74096c
         ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
74096c
                         &need_heal);
74096c
 
74096c
@@ -2552,13 +2569,15 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
74096c
             goto out;
74096c
         }
74096c
     }
74096c
+
74096c
     sources = alloca0(ec->nodes);
74096c
     healed_sinks = alloca0(ec->nodes);
74096c
     if (IA_ISREG(loc->inode->ia_type)) {
74096c
         ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
74096c
                            healed_sinks);
74096c
     } else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
74096c
-        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks);
74096c
+        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
74096c
+                            &pending);
74096c
     } else {
74096c
         ret = 0;
74096c
         memcpy(sources, participants, ec->nodes);
74096c
@@ -2588,10 +2607,11 @@ out:
74096c
     if (fop->cbks.heal) {
74096c
         fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
74096c
                        ec_char_array_to_mask(participants, ec->nodes),
74096c
-                       mgood & good, mbad & bad, NULL);
74096c
+                       mgood & good, mbad & bad, pending, NULL);
74096c
     }
74096c
     if (frame)
74096c
         STACK_DESTROY(frame->root);
74096c
+
74096c
     return;
74096c
 }
74096c
 
74096c
@@ -2638,8 +2658,8 @@ void
74096c
 ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
74096c
 {
74096c
     if (fop->cbks.heal) {
74096c
-        fop->cbks.heal(fop->req_frame, NULL, ec->xl, -1, fop->error, 0, 0, 0,
74096c
-                       NULL);
74096c
+        fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
74096c
+                       0, 0, NULL);
74096c
     }
74096c
     ec_fop_data_release(fop);
74096c
 }
74096c
@@ -2826,7 +2846,7 @@ fail:
74096c
     if (fop)
74096c
         ec_fop_data_release(fop);
74096c
     if (func)
74096c
-        func(frame, NULL, this, -1, err, 0, 0, 0, NULL);
74096c
+        func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
74096c
 }
74096c
 
74096c
 int
74096c
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
74096c
index cba111a..4f4b6aa 100644
74096c
--- a/xlators/cluster/ec/src/ec-heald.c
74096c
+++ b/xlators/cluster/ec/src/ec-heald.c
74096c
@@ -156,15 +156,27 @@ int
74096c
 ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc,
74096c
                 gf_boolean_t full)
74096c
 {
74096c
+    dict_t *xdata = NULL;
74096c
+    uint32_t count;
74096c
     int32_t ret;
74096c
 
74096c
-    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, NULL);
74096c
-    if (!full && (ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
74096c
+    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, &xdata);
74096c
+    if (!full && (loc->inode->ia_type == IA_IFDIR)) {
74096c
         /* If we have just healed a directory, it's possible that
74096c
-         * other index entries have appeared to be healed. We put a
74096c
-         * mark so that we can check it later and restart a scan
74096c
-         * without delay. */
74096c
-        healer->rerun = _gf_true;
74096c
+         * other index entries have appeared to be healed. */
74096c
+        if ((xdata != NULL) &&
74096c
+            (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) &&
74096c
+            (count > 0)) {
74096c
+            /* Force a rerun of the index healer. */
74096c
+            gf_msg_debug(healer->this->name, 0, "%d more entries to heal",
74096c
+                         count);
74096c
+
74096c
+            healer->rerun = _gf_true;
74096c
+        }
74096c
+    }
74096c
+
74096c
+    if (xdata != NULL) {
74096c
+        dict_unref(xdata);
74096c
     }
74096c
 
74096c
     return ret;
74096c
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
74096c
index f87a94a..e82e8f6 100644
74096c
--- a/xlators/cluster/ec/src/ec-inode-read.c
74096c
+++ b/xlators/cluster/ec/src/ec-inode-read.c
74096c
@@ -393,7 +393,8 @@ ec_manager_getxattr(ec_fop_data_t *fop, int32_t state)
74096c
 int32_t
74096c
 ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
74096c
                      int32_t op_ret, int32_t op_errno, uintptr_t mask,
74096c
-                     uintptr_t good, uintptr_t bad, dict_t *xdata)
74096c
+                     uintptr_t good, uintptr_t bad, uint32_t pending,
74096c
+                     dict_t *xdata)
74096c
 {
74096c
     ec_fop_data_t *fop = cookie;
74096c
     fop_getxattr_cbk_t func = fop->data;
74096c
@@ -402,6 +403,25 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
74096c
     char *str;
74096c
     char bin1[65], bin2[65];
74096c
 
74096c
+    /* We try to return the 'pending' information in xdata, but if this cannot
74096c
+     * be set, we will ignore it silently. We prefer to report the success or
74096c
+     * failure of the heal itself. */
74096c
+    if (xdata == NULL) {
74096c
+        xdata = dict_new();
74096c
+    } else {
74096c
+        dict_ref(xdata);
74096c
+    }
74096c
+    if (xdata != NULL) {
74096c
+        if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) {
74096c
+            /* dict_set_uint32() is marked as 'warn_unused_result' and gcc
74096c
+             * enforces to check the result in this case. However we don't
74096c
+             * really care if it succeeded or not. We'll just do the same.
74096c
+             *
74096c
+             * This empty 'if' avoids the warning, and it will be removed by
74096c
+             * the optimizer. */
74096c
+        }
74096c
+    }
74096c
+
74096c
     if (op_ret >= 0) {
74096c
         dict = dict_new();
74096c
         if (dict == NULL) {
74096c
@@ -435,11 +455,14 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
74096c
     }
74096c
 
74096c
 out:
74096c
-    func(frame, NULL, xl, op_ret, op_errno, dict, NULL);
74096c
+    func(frame, NULL, xl, op_ret, op_errno, dict, xdata);
74096c
 
74096c
     if (dict != NULL) {
74096c
         dict_unref(dict);
74096c
     }
74096c
+    if (xdata != NULL) {
74096c
+        dict_unref(xdata);
74096c
+    }
74096c
 
74096c
     return 0;
74096c
 }
74096c
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
74096c
index 34a9768..f15429d 100644
74096c
--- a/xlators/cluster/ec/src/ec-types.h
74096c
+++ b/xlators/cluster/ec/src/ec-types.h
74096c
@@ -186,10 +186,10 @@ struct _ec_inode {
74096c
 
74096c
 typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
74096c
                                   int32_t, uintptr_t, uintptr_t, uintptr_t,
74096c
-                                  dict_t *);
74096c
+                                  uint32_t, dict_t *);
74096c
 typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
74096c
                                    int32_t, uintptr_t, uintptr_t, uintptr_t,
74096c
-                                   dict_t *);
74096c
+                                   uint32_t, dict_t *);
74096c
 
74096c
 union _ec_cbk {
74096c
     fop_access_cbk_t access;
74096c
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
74096c
index 1b210d9..6f6de6d 100644
74096c
--- a/xlators/cluster/ec/src/ec.h
74096c
+++ b/xlators/cluster/ec/src/ec.h
74096c
@@ -18,6 +18,7 @@
74096c
 #define EC_XATTR_SIZE EC_XATTR_PREFIX "size"
74096c
 #define EC_XATTR_VERSION EC_XATTR_PREFIX "version"
74096c
 #define EC_XATTR_HEAL EC_XATTR_PREFIX "heal"
74096c
+#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new"
74096c
 #define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty"
74096c
 #define EC_STRIPE_CACHE_MAX_SIZE 10
74096c
 #define EC_VERSION_SIZE 2
74096c
-- 
74096c
1.8.3.1
74096c