190130
From 3e8b3a2c2c6f83635486035fc8040c87d89813d2 Mon Sep 17 00:00:00 2001
190130
From: Xavi Hernandez <xhernandez@redhat.com>
190130
Date: Thu, 2 Jul 2020 18:08:52 +0200
190130
Subject: [PATCH 457/465] cluster/ec: Improve detection of new heals
190130
190130
When EC successfully healed a directory it assumed that maybe other
190130
entries inside that directory could have been created, which could
190130
require additional heal cycles. For this reason, when the heal happened
190130
as part of one index heal iteration, it triggered a new iteration.
190130
190130
The problem happened when the directory was healthy, so no new entries
190130
were added, but its index entry was not removed for some reason. In
190130
this case self-heal started and endless loop healing the same directory
190130
continuously, cause high CPU utilization.
190130
190130
This patch improves detection of new files added to the heal index so
190130
that a new index heal iteration is only triggered if there is new work
190130
to do.
190130
190130
>Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24665/
190130
>Fixes: #1354
190130
190130
Change-Id: I2355742b85fbfa6de758bccc5d2e1a283c82b53f
190130
BUG: 1852736
190130
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
190130
Reviewed-on: https://code.engineering.redhat.com/gerrit/208041
190130
Tested-by: Ashish Pandey <aspandey@redhat.com>
190130
Tested-by: RHGS Build Bot <nigelb@redhat.com>
190130
Reviewed-by: Ashish Pandey <aspandey@redhat.com>
190130
---
190130
 xlators/cluster/ec/src/ec-common.c     |  2 +-
190130
 xlators/cluster/ec/src/ec-heal.c       | 58 +++++++++++++++++++++++-----------
190130
 xlators/cluster/ec/src/ec-heald.c      | 24 ++++++++++----
190130
 xlators/cluster/ec/src/ec-inode-read.c | 27 ++++++++++++++--
190130
 xlators/cluster/ec/src/ec-types.h      |  4 +--
190130
 xlators/cluster/ec/src/ec.h            |  1 +
190130
 6 files changed, 86 insertions(+), 30 deletions(-)
190130
190130
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
190130
index e580bfb..e3f8769 100644
190130
--- a/xlators/cluster/ec/src/ec-common.c
190130
+++ b/xlators/cluster/ec/src/ec-common.c
190130
@@ -230,7 +230,7 @@ ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
190130
 int32_t
190130
 ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this,
190130
                int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good,
190130
-               uintptr_t bad, dict_t *xdata)
190130
+               uintptr_t bad, uint32_t pending, dict_t *xdata)
190130
 {
190130
     if (op_ret < 0) {
190130
         gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL,
190130
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
190130
index 06a7016..e2de879 100644
190130
--- a/xlators/cluster/ec/src/ec-heal.c
190130
+++ b/xlators/cluster/ec/src/ec-heal.c
190130
@@ -72,6 +72,7 @@ struct ec_name_data {
190130
     char *name;
190130
     inode_t *parent;
190130
     default_args_cbk_t *replies;
190130
+    uint32_t heal_pending;
190130
 };
190130
 
190130
 static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
190130
@@ -996,6 +997,7 @@ ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
190130
         ret = -ENOTCONN;
190130
         goto out;
190130
     }
190130
+
190130
 out:
190130
     if (xattr)
190130
         dict_unref(xattr);
190130
@@ -1164,6 +1166,7 @@ ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
190130
     dict_t *xdata = NULL;
190130
     char *linkname = NULL;
190130
     ec_config_t config;
190130
+
190130
     /* There should be just one gfid key */
190130
     EC_REPLIES_ALLOC(replies, ec->nodes);
190130
     if (gfid_db->count != 1) {
190130
@@ -1408,6 +1411,11 @@ __ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
190130
 
190130
     ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
190130
                          participants);
190130
+    if (ret >= 0) {
190130
+        /* If ec_create_name() succeeded we return 1 to indicate that a new
190130
+         * file has been created and it will need to be healed. */
190130
+        ret = 1;
190130
+    }
190130
 out:
190130
     cluster_replies_wipe(replies, ec->nodes);
190130
     loc_wipe(&loc;;
190130
@@ -1485,18 +1493,22 @@ ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
190130
     ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
190130
                        name_on);
190130
 
190130
-    if (ret < 0)
190130
+    if (ret < 0) {
190130
         memset(name_on, 0, ec->nodes);
190130
+    } else {
190130
+        name_data->heal_pending += ret;
190130
+    }
190130
 
190130
     for (i = 0; i < ec->nodes; i++)
190130
         if (name_data->participants[i] && !name_on[i])
190130
             name_data->failed_on[i] = 1;
190130
+
190130
     return 0;
190130
 }
190130
 
190130
 int
190130
 ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
190130
-              unsigned char *participants)
190130
+              unsigned char *participants, uint32_t *pending)
190130
 {
190130
     int i = 0;
190130
     int j = 0;
190130
@@ -1509,7 +1521,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
190130
     name_data.frame = frame;
190130
     name_data.participants = participants;
190130
     name_data.failed_on = alloca0(ec->nodes);
190130
-    ;
190130
+    name_data.heal_pending = 0;
190130
 
190130
     for (i = 0; i < ec->nodes; i++) {
190130
         if (!participants[i])
190130
@@ -1528,6 +1540,8 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
190130
             break;
190130
         }
190130
     }
190130
+    *pending += name_data.heal_pending;
190130
+
190130
     loc_wipe(&loc;;
190130
     return ret;
190130
 }
190130
@@ -1535,7 +1549,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
190130
 int
190130
 __ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
190130
                 unsigned char *heal_on, unsigned char *sources,
190130
-                unsigned char *healed_sinks)
190130
+                unsigned char *healed_sinks, uint32_t *pending)
190130
 {
190130
     unsigned char *locked_on = NULL;
190130
     unsigned char *output = NULL;
190130
@@ -1580,7 +1594,7 @@ unlock:
190130
         if (sources[i] || healed_sinks[i])
190130
             participants[i] = 1;
190130
     }
190130
-    ret = ec_heal_names(frame, ec, inode, participants);
190130
+    ret = ec_heal_names(frame, ec, inode, participants, pending);
190130
 
190130
     if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
190130
         goto out;
190130
@@ -1601,7 +1615,8 @@ out:
190130
 
190130
 int
190130
 ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
190130
-              unsigned char *sources, unsigned char *healed_sinks)
190130
+              unsigned char *sources, unsigned char *healed_sinks,
190130
+              uint32_t *pending)
190130
 {
190130
     unsigned char *locked_on = NULL;
190130
     unsigned char *up_subvols = NULL;
190130
@@ -1632,7 +1647,7 @@ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
190130
             goto unlock;
190130
         }
190130
         ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
190130
-                              healed_sinks);
190130
+                              healed_sinks, pending);
190130
     }
190130
 unlock:
190130
     cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
190130
@@ -1953,14 +1968,14 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
190130
             if (fop->cbks.heal) {
190130
                 fop->cbks.heal(fop->req_frame, fop, fop->xl, 0, 0,
190130
                                (heal->good | heal->bad), heal->good, heal->bad,
190130
-                               NULL);
190130
+                               0, NULL);
190130
             }
190130
 
190130
             return EC_STATE_END;
190130
         case -EC_STATE_REPORT:
190130
             if (fop->cbks.heal) {
190130
-                fop->cbks.heal(fop->req_frame, fop, fop->xl, -1, fop->error, 0,
190130
-                               0, 0, NULL);
190130
+                fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
190130
+                               fop->error, 0, 0, 0, 0, NULL);
190130
             }
190130
 
190130
             return EC_STATE_END;
190130
@@ -1997,14 +2012,15 @@ out:
190130
     if (fop != NULL) {
190130
         ec_manager(fop, error);
190130
     } else {
190130
-        func(frame, NULL, this, -1, error, 0, 0, 0, NULL);
190130
+        func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
190130
     }
190130
 }
190130
 
190130
 int32_t
190130
 ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
190130
                    int32_t op_ret, int32_t op_errno, uintptr_t mask,
190130
-                   uintptr_t good, uintptr_t bad, dict_t *xdata)
190130
+                   uintptr_t good, uintptr_t bad, uint32_t pending,
190130
+                   dict_t *xdata)
190130
 {
190130
     ec_fop_data_t *fop = cookie;
190130
     ec_heal_t *heal = fop->data;
190130
@@ -2489,6 +2505,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
190130
     intptr_t mbad = 0;
190130
     intptr_t good = 0;
190130
     intptr_t bad = 0;
190130
+    uint32_t pending = 0;
190130
     ec_fop_data_t *fop = data;
190130
     gf_boolean_t blocking = _gf_false;
190130
     ec_heal_need_t need_heal = EC_HEAL_NONEED;
190130
@@ -2524,7 +2541,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
190130
     if (loc->name && strlen(loc->name)) {
190130
         ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
190130
                            participants);
190130
-        if (ret == 0) {
190130
+        if (ret >= 0) {
190130
             gf_msg_debug(this->name, 0,
190130
                          "%s: name heal "
190130
                          "successful on %" PRIXPTR,
190130
@@ -2542,7 +2559,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
190130
 
190130
     /* Mount triggers heal only when it detects that it must need heal, shd
190130
      * triggers heals periodically which need not be thorough*/
190130
-    if (ec->shd.iamshd) {
190130
+    if (ec->shd.iamshd && (ret <= 0)) {
190130
         ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
190130
                         &need_heal);
190130
 
190130
@@ -2552,13 +2569,15 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
190130
             goto out;
190130
         }
190130
     }
190130
+
190130
     sources = alloca0(ec->nodes);
190130
     healed_sinks = alloca0(ec->nodes);
190130
     if (IA_ISREG(loc->inode->ia_type)) {
190130
         ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
190130
                            healed_sinks);
190130
     } else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
190130
-        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks);
190130
+        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
190130
+                            &pending);
190130
     } else {
190130
         ret = 0;
190130
         memcpy(sources, participants, ec->nodes);
190130
@@ -2588,10 +2607,11 @@ out:
190130
     if (fop->cbks.heal) {
190130
         fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
190130
                        ec_char_array_to_mask(participants, ec->nodes),
190130
-                       mgood & good, mbad & bad, NULL);
190130
+                       mgood & good, mbad & bad, pending, NULL);
190130
     }
190130
     if (frame)
190130
         STACK_DESTROY(frame->root);
190130
+
190130
     return;
190130
 }
190130
 
190130
@@ -2638,8 +2658,8 @@ void
190130
 ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
190130
 {
190130
     if (fop->cbks.heal) {
190130
-        fop->cbks.heal(fop->req_frame, NULL, ec->xl, -1, fop->error, 0, 0, 0,
190130
-                       NULL);
190130
+        fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
190130
+                       0, 0, NULL);
190130
     }
190130
     ec_fop_data_release(fop);
190130
 }
190130
@@ -2826,7 +2846,7 @@ fail:
190130
     if (fop)
190130
         ec_fop_data_release(fop);
190130
     if (func)
190130
-        func(frame, NULL, this, -1, err, 0, 0, 0, NULL);
190130
+        func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
190130
 }
190130
 
190130
 int
190130
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
190130
index cba111a..4f4b6aa 100644
190130
--- a/xlators/cluster/ec/src/ec-heald.c
190130
+++ b/xlators/cluster/ec/src/ec-heald.c
190130
@@ -156,15 +156,27 @@ int
190130
 ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc,
190130
                 gf_boolean_t full)
190130
 {
190130
+    dict_t *xdata = NULL;
190130
+    uint32_t count;
190130
     int32_t ret;
190130
 
190130
-    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, NULL);
190130
-    if (!full && (ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
190130
+    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, &xdata);
190130
+    if (!full && (loc->inode->ia_type == IA_IFDIR)) {
190130
         /* If we have just healed a directory, it's possible that
190130
-         * other index entries have appeared to be healed. We put a
190130
-         * mark so that we can check it later and restart a scan
190130
-         * without delay. */
190130
-        healer->rerun = _gf_true;
190130
+         * other index entries have appeared to be healed. */
190130
+        if ((xdata != NULL) &&
190130
+            (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) &&
190130
+            (count > 0)) {
190130
+            /* Force a rerun of the index healer. */
190130
+            gf_msg_debug(healer->this->name, 0, "%d more entries to heal",
190130
+                         count);
190130
+
190130
+            healer->rerun = _gf_true;
190130
+        }
190130
+    }
190130
+
190130
+    if (xdata != NULL) {
190130
+        dict_unref(xdata);
190130
     }
190130
 
190130
     return ret;
190130
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
190130
index f87a94a..e82e8f6 100644
190130
--- a/xlators/cluster/ec/src/ec-inode-read.c
190130
+++ b/xlators/cluster/ec/src/ec-inode-read.c
190130
@@ -393,7 +393,8 @@ ec_manager_getxattr(ec_fop_data_t *fop, int32_t state)
190130
 int32_t
190130
 ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
190130
                      int32_t op_ret, int32_t op_errno, uintptr_t mask,
190130
-                     uintptr_t good, uintptr_t bad, dict_t *xdata)
190130
+                     uintptr_t good, uintptr_t bad, uint32_t pending,
190130
+                     dict_t *xdata)
190130
 {
190130
     ec_fop_data_t *fop = cookie;
190130
     fop_getxattr_cbk_t func = fop->data;
190130
@@ -402,6 +403,25 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
190130
     char *str;
190130
     char bin1[65], bin2[65];
190130
 
190130
+    /* We try to return the 'pending' information in xdata, but if this cannot
190130
+     * be set, we will ignore it silently. We prefer to report the success or
190130
+     * failure of the heal itself. */
190130
+    if (xdata == NULL) {
190130
+        xdata = dict_new();
190130
+    } else {
190130
+        dict_ref(xdata);
190130
+    }
190130
+    if (xdata != NULL) {
190130
+        if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) {
190130
+            /* dict_set_uint32() is marked as 'warn_unused_result' and gcc
190130
+             * enforces to check the result in this case. However we don't
190130
+             * really care if it succeeded or not. We'll just do the same.
190130
+             *
190130
+             * This empty 'if' avoids the warning, and it will be removed by
190130
+             * the optimizer. */
190130
+        }
190130
+    }
190130
+
190130
     if (op_ret >= 0) {
190130
         dict = dict_new();
190130
         if (dict == NULL) {
190130
@@ -435,11 +455,14 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
190130
     }
190130
 
190130
 out:
190130
-    func(frame, NULL, xl, op_ret, op_errno, dict, NULL);
190130
+    func(frame, NULL, xl, op_ret, op_errno, dict, xdata);
190130
 
190130
     if (dict != NULL) {
190130
         dict_unref(dict);
190130
     }
190130
+    if (xdata != NULL) {
190130
+        dict_unref(xdata);
190130
+    }
190130
 
190130
     return 0;
190130
 }
190130
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
190130
index 34a9768..f15429d 100644
190130
--- a/xlators/cluster/ec/src/ec-types.h
190130
+++ b/xlators/cluster/ec/src/ec-types.h
190130
@@ -186,10 +186,10 @@ struct _ec_inode {
190130
 
190130
 typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
190130
                                   int32_t, uintptr_t, uintptr_t, uintptr_t,
190130
-                                  dict_t *);
190130
+                                  uint32_t, dict_t *);
190130
 typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
190130
                                    int32_t, uintptr_t, uintptr_t, uintptr_t,
190130
-                                   dict_t *);
190130
+                                   uint32_t, dict_t *);
190130
 
190130
 union _ec_cbk {
190130
     fop_access_cbk_t access;
190130
diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
190130
index 1b210d9..6f6de6d 100644
190130
--- a/xlators/cluster/ec/src/ec.h
190130
+++ b/xlators/cluster/ec/src/ec.h
190130
@@ -18,6 +18,7 @@
190130
 #define EC_XATTR_SIZE EC_XATTR_PREFIX "size"
190130
 #define EC_XATTR_VERSION EC_XATTR_PREFIX "version"
190130
 #define EC_XATTR_HEAL EC_XATTR_PREFIX "heal"
190130
+#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new"
190130
 #define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty"
190130
 #define EC_STRIPE_CACHE_MAX_SIZE 10
190130
 #define EC_VERSION_SIZE 2
190130
-- 
190130
1.8.3.1
190130