233933
From bc6588890ce94101a63b861178cf38db5549d8a8 Mon Sep 17 00:00:00 2001
233933
From: Ashish Pandey <aspandey@redhat.com>
233933
Date: Wed, 28 Nov 2018 11:22:52 +0530
233933
Subject: [PATCH 44/52] cluster/ec: Don't enqueue an entry if it is already
233933
 healing
233933
233933
Problem:
233933
1 - heal-wait-qlength is by default 128. If shd is disabled
233933
and we need to heal files, client side heal is needed.
233933
If we access these files that will trigger the heal.
233933
However, it has been observed that a file will be enqueued
233933
multiple times in the heal wait queue, which in turn causes
233933
queue to be filled and prevent other files to be enqueued.
233933
233933
2 - While a file is going through healing and a write fop from
233933
mount comes on that file, it sends write on all the bricks including
233933
healing one. At the end it updates version and size on all the
233933
bricks. However, it does not unset dirty flag on all the bricks,
233933
even if this write fop was successful on all the bricks.
233933
After healing completion this dirty flag remain set and never
233933
gets cleaned up if SHD is disabled.
233933
233933
Solution:
233933
1 - If an entry is already in queue or going through heal process,
233933
don't enqueue next client side request to heal the same file.
233933
233933
2 - Unset dirty on all the bricks at the end if fop has succeeded on
233933
all the bricks even if some of the bricks are going through heal.
233933
233933
backport of : https://review.gluster.org/#/c/glusterfs/+/21744/
233933
233933
Change-Id: Ia61ffe230c6502ce6cb934425d55e2f40dd1a727
233933
BUG: 1600918
233933
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
233933
Reviewed-on: https://code.engineering.redhat.com/gerrit/166296
233933
Tested-by: RHGS Build Bot <nigelb@redhat.com>
233933
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
233933
---
233933
 tests/bugs/ec/bug-1236065.t         |   1 -
233933
 xlators/cluster/ec/src/ec-common.c  |  43 +++++++++------
233933
 xlators/cluster/ec/src/ec-common.h  |   8 +++
233933
 xlators/cluster/ec/src/ec-heal.c    | 104 +++++++++++++++++++++++++++++++-----
233933
 xlators/cluster/ec/src/ec-helpers.c |   1 +
233933
 xlators/cluster/ec/src/ec-types.h   |   1 +
233933
 6 files changed, 127 insertions(+), 31 deletions(-)
233933
233933
diff --git a/tests/bugs/ec/bug-1236065.t b/tests/bugs/ec/bug-1236065.t
233933
index 76d25d7..9181e73 100644
233933
--- a/tests/bugs/ec/bug-1236065.t
233933
+++ b/tests/bugs/ec/bug-1236065.t
233933
@@ -85,7 +85,6 @@ TEST pidof glusterd
233933
 EXPECT "$V0" volinfo_field $V0 'Volume Name'
233933
 EXPECT 'Started' volinfo_field $V0 'Status'
233933
 EXPECT '7' online_brick_count
233933
-
233933
 ## cleanup
233933
 cd
233933
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
233933
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
233933
index 8d65670..5183680 100644
233933
--- a/xlators/cluster/ec/src/ec-common.c
233933
+++ b/xlators/cluster/ec/src/ec-common.c
233933
@@ -313,14 +313,15 @@ ec_check_status(ec_fop_data_t *fop)
233933
 
233933
     gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
233933
            "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
233933
-           "remaining=%s, good=%s, bad=%s)",
233933
+           "remaining=%s, good=%s, bad=%s, %s)",
233933
            gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
233933
            ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
233933
            ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
233933
            ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
233933
            ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
233933
            ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
233933
-                  ec->nodes));
233933
+                  ec->nodes),
233933
+           ec_msg_str(fop));
233933
     if (fop->use_fd) {
233933
         if (fop->fd != NULL) {
233933
             ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
233933
@@ -2371,37 +2372,47 @@ ec_update_info(ec_lock_link_t *link)
233933
     uint64_t dirty[2] = {0, 0};
233933
     uint64_t size;
233933
     ec_t *ec = NULL;
233933
+    uintptr_t mask;
233933
 
233933
     lock = link->lock;
233933
     ctx = lock->ctx;
233933
     ec = link->fop->xl->private;
233933
 
233933
     /* pre_version[*] will be 0 if have_version is false */
233933
-    version[0] = ctx->post_version[0] - ctx->pre_version[0];
233933
-    version[1] = ctx->post_version[1] - ctx->pre_version[1];
233933
+    version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] -
233933
+                           ctx->pre_version[EC_DATA_TXN];
233933
+    version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] -
233933
+                               ctx->pre_version[EC_METADATA_TXN];
233933
 
233933
     size = ctx->post_size - ctx->pre_size;
233933
     /* If we set the dirty flag for update fop, we have to unset it.
233933
      * If fop has failed on some bricks, leave the dirty as marked. */
233933
+
233933
     if (lock->unlock_now) {
233933
+        if (version[EC_DATA_TXN]) {
233933
+            /*A data fop will have difference in post and pre version
233933
+             *and for data fop we send writes on healing bricks also */
233933
+            mask = lock->good_mask | lock->healing;
233933
+        } else {
233933
+            mask = lock->good_mask;
233933
+        }
233933
         /* Ensure that nodes are up while doing final
233933
          * metadata update.*/
233933
-        if (!(ec->node_mask & ~lock->good_mask) &&
233933
-            !(ec->node_mask & ~ec->xl_up)) {
233933
-            if (ctx->dirty[0] != 0) {
233933
-                dirty[0] = -1;
233933
+        if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) {
233933
+            if (ctx->dirty[EC_DATA_TXN] != 0) {
233933
+                dirty[EC_DATA_TXN] = -1;
233933
             }
233933
-            if (ctx->dirty[1] != 0) {
233933
-                dirty[1] = -1;
233933
+            if (ctx->dirty[EC_METADATA_TXN] != 0) {
233933
+                dirty[EC_METADATA_TXN] = -1;
233933
             }
233933
             /*If everything is fine and we already
233933
              *have version xattr set on entry, there
233933
              *is no need to update version again*/
233933
-            if (ctx->pre_version[0]) {
233933
-                version[0] = 0;
233933
+            if (ctx->pre_version[EC_DATA_TXN]) {
233933
+                version[EC_DATA_TXN] = 0;
233933
             }
233933
-            if (ctx->pre_version[1]) {
233933
-                version[1] = 0;
233933
+            if (ctx->pre_version[EC_METADATA_TXN]) {
233933
+                version[EC_METADATA_TXN] = 0;
233933
             }
233933
         } else {
233933
             link->optimistic_changelog = _gf_false;
233933
@@ -2410,8 +2421,8 @@ ec_update_info(ec_lock_link_t *link)
233933
         memset(ctx->dirty, 0, sizeof(ctx->dirty));
233933
     }
233933
 
233933
-    if ((version[0] != 0) || (version[1] != 0) || (dirty[0] != 0) ||
233933
-        (dirty[1] != 0)) {
233933
+    if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) ||
233933
+        (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) {
233933
         ec_update_size_version(link, version, size, dirty);
233933
         return _gf_true;
233933
     }
233933
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
233933
index 115e147..54aaa77 100644
233933
--- a/xlators/cluster/ec/src/ec-common.h
233933
+++ b/xlators/cluster/ec/src/ec-common.h
233933
@@ -190,4 +190,12 @@ ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
233933
 void
233933
 ec_update_fd_status(fd_t *fd, xlator_t *xl, int child_index,
233933
                     int32_t ret_status);
233933
+gf_boolean_t
233933
+ec_is_entry_healing(ec_fop_data_t *fop);
233933
+void
233933
+ec_set_entry_healing(ec_fop_data_t *fop);
233933
+void
233933
+ec_reset_entry_healing(ec_fop_data_t *fop);
233933
+char *
233933
+ec_msg_str(ec_fop_data_t *fop);
233933
 #endif /* __EC_COMMON_H__ */
233933
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
233933
index eaf80e0..1ca12c1 100644
233933
--- a/xlators/cluster/ec/src/ec-heal.c
233933
+++ b/xlators/cluster/ec/src/ec-heal.c
233933
@@ -103,6 +103,48 @@ ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata)
233933
 }
233933
 /* FOP: heal */
233933
 
233933
+void
233933
+ec_set_entry_healing(ec_fop_data_t *fop)
233933
+{
233933
+    ec_inode_t *ctx = NULL;
233933
+    loc_t *loc = NULL;
233933
+
233933
+    if (!fop)
233933
+        return;
233933
+
233933
+    loc = &fop->loc[0];
233933
+    LOCK(&loc->inode->lock);
233933
+    {
233933
+        ctx = __ec_inode_get(loc->inode, fop->xl);
233933
+        if (ctx) {
233933
+            ctx->heal_count += 1;
233933
+        }
233933
+    }
233933
+    UNLOCK(&loc->inode->lock);
233933
+}
233933
+
233933
+void
233933
+ec_reset_entry_healing(ec_fop_data_t *fop)
233933
+{
233933
+    ec_inode_t *ctx = NULL;
233933
+    loc_t *loc = NULL;
233933
+    int32_t heal_count = 0;
233933
+    if (!fop)
233933
+        return;
233933
+
233933
+    loc = &fop->loc[0];
233933
+    LOCK(&loc->inode->lock);
233933
+    {
233933
+        ctx = __ec_inode_get(loc->inode, fop->xl);
233933
+        if (ctx) {
233933
+            ctx->heal_count += -1;
233933
+            heal_count = ctx->heal_count;
233933
+        }
233933
+    }
233933
+    UNLOCK(&loc->inode->lock);
233933
+    GF_ASSERT(heal_count >= 0);
233933
+}
233933
+
233933
 uintptr_t
233933
 ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood)
233933
 {
233933
@@ -2507,17 +2549,6 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
233933
                "Heal is not required for : %s ", uuid_utoa(loc->gfid));
233933
         goto out;
233933
     }
233933
-
233933
-    msources = alloca0(ec->nodes);
233933
-    mhealed_sinks = alloca0(ec->nodes);
233933
-    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
233933
-    if (ret == 0) {
233933
-        mgood = ec_char_array_to_mask(msources, ec->nodes);
233933
-        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
233933
-    } else {
233933
-        op_ret = -1;
233933
-        op_errno = -ret;
233933
-    }
233933
     sources = alloca0(ec->nodes);
233933
     healed_sinks = alloca0(ec->nodes);
233933
     if (IA_ISREG(loc->inode->ia_type)) {
233933
@@ -2538,8 +2569,19 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
233933
         op_ret = -1;
233933
         op_errno = -ret;
233933
     }
233933
+    msources = alloca0(ec->nodes);
233933
+    mhealed_sinks = alloca0(ec->nodes);
233933
+    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
233933
+    if (ret == 0) {
233933
+        mgood = ec_char_array_to_mask(msources, ec->nodes);
233933
+        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
233933
+    } else {
233933
+        op_ret = -1;
233933
+        op_errno = -ret;
233933
+    }
233933
 
233933
 out:
233933
+    ec_reset_entry_healing(fop);
233933
     if (fop->cbks.heal) {
233933
         fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
233933
                        ec_char_array_to_mask(participants, ec->nodes),
233933
@@ -2650,11 +2692,33 @@ ec_handle_healers_done(ec_fop_data_t *fop)
233933
         ec_launch_heal(ec, heal_fop);
233933
 }
233933
 
233933
+gf_boolean_t
233933
+ec_is_entry_healing(ec_fop_data_t *fop)
233933
+{
233933
+    ec_inode_t *ctx = NULL;
233933
+    int32_t heal_count = 0;
233933
+    loc_t *loc = NULL;
233933
+
233933
+    loc = &fop->loc[0];
233933
+
233933
+    LOCK(&loc->inode->lock);
233933
+    {
233933
+        ctx = __ec_inode_get(loc->inode, fop->xl);
233933
+        if (ctx) {
233933
+            heal_count = ctx->heal_count;
233933
+        }
233933
+    }
233933
+    UNLOCK(&loc->inode->lock);
233933
+    GF_ASSERT(heal_count >= 0);
233933
+    return heal_count;
233933
+}
233933
+
233933
 void
233933
 ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
233933
 {
233933
     gf_boolean_t can_heal = _gf_true;
233933
     ec_t *ec = this->private;
233933
+    ec_fop_data_t *fop_rel = NULL;
233933
 
233933
     if (fop->req_frame == NULL) {
233933
         LOCK(&ec->lock);
233933
@@ -2662,8 +2726,13 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
233933
             if ((ec->background_heals > 0) &&
233933
                 (ec->heal_wait_qlen + ec->background_heals) >
233933
                     (ec->heal_waiters + ec->healers)) {
233933
-                list_add_tail(&fop->healer, &ec->heal_waiting);
233933
-                ec->heal_waiters++;
233933
+                if (!ec_is_entry_healing(fop)) {
233933
+                    list_add_tail(&fop->healer, &ec->heal_waiting);
233933
+                    ec->heal_waiters++;
233933
+                    ec_set_entry_healing(fop);
233933
+                } else {
233933
+                    fop_rel = fop;
233933
+                }
233933
                 fop = __ec_dequeue_heals(ec);
233933
             } else {
233933
                 can_heal = _gf_false;
233933
@@ -2673,8 +2742,12 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
233933
     }
233933
 
233933
     if (can_heal) {
233933
-        if (fop)
233933
+        if (fop) {
233933
+            if (fop->req_frame != NULL) {
233933
+                ec_set_entry_healing(fop);
233933
+            }
233933
             ec_launch_heal(ec, fop);
233933
+        }
233933
     } else {
233933
         gf_msg_debug(this->name, 0,
233933
                      "Max number of heals are "
233933
@@ -2682,6 +2755,9 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
233933
         ec_fop_set_error(fop, EBUSY);
233933
         ec_heal_fail(ec, fop);
233933
     }
233933
+    if (fop_rel) {
233933
+        ec_heal_done(0, NULL, fop_rel);
233933
+    }
233933
 }
233933
 
233933
 void
233933
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
233933
index e6b0359..43f6e3b 100644
233933
--- a/xlators/cluster/ec/src/ec-helpers.c
233933
+++ b/xlators/cluster/ec/src/ec-helpers.c
233933
@@ -717,6 +717,7 @@ __ec_inode_get(inode_t *inode, xlator_t *xl)
233933
             memset(ctx, 0, sizeof(*ctx));
233933
             INIT_LIST_HEAD(&ctx->heal);
233933
             INIT_LIST_HEAD(&ctx->stripe_cache.lru);
233933
+            ctx->heal_count = 0;
233933
             value = (uint64_t)(uintptr_t)ctx;
233933
             if (__inode_ctx_set(inode, xl, &value) != 0) {
233933
                 GF_FREE(ctx);
233933
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
233933
index f3d63ca..6ae4a2b 100644
233933
--- a/xlators/cluster/ec/src/ec-types.h
233933
+++ b/xlators/cluster/ec/src/ec-types.h
233933
@@ -171,6 +171,7 @@ struct _ec_inode {
233933
     gf_boolean_t have_config;
233933
     gf_boolean_t have_version;
233933
     gf_boolean_t have_size;
233933
+    int32_t heal_count;
233933
     ec_config_t config;
233933
     uint64_t pre_version[2];
233933
     uint64_t post_version[2];
233933
-- 
233933
1.8.3.1
233933