256ebe
From bc6588890ce94101a63b861178cf38db5549d8a8 Mon Sep 17 00:00:00 2001
256ebe
From: Ashish Pandey <aspandey@redhat.com>
256ebe
Date: Wed, 28 Nov 2018 11:22:52 +0530
256ebe
Subject: [PATCH 44/52] cluster/ec: Don't enqueue an entry if it is already
256ebe
 healing
256ebe
256ebe
Problem:
256ebe
1 - heal-wait-qlength is by default 128. If shd is disabled
256ebe
and we need to heal files, client side heal is needed.
256ebe
If we access these files that will trigger the heal.
256ebe
However, it has been observed that a file will be enqueued
256ebe
multiple times in the heal wait queue, which in turn causes
256ebe
queue to be filled and prevent other files to be enqueued.
256ebe
256ebe
2 - While a file is going through healing and a write fop from
256ebe
mount comes on that file, it sends write on all the bricks including
256ebe
healing one. At the end it updates version and size on all the
256ebe
bricks. However, it does not unset dirty flag on all the bricks,
256ebe
even if this write fop was successful on all the bricks.
256ebe
After healing completion this dirty flag remain set and never
256ebe
gets cleaned up if SHD is disabled.
256ebe
256ebe
Solution:
256ebe
1 - If an entry is already in queue or going through heal process,
256ebe
don't enqueue next client side request to heal the same file.
256ebe
256ebe
2 - Unset dirty on all the bricks at the end if fop has succeeded on
256ebe
all the bricks even if some of the bricks are going through heal.
256ebe
256ebe
backport of : https://review.gluster.org/#/c/glusterfs/+/21744/
256ebe
256ebe
Change-Id: Ia61ffe230c6502ce6cb934425d55e2f40dd1a727
256ebe
BUG: 1600918
256ebe
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
256ebe
Reviewed-on: https://code.engineering.redhat.com/gerrit/166296
256ebe
Tested-by: RHGS Build Bot <nigelb@redhat.com>
256ebe
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
256ebe
---
256ebe
 tests/bugs/ec/bug-1236065.t         |   1 -
256ebe
 xlators/cluster/ec/src/ec-common.c  |  43 +++++++++------
256ebe
 xlators/cluster/ec/src/ec-common.h  |   8 +++
256ebe
 xlators/cluster/ec/src/ec-heal.c    | 104 +++++++++++++++++++++++++++++++-----
256ebe
 xlators/cluster/ec/src/ec-helpers.c |   1 +
256ebe
 xlators/cluster/ec/src/ec-types.h   |   1 +
256ebe
 6 files changed, 127 insertions(+), 31 deletions(-)
256ebe
256ebe
diff --git a/tests/bugs/ec/bug-1236065.t b/tests/bugs/ec/bug-1236065.t
256ebe
index 76d25d7..9181e73 100644
256ebe
--- a/tests/bugs/ec/bug-1236065.t
256ebe
+++ b/tests/bugs/ec/bug-1236065.t
256ebe
@@ -85,7 +85,6 @@ TEST pidof glusterd
256ebe
 EXPECT "$V0" volinfo_field $V0 'Volume Name'
256ebe
 EXPECT 'Started' volinfo_field $V0 'Status'
256ebe
 EXPECT '7' online_brick_count
256ebe
-
256ebe
 ## cleanup
256ebe
 cd
256ebe
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
256ebe
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
256ebe
index 8d65670..5183680 100644
256ebe
--- a/xlators/cluster/ec/src/ec-common.c
256ebe
+++ b/xlators/cluster/ec/src/ec-common.c
256ebe
@@ -313,14 +313,15 @@ ec_check_status(ec_fop_data_t *fop)
256ebe
 
256ebe
     gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
256ebe
            "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
256ebe
-           "remaining=%s, good=%s, bad=%s)",
256ebe
+           "remaining=%s, good=%s, bad=%s, %s)",
256ebe
            gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
256ebe
            ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
256ebe
            ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
256ebe
            ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
256ebe
            ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
256ebe
            ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
256ebe
-                  ec->nodes));
256ebe
+                  ec->nodes),
256ebe
+           ec_msg_str(fop));
256ebe
     if (fop->use_fd) {
256ebe
         if (fop->fd != NULL) {
256ebe
             ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
256ebe
@@ -2371,37 +2372,47 @@ ec_update_info(ec_lock_link_t *link)
256ebe
     uint64_t dirty[2] = {0, 0};
256ebe
     uint64_t size;
256ebe
     ec_t *ec = NULL;
256ebe
+    uintptr_t mask;
256ebe
 
256ebe
     lock = link->lock;
256ebe
     ctx = lock->ctx;
256ebe
     ec = link->fop->xl->private;
256ebe
 
256ebe
     /* pre_version[*] will be 0 if have_version is false */
256ebe
-    version[0] = ctx->post_version[0] - ctx->pre_version[0];
256ebe
-    version[1] = ctx->post_version[1] - ctx->pre_version[1];
256ebe
+    version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] -
256ebe
+                           ctx->pre_version[EC_DATA_TXN];
256ebe
+    version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] -
256ebe
+                               ctx->pre_version[EC_METADATA_TXN];
256ebe
 
256ebe
     size = ctx->post_size - ctx->pre_size;
256ebe
     /* If we set the dirty flag for update fop, we have to unset it.
256ebe
      * If fop has failed on some bricks, leave the dirty as marked. */
256ebe
+
256ebe
     if (lock->unlock_now) {
256ebe
+        if (version[EC_DATA_TXN]) {
256ebe
+            /*A data fop will have difference in post and pre version
256ebe
+             *and for data fop we send writes on healing bricks also */
256ebe
+            mask = lock->good_mask | lock->healing;
256ebe
+        } else {
256ebe
+            mask = lock->good_mask;
256ebe
+        }
256ebe
         /* Ensure that nodes are up while doing final
256ebe
          * metadata update.*/
256ebe
-        if (!(ec->node_mask & ~lock->good_mask) &&
256ebe
-            !(ec->node_mask & ~ec->xl_up)) {
256ebe
-            if (ctx->dirty[0] != 0) {
256ebe
-                dirty[0] = -1;
256ebe
+        if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) {
256ebe
+            if (ctx->dirty[EC_DATA_TXN] != 0) {
256ebe
+                dirty[EC_DATA_TXN] = -1;
256ebe
             }
256ebe
-            if (ctx->dirty[1] != 0) {
256ebe
-                dirty[1] = -1;
256ebe
+            if (ctx->dirty[EC_METADATA_TXN] != 0) {
256ebe
+                dirty[EC_METADATA_TXN] = -1;
256ebe
             }
256ebe
             /*If everything is fine and we already
256ebe
              *have version xattr set on entry, there
256ebe
              *is no need to update version again*/
256ebe
-            if (ctx->pre_version[0]) {
256ebe
-                version[0] = 0;
256ebe
+            if (ctx->pre_version[EC_DATA_TXN]) {
256ebe
+                version[EC_DATA_TXN] = 0;
256ebe
             }
256ebe
-            if (ctx->pre_version[1]) {
256ebe
-                version[1] = 0;
256ebe
+            if (ctx->pre_version[EC_METADATA_TXN]) {
256ebe
+                version[EC_METADATA_TXN] = 0;
256ebe
             }
256ebe
         } else {
256ebe
             link->optimistic_changelog = _gf_false;
256ebe
@@ -2410,8 +2421,8 @@ ec_update_info(ec_lock_link_t *link)
256ebe
         memset(ctx->dirty, 0, sizeof(ctx->dirty));
256ebe
     }
256ebe
 
256ebe
-    if ((version[0] != 0) || (version[1] != 0) || (dirty[0] != 0) ||
256ebe
-        (dirty[1] != 0)) {
256ebe
+    if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) ||
256ebe
+        (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) {
256ebe
         ec_update_size_version(link, version, size, dirty);
256ebe
         return _gf_true;
256ebe
     }
256ebe
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
256ebe
index 115e147..54aaa77 100644
256ebe
--- a/xlators/cluster/ec/src/ec-common.h
256ebe
+++ b/xlators/cluster/ec/src/ec-common.h
256ebe
@@ -190,4 +190,12 @@ ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
256ebe
 void
256ebe
 ec_update_fd_status(fd_t *fd, xlator_t *xl, int child_index,
256ebe
                     int32_t ret_status);
256ebe
+gf_boolean_t
256ebe
+ec_is_entry_healing(ec_fop_data_t *fop);
256ebe
+void
256ebe
+ec_set_entry_healing(ec_fop_data_t *fop);
256ebe
+void
256ebe
+ec_reset_entry_healing(ec_fop_data_t *fop);
256ebe
+char *
256ebe
+ec_msg_str(ec_fop_data_t *fop);
256ebe
 #endif /* __EC_COMMON_H__ */
256ebe
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
256ebe
index eaf80e0..1ca12c1 100644
256ebe
--- a/xlators/cluster/ec/src/ec-heal.c
256ebe
+++ b/xlators/cluster/ec/src/ec-heal.c
256ebe
@@ -103,6 +103,48 @@ ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata)
256ebe
 }
256ebe
 /* FOP: heal */
256ebe
 
256ebe
+void
256ebe
+ec_set_entry_healing(ec_fop_data_t *fop)
256ebe
+{
256ebe
+    ec_inode_t *ctx = NULL;
256ebe
+    loc_t *loc = NULL;
256ebe
+
256ebe
+    if (!fop)
256ebe
+        return;
256ebe
+
256ebe
+    loc = &fop->loc[0];
256ebe
+    LOCK(&loc->inode->lock);
256ebe
+    {
256ebe
+        ctx = __ec_inode_get(loc->inode, fop->xl);
256ebe
+        if (ctx) {
256ebe
+            ctx->heal_count += 1;
256ebe
+        }
256ebe
+    }
256ebe
+    UNLOCK(&loc->inode->lock);
256ebe
+}
256ebe
+
256ebe
+void
256ebe
+ec_reset_entry_healing(ec_fop_data_t *fop)
256ebe
+{
256ebe
+    ec_inode_t *ctx = NULL;
256ebe
+    loc_t *loc = NULL;
256ebe
+    int32_t heal_count = 0;
256ebe
+    if (!fop)
256ebe
+        return;
256ebe
+
256ebe
+    loc = &fop->loc[0];
256ebe
+    LOCK(&loc->inode->lock);
256ebe
+    {
256ebe
+        ctx = __ec_inode_get(loc->inode, fop->xl);
256ebe
+        if (ctx) {
256ebe
+            ctx->heal_count += -1;
256ebe
+            heal_count = ctx->heal_count;
256ebe
+        }
256ebe
+    }
256ebe
+    UNLOCK(&loc->inode->lock);
256ebe
+    GF_ASSERT(heal_count >= 0);
256ebe
+}
256ebe
+
256ebe
 uintptr_t
256ebe
 ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood)
256ebe
 {
256ebe
@@ -2507,17 +2549,6 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
256ebe
                "Heal is not required for : %s ", uuid_utoa(loc->gfid));
256ebe
         goto out;
256ebe
     }
256ebe
-
256ebe
-    msources = alloca0(ec->nodes);
256ebe
-    mhealed_sinks = alloca0(ec->nodes);
256ebe
-    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
256ebe
-    if (ret == 0) {
256ebe
-        mgood = ec_char_array_to_mask(msources, ec->nodes);
256ebe
-        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
256ebe
-    } else {
256ebe
-        op_ret = -1;
256ebe
-        op_errno = -ret;
256ebe
-    }
256ebe
     sources = alloca0(ec->nodes);
256ebe
     healed_sinks = alloca0(ec->nodes);
256ebe
     if (IA_ISREG(loc->inode->ia_type)) {
256ebe
@@ -2538,8 +2569,19 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
256ebe
         op_ret = -1;
256ebe
         op_errno = -ret;
256ebe
     }
256ebe
+    msources = alloca0(ec->nodes);
256ebe
+    mhealed_sinks = alloca0(ec->nodes);
256ebe
+    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
256ebe
+    if (ret == 0) {
256ebe
+        mgood = ec_char_array_to_mask(msources, ec->nodes);
256ebe
+        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
256ebe
+    } else {
256ebe
+        op_ret = -1;
256ebe
+        op_errno = -ret;
256ebe
+    }
256ebe
 
256ebe
 out:
256ebe
+    ec_reset_entry_healing(fop);
256ebe
     if (fop->cbks.heal) {
256ebe
         fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
256ebe
                        ec_char_array_to_mask(participants, ec->nodes),
256ebe
@@ -2650,11 +2692,33 @@ ec_handle_healers_done(ec_fop_data_t *fop)
256ebe
         ec_launch_heal(ec, heal_fop);
256ebe
 }
256ebe
 
256ebe
+gf_boolean_t
256ebe
+ec_is_entry_healing(ec_fop_data_t *fop)
256ebe
+{
256ebe
+    ec_inode_t *ctx = NULL;
256ebe
+    int32_t heal_count = 0;
256ebe
+    loc_t *loc = NULL;
256ebe
+
256ebe
+    loc = &fop->loc[0];
256ebe
+
256ebe
+    LOCK(&loc->inode->lock);
256ebe
+    {
256ebe
+        ctx = __ec_inode_get(loc->inode, fop->xl);
256ebe
+        if (ctx) {
256ebe
+            heal_count = ctx->heal_count;
256ebe
+        }
256ebe
+    }
256ebe
+    UNLOCK(&loc->inode->lock);
256ebe
+    GF_ASSERT(heal_count >= 0);
256ebe
+    return heal_count;
256ebe
+}
256ebe
+
256ebe
 void
256ebe
 ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
256ebe
 {
256ebe
     gf_boolean_t can_heal = _gf_true;
256ebe
     ec_t *ec = this->private;
256ebe
+    ec_fop_data_t *fop_rel = NULL;
256ebe
 
256ebe
     if (fop->req_frame == NULL) {
256ebe
         LOCK(&ec->lock);
256ebe
@@ -2662,8 +2726,13 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
256ebe
             if ((ec->background_heals > 0) &&
256ebe
                 (ec->heal_wait_qlen + ec->background_heals) >
256ebe
                     (ec->heal_waiters + ec->healers)) {
256ebe
-                list_add_tail(&fop->healer, &ec->heal_waiting);
256ebe
-                ec->heal_waiters++;
256ebe
+                if (!ec_is_entry_healing(fop)) {
256ebe
+                    list_add_tail(&fop->healer, &ec->heal_waiting);
256ebe
+                    ec->heal_waiters++;
256ebe
+                    ec_set_entry_healing(fop);
256ebe
+                } else {
256ebe
+                    fop_rel = fop;
256ebe
+                }
256ebe
                 fop = __ec_dequeue_heals(ec);
256ebe
             } else {
256ebe
                 can_heal = _gf_false;
256ebe
@@ -2673,8 +2742,12 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
256ebe
     }
256ebe
 
256ebe
     if (can_heal) {
256ebe
-        if (fop)
256ebe
+        if (fop) {
256ebe
+            if (fop->req_frame != NULL) {
256ebe
+                ec_set_entry_healing(fop);
256ebe
+            }
256ebe
             ec_launch_heal(ec, fop);
256ebe
+        }
256ebe
     } else {
256ebe
         gf_msg_debug(this->name, 0,
256ebe
                      "Max number of heals are "
256ebe
@@ -2682,6 +2755,9 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
256ebe
         ec_fop_set_error(fop, EBUSY);
256ebe
         ec_heal_fail(ec, fop);
256ebe
     }
256ebe
+    if (fop_rel) {
256ebe
+        ec_heal_done(0, NULL, fop_rel);
256ebe
+    }
256ebe
 }
256ebe
 
256ebe
 void
256ebe
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
256ebe
index e6b0359..43f6e3b 100644
256ebe
--- a/xlators/cluster/ec/src/ec-helpers.c
256ebe
+++ b/xlators/cluster/ec/src/ec-helpers.c
256ebe
@@ -717,6 +717,7 @@ __ec_inode_get(inode_t *inode, xlator_t *xl)
256ebe
             memset(ctx, 0, sizeof(*ctx));
256ebe
             INIT_LIST_HEAD(&ctx->heal);
256ebe
             INIT_LIST_HEAD(&ctx->stripe_cache.lru);
256ebe
+            ctx->heal_count = 0;
256ebe
             value = (uint64_t)(uintptr_t)ctx;
256ebe
             if (__inode_ctx_set(inode, xl, &value) != 0) {
256ebe
                 GF_FREE(ctx);
256ebe
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
256ebe
index f3d63ca..6ae4a2b 100644
256ebe
--- a/xlators/cluster/ec/src/ec-types.h
256ebe
+++ b/xlators/cluster/ec/src/ec-types.h
256ebe
@@ -171,6 +171,7 @@ struct _ec_inode {
256ebe
     gf_boolean_t have_config;
256ebe
     gf_boolean_t have_version;
256ebe
     gf_boolean_t have_size;
256ebe
+    int32_t heal_count;
256ebe
     ec_config_t config;
256ebe
     uint64_t pre_version[2];
256ebe
     uint64_t post_version[2];
256ebe
-- 
256ebe
1.8.3.1
256ebe