14f8ab
From bc6588890ce94101a63b861178cf38db5549d8a8 Mon Sep 17 00:00:00 2001
14f8ab
From: Ashish Pandey <aspandey@redhat.com>
14f8ab
Date: Wed, 28 Nov 2018 11:22:52 +0530
14f8ab
Subject: [PATCH 44/52] cluster/ec: Don't enqueue an entry if it is already
14f8ab
 healing
14f8ab
14f8ab
Problem:
14f8ab
1 - heal-wait-qlength is by default 128. If shd is disabled
14f8ab
and we need to heal files, client side heal is needed.
14f8ab
If we access these files that will trigger the heal.
14f8ab
However, it has been observed that a file will be enqueued
14f8ab
multiple times in the heal wait queue, which in turn causes
14f8ab
queue to be filled and prevent other files to be enqueued.
14f8ab
14f8ab
2 - While a file is going through healing and a write fop from
14f8ab
mount comes on that file, it sends write on all the bricks including
14f8ab
healing one. At the end it updates version and size on all the
14f8ab
bricks. However, it does not unset dirty flag on all the bricks,
14f8ab
even if this write fop was successful on all the bricks.
14f8ab
After healing completion this dirty flag remain set and never
14f8ab
gets cleaned up if SHD is disabled.
14f8ab
14f8ab
Solution:
14f8ab
1 - If an entry is already in queue or going through heal process,
14f8ab
don't enqueue next client side request to heal the same file.
14f8ab
14f8ab
2 - Unset dirty on all the bricks at the end if fop has succeeded on
14f8ab
all the bricks even if some of the bricks are going through heal.
14f8ab
14f8ab
backport of : https://review.gluster.org/#/c/glusterfs/+/21744/
14f8ab
14f8ab
Change-Id: Ia61ffe230c6502ce6cb934425d55e2f40dd1a727
14f8ab
BUG: 1600918
14f8ab
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
14f8ab
Reviewed-on: https://code.engineering.redhat.com/gerrit/166296
14f8ab
Tested-by: RHGS Build Bot <nigelb@redhat.com>
14f8ab
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
14f8ab
---
14f8ab
 tests/bugs/ec/bug-1236065.t         |   1 -
14f8ab
 xlators/cluster/ec/src/ec-common.c  |  43 +++++++++------
14f8ab
 xlators/cluster/ec/src/ec-common.h  |   8 +++
14f8ab
 xlators/cluster/ec/src/ec-heal.c    | 104 +++++++++++++++++++++++++++++++-----
14f8ab
 xlators/cluster/ec/src/ec-helpers.c |   1 +
14f8ab
 xlators/cluster/ec/src/ec-types.h   |   1 +
14f8ab
 6 files changed, 127 insertions(+), 31 deletions(-)
14f8ab
14f8ab
diff --git a/tests/bugs/ec/bug-1236065.t b/tests/bugs/ec/bug-1236065.t
14f8ab
index 76d25d7..9181e73 100644
14f8ab
--- a/tests/bugs/ec/bug-1236065.t
14f8ab
+++ b/tests/bugs/ec/bug-1236065.t
14f8ab
@@ -85,7 +85,6 @@ TEST pidof glusterd
14f8ab
 EXPECT "$V0" volinfo_field $V0 'Volume Name'
14f8ab
 EXPECT 'Started' volinfo_field $V0 'Status'
14f8ab
 EXPECT '7' online_brick_count
14f8ab
-
14f8ab
 ## cleanup
14f8ab
 cd
14f8ab
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
14f8ab
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
14f8ab
index 8d65670..5183680 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-common.c
14f8ab
+++ b/xlators/cluster/ec/src/ec-common.c
14f8ab
@@ -313,14 +313,15 @@ ec_check_status(ec_fop_data_t *fop)
14f8ab
 
14f8ab
     gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
14f8ab
            "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
14f8ab
-           "remaining=%s, good=%s, bad=%s)",
14f8ab
+           "remaining=%s, good=%s, bad=%s, %s)",
14f8ab
            gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
14f8ab
            ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
14f8ab
            ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
14f8ab
            ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
14f8ab
            ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
14f8ab
            ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
14f8ab
-                  ec->nodes));
14f8ab
+                  ec->nodes),
14f8ab
+           ec_msg_str(fop));
14f8ab
     if (fop->use_fd) {
14f8ab
         if (fop->fd != NULL) {
14f8ab
             ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
14f8ab
@@ -2371,37 +2372,47 @@ ec_update_info(ec_lock_link_t *link)
14f8ab
     uint64_t dirty[2] = {0, 0};
14f8ab
     uint64_t size;
14f8ab
     ec_t *ec = NULL;
14f8ab
+    uintptr_t mask;
14f8ab
 
14f8ab
     lock = link->lock;
14f8ab
     ctx = lock->ctx;
14f8ab
     ec = link->fop->xl->private;
14f8ab
 
14f8ab
     /* pre_version[*] will be 0 if have_version is false */
14f8ab
-    version[0] = ctx->post_version[0] - ctx->pre_version[0];
14f8ab
-    version[1] = ctx->post_version[1] - ctx->pre_version[1];
14f8ab
+    version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] -
14f8ab
+                           ctx->pre_version[EC_DATA_TXN];
14f8ab
+    version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] -
14f8ab
+                               ctx->pre_version[EC_METADATA_TXN];
14f8ab
 
14f8ab
     size = ctx->post_size - ctx->pre_size;
14f8ab
     /* If we set the dirty flag for update fop, we have to unset it.
14f8ab
      * If fop has failed on some bricks, leave the dirty as marked. */
14f8ab
+
14f8ab
     if (lock->unlock_now) {
14f8ab
+        if (version[EC_DATA_TXN]) {
14f8ab
+            /*A data fop will have difference in post and pre version
14f8ab
+             *and for data fop we send writes on healing bricks also */
14f8ab
+            mask = lock->good_mask | lock->healing;
14f8ab
+        } else {
14f8ab
+            mask = lock->good_mask;
14f8ab
+        }
14f8ab
         /* Ensure that nodes are up while doing final
14f8ab
          * metadata update.*/
14f8ab
-        if (!(ec->node_mask & ~lock->good_mask) &&
14f8ab
-            !(ec->node_mask & ~ec->xl_up)) {
14f8ab
-            if (ctx->dirty[0] != 0) {
14f8ab
-                dirty[0] = -1;
14f8ab
+        if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) {
14f8ab
+            if (ctx->dirty[EC_DATA_TXN] != 0) {
14f8ab
+                dirty[EC_DATA_TXN] = -1;
14f8ab
             }
14f8ab
-            if (ctx->dirty[1] != 0) {
14f8ab
-                dirty[1] = -1;
14f8ab
+            if (ctx->dirty[EC_METADATA_TXN] != 0) {
14f8ab
+                dirty[EC_METADATA_TXN] = -1;
14f8ab
             }
14f8ab
             /*If everything is fine and we already
14f8ab
              *have version xattr set on entry, there
14f8ab
              *is no need to update version again*/
14f8ab
-            if (ctx->pre_version[0]) {
14f8ab
-                version[0] = 0;
14f8ab
+            if (ctx->pre_version[EC_DATA_TXN]) {
14f8ab
+                version[EC_DATA_TXN] = 0;
14f8ab
             }
14f8ab
-            if (ctx->pre_version[1]) {
14f8ab
-                version[1] = 0;
14f8ab
+            if (ctx->pre_version[EC_METADATA_TXN]) {
14f8ab
+                version[EC_METADATA_TXN] = 0;
14f8ab
             }
14f8ab
         } else {
14f8ab
             link->optimistic_changelog = _gf_false;
14f8ab
@@ -2410,8 +2421,8 @@ ec_update_info(ec_lock_link_t *link)
14f8ab
         memset(ctx->dirty, 0, sizeof(ctx->dirty));
14f8ab
     }
14f8ab
 
14f8ab
-    if ((version[0] != 0) || (version[1] != 0) || (dirty[0] != 0) ||
14f8ab
-        (dirty[1] != 0)) {
14f8ab
+    if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) ||
14f8ab
+        (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) {
14f8ab
         ec_update_size_version(link, version, size, dirty);
14f8ab
         return _gf_true;
14f8ab
     }
14f8ab
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
14f8ab
index 115e147..54aaa77 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-common.h
14f8ab
+++ b/xlators/cluster/ec/src/ec-common.h
14f8ab
@@ -190,4 +190,12 @@ ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
14f8ab
 void
14f8ab
 ec_update_fd_status(fd_t *fd, xlator_t *xl, int child_index,
14f8ab
                     int32_t ret_status);
14f8ab
+gf_boolean_t
14f8ab
+ec_is_entry_healing(ec_fop_data_t *fop);
14f8ab
+void
14f8ab
+ec_set_entry_healing(ec_fop_data_t *fop);
14f8ab
+void
14f8ab
+ec_reset_entry_healing(ec_fop_data_t *fop);
14f8ab
+char *
14f8ab
+ec_msg_str(ec_fop_data_t *fop);
14f8ab
 #endif /* __EC_COMMON_H__ */
14f8ab
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
14f8ab
index eaf80e0..1ca12c1 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-heal.c
14f8ab
+++ b/xlators/cluster/ec/src/ec-heal.c
14f8ab
@@ -103,6 +103,48 @@ ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata)
14f8ab
 }
14f8ab
 /* FOP: heal */
14f8ab
 
14f8ab
+void
14f8ab
+ec_set_entry_healing(ec_fop_data_t *fop)
14f8ab
+{
14f8ab
+    ec_inode_t *ctx = NULL;
14f8ab
+    loc_t *loc = NULL;
14f8ab
+
14f8ab
+    if (!fop)
14f8ab
+        return;
14f8ab
+
14f8ab
+    loc = &fop->loc[0];
14f8ab
+    LOCK(&loc->inode->lock);
14f8ab
+    {
14f8ab
+        ctx = __ec_inode_get(loc->inode, fop->xl);
14f8ab
+        if (ctx) {
14f8ab
+            ctx->heal_count += 1;
14f8ab
+        }
14f8ab
+    }
14f8ab
+    UNLOCK(&loc->inode->lock);
14f8ab
+}
14f8ab
+
14f8ab
+void
14f8ab
+ec_reset_entry_healing(ec_fop_data_t *fop)
14f8ab
+{
14f8ab
+    ec_inode_t *ctx = NULL;
14f8ab
+    loc_t *loc = NULL;
14f8ab
+    int32_t heal_count = 0;
14f8ab
+    if (!fop)
14f8ab
+        return;
14f8ab
+
14f8ab
+    loc = &fop->loc[0];
14f8ab
+    LOCK(&loc->inode->lock);
14f8ab
+    {
14f8ab
+        ctx = __ec_inode_get(loc->inode, fop->xl);
14f8ab
+        if (ctx) {
14f8ab
+            ctx->heal_count += -1;
14f8ab
+            heal_count = ctx->heal_count;
14f8ab
+        }
14f8ab
+    }
14f8ab
+    UNLOCK(&loc->inode->lock);
14f8ab
+    GF_ASSERT(heal_count >= 0);
14f8ab
+}
14f8ab
+
14f8ab
 uintptr_t
14f8ab
 ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood)
14f8ab
 {
14f8ab
@@ -2507,17 +2549,6 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
14f8ab
                "Heal is not required for : %s ", uuid_utoa(loc->gfid));
14f8ab
         goto out;
14f8ab
     }
14f8ab
-
14f8ab
-    msources = alloca0(ec->nodes);
14f8ab
-    mhealed_sinks = alloca0(ec->nodes);
14f8ab
-    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
14f8ab
-    if (ret == 0) {
14f8ab
-        mgood = ec_char_array_to_mask(msources, ec->nodes);
14f8ab
-        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
14f8ab
-    } else {
14f8ab
-        op_ret = -1;
14f8ab
-        op_errno = -ret;
14f8ab
-    }
14f8ab
     sources = alloca0(ec->nodes);
14f8ab
     healed_sinks = alloca0(ec->nodes);
14f8ab
     if (IA_ISREG(loc->inode->ia_type)) {
14f8ab
@@ -2538,8 +2569,19 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
14f8ab
         op_ret = -1;
14f8ab
         op_errno = -ret;
14f8ab
     }
14f8ab
+    msources = alloca0(ec->nodes);
14f8ab
+    mhealed_sinks = alloca0(ec->nodes);
14f8ab
+    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
14f8ab
+    if (ret == 0) {
14f8ab
+        mgood = ec_char_array_to_mask(msources, ec->nodes);
14f8ab
+        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
14f8ab
+    } else {
14f8ab
+        op_ret = -1;
14f8ab
+        op_errno = -ret;
14f8ab
+    }
14f8ab
 
14f8ab
 out:
14f8ab
+    ec_reset_entry_healing(fop);
14f8ab
     if (fop->cbks.heal) {
14f8ab
         fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
14f8ab
                        ec_char_array_to_mask(participants, ec->nodes),
14f8ab
@@ -2650,11 +2692,33 @@ ec_handle_healers_done(ec_fop_data_t *fop)
14f8ab
         ec_launch_heal(ec, heal_fop);
14f8ab
 }
14f8ab
 
14f8ab
+gf_boolean_t
14f8ab
+ec_is_entry_healing(ec_fop_data_t *fop)
14f8ab
+{
14f8ab
+    ec_inode_t *ctx = NULL;
14f8ab
+    int32_t heal_count = 0;
14f8ab
+    loc_t *loc = NULL;
14f8ab
+
14f8ab
+    loc = &fop->loc[0];
14f8ab
+
14f8ab
+    LOCK(&loc->inode->lock);
14f8ab
+    {
14f8ab
+        ctx = __ec_inode_get(loc->inode, fop->xl);
14f8ab
+        if (ctx) {
14f8ab
+            heal_count = ctx->heal_count;
14f8ab
+        }
14f8ab
+    }
14f8ab
+    UNLOCK(&loc->inode->lock);
14f8ab
+    GF_ASSERT(heal_count >= 0);
14f8ab
+    return heal_count;
14f8ab
+}
14f8ab
+
14f8ab
 void
14f8ab
 ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
14f8ab
 {
14f8ab
     gf_boolean_t can_heal = _gf_true;
14f8ab
     ec_t *ec = this->private;
14f8ab
+    ec_fop_data_t *fop_rel = NULL;
14f8ab
 
14f8ab
     if (fop->req_frame == NULL) {
14f8ab
         LOCK(&ec->lock);
14f8ab
@@ -2662,8 +2726,13 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
14f8ab
             if ((ec->background_heals > 0) &&
14f8ab
                 (ec->heal_wait_qlen + ec->background_heals) >
14f8ab
                     (ec->heal_waiters + ec->healers)) {
14f8ab
-                list_add_tail(&fop->healer, &ec->heal_waiting);
14f8ab
-                ec->heal_waiters++;
14f8ab
+                if (!ec_is_entry_healing(fop)) {
14f8ab
+                    list_add_tail(&fop->healer, &ec->heal_waiting);
14f8ab
+                    ec->heal_waiters++;
14f8ab
+                    ec_set_entry_healing(fop);
14f8ab
+                } else {
14f8ab
+                    fop_rel = fop;
14f8ab
+                }
14f8ab
                 fop = __ec_dequeue_heals(ec);
14f8ab
             } else {
14f8ab
                 can_heal = _gf_false;
14f8ab
@@ -2673,8 +2742,12 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
14f8ab
     }
14f8ab
 
14f8ab
     if (can_heal) {
14f8ab
-        if (fop)
14f8ab
+        if (fop) {
14f8ab
+            if (fop->req_frame != NULL) {
14f8ab
+                ec_set_entry_healing(fop);
14f8ab
+            }
14f8ab
             ec_launch_heal(ec, fop);
14f8ab
+        }
14f8ab
     } else {
14f8ab
         gf_msg_debug(this->name, 0,
14f8ab
                      "Max number of heals are "
14f8ab
@@ -2682,6 +2755,9 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
14f8ab
         ec_fop_set_error(fop, EBUSY);
14f8ab
         ec_heal_fail(ec, fop);
14f8ab
     }
14f8ab
+    if (fop_rel) {
14f8ab
+        ec_heal_done(0, NULL, fop_rel);
14f8ab
+    }
14f8ab
 }
14f8ab
 
14f8ab
 void
14f8ab
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
14f8ab
index e6b0359..43f6e3b 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-helpers.c
14f8ab
+++ b/xlators/cluster/ec/src/ec-helpers.c
14f8ab
@@ -717,6 +717,7 @@ __ec_inode_get(inode_t *inode, xlator_t *xl)
14f8ab
             memset(ctx, 0, sizeof(*ctx));
14f8ab
             INIT_LIST_HEAD(&ctx->heal);
14f8ab
             INIT_LIST_HEAD(&ctx->stripe_cache.lru);
14f8ab
+            ctx->heal_count = 0;
14f8ab
             value = (uint64_t)(uintptr_t)ctx;
14f8ab
             if (__inode_ctx_set(inode, xl, &value) != 0) {
14f8ab
                 GF_FREE(ctx);
14f8ab
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
14f8ab
index f3d63ca..6ae4a2b 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-types.h
14f8ab
+++ b/xlators/cluster/ec/src/ec-types.h
14f8ab
@@ -171,6 +171,7 @@ struct _ec_inode {
14f8ab
     gf_boolean_t have_config;
14f8ab
     gf_boolean_t have_version;
14f8ab
     gf_boolean_t have_size;
14f8ab
+    int32_t heal_count;
14f8ab
     ec_config_t config;
14f8ab
     uint64_t pre_version[2];
14f8ab
     uint64_t post_version[2];
14f8ab
-- 
14f8ab
1.8.3.1
14f8ab