e3c68b
From bc6588890ce94101a63b861178cf38db5549d8a8 Mon Sep 17 00:00:00 2001
e3c68b
From: Ashish Pandey <aspandey@redhat.com>
e3c68b
Date: Wed, 28 Nov 2018 11:22:52 +0530
e3c68b
Subject: [PATCH 44/52] cluster/ec: Don't enqueue an entry if it is already
e3c68b
 healing
e3c68b
e3c68b
Problem:
e3c68b
1 - heal-wait-qlength is by default 128. If shd is disabled
e3c68b
and we need to heal files, client side heal is needed.
e3c68b
If we access these files that will trigger the heal.
e3c68b
However, it has been observed that a file will be enqueued
e3c68b
multiple times in the heal wait queue, which in turn causes
e3c68b
queue to be filled and prevent other files to be enqueued.
e3c68b
e3c68b
2 - While a file is going through healing and a write fop from
e3c68b
mount comes on that file, it sends write on all the bricks including
e3c68b
healing one. At the end it updates version and size on all the
e3c68b
bricks. However, it does not unset dirty flag on all the bricks,
e3c68b
even if this write fop was successful on all the bricks.
e3c68b
After healing completion this dirty flag remain set and never
e3c68b
gets cleaned up if SHD is disabled.
e3c68b
e3c68b
Solution:
e3c68b
1 - If an entry is already in queue or going through heal process,
e3c68b
don't enqueue next client side request to heal the same file.
e3c68b
e3c68b
2 - Unset dirty on all the bricks at the end if fop has succeeded on
e3c68b
all the bricks even if some of the bricks are going through heal.
e3c68b
e3c68b
backport of : https://review.gluster.org/#/c/glusterfs/+/21744/
e3c68b
e3c68b
Change-Id: Ia61ffe230c6502ce6cb934425d55e2f40dd1a727
e3c68b
BUG: 1600918
e3c68b
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
e3c68b
Reviewed-on: https://code.engineering.redhat.com/gerrit/166296
e3c68b
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e3c68b
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
e3c68b
---
e3c68b
 tests/bugs/ec/bug-1236065.t         |   1 -
e3c68b
 xlators/cluster/ec/src/ec-common.c  |  43 +++++++++------
e3c68b
 xlators/cluster/ec/src/ec-common.h  |   8 +++
e3c68b
 xlators/cluster/ec/src/ec-heal.c    | 104 +++++++++++++++++++++++++++++++-----
e3c68b
 xlators/cluster/ec/src/ec-helpers.c |   1 +
e3c68b
 xlators/cluster/ec/src/ec-types.h   |   1 +
e3c68b
 6 files changed, 127 insertions(+), 31 deletions(-)
e3c68b
e3c68b
diff --git a/tests/bugs/ec/bug-1236065.t b/tests/bugs/ec/bug-1236065.t
e3c68b
index 76d25d7..9181e73 100644
e3c68b
--- a/tests/bugs/ec/bug-1236065.t
e3c68b
+++ b/tests/bugs/ec/bug-1236065.t
e3c68b
@@ -85,7 +85,6 @@ TEST pidof glusterd
e3c68b
 EXPECT "$V0" volinfo_field $V0 'Volume Name'
e3c68b
 EXPECT 'Started' volinfo_field $V0 'Status'
e3c68b
 EXPECT '7' online_brick_count
e3c68b
-
e3c68b
 ## cleanup
e3c68b
 cd
e3c68b
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
e3c68b
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
e3c68b
index 8d65670..5183680 100644
e3c68b
--- a/xlators/cluster/ec/src/ec-common.c
e3c68b
+++ b/xlators/cluster/ec/src/ec-common.c
e3c68b
@@ -313,14 +313,15 @@ ec_check_status(ec_fop_data_t *fop)
e3c68b
 
e3c68b
     gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
e3c68b
            "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
e3c68b
-           "remaining=%s, good=%s, bad=%s)",
e3c68b
+           "remaining=%s, good=%s, bad=%s, %s)",
e3c68b
            gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
e3c68b
            ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
e3c68b
            ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
e3c68b
            ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
e3c68b
            ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
e3c68b
            ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
e3c68b
-                  ec->nodes));
e3c68b
+                  ec->nodes),
e3c68b
+           ec_msg_str(fop));
e3c68b
     if (fop->use_fd) {
e3c68b
         if (fop->fd != NULL) {
e3c68b
             ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
e3c68b
@@ -2371,37 +2372,47 @@ ec_update_info(ec_lock_link_t *link)
e3c68b
     uint64_t dirty[2] = {0, 0};
e3c68b
     uint64_t size;
e3c68b
     ec_t *ec = NULL;
e3c68b
+    uintptr_t mask;
e3c68b
 
e3c68b
     lock = link->lock;
e3c68b
     ctx = lock->ctx;
e3c68b
     ec = link->fop->xl->private;
e3c68b
 
e3c68b
     /* pre_version[*] will be 0 if have_version is false */
e3c68b
-    version[0] = ctx->post_version[0] - ctx->pre_version[0];
e3c68b
-    version[1] = ctx->post_version[1] - ctx->pre_version[1];
e3c68b
+    version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] -
e3c68b
+                           ctx->pre_version[EC_DATA_TXN];
e3c68b
+    version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] -
e3c68b
+                               ctx->pre_version[EC_METADATA_TXN];
e3c68b
 
e3c68b
     size = ctx->post_size - ctx->pre_size;
e3c68b
     /* If we set the dirty flag for update fop, we have to unset it.
e3c68b
      * If fop has failed on some bricks, leave the dirty as marked. */
e3c68b
+
e3c68b
     if (lock->unlock_now) {
e3c68b
+        if (version[EC_DATA_TXN]) {
e3c68b
+            /*A data fop will have difference in post and pre version
e3c68b
+             *and for data fop we send writes on healing bricks also */
e3c68b
+            mask = lock->good_mask | lock->healing;
e3c68b
+        } else {
e3c68b
+            mask = lock->good_mask;
e3c68b
+        }
e3c68b
         /* Ensure that nodes are up while doing final
e3c68b
          * metadata update.*/
e3c68b
-        if (!(ec->node_mask & ~lock->good_mask) &&
e3c68b
-            !(ec->node_mask & ~ec->xl_up)) {
e3c68b
-            if (ctx->dirty[0] != 0) {
e3c68b
-                dirty[0] = -1;
e3c68b
+        if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) {
e3c68b
+            if (ctx->dirty[EC_DATA_TXN] != 0) {
e3c68b
+                dirty[EC_DATA_TXN] = -1;
e3c68b
             }
e3c68b
-            if (ctx->dirty[1] != 0) {
e3c68b
-                dirty[1] = -1;
e3c68b
+            if (ctx->dirty[EC_METADATA_TXN] != 0) {
e3c68b
+                dirty[EC_METADATA_TXN] = -1;
e3c68b
             }
e3c68b
             /*If everything is fine and we already
e3c68b
              *have version xattr set on entry, there
e3c68b
              *is no need to update version again*/
e3c68b
-            if (ctx->pre_version[0]) {
e3c68b
-                version[0] = 0;
e3c68b
+            if (ctx->pre_version[EC_DATA_TXN]) {
e3c68b
+                version[EC_DATA_TXN] = 0;
e3c68b
             }
e3c68b
-            if (ctx->pre_version[1]) {
e3c68b
-                version[1] = 0;
e3c68b
+            if (ctx->pre_version[EC_METADATA_TXN]) {
e3c68b
+                version[EC_METADATA_TXN] = 0;
e3c68b
             }
e3c68b
         } else {
e3c68b
             link->optimistic_changelog = _gf_false;
e3c68b
@@ -2410,8 +2421,8 @@ ec_update_info(ec_lock_link_t *link)
e3c68b
         memset(ctx->dirty, 0, sizeof(ctx->dirty));
e3c68b
     }
e3c68b
 
e3c68b
-    if ((version[0] != 0) || (version[1] != 0) || (dirty[0] != 0) ||
e3c68b
-        (dirty[1] != 0)) {
e3c68b
+    if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) ||
e3c68b
+        (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) {
e3c68b
         ec_update_size_version(link, version, size, dirty);
e3c68b
         return _gf_true;
e3c68b
     }
e3c68b
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
e3c68b
index 115e147..54aaa77 100644
e3c68b
--- a/xlators/cluster/ec/src/ec-common.h
e3c68b
+++ b/xlators/cluster/ec/src/ec-common.h
e3c68b
@@ -190,4 +190,12 @@ ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
e3c68b
 void
e3c68b
 ec_update_fd_status(fd_t *fd, xlator_t *xl, int child_index,
e3c68b
                     int32_t ret_status);
e3c68b
+gf_boolean_t
e3c68b
+ec_is_entry_healing(ec_fop_data_t *fop);
e3c68b
+void
e3c68b
+ec_set_entry_healing(ec_fop_data_t *fop);
e3c68b
+void
e3c68b
+ec_reset_entry_healing(ec_fop_data_t *fop);
e3c68b
+char *
e3c68b
+ec_msg_str(ec_fop_data_t *fop);
e3c68b
 #endif /* __EC_COMMON_H__ */
e3c68b
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
e3c68b
index eaf80e0..1ca12c1 100644
e3c68b
--- a/xlators/cluster/ec/src/ec-heal.c
e3c68b
+++ b/xlators/cluster/ec/src/ec-heal.c
e3c68b
@@ -103,6 +103,48 @@ ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata)
e3c68b
 }
e3c68b
 /* FOP: heal */
e3c68b
 
e3c68b
+void
e3c68b
+ec_set_entry_healing(ec_fop_data_t *fop)
e3c68b
+{
e3c68b
+    ec_inode_t *ctx = NULL;
e3c68b
+    loc_t *loc = NULL;
e3c68b
+
e3c68b
+    if (!fop)
e3c68b
+        return;
e3c68b
+
e3c68b
+    loc = &fop->loc[0];
e3c68b
+    LOCK(&loc->inode->lock);
e3c68b
+    {
e3c68b
+        ctx = __ec_inode_get(loc->inode, fop->xl);
e3c68b
+        if (ctx) {
e3c68b
+            ctx->heal_count += 1;
e3c68b
+        }
e3c68b
+    }
e3c68b
+    UNLOCK(&loc->inode->lock);
e3c68b
+}
e3c68b
+
e3c68b
+void
e3c68b
+ec_reset_entry_healing(ec_fop_data_t *fop)
e3c68b
+{
e3c68b
+    ec_inode_t *ctx = NULL;
e3c68b
+    loc_t *loc = NULL;
e3c68b
+    int32_t heal_count = 0;
e3c68b
+    if (!fop)
e3c68b
+        return;
e3c68b
+
e3c68b
+    loc = &fop->loc[0];
e3c68b
+    LOCK(&loc->inode->lock);
e3c68b
+    {
e3c68b
+        ctx = __ec_inode_get(loc->inode, fop->xl);
e3c68b
+        if (ctx) {
e3c68b
+            ctx->heal_count += -1;
e3c68b
+            heal_count = ctx->heal_count;
e3c68b
+        }
e3c68b
+    }
e3c68b
+    UNLOCK(&loc->inode->lock);
e3c68b
+    GF_ASSERT(heal_count >= 0);
e3c68b
+}
e3c68b
+
e3c68b
 uintptr_t
e3c68b
 ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood)
e3c68b
 {
e3c68b
@@ -2507,17 +2549,6 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
e3c68b
                "Heal is not required for : %s ", uuid_utoa(loc->gfid));
e3c68b
         goto out;
e3c68b
     }
e3c68b
-
e3c68b
-    msources = alloca0(ec->nodes);
e3c68b
-    mhealed_sinks = alloca0(ec->nodes);
e3c68b
-    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
e3c68b
-    if (ret == 0) {
e3c68b
-        mgood = ec_char_array_to_mask(msources, ec->nodes);
e3c68b
-        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
e3c68b
-    } else {
e3c68b
-        op_ret = -1;
e3c68b
-        op_errno = -ret;
e3c68b
-    }
e3c68b
     sources = alloca0(ec->nodes);
e3c68b
     healed_sinks = alloca0(ec->nodes);
e3c68b
     if (IA_ISREG(loc->inode->ia_type)) {
e3c68b
@@ -2538,8 +2569,19 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
e3c68b
         op_ret = -1;
e3c68b
         op_errno = -ret;
e3c68b
     }
e3c68b
+    msources = alloca0(ec->nodes);
e3c68b
+    mhealed_sinks = alloca0(ec->nodes);
e3c68b
+    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
e3c68b
+    if (ret == 0) {
e3c68b
+        mgood = ec_char_array_to_mask(msources, ec->nodes);
e3c68b
+        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
e3c68b
+    } else {
e3c68b
+        op_ret = -1;
e3c68b
+        op_errno = -ret;
e3c68b
+    }
e3c68b
 
e3c68b
 out:
e3c68b
+    ec_reset_entry_healing(fop);
e3c68b
     if (fop->cbks.heal) {
e3c68b
         fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
e3c68b
                        ec_char_array_to_mask(participants, ec->nodes),
e3c68b
@@ -2650,11 +2692,33 @@ ec_handle_healers_done(ec_fop_data_t *fop)
e3c68b
         ec_launch_heal(ec, heal_fop);
e3c68b
 }
e3c68b
 
e3c68b
+gf_boolean_t
e3c68b
+ec_is_entry_healing(ec_fop_data_t *fop)
e3c68b
+{
e3c68b
+    ec_inode_t *ctx = NULL;
e3c68b
+    int32_t heal_count = 0;
e3c68b
+    loc_t *loc = NULL;
e3c68b
+
e3c68b
+    loc = &fop->loc[0];
e3c68b
+
e3c68b
+    LOCK(&loc->inode->lock);
e3c68b
+    {
e3c68b
+        ctx = __ec_inode_get(loc->inode, fop->xl);
e3c68b
+        if (ctx) {
e3c68b
+            heal_count = ctx->heal_count;
e3c68b
+        }
e3c68b
+    }
e3c68b
+    UNLOCK(&loc->inode->lock);
e3c68b
+    GF_ASSERT(heal_count >= 0);
e3c68b
+    return heal_count;
e3c68b
+}
e3c68b
+
e3c68b
 void
e3c68b
 ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
e3c68b
 {
e3c68b
     gf_boolean_t can_heal = _gf_true;
e3c68b
     ec_t *ec = this->private;
e3c68b
+    ec_fop_data_t *fop_rel = NULL;
e3c68b
 
e3c68b
     if (fop->req_frame == NULL) {
e3c68b
         LOCK(&ec->lock);
e3c68b
@@ -2662,8 +2726,13 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
e3c68b
             if ((ec->background_heals > 0) &&
e3c68b
                 (ec->heal_wait_qlen + ec->background_heals) >
e3c68b
                     (ec->heal_waiters + ec->healers)) {
e3c68b
-                list_add_tail(&fop->healer, &ec->heal_waiting);
e3c68b
-                ec->heal_waiters++;
e3c68b
+                if (!ec_is_entry_healing(fop)) {
e3c68b
+                    list_add_tail(&fop->healer, &ec->heal_waiting);
e3c68b
+                    ec->heal_waiters++;
e3c68b
+                    ec_set_entry_healing(fop);
e3c68b
+                } else {
e3c68b
+                    fop_rel = fop;
e3c68b
+                }
e3c68b
                 fop = __ec_dequeue_heals(ec);
e3c68b
             } else {
e3c68b
                 can_heal = _gf_false;
e3c68b
@@ -2673,8 +2742,12 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
e3c68b
     }
e3c68b
 
e3c68b
     if (can_heal) {
e3c68b
-        if (fop)
e3c68b
+        if (fop) {
e3c68b
+            if (fop->req_frame != NULL) {
e3c68b
+                ec_set_entry_healing(fop);
e3c68b
+            }
e3c68b
             ec_launch_heal(ec, fop);
e3c68b
+        }
e3c68b
     } else {
e3c68b
         gf_msg_debug(this->name, 0,
e3c68b
                      "Max number of heals are "
e3c68b
@@ -2682,6 +2755,9 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
e3c68b
         ec_fop_set_error(fop, EBUSY);
e3c68b
         ec_heal_fail(ec, fop);
e3c68b
     }
e3c68b
+    if (fop_rel) {
e3c68b
+        ec_heal_done(0, NULL, fop_rel);
e3c68b
+    }
e3c68b
 }
e3c68b
 
e3c68b
 void
e3c68b
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
e3c68b
index e6b0359..43f6e3b 100644
e3c68b
--- a/xlators/cluster/ec/src/ec-helpers.c
e3c68b
+++ b/xlators/cluster/ec/src/ec-helpers.c
e3c68b
@@ -717,6 +717,7 @@ __ec_inode_get(inode_t *inode, xlator_t *xl)
e3c68b
             memset(ctx, 0, sizeof(*ctx));
e3c68b
             INIT_LIST_HEAD(&ctx->heal);
e3c68b
             INIT_LIST_HEAD(&ctx->stripe_cache.lru);
e3c68b
+            ctx->heal_count = 0;
e3c68b
             value = (uint64_t)(uintptr_t)ctx;
e3c68b
             if (__inode_ctx_set(inode, xl, &value) != 0) {
e3c68b
                 GF_FREE(ctx);
e3c68b
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
e3c68b
index f3d63ca..6ae4a2b 100644
e3c68b
--- a/xlators/cluster/ec/src/ec-types.h
e3c68b
+++ b/xlators/cluster/ec/src/ec-types.h
e3c68b
@@ -171,6 +171,7 @@ struct _ec_inode {
e3c68b
     gf_boolean_t have_config;
e3c68b
     gf_boolean_t have_version;
e3c68b
     gf_boolean_t have_size;
e3c68b
+    int32_t heal_count;
e3c68b
     ec_config_t config;
e3c68b
     uint64_t pre_version[2];
e3c68b
     uint64_t post_version[2];
e3c68b
-- 
e3c68b
1.8.3.1
e3c68b