74b1de
From bc6588890ce94101a63b861178cf38db5549d8a8 Mon Sep 17 00:00:00 2001
74b1de
From: Ashish Pandey <aspandey@redhat.com>
74b1de
Date: Wed, 28 Nov 2018 11:22:52 +0530
74b1de
Subject: [PATCH 44/52] cluster/ec: Don't enqueue an entry if it is already
74b1de
 healing
74b1de
74b1de
Problem:
74b1de
1 - heal-wait-qlength is by default 128. If shd is disabled
74b1de
and we need to heal files, client side heal is needed.
74b1de
If we access these files that will trigger the heal.
74b1de
However, it has been observed that a file will be enqueued
74b1de
multiple times in the heal wait queue, which in turn causes
74b1de
queue to be filled and prevent other files to be enqueued.
74b1de
74b1de
2 - While a file is going through healing and a write fop from
74b1de
mount comes on that file, it sends write on all the bricks including
74b1de
healing one. At the end it updates version and size on all the
74b1de
bricks. However, it does not unset dirty flag on all the bricks,
74b1de
even if this write fop was successful on all the bricks.
74b1de
After healing completion this dirty flag remain set and never
74b1de
gets cleaned up if SHD is disabled.
74b1de
74b1de
Solution:
74b1de
1 - If an entry is already in queue or going through heal process,
74b1de
don't enqueue next client side request to heal the same file.
74b1de
74b1de
2 - Unset dirty on all the bricks at the end if fop has succeeded on
74b1de
all the bricks even if some of the bricks are going through heal.
74b1de
74b1de
backport of : https://review.gluster.org/#/c/glusterfs/+/21744/
74b1de
74b1de
Change-Id: Ia61ffe230c6502ce6cb934425d55e2f40dd1a727
74b1de
BUG: 1600918
74b1de
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
74b1de
Reviewed-on: https://code.engineering.redhat.com/gerrit/166296
74b1de
Tested-by: RHGS Build Bot <nigelb@redhat.com>
74b1de
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
74b1de
---
74b1de
 tests/bugs/ec/bug-1236065.t         |   1 -
74b1de
 xlators/cluster/ec/src/ec-common.c  |  43 +++++++++------
74b1de
 xlators/cluster/ec/src/ec-common.h  |   8 +++
74b1de
 xlators/cluster/ec/src/ec-heal.c    | 104 +++++++++++++++++++++++++++++++-----
74b1de
 xlators/cluster/ec/src/ec-helpers.c |   1 +
74b1de
 xlators/cluster/ec/src/ec-types.h   |   1 +
74b1de
 6 files changed, 127 insertions(+), 31 deletions(-)
74b1de
74b1de
diff --git a/tests/bugs/ec/bug-1236065.t b/tests/bugs/ec/bug-1236065.t
74b1de
index 76d25d7..9181e73 100644
74b1de
--- a/tests/bugs/ec/bug-1236065.t
74b1de
+++ b/tests/bugs/ec/bug-1236065.t
74b1de
@@ -85,7 +85,6 @@ TEST pidof glusterd
74b1de
 EXPECT "$V0" volinfo_field $V0 'Volume Name'
74b1de
 EXPECT 'Started' volinfo_field $V0 'Status'
74b1de
 EXPECT '7' online_brick_count
74b1de
-
74b1de
 ## cleanup
74b1de
 cd
74b1de
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
74b1de
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
74b1de
index 8d65670..5183680 100644
74b1de
--- a/xlators/cluster/ec/src/ec-common.c
74b1de
+++ b/xlators/cluster/ec/src/ec-common.c
74b1de
@@ -313,14 +313,15 @@ ec_check_status(ec_fop_data_t *fop)
74b1de
 
74b1de
     gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
74b1de
            "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
74b1de
-           "remaining=%s, good=%s, bad=%s)",
74b1de
+           "remaining=%s, good=%s, bad=%s, %s)",
74b1de
            gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
74b1de
            ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
74b1de
            ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
74b1de
            ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
74b1de
            ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
74b1de
            ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
74b1de
-                  ec->nodes));
74b1de
+                  ec->nodes),
74b1de
+           ec_msg_str(fop));
74b1de
     if (fop->use_fd) {
74b1de
         if (fop->fd != NULL) {
74b1de
             ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
74b1de
@@ -2371,37 +2372,47 @@ ec_update_info(ec_lock_link_t *link)
74b1de
     uint64_t dirty[2] = {0, 0};
74b1de
     uint64_t size;
74b1de
     ec_t *ec = NULL;
74b1de
+    uintptr_t mask;
74b1de
 
74b1de
     lock = link->lock;
74b1de
     ctx = lock->ctx;
74b1de
     ec = link->fop->xl->private;
74b1de
 
74b1de
     /* pre_version[*] will be 0 if have_version is false */
74b1de
-    version[0] = ctx->post_version[0] - ctx->pre_version[0];
74b1de
-    version[1] = ctx->post_version[1] - ctx->pre_version[1];
74b1de
+    version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] -
74b1de
+                           ctx->pre_version[EC_DATA_TXN];
74b1de
+    version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] -
74b1de
+                               ctx->pre_version[EC_METADATA_TXN];
74b1de
 
74b1de
     size = ctx->post_size - ctx->pre_size;
74b1de
     /* If we set the dirty flag for update fop, we have to unset it.
74b1de
      * If fop has failed on some bricks, leave the dirty as marked. */
74b1de
+
74b1de
     if (lock->unlock_now) {
74b1de
+        if (version[EC_DATA_TXN]) {
74b1de
+            /*A data fop will have difference in post and pre version
74b1de
+             *and for data fop we send writes on healing bricks also */
74b1de
+            mask = lock->good_mask | lock->healing;
74b1de
+        } else {
74b1de
+            mask = lock->good_mask;
74b1de
+        }
74b1de
         /* Ensure that nodes are up while doing final
74b1de
          * metadata update.*/
74b1de
-        if (!(ec->node_mask & ~lock->good_mask) &&
74b1de
-            !(ec->node_mask & ~ec->xl_up)) {
74b1de
-            if (ctx->dirty[0] != 0) {
74b1de
-                dirty[0] = -1;
74b1de
+        if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) {
74b1de
+            if (ctx->dirty[EC_DATA_TXN] != 0) {
74b1de
+                dirty[EC_DATA_TXN] = -1;
74b1de
             }
74b1de
-            if (ctx->dirty[1] != 0) {
74b1de
-                dirty[1] = -1;
74b1de
+            if (ctx->dirty[EC_METADATA_TXN] != 0) {
74b1de
+                dirty[EC_METADATA_TXN] = -1;
74b1de
             }
74b1de
             /*If everything is fine and we already
74b1de
              *have version xattr set on entry, there
74b1de
              *is no need to update version again*/
74b1de
-            if (ctx->pre_version[0]) {
74b1de
-                version[0] = 0;
74b1de
+            if (ctx->pre_version[EC_DATA_TXN]) {
74b1de
+                version[EC_DATA_TXN] = 0;
74b1de
             }
74b1de
-            if (ctx->pre_version[1]) {
74b1de
-                version[1] = 0;
74b1de
+            if (ctx->pre_version[EC_METADATA_TXN]) {
74b1de
+                version[EC_METADATA_TXN] = 0;
74b1de
             }
74b1de
         } else {
74b1de
             link->optimistic_changelog = _gf_false;
74b1de
@@ -2410,8 +2421,8 @@ ec_update_info(ec_lock_link_t *link)
74b1de
         memset(ctx->dirty, 0, sizeof(ctx->dirty));
74b1de
     }
74b1de
 
74b1de
-    if ((version[0] != 0) || (version[1] != 0) || (dirty[0] != 0) ||
74b1de
-        (dirty[1] != 0)) {
74b1de
+    if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) ||
74b1de
+        (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) {
74b1de
         ec_update_size_version(link, version, size, dirty);
74b1de
         return _gf_true;
74b1de
     }
74b1de
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
74b1de
index 115e147..54aaa77 100644
74b1de
--- a/xlators/cluster/ec/src/ec-common.h
74b1de
+++ b/xlators/cluster/ec/src/ec-common.h
74b1de
@@ -190,4 +190,12 @@ ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
74b1de
 void
74b1de
 ec_update_fd_status(fd_t *fd, xlator_t *xl, int child_index,
74b1de
                     int32_t ret_status);
74b1de
+gf_boolean_t
74b1de
+ec_is_entry_healing(ec_fop_data_t *fop);
74b1de
+void
74b1de
+ec_set_entry_healing(ec_fop_data_t *fop);
74b1de
+void
74b1de
+ec_reset_entry_healing(ec_fop_data_t *fop);
74b1de
+char *
74b1de
+ec_msg_str(ec_fop_data_t *fop);
74b1de
 #endif /* __EC_COMMON_H__ */
74b1de
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
74b1de
index eaf80e0..1ca12c1 100644
74b1de
--- a/xlators/cluster/ec/src/ec-heal.c
74b1de
+++ b/xlators/cluster/ec/src/ec-heal.c
74b1de
@@ -103,6 +103,48 @@ ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata)
74b1de
 }
74b1de
 /* FOP: heal */
74b1de
 
74b1de
+void
74b1de
+ec_set_entry_healing(ec_fop_data_t *fop)
74b1de
+{
74b1de
+    ec_inode_t *ctx = NULL;
74b1de
+    loc_t *loc = NULL;
74b1de
+
74b1de
+    if (!fop)
74b1de
+        return;
74b1de
+
74b1de
+    loc = &fop->loc[0];
74b1de
+    LOCK(&loc->inode->lock);
74b1de
+    {
74b1de
+        ctx = __ec_inode_get(loc->inode, fop->xl);
74b1de
+        if (ctx) {
74b1de
+            ctx->heal_count += 1;
74b1de
+        }
74b1de
+    }
74b1de
+    UNLOCK(&loc->inode->lock);
74b1de
+}
74b1de
+
74b1de
+void
74b1de
+ec_reset_entry_healing(ec_fop_data_t *fop)
74b1de
+{
74b1de
+    ec_inode_t *ctx = NULL;
74b1de
+    loc_t *loc = NULL;
74b1de
+    int32_t heal_count = 0;
74b1de
+    if (!fop)
74b1de
+        return;
74b1de
+
74b1de
+    loc = &fop->loc[0];
74b1de
+    LOCK(&loc->inode->lock);
74b1de
+    {
74b1de
+        ctx = __ec_inode_get(loc->inode, fop->xl);
74b1de
+        if (ctx) {
74b1de
+            ctx->heal_count += -1;
74b1de
+            heal_count = ctx->heal_count;
74b1de
+        }
74b1de
+    }
74b1de
+    UNLOCK(&loc->inode->lock);
74b1de
+    GF_ASSERT(heal_count >= 0);
74b1de
+}
74b1de
+
74b1de
 uintptr_t
74b1de
 ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood)
74b1de
 {
74b1de
@@ -2507,17 +2549,6 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
74b1de
                "Heal is not required for : %s ", uuid_utoa(loc->gfid));
74b1de
         goto out;
74b1de
     }
74b1de
-
74b1de
-    msources = alloca0(ec->nodes);
74b1de
-    mhealed_sinks = alloca0(ec->nodes);
74b1de
-    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
74b1de
-    if (ret == 0) {
74b1de
-        mgood = ec_char_array_to_mask(msources, ec->nodes);
74b1de
-        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
74b1de
-    } else {
74b1de
-        op_ret = -1;
74b1de
-        op_errno = -ret;
74b1de
-    }
74b1de
     sources = alloca0(ec->nodes);
74b1de
     healed_sinks = alloca0(ec->nodes);
74b1de
     if (IA_ISREG(loc->inode->ia_type)) {
74b1de
@@ -2538,8 +2569,19 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
74b1de
         op_ret = -1;
74b1de
         op_errno = -ret;
74b1de
     }
74b1de
+    msources = alloca0(ec->nodes);
74b1de
+    mhealed_sinks = alloca0(ec->nodes);
74b1de
+    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
74b1de
+    if (ret == 0) {
74b1de
+        mgood = ec_char_array_to_mask(msources, ec->nodes);
74b1de
+        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
74b1de
+    } else {
74b1de
+        op_ret = -1;
74b1de
+        op_errno = -ret;
74b1de
+    }
74b1de
 
74b1de
 out:
74b1de
+    ec_reset_entry_healing(fop);
74b1de
     if (fop->cbks.heal) {
74b1de
         fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
74b1de
                        ec_char_array_to_mask(participants, ec->nodes),
74b1de
@@ -2650,11 +2692,33 @@ ec_handle_healers_done(ec_fop_data_t *fop)
74b1de
         ec_launch_heal(ec, heal_fop);
74b1de
 }
74b1de
 
74b1de
+gf_boolean_t
74b1de
+ec_is_entry_healing(ec_fop_data_t *fop)
74b1de
+{
74b1de
+    ec_inode_t *ctx = NULL;
74b1de
+    int32_t heal_count = 0;
74b1de
+    loc_t *loc = NULL;
74b1de
+
74b1de
+    loc = &fop->loc[0];
74b1de
+
74b1de
+    LOCK(&loc->inode->lock);
74b1de
+    {
74b1de
+        ctx = __ec_inode_get(loc->inode, fop->xl);
74b1de
+        if (ctx) {
74b1de
+            heal_count = ctx->heal_count;
74b1de
+        }
74b1de
+    }
74b1de
+    UNLOCK(&loc->inode->lock);
74b1de
+    GF_ASSERT(heal_count >= 0);
74b1de
+    return heal_count;
74b1de
+}
74b1de
+
74b1de
 void
74b1de
 ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
74b1de
 {
74b1de
     gf_boolean_t can_heal = _gf_true;
74b1de
     ec_t *ec = this->private;
74b1de
+    ec_fop_data_t *fop_rel = NULL;
74b1de
 
74b1de
     if (fop->req_frame == NULL) {
74b1de
         LOCK(&ec->lock);
74b1de
@@ -2662,8 +2726,13 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
74b1de
             if ((ec->background_heals > 0) &&
74b1de
                 (ec->heal_wait_qlen + ec->background_heals) >
74b1de
                     (ec->heal_waiters + ec->healers)) {
74b1de
-                list_add_tail(&fop->healer, &ec->heal_waiting);
74b1de
-                ec->heal_waiters++;
74b1de
+                if (!ec_is_entry_healing(fop)) {
74b1de
+                    list_add_tail(&fop->healer, &ec->heal_waiting);
74b1de
+                    ec->heal_waiters++;
74b1de
+                    ec_set_entry_healing(fop);
74b1de
+                } else {
74b1de
+                    fop_rel = fop;
74b1de
+                }
74b1de
                 fop = __ec_dequeue_heals(ec);
74b1de
             } else {
74b1de
                 can_heal = _gf_false;
74b1de
@@ -2673,8 +2742,12 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
74b1de
     }
74b1de
 
74b1de
     if (can_heal) {
74b1de
-        if (fop)
74b1de
+        if (fop) {
74b1de
+            if (fop->req_frame != NULL) {
74b1de
+                ec_set_entry_healing(fop);
74b1de
+            }
74b1de
             ec_launch_heal(ec, fop);
74b1de
+        }
74b1de
     } else {
74b1de
         gf_msg_debug(this->name, 0,
74b1de
                      "Max number of heals are "
74b1de
@@ -2682,6 +2755,9 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
74b1de
         ec_fop_set_error(fop, EBUSY);
74b1de
         ec_heal_fail(ec, fop);
74b1de
     }
74b1de
+    if (fop_rel) {
74b1de
+        ec_heal_done(0, NULL, fop_rel);
74b1de
+    }
74b1de
 }
74b1de
 
74b1de
 void
74b1de
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
74b1de
index e6b0359..43f6e3b 100644
74b1de
--- a/xlators/cluster/ec/src/ec-helpers.c
74b1de
+++ b/xlators/cluster/ec/src/ec-helpers.c
74b1de
@@ -717,6 +717,7 @@ __ec_inode_get(inode_t *inode, xlator_t *xl)
74b1de
             memset(ctx, 0, sizeof(*ctx));
74b1de
             INIT_LIST_HEAD(&ctx->heal);
74b1de
             INIT_LIST_HEAD(&ctx->stripe_cache.lru);
74b1de
+            ctx->heal_count = 0;
74b1de
             value = (uint64_t)(uintptr_t)ctx;
74b1de
             if (__inode_ctx_set(inode, xl, &value) != 0) {
74b1de
                 GF_FREE(ctx);
74b1de
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
74b1de
index f3d63ca..6ae4a2b 100644
74b1de
--- a/xlators/cluster/ec/src/ec-types.h
74b1de
+++ b/xlators/cluster/ec/src/ec-types.h
74b1de
@@ -171,6 +171,7 @@ struct _ec_inode {
74b1de
     gf_boolean_t have_config;
74b1de
     gf_boolean_t have_version;
74b1de
     gf_boolean_t have_size;
74b1de
+    int32_t heal_count;
74b1de
     ec_config_t config;
74b1de
     uint64_t pre_version[2];
74b1de
     uint64_t post_version[2];
74b1de
-- 
74b1de
1.8.3.1
74b1de