50dc83
From bc6588890ce94101a63b861178cf38db5549d8a8 Mon Sep 17 00:00:00 2001
50dc83
From: Ashish Pandey <aspandey@redhat.com>
50dc83
Date: Wed, 28 Nov 2018 11:22:52 +0530
50dc83
Subject: [PATCH 44/52] cluster/ec: Don't enqueue an entry if it is already
50dc83
 healing
50dc83
50dc83
Problem:
50dc83
1 - heal-wait-qlength is by default 128. If shd is disabled
50dc83
and we need to heal files, client side heal is needed.
50dc83
If we access these files that will trigger the heal.
50dc83
However, it has been observed that a file will be enqueued
50dc83
multiple times in the heal wait queue, which in turn causes
50dc83
queue to be filled and prevent other files to be enqueued.
50dc83
50dc83
2 - While a file is going through healing and a write fop from
50dc83
mount comes on that file, it sends write on all the bricks including
50dc83
healing one. At the end it updates version and size on all the
50dc83
bricks. However, it does not unset dirty flag on all the bricks,
50dc83
even if this write fop was successful on all the bricks.
50dc83
After healing completion this dirty flag remain set and never
50dc83
gets cleaned up if SHD is disabled.
50dc83
50dc83
Solution:
50dc83
1 - If an entry is already in queue or going through heal process,
50dc83
don't enqueue next client side request to heal the same file.
50dc83
50dc83
2 - Unset dirty on all the bricks at the end if fop has succeeded on
50dc83
all the bricks even if some of the bricks are going through heal.
50dc83
50dc83
backport of : https://review.gluster.org/#/c/glusterfs/+/21744/
50dc83
50dc83
Change-Id: Ia61ffe230c6502ce6cb934425d55e2f40dd1a727
50dc83
BUG: 1600918
50dc83
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
50dc83
Reviewed-on: https://code.engineering.redhat.com/gerrit/166296
50dc83
Tested-by: RHGS Build Bot <nigelb@redhat.com>
50dc83
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
50dc83
---
50dc83
 tests/bugs/ec/bug-1236065.t         |   1 -
50dc83
 xlators/cluster/ec/src/ec-common.c  |  43 +++++++++------
50dc83
 xlators/cluster/ec/src/ec-common.h  |   8 +++
50dc83
 xlators/cluster/ec/src/ec-heal.c    | 104 +++++++++++++++++++++++++++++++-----
50dc83
 xlators/cluster/ec/src/ec-helpers.c |   1 +
50dc83
 xlators/cluster/ec/src/ec-types.h   |   1 +
50dc83
 6 files changed, 127 insertions(+), 31 deletions(-)
50dc83
50dc83
diff --git a/tests/bugs/ec/bug-1236065.t b/tests/bugs/ec/bug-1236065.t
50dc83
index 76d25d7..9181e73 100644
50dc83
--- a/tests/bugs/ec/bug-1236065.t
50dc83
+++ b/tests/bugs/ec/bug-1236065.t
50dc83
@@ -85,7 +85,6 @@ TEST pidof glusterd
50dc83
 EXPECT "$V0" volinfo_field $V0 'Volume Name'
50dc83
 EXPECT 'Started' volinfo_field $V0 'Status'
50dc83
 EXPECT '7' online_brick_count
50dc83
-
50dc83
 ## cleanup
50dc83
 cd
50dc83
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
50dc83
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
50dc83
index 8d65670..5183680 100644
50dc83
--- a/xlators/cluster/ec/src/ec-common.c
50dc83
+++ b/xlators/cluster/ec/src/ec-common.c
50dc83
@@ -313,14 +313,15 @@ ec_check_status(ec_fop_data_t *fop)
50dc83
 
50dc83
     gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
50dc83
            "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
50dc83
-           "remaining=%s, good=%s, bad=%s)",
50dc83
+           "remaining=%s, good=%s, bad=%s, %s)",
50dc83
            gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
50dc83
            ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
50dc83
            ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
50dc83
            ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
50dc83
            ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
50dc83
            ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
50dc83
-                  ec->nodes));
50dc83
+                  ec->nodes),
50dc83
+           ec_msg_str(fop));
50dc83
     if (fop->use_fd) {
50dc83
         if (fop->fd != NULL) {
50dc83
             ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
50dc83
@@ -2371,37 +2372,47 @@ ec_update_info(ec_lock_link_t *link)
50dc83
     uint64_t dirty[2] = {0, 0};
50dc83
     uint64_t size;
50dc83
     ec_t *ec = NULL;
50dc83
+    uintptr_t mask;
50dc83
 
50dc83
     lock = link->lock;
50dc83
     ctx = lock->ctx;
50dc83
     ec = link->fop->xl->private;
50dc83
 
50dc83
     /* pre_version[*] will be 0 if have_version is false */
50dc83
-    version[0] = ctx->post_version[0] - ctx->pre_version[0];
50dc83
-    version[1] = ctx->post_version[1] - ctx->pre_version[1];
50dc83
+    version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] -
50dc83
+                           ctx->pre_version[EC_DATA_TXN];
50dc83
+    version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] -
50dc83
+                               ctx->pre_version[EC_METADATA_TXN];
50dc83
 
50dc83
     size = ctx->post_size - ctx->pre_size;
50dc83
     /* If we set the dirty flag for update fop, we have to unset it.
50dc83
      * If fop has failed on some bricks, leave the dirty as marked. */
50dc83
+
50dc83
     if (lock->unlock_now) {
50dc83
+        if (version[EC_DATA_TXN]) {
50dc83
+            /*A data fop will have difference in post and pre version
50dc83
+             *and for data fop we send writes on healing bricks also */
50dc83
+            mask = lock->good_mask | lock->healing;
50dc83
+        } else {
50dc83
+            mask = lock->good_mask;
50dc83
+        }
50dc83
         /* Ensure that nodes are up while doing final
50dc83
          * metadata update.*/
50dc83
-        if (!(ec->node_mask & ~lock->good_mask) &&
50dc83
-            !(ec->node_mask & ~ec->xl_up)) {
50dc83
-            if (ctx->dirty[0] != 0) {
50dc83
-                dirty[0] = -1;
50dc83
+        if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) {
50dc83
+            if (ctx->dirty[EC_DATA_TXN] != 0) {
50dc83
+                dirty[EC_DATA_TXN] = -1;
50dc83
             }
50dc83
-            if (ctx->dirty[1] != 0) {
50dc83
-                dirty[1] = -1;
50dc83
+            if (ctx->dirty[EC_METADATA_TXN] != 0) {
50dc83
+                dirty[EC_METADATA_TXN] = -1;
50dc83
             }
50dc83
             /*If everything is fine and we already
50dc83
              *have version xattr set on entry, there
50dc83
              *is no need to update version again*/
50dc83
-            if (ctx->pre_version[0]) {
50dc83
-                version[0] = 0;
50dc83
+            if (ctx->pre_version[EC_DATA_TXN]) {
50dc83
+                version[EC_DATA_TXN] = 0;
50dc83
             }
50dc83
-            if (ctx->pre_version[1]) {
50dc83
-                version[1] = 0;
50dc83
+            if (ctx->pre_version[EC_METADATA_TXN]) {
50dc83
+                version[EC_METADATA_TXN] = 0;
50dc83
             }
50dc83
         } else {
50dc83
             link->optimistic_changelog = _gf_false;
50dc83
@@ -2410,8 +2421,8 @@ ec_update_info(ec_lock_link_t *link)
50dc83
         memset(ctx->dirty, 0, sizeof(ctx->dirty));
50dc83
     }
50dc83
 
50dc83
-    if ((version[0] != 0) || (version[1] != 0) || (dirty[0] != 0) ||
50dc83
-        (dirty[1] != 0)) {
50dc83
+    if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) ||
50dc83
+        (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) {
50dc83
         ec_update_size_version(link, version, size, dirty);
50dc83
         return _gf_true;
50dc83
     }
50dc83
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
50dc83
index 115e147..54aaa77 100644
50dc83
--- a/xlators/cluster/ec/src/ec-common.h
50dc83
+++ b/xlators/cluster/ec/src/ec-common.h
50dc83
@@ -190,4 +190,12 @@ ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
50dc83
 void
50dc83
 ec_update_fd_status(fd_t *fd, xlator_t *xl, int child_index,
50dc83
                     int32_t ret_status);
50dc83
+gf_boolean_t
50dc83
+ec_is_entry_healing(ec_fop_data_t *fop);
50dc83
+void
50dc83
+ec_set_entry_healing(ec_fop_data_t *fop);
50dc83
+void
50dc83
+ec_reset_entry_healing(ec_fop_data_t *fop);
50dc83
+char *
50dc83
+ec_msg_str(ec_fop_data_t *fop);
50dc83
 #endif /* __EC_COMMON_H__ */
50dc83
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
50dc83
index eaf80e0..1ca12c1 100644
50dc83
--- a/xlators/cluster/ec/src/ec-heal.c
50dc83
+++ b/xlators/cluster/ec/src/ec-heal.c
50dc83
@@ -103,6 +103,48 @@ ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata)
50dc83
 }
50dc83
 /* FOP: heal */
50dc83
 
50dc83
+void
50dc83
+ec_set_entry_healing(ec_fop_data_t *fop)
50dc83
+{
50dc83
+    ec_inode_t *ctx = NULL;
50dc83
+    loc_t *loc = NULL;
50dc83
+
50dc83
+    if (!fop)
50dc83
+        return;
50dc83
+
50dc83
+    loc = &fop->loc[0];
50dc83
+    LOCK(&loc->inode->lock);
50dc83
+    {
50dc83
+        ctx = __ec_inode_get(loc->inode, fop->xl);
50dc83
+        if (ctx) {
50dc83
+            ctx->heal_count += 1;
50dc83
+        }
50dc83
+    }
50dc83
+    UNLOCK(&loc->inode->lock);
50dc83
+}
50dc83
+
50dc83
+void
50dc83
+ec_reset_entry_healing(ec_fop_data_t *fop)
50dc83
+{
50dc83
+    ec_inode_t *ctx = NULL;
50dc83
+    loc_t *loc = NULL;
50dc83
+    int32_t heal_count = 0;
50dc83
+    if (!fop)
50dc83
+        return;
50dc83
+
50dc83
+    loc = &fop->loc[0];
50dc83
+    LOCK(&loc->inode->lock);
50dc83
+    {
50dc83
+        ctx = __ec_inode_get(loc->inode, fop->xl);
50dc83
+        if (ctx) {
50dc83
+            ctx->heal_count += -1;
50dc83
+            heal_count = ctx->heal_count;
50dc83
+        }
50dc83
+    }
50dc83
+    UNLOCK(&loc->inode->lock);
50dc83
+    GF_ASSERT(heal_count >= 0);
50dc83
+}
50dc83
+
50dc83
 uintptr_t
50dc83
 ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood)
50dc83
 {
50dc83
@@ -2507,17 +2549,6 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
50dc83
                "Heal is not required for : %s ", uuid_utoa(loc->gfid));
50dc83
         goto out;
50dc83
     }
50dc83
-
50dc83
-    msources = alloca0(ec->nodes);
50dc83
-    mhealed_sinks = alloca0(ec->nodes);
50dc83
-    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
50dc83
-    if (ret == 0) {
50dc83
-        mgood = ec_char_array_to_mask(msources, ec->nodes);
50dc83
-        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
50dc83
-    } else {
50dc83
-        op_ret = -1;
50dc83
-        op_errno = -ret;
50dc83
-    }
50dc83
     sources = alloca0(ec->nodes);
50dc83
     healed_sinks = alloca0(ec->nodes);
50dc83
     if (IA_ISREG(loc->inode->ia_type)) {
50dc83
@@ -2538,8 +2569,19 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
50dc83
         op_ret = -1;
50dc83
         op_errno = -ret;
50dc83
     }
50dc83
+    msources = alloca0(ec->nodes);
50dc83
+    mhealed_sinks = alloca0(ec->nodes);
50dc83
+    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
50dc83
+    if (ret == 0) {
50dc83
+        mgood = ec_char_array_to_mask(msources, ec->nodes);
50dc83
+        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
50dc83
+    } else {
50dc83
+        op_ret = -1;
50dc83
+        op_errno = -ret;
50dc83
+    }
50dc83
 
50dc83
 out:
50dc83
+    ec_reset_entry_healing(fop);
50dc83
     if (fop->cbks.heal) {
50dc83
         fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
50dc83
                        ec_char_array_to_mask(participants, ec->nodes),
50dc83
@@ -2650,11 +2692,33 @@ ec_handle_healers_done(ec_fop_data_t *fop)
50dc83
         ec_launch_heal(ec, heal_fop);
50dc83
 }
50dc83
 
50dc83
+gf_boolean_t
50dc83
+ec_is_entry_healing(ec_fop_data_t *fop)
50dc83
+{
50dc83
+    ec_inode_t *ctx = NULL;
50dc83
+    int32_t heal_count = 0;
50dc83
+    loc_t *loc = NULL;
50dc83
+
50dc83
+    loc = &fop->loc[0];
50dc83
+
50dc83
+    LOCK(&loc->inode->lock);
50dc83
+    {
50dc83
+        ctx = __ec_inode_get(loc->inode, fop->xl);
50dc83
+        if (ctx) {
50dc83
+            heal_count = ctx->heal_count;
50dc83
+        }
50dc83
+    }
50dc83
+    UNLOCK(&loc->inode->lock);
50dc83
+    GF_ASSERT(heal_count >= 0);
50dc83
+    return heal_count;
50dc83
+}
50dc83
+
50dc83
 void
50dc83
 ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
50dc83
 {
50dc83
     gf_boolean_t can_heal = _gf_true;
50dc83
     ec_t *ec = this->private;
50dc83
+    ec_fop_data_t *fop_rel = NULL;
50dc83
 
50dc83
     if (fop->req_frame == NULL) {
50dc83
         LOCK(&ec->lock);
50dc83
@@ -2662,8 +2726,13 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
50dc83
             if ((ec->background_heals > 0) &&
50dc83
                 (ec->heal_wait_qlen + ec->background_heals) >
50dc83
                     (ec->heal_waiters + ec->healers)) {
50dc83
-                list_add_tail(&fop->healer, &ec->heal_waiting);
50dc83
-                ec->heal_waiters++;
50dc83
+                if (!ec_is_entry_healing(fop)) {
50dc83
+                    list_add_tail(&fop->healer, &ec->heal_waiting);
50dc83
+                    ec->heal_waiters++;
50dc83
+                    ec_set_entry_healing(fop);
50dc83
+                } else {
50dc83
+                    fop_rel = fop;
50dc83
+                }
50dc83
                 fop = __ec_dequeue_heals(ec);
50dc83
             } else {
50dc83
                 can_heal = _gf_false;
50dc83
@@ -2673,8 +2742,12 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
50dc83
     }
50dc83
 
50dc83
     if (can_heal) {
50dc83
-        if (fop)
50dc83
+        if (fop) {
50dc83
+            if (fop->req_frame != NULL) {
50dc83
+                ec_set_entry_healing(fop);
50dc83
+            }
50dc83
             ec_launch_heal(ec, fop);
50dc83
+        }
50dc83
     } else {
50dc83
         gf_msg_debug(this->name, 0,
50dc83
                      "Max number of heals are "
50dc83
@@ -2682,6 +2755,9 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
50dc83
         ec_fop_set_error(fop, EBUSY);
50dc83
         ec_heal_fail(ec, fop);
50dc83
     }
50dc83
+    if (fop_rel) {
50dc83
+        ec_heal_done(0, NULL, fop_rel);
50dc83
+    }
50dc83
 }
50dc83
 
50dc83
 void
50dc83
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
50dc83
index e6b0359..43f6e3b 100644
50dc83
--- a/xlators/cluster/ec/src/ec-helpers.c
50dc83
+++ b/xlators/cluster/ec/src/ec-helpers.c
50dc83
@@ -717,6 +717,7 @@ __ec_inode_get(inode_t *inode, xlator_t *xl)
50dc83
             memset(ctx, 0, sizeof(*ctx));
50dc83
             INIT_LIST_HEAD(&ctx->heal);
50dc83
             INIT_LIST_HEAD(&ctx->stripe_cache.lru);
50dc83
+            ctx->heal_count = 0;
50dc83
             value = (uint64_t)(uintptr_t)ctx;
50dc83
             if (__inode_ctx_set(inode, xl, &value) != 0) {
50dc83
                 GF_FREE(ctx);
50dc83
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
50dc83
index f3d63ca..6ae4a2b 100644
50dc83
--- a/xlators/cluster/ec/src/ec-types.h
50dc83
+++ b/xlators/cluster/ec/src/ec-types.h
50dc83
@@ -171,6 +171,7 @@ struct _ec_inode {
50dc83
     gf_boolean_t have_config;
50dc83
     gf_boolean_t have_version;
50dc83
     gf_boolean_t have_size;
50dc83
+    int32_t heal_count;
50dc83
     ec_config_t config;
50dc83
     uint64_t pre_version[2];
50dc83
     uint64_t post_version[2];
50dc83
-- 
50dc83
1.8.3.1
50dc83