Blob Blame History Raw
From 0d9aeed1092e7ce206a43e9fa29134de73731306 Mon Sep 17 00:00:00 2001
From: Ashish Pandey <aspandey@redhat.com>
Date: Thu, 13 Oct 2016 14:13:51 +0530
Subject: [PATCH 098/141] cluster/ec: set/unset dirty flag for data/metadata update

Currently, for all the update operations, metadata or data,
we set the dirty flag at the end of the operation only if
a brick is down. This leads to delay in healing and in some
cases not at all.
In this patch we set (+1) the dirty flag
at the start of the metadata or data update operations and
after successfull completion of the fop, we unset (-1) it again.

>Reviewed-on: http://review.gluster.org/13733
>Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
>Smoke: Gluster Build System <jenkins@build.gluster.org>
>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
>Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
>Signed-off-by: Ashish Pandey <aspandey@redhat.com>

Change-Id: Ide5668bdec7b937a61c5c840cdc79a967598e1e9
BUG: 1361513
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/87008
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
---
 tests/basic/ec/ec-background-heals.t |    1 +
 tests/basic/ec/ec-new-entry.t        |   14 ++
 xlators/cluster/ec/src/ec-common.c   |  284 +++++++++++++++++++---------------
 xlators/cluster/ec/src/ec-common.h   |    3 +-
 xlators/cluster/ec/src/ec-data.h     |    3 +-
 5 files changed, 181 insertions(+), 124 deletions(-)

diff --git a/tests/basic/ec/ec-background-heals.t b/tests/basic/ec/ec-background-heals.t
index 726e60d..7ac6c0e 100644
--- a/tests/basic/ec/ec-background-heals.t
+++ b/tests/basic/ec/ec-background-heals.t
@@ -23,6 +23,7 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
 EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "0" mount_get_option_value $M0 $V0-disperse-0 background-heals
 EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "0" mount_get_option_value $M0 $V0-disperse-0 heal-wait-qlength
 TEST touch $M0/a
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "0" get_pending_heal_count $V0 #One for each active brick
 TEST kill_brick $V0 $H0 $B0/${V0}2
 echo abc > $M0/a
 EXPECT 2 get_pending_heal_count $V0 #One for each active brick
diff --git a/tests/basic/ec/ec-new-entry.t b/tests/basic/ec/ec-new-entry.t
index 3a5c2ee..2ba2bf5 100644
--- a/tests/basic/ec/ec-new-entry.t
+++ b/tests/basic/ec/ec-new-entry.t
@@ -12,6 +12,17 @@ function get_md5sum {
         md5sum $1 | awk '{print $1}'
 }
 
+#after replace-brick immediately trusted.ec.version will be absent, so if it
+#is present we can assume that heal attempted on root
+function root_heal_attempted {
+        if [ -z $(get_hex_xattr trusted.ec.version $1) ];
+        then
+                echo "N";
+        else
+                echo "Y";
+        fi
+}
+
 TEST glusterd
 TEST pidof glusterd
 TEST $CLI volume create $V0 disperse 6 redundancy 2 $H0:$B0/${V0}{0..5}
@@ -23,6 +34,9 @@ touch $M0/11
 for i in {1..10}; do dd if=/dev/zero of=$M0/$i bs=1M count=1; done
 TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}5 $H0:$B0/${V0}6 commit force
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count_shd $V0 0
+EXPECT_WITHIN $HEAL_TIMEOUT "Y" root_heal_attempted $B0/${V0}6
 EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
 #ls -l gives "Total" line so number of lines will be 1 more
 EXPECT "^12$" num_entries $B0/${V0}6
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 2e6759a..6ff87ad 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -886,6 +886,27 @@ ec_config_check (ec_fop_data_t *fop, ec_config_t *config)
     return _gf_true;
 }
 
+gf_boolean_t
+ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty)
+{
+
+    gf_boolean_t set_dirty = _gf_false;
+
+    if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) {
+                dirty[EC_DATA_TXN] = 1;
+    }
+
+    if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) {
+                dirty[EC_METADATA_TXN] = 1;
+    }
+
+    if (dirty[EC_METADATA_TXN] || dirty[EC_DATA_TXN]) {
+        set_dirty = _gf_true;
+    }
+
+    return set_dirty;
+}
+
 int32_t
 ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
                        xlator_t *this, int32_t op_ret, int32_t op_errno,
@@ -906,8 +927,8 @@ ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
     LOCK(&lock->loc.inode->lock);
 
     list_for_each_entry(link, &lock->owners, owner_list) {
-        if ((link->fop->flags & EC_FLAG_WAITING_SIZE) != 0) {
-            link->fop->flags ^= EC_FLAG_WAITING_SIZE;
+        if ((link->fop->flags & EC_FLAG_WAITING_XATTROP) != 0) {
+            link->fop->flags ^= EC_FLAG_WAITING_XATTROP;
 
             list_add_tail(&link->fop->cbk_list, &list);
         }
@@ -921,68 +942,70 @@ ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
         goto unlock;
     }
 
-    op_errno = -ec_dict_del_array(dict, EC_XATTR_VERSION, ctx->pre_version,
-                                  EC_VERSION_SIZE);
-    if (op_errno != 0) {
-        gf_msg (this->name, GF_LOG_ERROR, op_errno,
-                EC_MSG_VER_XATTR_GET_FAIL,
-                "Unable to get version xattr");
-
-        goto unlock;
-    }
-    ctx->post_version[0] += ctx->pre_version[0];
-    ctx->post_version[1] += ctx->pre_version[1];
-
-    ctx->have_version = _gf_true;
-
-    if (lock->loc.inode->ia_type == IA_IFREG ||
-        lock->loc.inode->ia_type == IA_INVAL) {
-        op_errno = -ec_dict_del_number(dict, EC_XATTR_SIZE, &ctx->pre_size);
-        if (op_errno != 0) {
-            if (lock->loc.inode->ia_type == IA_IFREG) {
+    if (parent->flags & EC_FLAG_QUERY_METADATA) {
+            parent->flags ^= EC_FLAG_QUERY_METADATA;
+            op_errno = -ec_dict_del_array(dict, EC_XATTR_VERSION,
+                                          ctx->pre_version,
+                                          EC_VERSION_SIZE);
+            if (op_errno != 0) {
                 gf_msg (this->name, GF_LOG_ERROR, op_errno,
-                        EC_MSG_SIZE_XATTR_GET_FAIL,
-                        "Unable to get size xattr");
-
+                        EC_MSG_VER_XATTR_GET_FAIL,
+                        "Unable to get version xattr");
                 goto unlock;
             }
-        } else {
-            ctx->post_size = ctx->pre_size;
+            ctx->post_version[0] += ctx->pre_version[0];
+            ctx->post_version[1] += ctx->pre_version[1];
 
-            ctx->have_size = _gf_true;
-        }
+            ctx->have_version = _gf_true;
 
-        op_errno = -ec_dict_del_config(dict, EC_XATTR_CONFIG, &ctx->config);
-        if (op_errno != 0) {
-            if ((lock->loc.inode->ia_type == IA_IFREG) ||
-                (op_errno != ENODATA)) {
-                gf_msg (this->name, GF_LOG_ERROR, op_errno,
-                        EC_MSG_CONFIG_XATTR_GET_FAIL,
-                        "Unable to get config xattr");
+            if (lock->loc.inode->ia_type == IA_IFREG ||
+                lock->loc.inode->ia_type == IA_INVAL) {
+                op_errno = -ec_dict_del_number(dict, EC_XATTR_SIZE,
+                                               &ctx->pre_size);
+                if (op_errno != 0) {
+                    if (lock->loc.inode->ia_type == IA_IFREG) {
+                        gf_msg (this->name, GF_LOG_ERROR, op_errno,
+                                EC_MSG_SIZE_XATTR_GET_FAIL,
+                                "Unable to get size xattr");
+                        goto unlock;
+                    }
+                } else {
+                    ctx->post_size = ctx->pre_size;
 
-                goto unlock;
-            }
-        } else {
-            if (!ec_config_check(parent, &ctx->config)) {
-                gf_msg (this->name, GF_LOG_ERROR, EINVAL,
-                        EC_MSG_CONFIG_XATTR_INVALID,
-                        "Invalid config xattr");
+                    ctx->have_size = _gf_true;
+                }
 
-                op_errno = EINVAL;
+                op_errno = -ec_dict_del_config(dict, EC_XATTR_CONFIG,
+                                               &ctx->config);
+                if (op_errno != 0) {
+                    if ((lock->loc.inode->ia_type == IA_IFREG) ||
+                        (op_errno != ENODATA)) {
+                        gf_msg (this->name, GF_LOG_ERROR, op_errno,
+                                EC_MSG_CONFIG_XATTR_GET_FAIL,
+                                "Unable to get config xattr");
+
+                        goto unlock;
+                    }
+                } else {
+                    if (!ec_config_check(parent, &ctx->config)) {
+                        gf_msg (this->name, GF_LOG_ERROR, EINVAL,
+                                EC_MSG_CONFIG_XATTR_INVALID,
+                                "Invalid config xattr");
 
-                goto unlock;
-            }
+                        op_errno = EINVAL;
 
-            ctx->have_config = _gf_true;
-        }
+                        goto unlock;
+                    }
+                    ctx->have_config = _gf_true;
+                }
+            }
+            ctx->have_info = _gf_true;
     }
 
-    ctx->have_info = _gf_true;
-
+    ec_set_dirty_flag (fop->data, ctx, ctx->dirty);
     op_errno = 0;
-
 unlock:
-    lock->getting_size = _gf_false;
+    lock->getting_xattr = _gf_false;
 
     UNLOCK(&lock->loc.inode->lock);
 
@@ -1029,16 +1052,19 @@ void ec_get_size_version(ec_lock_link_t *link)
     ec_inode_t *ctx;
     ec_fop_data_t *fop;
     dict_t *dict = NULL;
-    int32_t error = -ENOMEM;
-    gf_boolean_t getting_size;
+    int32_t error = 0;
+    gf_boolean_t getting_xattr;
+    gf_boolean_t set_dirty = _gf_false;
     uint64_t allzero[EC_VERSION_SIZE] = {0, 0};
-
+    uint64_t dirty[EC_VERSION_SIZE] = {0, 0};
     lock = link->lock;
     ctx = lock->ctx;
     fop = link->fop;
 
+    set_dirty = ec_set_dirty_flag (link, ctx, dirty);
+
     /* If ec metadata has already been retrieved, do not try again. */
-    if (ctx->have_info) {
+    if (ctx->have_info && (!set_dirty)) {
         if (ec_is_data_fop (fop->id)) {
             fop->healing |= lock->healing;
         }
@@ -1047,58 +1073,63 @@ void ec_get_size_version(ec_lock_link_t *link)
 
     /* Determine if there's something we need to retrieve for the current
      * operation. */
-    if (!lock->query &&
+    if (!set_dirty && !lock->query &&
         (lock->loc.inode->ia_type != IA_IFREG) &&
         (lock->loc.inode->ia_type != IA_INVAL)) {
-        return;
+            return;
     }
 
     memset(&loc, 0, sizeof(loc));
 
     LOCK(&lock->loc.inode->lock);
 
-    getting_size = lock->getting_size;
-    lock->getting_size = _gf_true;
-    if (getting_size) {
-        fop->flags |= EC_FLAG_WAITING_SIZE;
+    getting_xattr = lock->getting_xattr;
+    lock->getting_xattr = _gf_true;
+    if (getting_xattr) {
+        fop->flags |= EC_FLAG_WAITING_XATTROP;
 
         ec_sleep(fop);
     }
 
     UNLOCK(&lock->loc.inode->lock);
 
-    if (getting_size) {
-        error = 0;
-
+    if (getting_xattr) {
         goto out;
     }
 
     dict = dict_new();
     if (dict == NULL) {
+        error = -ENOMEM;
         goto out;
     }
+    if (lock->query && !ctx->have_info) {
+            fop->flags |= EC_FLAG_QUERY_METADATA;
+            /* Once we know that an xattrop will be needed,
+             * we try to get all available information in a
+             * single call. */
+            error = ec_dict_set_array(dict, EC_XATTR_VERSION, allzero,
+                                      EC_VERSION_SIZE);
+            if (error != 0) {
+                goto out;
+            }
 
-    /* Once we know that an xattrop will be needed, we try to get all available
-     * information in a single call. */
-    error = ec_dict_set_array(dict, EC_XATTR_VERSION, allzero,
-                              EC_VERSION_SIZE);
-    if (error == 0) {
-        error = ec_dict_set_array(dict, EC_XATTR_DIRTY, allzero,
-                                  EC_VERSION_SIZE);
-    }
-    if (error != 0) {
-        goto out;
+            if (lock->loc.inode->ia_type == IA_IFREG ||
+                lock->loc.inode->ia_type == IA_INVAL) {
+                error = ec_dict_set_number(dict, EC_XATTR_SIZE, 0);
+                if (error == 0) {
+                    error = ec_dict_set_number(dict, EC_XATTR_CONFIG, 0);
+                }
+                if (error != 0) {
+                    goto out;
+                }
+            }
     }
-
-    if (lock->loc.inode->ia_type == IA_IFREG ||
-        lock->loc.inode->ia_type == IA_INVAL) {
-        error = ec_dict_set_number(dict, EC_XATTR_SIZE, 0);
-        if (error == 0) {
-            error = ec_dict_set_number(dict, EC_XATTR_CONFIG, 0);
-        }
-        if (error != 0) {
-            goto out;
-        }
+    if (set_dirty) {
+            error = ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty,
+                                      EC_VERSION_SIZE);
+            if (error != 0) {
+                goto out;
+            }
     }
 
     fop->frame->root->uid = 0;
@@ -1634,15 +1665,9 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
     if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) {
         if (link->update[0]) {
             ctx->post_version[0]++;
-            if (ec->node_mask & ~fop->good) {
-                ctx->dirty[0]++;
-            }
         }
         if (link->update[1]) {
             ctx->post_version[1]++;
-            if (ec->node_mask & ~fop->good) {
-                ctx->dirty[1]++;
-            }
         }
     }
 
@@ -1764,11 +1789,11 @@ void ec_unlock_lock(ec_lock_link_t *link)
     lock = link->lock;
     fop = link->fop;
 
+    lock->unlock_now = _gf_false;
     ec_clear_inode_info(fop, lock->loc.inode);
 
     if ((lock->mask != 0) && lock->acquired) {
         ec_owner_set(fop->frame, lock);
-
         lock->flock.l_type = F_UNLCK;
         ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock,
                  lock->loc.inode);
@@ -1791,15 +1816,16 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,
     ec_lock_t *lock;
     ec_inode_t *ctx;
 
+    link = fop->data;
+    lock = link->lock;
+    ctx = lock->ctx;
+
     if (op_ret < 0) {
         gf_msg(fop->xl->name, fop_log_level (fop->id, op_errno), op_errno,
                EC_MSG_SIZE_VERS_UPDATE_FAIL,
                "Failed to update version and size");
     } else {
         fop->parent->good &= fop->good;
-        link = fop->data;
-        lock = link->lock;
-        ctx = lock->ctx;
 
         ec_lock_update_good(lock, fop);
 
@@ -1822,10 +1848,11 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,
 
         ctx->have_info = _gf_true;
     }
-
-    if ((fop->parent->id != GF_FOP_FLUSH) &&
-        (fop->parent->id != GF_FOP_FSYNC) &&
-        (fop->parent->id != GF_FOP_FSYNCDIR)) {
+    /* If we are here because of fop's and other than unlock request,
+     * that means we are still holding a lock. That make sure
+     * lock->unlock_now can not be modified.
+     */
+    if (lock->unlock_now) {
         ec_unlock_lock(fop->data);
     }
 
@@ -1843,6 +1870,9 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
     int32_t err = -ENOMEM;
 
     fop = link->fop;
+    ec_t *ec = fop->xl->private;
+    lock = link->lock;
+    ctx = lock->ctx;
 
     ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld",
              version[0], version[1], size, dirty[0], dirty[1]);
@@ -1852,9 +1882,6 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
         goto out;
     }
 
-    lock = link->lock;
-    ctx = lock->ctx;
-
     /* If we don't have version information or it has been modified, we
      * update it. */
     if (!ctx->have_version || (version[0] != 0) || (version[1] != 0)) {
@@ -1866,8 +1893,8 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
     }
 
     if (size != 0) {
-        /* If size has been changed, we should already know the previous size
-         * of the file. */
+        /* If size has been changed, we should already
+         * know the previous size of the file. */
         GF_ASSERT(ctx->have_size);
 
         err = ec_dict_set_number(dict, EC_XATTR_SIZE, size);
@@ -1876,13 +1903,12 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
         }
     }
 
-    /* If we don't have dirty information or it has been modified, we update
-     * it. */
-    if ((dirty[0] != 0) || (dirty[1] != 0)) {
-        err = ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
-        if (err != 0) {
-            goto out;
-        }
+    if (dirty[0] || dirty[1]) {
+            err = ec_dict_set_array(dict, EC_XATTR_DIRTY,
+                                    dirty, EC_VERSION_SIZE);
+            if (err != 0) {
+                goto out;
+            }
     }
 
     /* If config information is not known, we request it now. */
@@ -1922,9 +1948,7 @@ out:
     gf_msg (fop->xl->name, GF_LOG_ERROR, -err, EC_MSG_SIZE_VERS_UPDATE_FAIL,
             "Unable to update version and size");
 
-    if ((fop->parent->id != GF_FOP_FLUSH) &&
-        (fop->parent->id != GF_FOP_FSYNC) &&
-        (fop->parent->id != GF_FOP_FSYNCDIR)) {
+    if (lock->unlock_now) {
         ec_unlock_lock(fop->data);
     }
 
@@ -1935,28 +1959,36 @@ ec_update_info(ec_lock_link_t *link)
 {
     ec_lock_t *lock;
     ec_inode_t *ctx;
-    uint64_t version[2];
-    uint64_t dirty[2];
+    uint64_t version[2] = {0, 0};
+    uint64_t dirty[2] = {0, 0};
     uint64_t size;
+    ec_t *ec = NULL;
 
     lock = link->lock;
     ctx = lock->ctx;
+    ec = link->fop->xl->private;
 
     /* pre_version[*] will be 0 if have_version is false */
     version[0] = ctx->post_version[0] - ctx->pre_version[0];
     version[1] = ctx->post_version[1] - ctx->pre_version[1];
 
     size = ctx->post_size - ctx->pre_size;
-
-    dirty[0] = ctx->dirty[0];
-    dirty[1] = ctx->dirty[1];
-    /*Dirty is not combined so just reset it right here*/
-    memset(ctx->dirty, 0, sizeof(ctx->dirty));
-
+    /* If we set the dirty flag for update fop, we have to unset it.
+     * If fop has failed on some bricks, leave the dirty as marked. */
+    if (lock->unlock_now) {
+            if (!(ec->node_mask & ~lock->good_mask)) {
+                    if (ctx->dirty[0] != 0) {
+                        dirty[0] = -1;
+                    }
+                    if (ctx->dirty[1] != 0) {
+                        dirty[1] = -1;
+                    }
+            }
+            memset(ctx->dirty, 0, sizeof(ctx->dirty));
+    }
     if ((version[0] != 0) || (version[1] != 0) ||
         (dirty[0] != 0) || (dirty[1] != 0)) {
         ec_update_size_version(link, version, size, dirty);
-
         return _gf_true;
     }
 
@@ -1966,7 +1998,15 @@ ec_update_info(ec_lock_link_t *link)
 void
 ec_unlock_now(ec_lock_link_t *link)
 {
+    ec_lock_t *lock;
+    lock = link->lock;
+
     ec_trace("UNLOCK_NOW", link->fop, "lock=%p", link->lock);
+    /*At this point, lock is not being used by any fop and
+     *can not be reused by any fop as it is going to be released.
+     *lock->unlock_now can not be modified at any other place.
+     */
+    lock->unlock_now = _gf_true;
 
     if (!ec_update_info(link)) {
         ec_unlock_lock(link);
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index 8e724a8..d720d24 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -28,7 +28,8 @@ typedef enum {
 #define EC_CONFIG_ALGORITHM 0
 
 #define EC_FLAG_LOCK_SHARED       0x0001
-#define EC_FLAG_WAITING_SIZE      0x0002
+#define EC_FLAG_WAITING_XATTROP   0x0002
+#define EC_FLAG_QUERY_METADATA    0x0004
 
 #define EC_SELFHEAL_BIT 62
 
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
index 4a2a11f..a207b11 100644
--- a/xlators/cluster/ec/src/ec-data.h
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -163,7 +163,8 @@ struct _ec_lock
     uint32_t           refs_owners;  /* Refs for fops owning the lock */
     uint32_t           refs_pending; /* Refs assigned to fops being prepared */
     gf_boolean_t       acquired;
-    gf_boolean_t       getting_size;
+    gf_boolean_t       getting_xattr;
+    gf_boolean_t       unlock_now;
     gf_boolean_t       release;
     gf_boolean_t       query;
     fd_t              *fd;
-- 
1.7.1