From 0d9aeed1092e7ce206a43e9fa29134de73731306 Mon Sep 17 00:00:00 2001
From: Ashish Pandey <aspandey@redhat.com>
Date: Thu, 13 Oct 2016 14:13:51 +0530
Subject: [PATCH 098/141] cluster/ec: set/unset dirty flag for data/metadata update
Currently, for all the update operations, metadata or data,
we set the dirty flag at the end of the operation only if
a brick is down. This leads to delay in healing and in some
cases not at all.
In this patch we set (+1) the dirty flag
at the start of the metadata or data update operations and
after successfull completion of the fop, we unset (-1) it again.
>Reviewed-on: http://review.gluster.org/13733
>Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
>Smoke: Gluster Build System <jenkins@build.gluster.org>
>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
>Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
>Signed-off-by: Ashish Pandey <aspandey@redhat.com>
Change-Id: Ide5668bdec7b937a61c5c840cdc79a967598e1e9
BUG: 1361513
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/87008
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
---
tests/basic/ec/ec-background-heals.t | 1 +
tests/basic/ec/ec-new-entry.t | 14 ++
xlators/cluster/ec/src/ec-common.c | 284 +++++++++++++++++++---------------
xlators/cluster/ec/src/ec-common.h | 3 +-
xlators/cluster/ec/src/ec-data.h | 3 +-
5 files changed, 181 insertions(+), 124 deletions(-)
diff --git a/tests/basic/ec/ec-background-heals.t b/tests/basic/ec/ec-background-heals.t
index 726e60d..7ac6c0e 100644
--- a/tests/basic/ec/ec-background-heals.t
+++ b/tests/basic/ec/ec-background-heals.t
@@ -23,6 +23,7 @@ EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "0" mount_get_option_value $M0 $V0-disperse-0 background-heals
EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "0" mount_get_option_value $M0 $V0-disperse-0 heal-wait-qlength
TEST touch $M0/a
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "0" get_pending_heal_count $V0 #One for each active brick
TEST kill_brick $V0 $H0 $B0/${V0}2
echo abc > $M0/a
EXPECT 2 get_pending_heal_count $V0 #One for each active brick
diff --git a/tests/basic/ec/ec-new-entry.t b/tests/basic/ec/ec-new-entry.t
index 3a5c2ee..2ba2bf5 100644
--- a/tests/basic/ec/ec-new-entry.t
+++ b/tests/basic/ec/ec-new-entry.t
@@ -12,6 +12,17 @@ function get_md5sum {
md5sum $1 | awk '{print $1}'
}
+#after replace-brick immediately trusted.ec.version will be absent, so if it
+#is present we can assume that heal attempted on root
+function root_heal_attempted {
+ if [ -z $(get_hex_xattr trusted.ec.version $1) ];
+ then
+ echo "N";
+ else
+ echo "Y";
+ fi
+}
+
TEST glusterd
TEST pidof glusterd
TEST $CLI volume create $V0 disperse 6 redundancy 2 $H0:$B0/${V0}{0..5}
@@ -23,6 +34,9 @@ touch $M0/11
for i in {1..10}; do dd if=/dev/zero of=$M0/$i bs=1M count=1; done
TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}5 $H0:$B0/${V0}6 commit force
EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count_shd $V0 0
+EXPECT_WITHIN $HEAL_TIMEOUT "Y" root_heal_attempted $B0/${V0}6
EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
#ls -l gives "Total" line so number of lines will be 1 more
EXPECT "^12$" num_entries $B0/${V0}6
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 2e6759a..6ff87ad 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -886,6 +886,27 @@ ec_config_check (ec_fop_data_t *fop, ec_config_t *config)
return _gf_true;
}
+gf_boolean_t
+ec_set_dirty_flag (ec_lock_link_t *link, ec_inode_t *ctx, uint64_t *dirty)
+{
+
+ gf_boolean_t set_dirty = _gf_false;
+
+ if (link->update[EC_DATA_TXN] && !ctx->dirty[EC_DATA_TXN]) {
+ dirty[EC_DATA_TXN] = 1;
+ }
+
+ if (link->update[EC_METADATA_TXN] && !ctx->dirty[EC_METADATA_TXN]) {
+ dirty[EC_METADATA_TXN] = 1;
+ }
+
+ if (dirty[EC_METADATA_TXN] || dirty[EC_DATA_TXN]) {
+ set_dirty = _gf_true;
+ }
+
+ return set_dirty;
+}
+
int32_t
ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
@@ -906,8 +927,8 @@ ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
LOCK(&lock->loc.inode->lock);
list_for_each_entry(link, &lock->owners, owner_list) {
- if ((link->fop->flags & EC_FLAG_WAITING_SIZE) != 0) {
- link->fop->flags ^= EC_FLAG_WAITING_SIZE;
+ if ((link->fop->flags & EC_FLAG_WAITING_XATTROP) != 0) {
+ link->fop->flags ^= EC_FLAG_WAITING_XATTROP;
list_add_tail(&link->fop->cbk_list, &list);
}
@@ -921,68 +942,70 @@ ec_prepare_update_cbk (call_frame_t *frame, void *cookie,
goto unlock;
}
- op_errno = -ec_dict_del_array(dict, EC_XATTR_VERSION, ctx->pre_version,
- EC_VERSION_SIZE);
- if (op_errno != 0) {
- gf_msg (this->name, GF_LOG_ERROR, op_errno,
- EC_MSG_VER_XATTR_GET_FAIL,
- "Unable to get version xattr");
-
- goto unlock;
- }
- ctx->post_version[0] += ctx->pre_version[0];
- ctx->post_version[1] += ctx->pre_version[1];
-
- ctx->have_version = _gf_true;
-
- if (lock->loc.inode->ia_type == IA_IFREG ||
- lock->loc.inode->ia_type == IA_INVAL) {
- op_errno = -ec_dict_del_number(dict, EC_XATTR_SIZE, &ctx->pre_size);
- if (op_errno != 0) {
- if (lock->loc.inode->ia_type == IA_IFREG) {
+ if (parent->flags & EC_FLAG_QUERY_METADATA) {
+ parent->flags ^= EC_FLAG_QUERY_METADATA;
+ op_errno = -ec_dict_del_array(dict, EC_XATTR_VERSION,
+ ctx->pre_version,
+ EC_VERSION_SIZE);
+ if (op_errno != 0) {
gf_msg (this->name, GF_LOG_ERROR, op_errno,
- EC_MSG_SIZE_XATTR_GET_FAIL,
- "Unable to get size xattr");
-
+ EC_MSG_VER_XATTR_GET_FAIL,
+ "Unable to get version xattr");
goto unlock;
}
- } else {
- ctx->post_size = ctx->pre_size;
+ ctx->post_version[0] += ctx->pre_version[0];
+ ctx->post_version[1] += ctx->pre_version[1];
- ctx->have_size = _gf_true;
- }
+ ctx->have_version = _gf_true;
- op_errno = -ec_dict_del_config(dict, EC_XATTR_CONFIG, &ctx->config);
- if (op_errno != 0) {
- if ((lock->loc.inode->ia_type == IA_IFREG) ||
- (op_errno != ENODATA)) {
- gf_msg (this->name, GF_LOG_ERROR, op_errno,
- EC_MSG_CONFIG_XATTR_GET_FAIL,
- "Unable to get config xattr");
+ if (lock->loc.inode->ia_type == IA_IFREG ||
+ lock->loc.inode->ia_type == IA_INVAL) {
+ op_errno = -ec_dict_del_number(dict, EC_XATTR_SIZE,
+ &ctx->pre_size);
+ if (op_errno != 0) {
+ if (lock->loc.inode->ia_type == IA_IFREG) {
+ gf_msg (this->name, GF_LOG_ERROR, op_errno,
+ EC_MSG_SIZE_XATTR_GET_FAIL,
+ "Unable to get size xattr");
+ goto unlock;
+ }
+ } else {
+ ctx->post_size = ctx->pre_size;
- goto unlock;
- }
- } else {
- if (!ec_config_check(parent, &ctx->config)) {
- gf_msg (this->name, GF_LOG_ERROR, EINVAL,
- EC_MSG_CONFIG_XATTR_INVALID,
- "Invalid config xattr");
+ ctx->have_size = _gf_true;
+ }
- op_errno = EINVAL;
+ op_errno = -ec_dict_del_config(dict, EC_XATTR_CONFIG,
+ &ctx->config);
+ if (op_errno != 0) {
+ if ((lock->loc.inode->ia_type == IA_IFREG) ||
+ (op_errno != ENODATA)) {
+ gf_msg (this->name, GF_LOG_ERROR, op_errno,
+ EC_MSG_CONFIG_XATTR_GET_FAIL,
+ "Unable to get config xattr");
+
+ goto unlock;
+ }
+ } else {
+ if (!ec_config_check(parent, &ctx->config)) {
+ gf_msg (this->name, GF_LOG_ERROR, EINVAL,
+ EC_MSG_CONFIG_XATTR_INVALID,
+ "Invalid config xattr");
- goto unlock;
- }
+ op_errno = EINVAL;
- ctx->have_config = _gf_true;
- }
+ goto unlock;
+ }
+ ctx->have_config = _gf_true;
+ }
+ }
+ ctx->have_info = _gf_true;
}
- ctx->have_info = _gf_true;
-
+ ec_set_dirty_flag (fop->data, ctx, ctx->dirty);
op_errno = 0;
-
unlock:
- lock->getting_size = _gf_false;
+ lock->getting_xattr = _gf_false;
UNLOCK(&lock->loc.inode->lock);
@@ -1029,16 +1052,19 @@ void ec_get_size_version(ec_lock_link_t *link)
ec_inode_t *ctx;
ec_fop_data_t *fop;
dict_t *dict = NULL;
- int32_t error = -ENOMEM;
- gf_boolean_t getting_size;
+ int32_t error = 0;
+ gf_boolean_t getting_xattr;
+ gf_boolean_t set_dirty = _gf_false;
uint64_t allzero[EC_VERSION_SIZE] = {0, 0};
-
+ uint64_t dirty[EC_VERSION_SIZE] = {0, 0};
lock = link->lock;
ctx = lock->ctx;
fop = link->fop;
+ set_dirty = ec_set_dirty_flag (link, ctx, dirty);
+
/* If ec metadata has already been retrieved, do not try again. */
- if (ctx->have_info) {
+ if (ctx->have_info && (!set_dirty)) {
if (ec_is_data_fop (fop->id)) {
fop->healing |= lock->healing;
}
@@ -1047,58 +1073,63 @@ void ec_get_size_version(ec_lock_link_t *link)
/* Determine if there's something we need to retrieve for the current
* operation. */
- if (!lock->query &&
+ if (!set_dirty && !lock->query &&
(lock->loc.inode->ia_type != IA_IFREG) &&
(lock->loc.inode->ia_type != IA_INVAL)) {
- return;
+ return;
}
memset(&loc, 0, sizeof(loc));
LOCK(&lock->loc.inode->lock);
- getting_size = lock->getting_size;
- lock->getting_size = _gf_true;
- if (getting_size) {
- fop->flags |= EC_FLAG_WAITING_SIZE;
+ getting_xattr = lock->getting_xattr;
+ lock->getting_xattr = _gf_true;
+ if (getting_xattr) {
+ fop->flags |= EC_FLAG_WAITING_XATTROP;
ec_sleep(fop);
}
UNLOCK(&lock->loc.inode->lock);
- if (getting_size) {
- error = 0;
-
+ if (getting_xattr) {
goto out;
}
dict = dict_new();
if (dict == NULL) {
+ error = -ENOMEM;
goto out;
}
+ if (lock->query && !ctx->have_info) {
+ fop->flags |= EC_FLAG_QUERY_METADATA;
+ /* Once we know that an xattrop will be needed,
+ * we try to get all available information in a
+ * single call. */
+ error = ec_dict_set_array(dict, EC_XATTR_VERSION, allzero,
+ EC_VERSION_SIZE);
+ if (error != 0) {
+ goto out;
+ }
- /* Once we know that an xattrop will be needed, we try to get all available
- * information in a single call. */
- error = ec_dict_set_array(dict, EC_XATTR_VERSION, allzero,
- EC_VERSION_SIZE);
- if (error == 0) {
- error = ec_dict_set_array(dict, EC_XATTR_DIRTY, allzero,
- EC_VERSION_SIZE);
- }
- if (error != 0) {
- goto out;
+ if (lock->loc.inode->ia_type == IA_IFREG ||
+ lock->loc.inode->ia_type == IA_INVAL) {
+ error = ec_dict_set_number(dict, EC_XATTR_SIZE, 0);
+ if (error == 0) {
+ error = ec_dict_set_number(dict, EC_XATTR_CONFIG, 0);
+ }
+ if (error != 0) {
+ goto out;
+ }
+ }
}
-
- if (lock->loc.inode->ia_type == IA_IFREG ||
- lock->loc.inode->ia_type == IA_INVAL) {
- error = ec_dict_set_number(dict, EC_XATTR_SIZE, 0);
- if (error == 0) {
- error = ec_dict_set_number(dict, EC_XATTR_CONFIG, 0);
- }
- if (error != 0) {
- goto out;
- }
+ if (set_dirty) {
+ error = ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty,
+ EC_VERSION_SIZE);
+ if (error != 0) {
+ goto out;
+ }
}
fop->frame->root->uid = 0;
@@ -1634,15 +1665,9 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
if ((fop->error == 0) && (cbk != NULL) && (cbk->op_ret >= 0)) {
if (link->update[0]) {
ctx->post_version[0]++;
- if (ec->node_mask & ~fop->good) {
- ctx->dirty[0]++;
- }
}
if (link->update[1]) {
ctx->post_version[1]++;
- if (ec->node_mask & ~fop->good) {
- ctx->dirty[1]++;
- }
}
}
@@ -1764,11 +1789,11 @@ void ec_unlock_lock(ec_lock_link_t *link)
lock = link->lock;
fop = link->fop;
+ lock->unlock_now = _gf_false;
ec_clear_inode_info(fop, lock->loc.inode);
if ((lock->mask != 0) && lock->acquired) {
ec_owner_set(fop->frame, lock);
-
lock->flock.l_type = F_UNLCK;
ec_trace("UNLOCK_INODELK", fop, "lock=%p, inode=%p", lock,
lock->loc.inode);
@@ -1791,15 +1816,16 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,
ec_lock_t *lock;
ec_inode_t *ctx;
+ link = fop->data;
+ lock = link->lock;
+ ctx = lock->ctx;
+
if (op_ret < 0) {
gf_msg(fop->xl->name, fop_log_level (fop->id, op_errno), op_errno,
EC_MSG_SIZE_VERS_UPDATE_FAIL,
"Failed to update version and size");
} else {
fop->parent->good &= fop->good;
- link = fop->data;
- lock = link->lock;
- ctx = lock->ctx;
ec_lock_update_good(lock, fop);
@@ -1822,10 +1848,11 @@ int32_t ec_update_size_version_done(call_frame_t * frame, void * cookie,
ctx->have_info = _gf_true;
}
-
- if ((fop->parent->id != GF_FOP_FLUSH) &&
- (fop->parent->id != GF_FOP_FSYNC) &&
- (fop->parent->id != GF_FOP_FSYNCDIR)) {
+ /* If we are here because of fop's and other than unlock request,
+ * that means we are still holding a lock. That make sure
+ * lock->unlock_now can not be modified.
+ */
+ if (lock->unlock_now) {
ec_unlock_lock(fop->data);
}
@@ -1843,6 +1870,9 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
int32_t err = -ENOMEM;
fop = link->fop;
+ ec_t *ec = fop->xl->private;
+ lock = link->lock;
+ ctx = lock->ctx;
ec_trace("UPDATE", fop, "version=%ld/%ld, size=%ld, dirty=%ld/%ld",
version[0], version[1], size, dirty[0], dirty[1]);
@@ -1852,9 +1882,6 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
goto out;
}
- lock = link->lock;
- ctx = lock->ctx;
-
/* If we don't have version information or it has been modified, we
* update it. */
if (!ctx->have_version || (version[0] != 0) || (version[1] != 0)) {
@@ -1866,8 +1893,8 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
}
if (size != 0) {
- /* If size has been changed, we should already know the previous size
- * of the file. */
+ /* If size has been changed, we should already
+ * know the previous size of the file. */
GF_ASSERT(ctx->have_size);
err = ec_dict_set_number(dict, EC_XATTR_SIZE, size);
@@ -1876,13 +1903,12 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
}
}
- /* If we don't have dirty information or it has been modified, we update
- * it. */
- if ((dirty[0] != 0) || (dirty[1] != 0)) {
- err = ec_dict_set_array(dict, EC_XATTR_DIRTY, dirty, EC_VERSION_SIZE);
- if (err != 0) {
- goto out;
- }
+ if (dirty[0] || dirty[1]) {
+ err = ec_dict_set_array(dict, EC_XATTR_DIRTY,
+ dirty, EC_VERSION_SIZE);
+ if (err != 0) {
+ goto out;
+ }
}
/* If config information is not known, we request it now. */
@@ -1922,9 +1948,7 @@ out:
gf_msg (fop->xl->name, GF_LOG_ERROR, -err, EC_MSG_SIZE_VERS_UPDATE_FAIL,
"Unable to update version and size");
- if ((fop->parent->id != GF_FOP_FLUSH) &&
- (fop->parent->id != GF_FOP_FSYNC) &&
- (fop->parent->id != GF_FOP_FSYNCDIR)) {
+ if (lock->unlock_now) {
ec_unlock_lock(fop->data);
}
@@ -1935,28 +1959,36 @@ ec_update_info(ec_lock_link_t *link)
{
ec_lock_t *lock;
ec_inode_t *ctx;
- uint64_t version[2];
- uint64_t dirty[2];
+ uint64_t version[2] = {0, 0};
+ uint64_t dirty[2] = {0, 0};
uint64_t size;
+ ec_t *ec = NULL;
lock = link->lock;
ctx = lock->ctx;
+ ec = link->fop->xl->private;
/* pre_version[*] will be 0 if have_version is false */
version[0] = ctx->post_version[0] - ctx->pre_version[0];
version[1] = ctx->post_version[1] - ctx->pre_version[1];
size = ctx->post_size - ctx->pre_size;
-
- dirty[0] = ctx->dirty[0];
- dirty[1] = ctx->dirty[1];
- /*Dirty is not combined so just reset it right here*/
- memset(ctx->dirty, 0, sizeof(ctx->dirty));
-
+ /* If we set the dirty flag for update fop, we have to unset it.
+ * If fop has failed on some bricks, leave the dirty as marked. */
+ if (lock->unlock_now) {
+ if (!(ec->node_mask & ~lock->good_mask)) {
+ if (ctx->dirty[0] != 0) {
+ dirty[0] = -1;
+ }
+ if (ctx->dirty[1] != 0) {
+ dirty[1] = -1;
+ }
+ }
+ memset(ctx->dirty, 0, sizeof(ctx->dirty));
+ }
if ((version[0] != 0) || (version[1] != 0) ||
(dirty[0] != 0) || (dirty[1] != 0)) {
ec_update_size_version(link, version, size, dirty);
-
return _gf_true;
}
@@ -1966,7 +1998,15 @@ ec_update_info(ec_lock_link_t *link)
void
ec_unlock_now(ec_lock_link_t *link)
{
+ ec_lock_t *lock;
+ lock = link->lock;
+
ec_trace("UNLOCK_NOW", link->fop, "lock=%p", link->lock);
+ /*At this point, lock is not being used by any fop and
+ *can not be reused by any fop as it is going to be released.
+ *lock->unlock_now can not be modified at any other place.
+ */
+ lock->unlock_now = _gf_true;
if (!ec_update_info(link)) {
ec_unlock_lock(link);
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
index 8e724a8..d720d24 100644
--- a/xlators/cluster/ec/src/ec-common.h
+++ b/xlators/cluster/ec/src/ec-common.h
@@ -28,7 +28,8 @@ typedef enum {
#define EC_CONFIG_ALGORITHM 0
#define EC_FLAG_LOCK_SHARED 0x0001
-#define EC_FLAG_WAITING_SIZE 0x0002
+#define EC_FLAG_WAITING_XATTROP 0x0002
+#define EC_FLAG_QUERY_METADATA 0x0004
#define EC_SELFHEAL_BIT 62
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
index 4a2a11f..a207b11 100644
--- a/xlators/cluster/ec/src/ec-data.h
+++ b/xlators/cluster/ec/src/ec-data.h
@@ -163,7 +163,8 @@ struct _ec_lock
uint32_t refs_owners; /* Refs for fops owning the lock */
uint32_t refs_pending; /* Refs assigned to fops being prepared */
gf_boolean_t acquired;
- gf_boolean_t getting_size;
+ gf_boolean_t getting_xattr;
+ gf_boolean_t unlock_now;
gf_boolean_t release;
gf_boolean_t query;
fd_t *fd;
--
1.7.1