|
|
21ab4e |
From d544399c921895b81240d2efd9371d2c8139faac Mon Sep 17 00:00:00 2001
|
|
|
21ab4e |
From: Ashish Pandey <aspandey@redhat.com>
|
|
|
21ab4e |
Date: Mon, 3 Apr 2017 12:46:29 +0530
|
|
|
21ab4e |
Subject: [PATCH 491/509] cluster/ec: Update xattr and heal size properly
|
|
|
21ab4e |
|
|
|
21ab4e |
Problem-1 : Recursive healing of same file is happening
|
|
|
21ab4e |
when IO is going on even after data heal completes.
|
|
|
21ab4e |
|
|
|
21ab4e |
Solution:
|
|
|
21ab4e |
RCA: At the end of the write, when ec_update_size_version
|
|
|
21ab4e |
gets called, we send it only on good bricks and not
|
|
|
21ab4e |
on healing brick. Due to this, xattr on healing brick
|
|
|
21ab4e |
will always remain out of sync and when the background
|
|
|
21ab4e |
heal check source and sink, it finds this brick to be
|
|
|
21ab4e |
healed and start healing from scratch. That involve
|
|
|
21ab4e |
ftruncate and writing all of the data again.
|
|
|
21ab4e |
|
|
|
21ab4e |
To solve this, send xattrop on all the good bricks as
|
|
|
21ab4e |
well as healing bricks.
|
|
|
21ab4e |
|
|
|
21ab4e |
Problem-2: The above fix exposes the data corruption
|
|
|
21ab4e |
during heal. If the write on a file is going on and
|
|
|
21ab4e |
heal finishes, we find that the file gets corrupted.
|
|
|
21ab4e |
|
|
|
21ab4e |
RCA:
|
|
|
21ab4e |
The real problem happens in ec_rebuild_data(). Here we receive the
|
|
|
21ab4e |
'size' argument which contains the real file size at the time of
|
|
|
21ab4e |
starting self-heal and it's assigned to heal->total_size.
|
|
|
21ab4e |
|
|
|
21ab4e |
After that, a sequence of calls to ec_sync_heal_block() are done. Each
|
|
|
21ab4e |
call ends up calling ec_manager_heal_block(), which does the actual work
|
|
|
21ab4e |
of healing a block.
|
|
|
21ab4e |
|
|
|
21ab4e |
First a lock on the inode is taken in state EC_STATE_INIT using
|
|
|
21ab4e |
ec_heal_inodelk(). When the lock is acquired, ec_heal_lock_cbk() is
|
|
|
21ab4e |
called. This function calls ec_set_inode_size() to store the real size
|
|
|
21ab4e |
of the inode (it uses heal->total_size).
|
|
|
21ab4e |
|
|
|
21ab4e |
The next step is to read the block to be healed. This is done using a
|
|
|
21ab4e |
regular ec_readv(). One of the things this call does is to trim the
|
|
|
21ab4e |
returned size if the file is smaller than the requested size.
|
|
|
21ab4e |
|
|
|
21ab4e |
In our case, when we read the last block of a file whose size was = 512
|
|
|
21ab4e |
mod 1024 at the time of starting self-heal, ec_readv() will return only
|
|
|
21ab4e |
the first 512 bytes, not the whole 1024 bytes.
|
|
|
21ab4e |
|
|
|
21ab4e |
This isn't a problem since the following ec_writev() sent from the heal
|
|
|
21ab4e |
code only attempts to write the amount of data read, so it shouldn't
|
|
|
21ab4e |
modify the remaining 512 bytes.
|
|
|
21ab4e |
|
|
|
21ab4e |
However ec_writev() also checks the file size. If we are writing the
|
|
|
21ab4e |
last block of the file (determined by the size stored on the inode that
|
|
|
21ab4e |
we have set to heal->total_size), any data beyond the (imposed) end of
|
|
|
21ab4e |
file will be cleared with 0's. This causes the 512 bytes after the
|
|
|
21ab4e |
heal->total_size to be cleared. Since the file was written after heal
|
|
|
21ab4e |
started, the these bytes contained data, so the block written to the
|
|
|
21ab4e |
damaged brick will be incorrect.
|
|
|
21ab4e |
|
|
|
21ab4e |
Solution:
|
|
|
21ab4e |
Align heal->total_size to a multiple of the stripe size.
|
|
|
21ab4e |
|
|
|
21ab4e |
Thanks "Xavier Hernandez" <xhernandez@datalab.es>
|
|
|
21ab4e |
to find out the root cause and to fix the issue.
|
|
|
21ab4e |
|
|
|
21ab4e |
>Change-Id: I6c9f37b3ff9dd7f5dc1858ad6f9845c05b4e204e
|
|
|
21ab4e |
>BUG: 1428673
|
|
|
21ab4e |
>Signed-off-by: Ashish Pandey <aspandey@redhat.com>
|
|
|
21ab4e |
>Reviewed-on: https://review.gluster.org/16985
|
|
|
21ab4e |
>Smoke: Gluster Build System <jenkins@build.gluster.org>
|
|
|
21ab4e |
>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
|
|
|
21ab4e |
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
|
|
|
21ab4e |
>Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
|
|
|
21ab4e |
>Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
|
|
|
21ab4e |
>Signed-off-by: Ashish Pandey <aspandey@redhat.com>
|
|
|
21ab4e |
|
|
|
21ab4e |
Change-Id: I6c9f37b3ff9dd7f5dc1858ad6f9845c05b4e204e
|
|
|
21ab4e |
BUG: 1427159
|
|
|
21ab4e |
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
|
|
|
21ab4e |
Reviewed-on: https://code.engineering.redhat.com/gerrit/108397
|
|
|
21ab4e |
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
|
|
21ab4e |
---
|
|
|
21ab4e |
tests/basic/ec/ec-data-heal.t | 87 ++++++++++++++++++++++++++++++++++++++
|
|
|
21ab4e |
tests/include.rc | 1 +
|
|
|
21ab4e |
xlators/cluster/ec/src/ec-common.c | 11 ++++-
|
|
|
21ab4e |
xlators/cluster/ec/src/ec-heal.c | 17 +++++---
|
|
|
21ab4e |
4 files changed, 107 insertions(+), 9 deletions(-)
|
|
|
21ab4e |
create mode 100755 tests/basic/ec/ec-data-heal.t
|
|
|
21ab4e |
|
|
|
21ab4e |
diff --git a/tests/basic/ec/ec-data-heal.t b/tests/basic/ec/ec-data-heal.t
|
|
|
21ab4e |
new file mode 100755
|
|
|
21ab4e |
index 0000000..4599c8a
|
|
|
21ab4e |
--- /dev/null
|
|
|
21ab4e |
+++ b/tests/basic/ec/ec-data-heal.t
|
|
|
21ab4e |
@@ -0,0 +1,87 @@
|
|
|
21ab4e |
+#!/bin/bash
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+. $(dirname $0)/../../include.rc
|
|
|
21ab4e |
+. $(dirname $0)/../../volume.rc
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+#This test checks data corruption after heal while IO is going on
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+cleanup
|
|
|
21ab4e |
+TEST glusterd
|
|
|
21ab4e |
+TEST pidof glusterd
|
|
|
21ab4e |
+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2}
|
|
|
21ab4e |
+TEST $CLI volume start $V0
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
|
|
|
21ab4e |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+############ Start IO ###########
|
|
|
21ab4e |
+TEST touch $M0/file
|
|
|
21ab4e |
+#start background IO on file
|
|
|
21ab4e |
+dd if=/dev/urandom of=$M0/file conv=fdatasync &
|
|
|
21ab4e |
+iopid=$(echo $!)
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+############ Kill and start brick0 for heal ###########
|
|
|
21ab4e |
+brick0=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}0) -o args)
|
|
|
21ab4e |
+WORDTOREMOVE=COMMAND
|
|
|
21ab4e |
+brick0=${brick0//$WORDTOREMOVE/}
|
|
|
21ab4e |
+TEST kill_brick $V0 $H0 $B0/${V0}0
|
|
|
21ab4e |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0
|
|
|
21ab4e |
+#sleep so that data can be written which will be healed later
|
|
|
21ab4e |
+sleep 10
|
|
|
21ab4e |
+TEST eval $brick0
|
|
|
21ab4e |
+##wait for heal info to become 0 and kill IO
|
|
|
21ab4e |
+EXPECT_WITHIN $IO_HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
21ab4e |
+kill $iopid
|
|
|
21ab4e |
+EXPECT_WITHIN $IO_HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+############### Check md5sum #########################
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+## unmount and mount get md5sum after killing brick0
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+brick0=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}0) -o args)
|
|
|
21ab4e |
+WORDTOREMOVE=COMMAND
|
|
|
21ab4e |
+brick0=${brick0//$WORDTOREMOVE/}
|
|
|
21ab4e |
+TEST kill_brick $V0 $H0 $B0/${V0}0
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
|
|
21ab4e |
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
|
|
|
21ab4e |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0
|
|
|
21ab4e |
+mdsum0=`md5sum $M0/file | awk '{print $1}'`
|
|
|
21ab4e |
+TEST eval $brick0
|
|
|
21ab4e |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+## unmount and mount get md5sum after killing brick1
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+brick1=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}1) -o args)
|
|
|
21ab4e |
+WORDTOREMOVE=COMMAND
|
|
|
21ab4e |
+brick1=${brick1//$WORDTOREMOVE/}
|
|
|
21ab4e |
+TEST kill_brick $V0 $H0 $B0/${V0}1
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
|
|
21ab4e |
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
|
|
|
21ab4e |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0
|
|
|
21ab4e |
+mdsum1=`md5sum $M0/file | awk '{print $1}'`
|
|
|
21ab4e |
+TEST eval $brick1
|
|
|
21ab4e |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+## unmount and mount get md5sum after killing brick2
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+brick2=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}2) -o args)
|
|
|
21ab4e |
+WORDTOREMOVE=COMMAND
|
|
|
21ab4e |
+brick2=${brick2//$WORDTOREMOVE/}
|
|
|
21ab4e |
+TEST kill_brick $V0 $H0 $B0/${V0}2
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
|
|
21ab4e |
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
|
|
|
21ab4e |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0
|
|
|
21ab4e |
+mdsum2=`md5sum $M0/file | awk '{print $1}'`
|
|
|
21ab4e |
+TEST eval $brick2
|
|
|
21ab4e |
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+# compare all the three md5sums
|
|
|
21ab4e |
+EXPECT "$mdsum0" echo $mdsum1
|
|
|
21ab4e |
+EXPECT "$mdsum0" echo $mdsum2
|
|
|
21ab4e |
+EXPECT "$mdsum1" echo $mdsum2
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+cleanup
|
|
|
21ab4e |
diff --git a/tests/include.rc b/tests/include.rc
|
|
|
21ab4e |
index ce2085f..21aabf5 100644
|
|
|
21ab4e |
--- a/tests/include.rc
|
|
|
21ab4e |
+++ b/tests/include.rc
|
|
|
21ab4e |
@@ -70,6 +70,7 @@ PROBE_TIMEOUT=60
|
|
|
21ab4e |
REBALANCE_TIMEOUT=360
|
|
|
21ab4e |
REOPEN_TIMEOUT=20
|
|
|
21ab4e |
HEAL_TIMEOUT=80
|
|
|
21ab4e |
+IO_HEAL_TIMEOUT=120
|
|
|
21ab4e |
MARKER_UPDATE_TIMEOUT=20
|
|
|
21ab4e |
JANITOR_TIMEOUT=60
|
|
|
21ab4e |
UMOUNT_TIMEOUT=5
|
|
|
21ab4e |
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
|
|
|
21ab4e |
index 1720d88..9a977d7 100644
|
|
|
21ab4e |
--- a/xlators/cluster/ec/src/ec-common.c
|
|
|
21ab4e |
+++ b/xlators/cluster/ec/src/ec-common.c
|
|
|
21ab4e |
@@ -1755,6 +1755,9 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
|
|
|
21ab4e |
}
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
+ if (fop->healing) {
|
|
|
21ab4e |
+ lock->healing = fop->healing & (fop->good | fop->remaining);
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
ec_lock_update_good(lock, fop);
|
|
|
21ab4e |
|
|
|
21ab4e |
lock->exclusive -= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
|
|
|
21ab4e |
@@ -1952,6 +1955,8 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
|
|
|
21ab4e |
ec_lock_t *lock;
|
|
|
21ab4e |
ec_inode_t *ctx;
|
|
|
21ab4e |
dict_t * dict;
|
|
|
21ab4e |
+ uintptr_t update_on = 0;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
int32_t err = -ENOMEM;
|
|
|
21ab4e |
|
|
|
21ab4e |
fop = link->fop;
|
|
|
21ab4e |
@@ -2006,12 +2011,14 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
|
|
|
21ab4e |
fop->frame->root->uid = 0;
|
|
|
21ab4e |
fop->frame->root->gid = 0;
|
|
|
21ab4e |
|
|
|
21ab4e |
+ update_on = lock->good_mask | lock->healing;
|
|
|
21ab4e |
+
|
|
|
21ab4e |
if (link->lock->fd == NULL) {
|
|
|
21ab4e |
- ec_xattrop(fop->frame, fop->xl, lock->good_mask, EC_MINIMUM_MIN,
|
|
|
21ab4e |
+ ec_xattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,
|
|
|
21ab4e |
ec_update_size_version_done, link, &link->lock->loc,
|
|
|
21ab4e |
GF_XATTROP_ADD_ARRAY64, dict, NULL);
|
|
|
21ab4e |
} else {
|
|
|
21ab4e |
- ec_fxattrop(fop->frame, fop->xl, lock->good_mask, EC_MINIMUM_MIN,
|
|
|
21ab4e |
+ ec_fxattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,
|
|
|
21ab4e |
ec_update_size_version_done, link, link->lock->fd,
|
|
|
21ab4e |
GF_XATTROP_ADD_ARRAY64, dict, NULL);
|
|
|
21ab4e |
}
|
|
|
21ab4e |
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
|
|
|
21ab4e |
index 02e8123..ae60d2f 100644
|
|
|
21ab4e |
--- a/xlators/cluster/ec/src/ec-heal.c
|
|
|
21ab4e |
+++ b/xlators/cluster/ec/src/ec-heal.c
|
|
|
21ab4e |
@@ -1908,8 +1908,8 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
|
|
|
21ab4e |
heal->fd = fd_ref (fd);
|
|
|
21ab4e |
heal->xl = ec->xl;
|
|
|
21ab4e |
heal->data = &barrier;
|
|
|
21ab4e |
- syncbarrier_init (heal->data);
|
|
|
21ab4e |
pool = ec->xl->ctx->iobuf_pool;
|
|
|
21ab4e |
+ size = ec_adjust_size (ec, size, 0);
|
|
|
21ab4e |
heal->total_size = size;
|
|
|
21ab4e |
heal->size = iobpool_default_pagesize (pool);
|
|
|
21ab4e |
/* We need to adjust the size to a multiple of the stripe size of the
|
|
|
21ab4e |
@@ -1947,13 +1947,15 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
int
|
|
|
21ab4e |
-__ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec, fd_t *fd,
|
|
|
21ab4e |
- unsigned char *healed_sinks, unsigned char *trim)
|
|
|
21ab4e |
+__ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec,
|
|
|
21ab4e |
+ fd_t *fd, unsigned char *healed_sinks,
|
|
|
21ab4e |
+ unsigned char *trim, uint64_t size)
|
|
|
21ab4e |
{
|
|
|
21ab4e |
default_args_cbk_t *replies = NULL;
|
|
|
21ab4e |
unsigned char *output = NULL;
|
|
|
21ab4e |
int ret = 0;
|
|
|
21ab4e |
int i = 0;
|
|
|
21ab4e |
+ off_t trim_offset = 0;
|
|
|
21ab4e |
|
|
|
21ab4e |
EC_REPLIES_ALLOC (replies, ec->nodes);
|
|
|
21ab4e |
output = alloca0 (ec->nodes);
|
|
|
21ab4e |
@@ -1962,9 +1964,9 @@ __ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec, fd_t *fd,
|
|
|
21ab4e |
ret = 0;
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
-
|
|
|
21ab4e |
+ trim_offset = ec_adjust_size (ec, size, 1);
|
|
|
21ab4e |
ret = cluster_ftruncate (ec->xl_list, trim, ec->nodes, replies, output,
|
|
|
21ab4e |
- frame, ec->xl, fd, 0, NULL);
|
|
|
21ab4e |
+ frame, ec->xl, fd, trim_offset, NULL);
|
|
|
21ab4e |
for (i = 0; i < ec->nodes; i++) {
|
|
|
21ab4e |
if (!output[i] && trim[i])
|
|
|
21ab4e |
healed_sinks[i] = 0;
|
|
|
21ab4e |
@@ -2018,7 +2020,7 @@ ec_data_undo_pending (call_frame_t *frame, ec_t *ec, fd_t *fd, dict_t *xattr,
|
|
|
21ab4e |
|
|
|
21ab4e |
if ((memcmp (versions_xattr, allzero, sizeof (allzero)) == 0) &&
|
|
|
21ab4e |
(memcmp (dirty_xattr, allzero, sizeof (allzero)) == 0) &&
|
|
|
21ab4e |
- (size == 0)) {
|
|
|
21ab4e |
+ (size_xattr == 0)) {
|
|
|
21ab4e |
ret = 0;
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
@@ -2224,7 +2226,8 @@ __ec_heal_data (call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on,
|
|
|
21ab4e |
if (ret < 0)
|
|
|
21ab4e |
goto unlock;
|
|
|
21ab4e |
|
|
|
21ab4e |
- ret = __ec_heal_trim_sinks (frame, ec, fd, healed_sinks, trim);
|
|
|
21ab4e |
+ ret = __ec_heal_trim_sinks (frame, ec, fd, healed_sinks,
|
|
|
21ab4e |
+ trim, size[source]);
|
|
|
21ab4e |
}
|
|
|
21ab4e |
unlock:
|
|
|
21ab4e |
cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output,
|
|
|
21ab4e |
--
|
|
|
21ab4e |
1.8.3.1
|
|
|
21ab4e |
|