Tree - rpms/glusterfs - CentOS Git server

rpms / glusterfs

Blame SOURCES/0491-cluster-ec-Update-xattr-and-heal-size-properly.patch

Blob History Raw

		21ab4e	`From d544399c921895b81240d2efd9371d2c8139faac Mon Sep 17 00:00:00 2001`
		21ab4e	`From: Ashish Pandey <aspandey@redhat.com>`
		21ab4e	`Date: Mon, 3 Apr 2017 12:46:29 +0530`
		21ab4e	`Subject: [PATCH 491/509] cluster/ec: Update xattr and heal size properly`
		21ab4e
		21ab4e	`Problem-1 : Recursive healing of same file is happening`
		21ab4e	`when IO is going on even after data heal completes.`
		21ab4e
		21ab4e	`Solution:`
		21ab4e	`RCA: At the end of the write, when ec_update_size_version`
		21ab4e	`gets called, we send it only on good bricks and not`
		21ab4e	`on healing brick. Due to this, xattr on healing brick`
		21ab4e	`will always remain out of sync and when the background`
		21ab4e	`heal check source and sink, it finds this brick to be`
		21ab4e	`healed and start healing from scratch. That involve`
		21ab4e	`ftruncate and writing all of the data again.`
		21ab4e
		21ab4e	`To solve this, send xattrop on all the good bricks as`
		21ab4e	`well as healing bricks.`
		21ab4e
		21ab4e	`Problem-2: The above fix exposes the data corruption`
		21ab4e	`during heal. If the write on a file is going on and`
		21ab4e	`heal finishes, we find that the file gets corrupted.`
		21ab4e
		21ab4e	`RCA:`
		21ab4e	`The real problem happens in ec_rebuild_data(). Here we receive the`
		21ab4e	`'size' argument which contains the real file size at the time of`
		21ab4e	`starting self-heal and it's assigned to heal->total_size.`
		21ab4e
		21ab4e	`After that, a sequence of calls to ec_sync_heal_block() are done. Each`
		21ab4e	`call ends up calling ec_manager_heal_block(), which does the actual work`
		21ab4e	`of healing a block.`
		21ab4e
		21ab4e	`First a lock on the inode is taken in state EC_STATE_INIT using`
		21ab4e	`ec_heal_inodelk(). When the lock is acquired, ec_heal_lock_cbk() is`
		21ab4e	`called. This function calls ec_set_inode_size() to store the real size`
		21ab4e	`of the inode (it uses heal->total_size).`
		21ab4e
		21ab4e	`The next step is to read the block to be healed. This is done using a`
		21ab4e	`regular ec_readv(). One of the things this call does is to trim the`
		21ab4e	`returned size if the file is smaller than the requested size.`
		21ab4e
		21ab4e	`In our case, when we read the last block of a file whose size was = 512`
		21ab4e	`mod 1024 at the time of starting self-heal, ec_readv() will return only`
		21ab4e	`the first 512 bytes, not the whole 1024 bytes.`
		21ab4e
		21ab4e	`This isn't a problem since the following ec_writev() sent from the heal`
		21ab4e	`code only attempts to write the amount of data read, so it shouldn't`
		21ab4e	`modify the remaining 512 bytes.`
		21ab4e
		21ab4e	`However ec_writev() also checks the file size. If we are writing the`
		21ab4e	`last block of the file (determined by the size stored on the inode that`
		21ab4e	`we have set to heal->total_size), any data beyond the (imposed) end of`
		21ab4e	`file will be cleared with 0's. This causes the 512 bytes after the`
		21ab4e	`heal->total_size to be cleared. Since the file was written after heal`
		21ab4e	`started, the these bytes contained data, so the block written to the`
		21ab4e	`damaged brick will be incorrect.`
		21ab4e
		21ab4e	`Solution:`
		21ab4e	`Align heal->total_size to a multiple of the stripe size.`
		21ab4e
		21ab4e	`Thanks "Xavier Hernandez" <xhernandez@datalab.es>`
		21ab4e	`to find out the root cause and to fix the issue.`
		21ab4e
		21ab4e	`>Change-Id: I6c9f37b3ff9dd7f5dc1858ad6f9845c05b4e204e`
		21ab4e	`>BUG: 1428673`
		21ab4e	`>Signed-off-by: Ashish Pandey <aspandey@redhat.com>`
		21ab4e	`>Reviewed-on: https://review.gluster.org/16985`
		21ab4e	`>Smoke: Gluster Build System <jenkins@build.gluster.org>`
		21ab4e	`>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>`
		21ab4e	`>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>`
		21ab4e	`>Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>`
		21ab4e	`>Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>`
		21ab4e	`>Signed-off-by: Ashish Pandey <aspandey@redhat.com>`
		21ab4e
		21ab4e	`Change-Id: I6c9f37b3ff9dd7f5dc1858ad6f9845c05b4e204e`
		21ab4e	`BUG: 1427159`
		21ab4e	`Signed-off-by: Ashish Pandey <aspandey@redhat.com>`
		21ab4e	`Reviewed-on: https://code.engineering.redhat.com/gerrit/108397`
		21ab4e	`Reviewed-by: Atin Mukherjee <amukherj@redhat.com>`
		21ab4e	`---`
		21ab4e	`tests/basic/ec/ec-data-heal.t \| 87 ++++++++++++++++++++++++++++++++++++++`
		21ab4e	`tests/include.rc \| 1 +`
		21ab4e	`xlators/cluster/ec/src/ec-common.c \| 11 ++++-`
		21ab4e	`xlators/cluster/ec/src/ec-heal.c \| 17 +++++---`
		21ab4e	`4 files changed, 107 insertions(+), 9 deletions(-)`
		21ab4e	`create mode 100755 tests/basic/ec/ec-data-heal.t`
		21ab4e
		21ab4e	`diff --git a/tests/basic/ec/ec-data-heal.t b/tests/basic/ec/ec-data-heal.t`
		21ab4e	`new file mode 100755`
		21ab4e	`index 0000000..4599c8a`
		21ab4e	`--- /dev/null`
		21ab4e	`+++ b/tests/basic/ec/ec-data-heal.t`
		21ab4e	`@@ -0,0 +1,87 @@`
		21ab4e	`+#!/bin/bash`
		21ab4e	`+`
		21ab4e	`+. $(dirname $0)/../../include.rc`
		21ab4e	`+. $(dirname $0)/../../volume.rc`
		21ab4e	`+`
		21ab4e	`+#This test checks data corruption after heal while IO is going on`
		21ab4e	`+`
		21ab4e	`+cleanup`
		21ab4e	`+TEST glusterd`
		21ab4e	`+TEST pidof glusterd`
		21ab4e	`+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2}`
		21ab4e	`+TEST $CLI volume start $V0`
		21ab4e	`+`
		21ab4e	`+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;`
		21ab4e	`+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0`
		21ab4e	`+`
		21ab4e	`+############ Start IO ###########`
		21ab4e	`+TEST touch $M0/file`
		21ab4e	`+#start background IO on file`
		21ab4e	`+dd if=/dev/urandom of=$M0/file conv=fdatasync &`
		21ab4e	`+iopid=$(echo $!)`
		21ab4e	`+`
		21ab4e	`+`
		21ab4e	`+############ Kill and start brick0 for heal ###########`
		21ab4e	`+brick0=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}0) -o args)`
		21ab4e	`+WORDTOREMOVE=COMMAND`
		21ab4e	`+brick0=${brick0//$WORDTOREMOVE/}`
		21ab4e	`+TEST kill_brick $V0 $H0 $B0/${V0}0`
		21ab4e	`+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0`
		21ab4e	`+#sleep so that data can be written which will be healed later`
		21ab4e	`+sleep 10`
		21ab4e	`+TEST eval $brick0`
		21ab4e	`+##wait for heal info to become 0 and kill IO`
		21ab4e	`+EXPECT_WITHIN $IO_HEAL_TIMEOUT "^0$" get_pending_heal_count $V0`
		21ab4e	`+kill $iopid`
		21ab4e	`+EXPECT_WITHIN $IO_HEAL_TIMEOUT "^0$" get_pending_heal_count $V0`
		21ab4e	`+`
		21ab4e	`+############### Check md5sum #########################`
		21ab4e	`+`
		21ab4e	`+## unmount and mount get md5sum after killing brick0`
		21ab4e	`+`
		21ab4e	`+brick0=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}0) -o args)`
		21ab4e	`+WORDTOREMOVE=COMMAND`
		21ab4e	`+brick0=${brick0//$WORDTOREMOVE/}`
		21ab4e	`+TEST kill_brick $V0 $H0 $B0/${V0}0`
		21ab4e	`+`
		21ab4e	`+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0`
		21ab4e	`+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;`
		21ab4e	`+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0`
		21ab4e	+mdsum0=`md5sum $M0/file \| awk '{print $1}'`
		21ab4e	`+TEST eval $brick0`
		21ab4e	`+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0`
		21ab4e	`+`
		21ab4e	`+## unmount and mount get md5sum after killing brick1`
		21ab4e	`+`
		21ab4e	`+brick1=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}1) -o args)`
		21ab4e	`+WORDTOREMOVE=COMMAND`
		21ab4e	`+brick1=${brick1//$WORDTOREMOVE/}`
		21ab4e	`+TEST kill_brick $V0 $H0 $B0/${V0}1`
		21ab4e	`+`
		21ab4e	`+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0`
		21ab4e	`+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;`
		21ab4e	`+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0`
		21ab4e	+mdsum1=`md5sum $M0/file \| awk '{print $1}'`
		21ab4e	`+TEST eval $brick1`
		21ab4e	`+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0`
		21ab4e	`+`
		21ab4e	`+## unmount and mount get md5sum after killing brick2`
		21ab4e	`+`
		21ab4e	`+brick2=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}2) -o args)`
		21ab4e	`+WORDTOREMOVE=COMMAND`
		21ab4e	`+brick2=${brick2//$WORDTOREMOVE/}`
		21ab4e	`+TEST kill_brick $V0 $H0 $B0/${V0}2`
		21ab4e	`+`
		21ab4e	`+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0`
		21ab4e	`+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;`
		21ab4e	`+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0`
		21ab4e	+mdsum2=`md5sum $M0/file \| awk '{print $1}'`
		21ab4e	`+TEST eval $brick2`
		21ab4e	`+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0`
		21ab4e	`+`
		21ab4e	`+# compare all the three md5sums`
		21ab4e	`+EXPECT "$mdsum0" echo $mdsum1`
		21ab4e	`+EXPECT "$mdsum0" echo $mdsum2`
		21ab4e	`+EXPECT "$mdsum1" echo $mdsum2`
		21ab4e	`+`
		21ab4e	`+cleanup`
		21ab4e	`diff --git a/tests/include.rc b/tests/include.rc`
		21ab4e	`index ce2085f..21aabf5 100644`
		21ab4e	`--- a/tests/include.rc`
		21ab4e	`+++ b/tests/include.rc`
		21ab4e	`@@ -70,6 +70,7 @@ PROBE_TIMEOUT=60`
		21ab4e	`REBALANCE_TIMEOUT=360`
		21ab4e	`REOPEN_TIMEOUT=20`
		21ab4e	`HEAL_TIMEOUT=80`
		21ab4e	`+IO_HEAL_TIMEOUT=120`
		21ab4e	`MARKER_UPDATE_TIMEOUT=20`
		21ab4e	`JANITOR_TIMEOUT=60`
		21ab4e	`UMOUNT_TIMEOUT=5`
		21ab4e	`diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c`
		21ab4e	`index 1720d88..9a977d7 100644`
		21ab4e	`--- a/xlators/cluster/ec/src/ec-common.c`
		21ab4e	`+++ b/xlators/cluster/ec/src/ec-common.c`
		21ab4e	`@@ -1755,6 +1755,9 @@ ec_lock_next_owner(ec_lock_link_t link, ec_cbk_data_t cbk,`
		21ab4e	`}`
		21ab4e	`}`
		21ab4e
		21ab4e	`+ if (fop->healing) {`
		21ab4e	`+ lock->healing = fop->healing & (fop->good \| fop->remaining);`
		21ab4e	`+ }`
		21ab4e	`ec_lock_update_good(lock, fop);`
		21ab4e
		21ab4e	`lock->exclusive -= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;`
		21ab4e	`@@ -1952,6 +1955,8 @@ ec_update_size_version(ec_lock_link_t link, uint64_t version,`
		21ab4e	`ec_lock_t *lock;`
		21ab4e	`ec_inode_t *ctx;`
		21ab4e	`dict_t * dict;`
		21ab4e	`+ uintptr_t update_on = 0;`
		21ab4e	`+`
		21ab4e	`int32_t err = -ENOMEM;`
		21ab4e
		21ab4e	`fop = link->fop;`
		21ab4e	`@@ -2006,12 +2011,14 @@ ec_update_size_version(ec_lock_link_t link, uint64_t version,`
		21ab4e	`fop->frame->root->uid = 0;`
		21ab4e	`fop->frame->root->gid = 0;`
		21ab4e
		21ab4e	`+ update_on = lock->good_mask \| lock->healing;`
		21ab4e	`+`
		21ab4e	`if (link->lock->fd == NULL) {`
		21ab4e	`- ec_xattrop(fop->frame, fop->xl, lock->good_mask, EC_MINIMUM_MIN,`
		21ab4e	`+ ec_xattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,`
		21ab4e	`ec_update_size_version_done, link, &link->lock->loc,`
		21ab4e	`GF_XATTROP_ADD_ARRAY64, dict, NULL);`
		21ab4e	`} else {`
		21ab4e	`- ec_fxattrop(fop->frame, fop->xl, lock->good_mask, EC_MINIMUM_MIN,`
		21ab4e	`+ ec_fxattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,`
		21ab4e	`ec_update_size_version_done, link, link->lock->fd,`
		21ab4e	`GF_XATTROP_ADD_ARRAY64, dict, NULL);`
		21ab4e	`}`
		21ab4e	`diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c`
		21ab4e	`index 02e8123..ae60d2f 100644`
		21ab4e	`--- a/xlators/cluster/ec/src/ec-heal.c`
		21ab4e	`+++ b/xlators/cluster/ec/src/ec-heal.c`
		21ab4e	`@@ -1908,8 +1908,8 @@ ec_rebuild_data (call_frame_t frame, ec_t ec, fd_t *fd, uint64_t size,`
		21ab4e	`heal->fd = fd_ref (fd);`
		21ab4e	`heal->xl = ec->xl;`
		21ab4e	`heal->data = &barrier;`
		21ab4e	`- syncbarrier_init (heal->data);`
		21ab4e	`pool = ec->xl->ctx->iobuf_pool;`
		21ab4e	`+ size = ec_adjust_size (ec, size, 0);`
		21ab4e	`heal->total_size = size;`
		21ab4e	`heal->size = iobpool_default_pagesize (pool);`
		21ab4e	`/* We need to adjust the size to a multiple of the stripe size of the`
		21ab4e	`@@ -1947,13 +1947,15 @@ ec_rebuild_data (call_frame_t frame, ec_t ec, fd_t *fd, uint64_t size,`
		21ab4e	`}`
		21ab4e
		21ab4e	`int`
		21ab4e	`-__ec_heal_trim_sinks (call_frame_t frame, ec_t ec, fd_t *fd,`
		21ab4e	`- unsigned char healed_sinks, unsigned char trim)`
		21ab4e	`+__ec_heal_trim_sinks (call_frame_t frame, ec_t ec,`
		21ab4e	`+ fd_t fd, unsigned char healed_sinks,`
		21ab4e	`+ unsigned char *trim, uint64_t size)`
		21ab4e	`{`
		21ab4e	`default_args_cbk_t *replies = NULL;`
		21ab4e	`unsigned char *output = NULL;`
		21ab4e	`int ret = 0;`
		21ab4e	`int i = 0;`
		21ab4e	`+ off_t trim_offset = 0;`
		21ab4e
		21ab4e	`EC_REPLIES_ALLOC (replies, ec->nodes);`
		21ab4e	`output = alloca0 (ec->nodes);`
		21ab4e	`@@ -1962,9 +1964,9 @@ __ec_heal_trim_sinks (call_frame_t frame, ec_t ec, fd_t *fd,`
		21ab4e	`ret = 0;`
		21ab4e	`goto out;`
		21ab4e	`}`
		21ab4e	`-`
		21ab4e	`+ trim_offset = ec_adjust_size (ec, size, 1);`
		21ab4e	`ret = cluster_ftruncate (ec->xl_list, trim, ec->nodes, replies, output,`
		21ab4e	`- frame, ec->xl, fd, 0, NULL);`
		21ab4e	`+ frame, ec->xl, fd, trim_offset, NULL);`
		21ab4e	`for (i = 0; i < ec->nodes; i++) {`
		21ab4e	`if (!output[i] && trim[i])`
		21ab4e	`healed_sinks[i] = 0;`
		21ab4e	`@@ -2018,7 +2020,7 @@ ec_data_undo_pending (call_frame_t frame, ec_t ec, fd_t fd, dict_t xattr,`
		21ab4e
		21ab4e	`if ((memcmp (versions_xattr, allzero, sizeof (allzero)) == 0) &&`
		21ab4e	`(memcmp (dirty_xattr, allzero, sizeof (allzero)) == 0) &&`
		21ab4e	`- (size == 0)) {`
		21ab4e	`+ (size_xattr == 0)) {`
		21ab4e	`ret = 0;`
		21ab4e	`goto out;`
		21ab4e	`}`
		21ab4e	`@@ -2224,7 +2226,8 @@ __ec_heal_data (call_frame_t frame, ec_t ec, fd_t fd, unsigned char heal_on,`
		21ab4e	`if (ret < 0)`
		21ab4e	`goto unlock;`
		21ab4e
		21ab4e	`- ret = __ec_heal_trim_sinks (frame, ec, fd, healed_sinks, trim);`
		21ab4e	`+ ret = __ec_heal_trim_sinks (frame, ec, fd, healed_sinks,`
		21ab4e	`+ trim, size[source]);`
		21ab4e	`}`
		21ab4e	`unlock:`
		21ab4e	`cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output,`
		21ab4e	`--`
		21ab4e	`1.8.3.1`
		21ab4e

rpms / glusterfs

Source Code

Blame SOURCES/0491-cluster-ec-Update-xattr-and-heal-size-properly.patch