21ab4e
From d544399c921895b81240d2efd9371d2c8139faac Mon Sep 17 00:00:00 2001
21ab4e
From: Ashish Pandey <aspandey@redhat.com>
21ab4e
Date: Mon, 3 Apr 2017 12:46:29 +0530
21ab4e
Subject: [PATCH 491/509] cluster/ec: Update xattr and heal size properly
21ab4e
21ab4e
Problem-1 : Recursive healing of same file is happening
21ab4e
when IO is going on even after data heal completes.
21ab4e
21ab4e
Solution:
21ab4e
RCA: At the end of the write, when ec_update_size_version
21ab4e
gets called, we send it only on good bricks and not
21ab4e
on healing brick. Due to this, xattr on healing brick
21ab4e
will always remain out of sync and when the background
21ab4e
heal check source and sink, it finds this brick to be
21ab4e
healed and start healing from scratch. That involve
21ab4e
ftruncate and writing all of the data again.
21ab4e
21ab4e
To solve this, send xattrop on all the good bricks as
21ab4e
well as healing bricks.
21ab4e
21ab4e
Problem-2: The above fix exposes the data corruption
21ab4e
during heal. If the write on a file is going on and
21ab4e
heal finishes, we find that the file gets corrupted.
21ab4e
21ab4e
RCA:
21ab4e
The real problem happens in ec_rebuild_data(). Here we receive the
21ab4e
'size' argument which contains the real file size at the time of
21ab4e
starting self-heal and it's assigned to heal->total_size.
21ab4e
21ab4e
After that, a sequence of calls to ec_sync_heal_block() are done. Each
21ab4e
call ends up calling ec_manager_heal_block(), which does the actual work
21ab4e
of healing a block.
21ab4e
21ab4e
First a lock on the inode is taken in state EC_STATE_INIT using
21ab4e
ec_heal_inodelk(). When the lock is acquired, ec_heal_lock_cbk() is
21ab4e
called. This function calls ec_set_inode_size() to store the real size
21ab4e
of the inode (it uses heal->total_size).
21ab4e
21ab4e
The next step is to read the block to be healed. This is done using a
21ab4e
regular ec_readv(). One of the things this call does is to trim the
21ab4e
returned size if the file is smaller than the requested size.
21ab4e
21ab4e
In our case, when we read the last block of a file whose size was = 512
21ab4e
mod 1024 at the time of starting self-heal, ec_readv() will return only
21ab4e
the first 512 bytes, not the whole 1024 bytes.
21ab4e
21ab4e
This isn't a problem since the following ec_writev() sent from the heal
21ab4e
code only attempts to write the amount of data read, so it shouldn't
21ab4e
modify the remaining 512 bytes.
21ab4e
21ab4e
However ec_writev() also checks the file size. If we are writing the
21ab4e
last block of the file (determined by the size stored on the inode that
21ab4e
we have set to heal->total_size), any data beyond the (imposed) end of
21ab4e
file will be cleared with 0's. This causes the 512 bytes after the
21ab4e
heal->total_size to be cleared. Since the file was written after heal
21ab4e
started, the these bytes contained data, so the block written to the
21ab4e
damaged brick will be incorrect.
21ab4e
21ab4e
Solution:
21ab4e
Align heal->total_size to a multiple of the stripe size.
21ab4e
21ab4e
Thanks "Xavier Hernandez" <xhernandez@datalab.es>
21ab4e
to find out the root cause and to fix the issue.
21ab4e
21ab4e
>Change-Id: I6c9f37b3ff9dd7f5dc1858ad6f9845c05b4e204e
21ab4e
>BUG: 1428673
21ab4e
>Signed-off-by: Ashish Pandey <aspandey@redhat.com>
21ab4e
>Reviewed-on: https://review.gluster.org/16985
21ab4e
>Smoke: Gluster Build System <jenkins@build.gluster.org>
21ab4e
>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
21ab4e
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
21ab4e
>Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
21ab4e
>Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
21ab4e
>Signed-off-by: Ashish Pandey <aspandey@redhat.com>
21ab4e
21ab4e
Change-Id: I6c9f37b3ff9dd7f5dc1858ad6f9845c05b4e204e
21ab4e
BUG: 1427159
21ab4e
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
21ab4e
Reviewed-on: https://code.engineering.redhat.com/gerrit/108397
21ab4e
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
---
21ab4e
 tests/basic/ec/ec-data-heal.t      | 87 ++++++++++++++++++++++++++++++++++++++
21ab4e
 tests/include.rc                   |  1 +
21ab4e
 xlators/cluster/ec/src/ec-common.c | 11 ++++-
21ab4e
 xlators/cluster/ec/src/ec-heal.c   | 17 +++++---
21ab4e
 4 files changed, 107 insertions(+), 9 deletions(-)
21ab4e
 create mode 100755 tests/basic/ec/ec-data-heal.t
21ab4e
21ab4e
diff --git a/tests/basic/ec/ec-data-heal.t b/tests/basic/ec/ec-data-heal.t
21ab4e
new file mode 100755
21ab4e
index 0000000..4599c8a
21ab4e
--- /dev/null
21ab4e
+++ b/tests/basic/ec/ec-data-heal.t
21ab4e
@@ -0,0 +1,87 @@
21ab4e
+#!/bin/bash
21ab4e
+
21ab4e
+. $(dirname $0)/../../include.rc
21ab4e
+. $(dirname $0)/../../volume.rc
21ab4e
+
21ab4e
+#This test checks data corruption after heal while IO is going on
21ab4e
+
21ab4e
+cleanup
21ab4e
+TEST glusterd
21ab4e
+TEST pidof glusterd
21ab4e
+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2}
21ab4e
+TEST $CLI volume start $V0
21ab4e
+
21ab4e
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
21ab4e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
21ab4e
+
21ab4e
+############ Start IO ###########
21ab4e
+TEST touch $M0/file
21ab4e
+#start background IO on file
21ab4e
+dd if=/dev/urandom of=$M0/file conv=fdatasync &
21ab4e
+iopid=$(echo $!)
21ab4e
+
21ab4e
+
21ab4e
+############ Kill and start brick0 for heal ###########
21ab4e
+brick0=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}0) -o args)
21ab4e
+WORDTOREMOVE=COMMAND
21ab4e
+brick0=${brick0//$WORDTOREMOVE/}
21ab4e
+TEST kill_brick $V0 $H0 $B0/${V0}0
21ab4e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0
21ab4e
+#sleep so that data can be written which will be healed later
21ab4e
+sleep 10
21ab4e
+TEST eval $brick0
21ab4e
+##wait for heal info to become 0 and kill IO
21ab4e
+EXPECT_WITHIN $IO_HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
21ab4e
+kill $iopid
21ab4e
+EXPECT_WITHIN $IO_HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
21ab4e
+
21ab4e
+############### Check md5sum #########################
21ab4e
+
21ab4e
+## unmount and mount get md5sum after killing brick0
21ab4e
+
21ab4e
+brick0=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}0) -o args)
21ab4e
+WORDTOREMOVE=COMMAND
21ab4e
+brick0=${brick0//$WORDTOREMOVE/}
21ab4e
+TEST kill_brick $V0 $H0 $B0/${V0}0
21ab4e
+
21ab4e
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
21ab4e
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
21ab4e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0
21ab4e
+mdsum0=`md5sum $M0/file | awk '{print $1}'`
21ab4e
+TEST eval $brick0
21ab4e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
21ab4e
+
21ab4e
+## unmount and mount get md5sum after killing brick1
21ab4e
+
21ab4e
+brick1=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}1) -o args)
21ab4e
+WORDTOREMOVE=COMMAND
21ab4e
+brick1=${brick1//$WORDTOREMOVE/}
21ab4e
+TEST kill_brick $V0 $H0 $B0/${V0}1
21ab4e
+
21ab4e
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
21ab4e
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
21ab4e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0
21ab4e
+mdsum1=`md5sum $M0/file | awk '{print $1}'`
21ab4e
+TEST eval $brick1
21ab4e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
21ab4e
+
21ab4e
+## unmount and mount get md5sum after killing brick2
21ab4e
+
21ab4e
+brick2=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}2) -o args)
21ab4e
+WORDTOREMOVE=COMMAND
21ab4e
+brick2=${brick2//$WORDTOREMOVE/}
21ab4e
+TEST kill_brick $V0 $H0 $B0/${V0}2
21ab4e
+
21ab4e
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
21ab4e
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
21ab4e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0
21ab4e
+mdsum2=`md5sum $M0/file | awk '{print $1}'`
21ab4e
+TEST eval $brick2
21ab4e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
21ab4e
+
21ab4e
+# compare all the three md5sums
21ab4e
+EXPECT "$mdsum0" echo $mdsum1
21ab4e
+EXPECT "$mdsum0" echo $mdsum2
21ab4e
+EXPECT "$mdsum1" echo $mdsum2
21ab4e
+
21ab4e
+cleanup
21ab4e
diff --git a/tests/include.rc b/tests/include.rc
21ab4e
index ce2085f..21aabf5 100644
21ab4e
--- a/tests/include.rc
21ab4e
+++ b/tests/include.rc
21ab4e
@@ -70,6 +70,7 @@ PROBE_TIMEOUT=60
21ab4e
 REBALANCE_TIMEOUT=360
21ab4e
 REOPEN_TIMEOUT=20
21ab4e
 HEAL_TIMEOUT=80
21ab4e
+IO_HEAL_TIMEOUT=120
21ab4e
 MARKER_UPDATE_TIMEOUT=20
21ab4e
 JANITOR_TIMEOUT=60
21ab4e
 UMOUNT_TIMEOUT=5
21ab4e
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
21ab4e
index 1720d88..9a977d7 100644
21ab4e
--- a/xlators/cluster/ec/src/ec-common.c
21ab4e
+++ b/xlators/cluster/ec/src/ec-common.c
21ab4e
@@ -1755,6 +1755,9 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
21ab4e
         }
21ab4e
     }
21ab4e
 
21ab4e
+    if (fop->healing) {
21ab4e
+           lock->healing = fop->healing & (fop->good | fop->remaining);
21ab4e
+    }
21ab4e
     ec_lock_update_good(lock, fop);
21ab4e
 
21ab4e
     lock->exclusive -= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
21ab4e
@@ -1952,6 +1955,8 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
21ab4e
     ec_lock_t *lock;
21ab4e
     ec_inode_t *ctx;
21ab4e
     dict_t * dict;
21ab4e
+    uintptr_t   update_on = 0;
21ab4e
+
21ab4e
     int32_t err = -ENOMEM;
21ab4e
 
21ab4e
     fop = link->fop;
21ab4e
@@ -2006,12 +2011,14 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
21ab4e
     fop->frame->root->uid = 0;
21ab4e
     fop->frame->root->gid = 0;
21ab4e
 
21ab4e
+    update_on = lock->good_mask | lock->healing;
21ab4e
+
21ab4e
     if (link->lock->fd == NULL) {
21ab4e
-            ec_xattrop(fop->frame, fop->xl, lock->good_mask, EC_MINIMUM_MIN,
21ab4e
+            ec_xattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,
21ab4e
                        ec_update_size_version_done, link, &link->lock->loc,
21ab4e
                        GF_XATTROP_ADD_ARRAY64, dict, NULL);
21ab4e
     } else {
21ab4e
-            ec_fxattrop(fop->frame, fop->xl, lock->good_mask, EC_MINIMUM_MIN,
21ab4e
+            ec_fxattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,
21ab4e
                        ec_update_size_version_done, link, link->lock->fd,
21ab4e
                        GF_XATTROP_ADD_ARRAY64, dict, NULL);
21ab4e
     }
21ab4e
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
21ab4e
index 02e8123..ae60d2f 100644
21ab4e
--- a/xlators/cluster/ec/src/ec-heal.c
21ab4e
+++ b/xlators/cluster/ec/src/ec-heal.c
21ab4e
@@ -1908,8 +1908,8 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
21ab4e
         heal->fd = fd_ref (fd);
21ab4e
         heal->xl = ec->xl;
21ab4e
         heal->data = &barrier;
21ab4e
-        syncbarrier_init (heal->data);
21ab4e
         pool = ec->xl->ctx->iobuf_pool;
21ab4e
+        size = ec_adjust_size (ec, size, 0);
21ab4e
         heal->total_size = size;
21ab4e
         heal->size = iobpool_default_pagesize (pool);
21ab4e
         /* We need to adjust the size to a multiple of the stripe size of the
21ab4e
@@ -1947,13 +1947,15 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
21ab4e
 }
21ab4e
 
21ab4e
 int
21ab4e
-__ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec, fd_t *fd,
21ab4e
-                      unsigned char *healed_sinks, unsigned char *trim)
21ab4e
+__ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec,
21ab4e
+                      fd_t *fd, unsigned char *healed_sinks,
21ab4e
+                      unsigned char *trim, uint64_t size)
21ab4e
 {
21ab4e
         default_args_cbk_t *replies = NULL;
21ab4e
         unsigned char      *output  = NULL;
21ab4e
         int                ret      = 0;
21ab4e
         int                i        = 0;
21ab4e
+        off_t        trim_offset    = 0;
21ab4e
 
21ab4e
         EC_REPLIES_ALLOC (replies, ec->nodes);
21ab4e
         output       = alloca0 (ec->nodes);
21ab4e
@@ -1962,9 +1964,9 @@ __ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec, fd_t *fd,
21ab4e
                 ret = 0;
21ab4e
                 goto out;
21ab4e
         }
21ab4e
-
21ab4e
+        trim_offset = ec_adjust_size (ec, size, 1);
21ab4e
         ret = cluster_ftruncate (ec->xl_list, trim, ec->nodes, replies, output,
21ab4e
-                                 frame, ec->xl, fd, 0, NULL);
21ab4e
+                                 frame, ec->xl, fd, trim_offset, NULL);
21ab4e
         for (i = 0; i < ec->nodes; i++) {
21ab4e
                 if (!output[i] && trim[i])
21ab4e
                         healed_sinks[i] = 0;
21ab4e
@@ -2018,7 +2020,7 @@ ec_data_undo_pending (call_frame_t *frame, ec_t *ec, fd_t *fd, dict_t *xattr,
21ab4e
 
21ab4e
         if ((memcmp (versions_xattr, allzero, sizeof (allzero)) == 0) &&
21ab4e
             (memcmp (dirty_xattr, allzero, sizeof (allzero)) == 0) &&
21ab4e
-             (size == 0)) {
21ab4e
+             (size_xattr == 0)) {
21ab4e
                 ret = 0;
21ab4e
                 goto out;
21ab4e
         }
21ab4e
@@ -2224,7 +2226,8 @@ __ec_heal_data (call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on,
21ab4e
                 if (ret < 0)
21ab4e
                         goto unlock;
21ab4e
 
21ab4e
-                ret = __ec_heal_trim_sinks (frame, ec, fd, healed_sinks, trim);
21ab4e
+                ret = __ec_heal_trim_sinks (frame, ec, fd, healed_sinks,
21ab4e
+                                            trim, size[source]);
21ab4e
         }
21ab4e
 unlock:
21ab4e
         cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output,
21ab4e
-- 
21ab4e
1.8.3.1
21ab4e