Blob Blame History Raw
From d544399c921895b81240d2efd9371d2c8139faac Mon Sep 17 00:00:00 2001
From: Ashish Pandey <aspandey@redhat.com>
Date: Mon, 3 Apr 2017 12:46:29 +0530
Subject: [PATCH 491/509] cluster/ec: Update xattr and heal size properly

Problem-1 : Recursive healing of same file is happening
when IO is going on even after data heal completes.

Solution:
RCA: At the end of the write, when ec_update_size_version
gets called, we send it only on good bricks and not
on healing brick. Due to this, xattr on healing brick
will always remain out of sync and when the background
heal check source and sink, it finds this brick to be
healed and start healing from scratch. That involve
ftruncate and writing all of the data again.

To solve this, send xattrop on all the good bricks as
well as healing bricks.

Problem-2: The above fix exposes the data corruption
during heal. If the write on a file is going on and
heal finishes, we find that the file gets corrupted.

RCA:
The real problem happens in ec_rebuild_data(). Here we receive the
'size' argument which contains the real file size at the time of
starting self-heal and it's assigned to heal->total_size.

After that, a sequence of calls to ec_sync_heal_block() are done. Each
call ends up calling ec_manager_heal_block(), which does the actual work
of healing a block.

First a lock on the inode is taken in state EC_STATE_INIT using
ec_heal_inodelk(). When the lock is acquired, ec_heal_lock_cbk() is
called. This function calls ec_set_inode_size() to store the real size
of the inode (it uses heal->total_size).

The next step is to read the block to be healed. This is done using a
regular ec_readv(). One of the things this call does is to trim the
returned size if the file is smaller than the requested size.

In our case, when we read the last block of a file whose size was = 512
mod 1024 at the time of starting self-heal, ec_readv() will return only
the first 512 bytes, not the whole 1024 bytes.

This isn't a problem since the following ec_writev() sent from the heal
code only attempts to write the amount of data read, so it shouldn't
modify the remaining 512 bytes.

However ec_writev() also checks the file size. If we are writing the
last block of the file (determined by the size stored on the inode that
we have set to heal->total_size), any data beyond the (imposed) end of
file will be cleared with 0's. This causes the 512 bytes after the
heal->total_size to be cleared. Since the file was written after heal
started, the these bytes contained data, so the block written to the
damaged brick will be incorrect.

Solution:
Align heal->total_size to a multiple of the stripe size.

Thanks "Xavier Hernandez" <xhernandez@datalab.es>
to find out the root cause and to fix the issue.

>Change-Id: I6c9f37b3ff9dd7f5dc1858ad6f9845c05b4e204e
>BUG: 1428673
>Signed-off-by: Ashish Pandey <aspandey@redhat.com>
>Reviewed-on: https://review.gluster.org/16985
>Smoke: Gluster Build System <jenkins@build.gluster.org>
>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
>Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
>Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
>Signed-off-by: Ashish Pandey <aspandey@redhat.com>

Change-Id: I6c9f37b3ff9dd7f5dc1858ad6f9845c05b4e204e
BUG: 1427159
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/108397
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
---
 tests/basic/ec/ec-data-heal.t      | 87 ++++++++++++++++++++++++++++++++++++++
 tests/include.rc                   |  1 +
 xlators/cluster/ec/src/ec-common.c | 11 ++++-
 xlators/cluster/ec/src/ec-heal.c   | 17 +++++---
 4 files changed, 107 insertions(+), 9 deletions(-)
 create mode 100755 tests/basic/ec/ec-data-heal.t

diff --git a/tests/basic/ec/ec-data-heal.t b/tests/basic/ec/ec-data-heal.t
new file mode 100755
index 0000000..4599c8a
--- /dev/null
+++ b/tests/basic/ec/ec-data-heal.t
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+
+#This test checks data corruption after heal while IO is going on
+
+cleanup
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2}
+TEST $CLI volume start $V0
+
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+
+############ Start IO ###########
+TEST touch $M0/file
+#start background IO on file
+dd if=/dev/urandom of=$M0/file conv=fdatasync &
+iopid=$(echo $!)
+
+
+############ Kill and start brick0 for heal ###########
+brick0=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}0) -o args)
+WORDTOREMOVE=COMMAND
+brick0=${brick0//$WORDTOREMOVE/}
+TEST kill_brick $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0
+#sleep so that data can be written which will be healed later
+sleep 10
+TEST eval $brick0
+##wait for heal info to become 0 and kill IO
+EXPECT_WITHIN $IO_HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+kill $iopid
+EXPECT_WITHIN $IO_HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
+
+############### Check md5sum #########################
+
+## unmount and mount get md5sum after killing brick0
+
+brick0=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}0) -o args)
+WORDTOREMOVE=COMMAND
+brick0=${brick0//$WORDTOREMOVE/}
+TEST kill_brick $V0 $H0 $B0/${V0}0
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0
+mdsum0=`md5sum $M0/file | awk '{print $1}'`
+TEST eval $brick0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+
+## unmount and mount get md5sum after killing brick1
+
+brick1=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}1) -o args)
+WORDTOREMOVE=COMMAND
+brick1=${brick1//$WORDTOREMOVE/}
+TEST kill_brick $V0 $H0 $B0/${V0}1
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0
+mdsum1=`md5sum $M0/file | awk '{print $1}'`
+TEST eval $brick1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+
+## unmount and mount get md5sum after killing brick2
+
+brick2=$(ps -p $(get_brick_pid $V0 $H0 $B0/${V0}2) -o args)
+WORDTOREMOVE=COMMAND
+brick2=${brick2//$WORDTOREMOVE/}
+TEST kill_brick $V0 $H0 $B0/${V0}2
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "2" ec_child_up_count $V0 0
+mdsum2=`md5sum $M0/file | awk '{print $1}'`
+TEST eval $brick2
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+
+# compare all the three md5sums
+EXPECT "$mdsum0" echo $mdsum1
+EXPECT "$mdsum0" echo $mdsum2
+EXPECT "$mdsum1" echo $mdsum2
+
+cleanup
diff --git a/tests/include.rc b/tests/include.rc
index ce2085f..21aabf5 100644
--- a/tests/include.rc
+++ b/tests/include.rc
@@ -70,6 +70,7 @@ PROBE_TIMEOUT=60
 REBALANCE_TIMEOUT=360
 REOPEN_TIMEOUT=20
 HEAL_TIMEOUT=80
+IO_HEAL_TIMEOUT=120
 MARKER_UPDATE_TIMEOUT=20
 JANITOR_TIMEOUT=60
 UMOUNT_TIMEOUT=5
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
index 1720d88..9a977d7 100644
--- a/xlators/cluster/ec/src/ec-common.c
+++ b/xlators/cluster/ec/src/ec-common.c
@@ -1755,6 +1755,9 @@ ec_lock_next_owner(ec_lock_link_t *link, ec_cbk_data_t *cbk,
         }
     }
 
+    if (fop->healing) {
+           lock->healing = fop->healing & (fop->good | fop->remaining);
+    }
     ec_lock_update_good(lock, fop);
 
     lock->exclusive -= (fop->flags & EC_FLAG_LOCK_SHARED) == 0;
@@ -1952,6 +1955,8 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
     ec_lock_t *lock;
     ec_inode_t *ctx;
     dict_t * dict;
+    uintptr_t   update_on = 0;
+
     int32_t err = -ENOMEM;
 
     fop = link->fop;
@@ -2006,12 +2011,14 @@ ec_update_size_version(ec_lock_link_t *link, uint64_t *version,
     fop->frame->root->uid = 0;
     fop->frame->root->gid = 0;
 
+    update_on = lock->good_mask | lock->healing;
+
     if (link->lock->fd == NULL) {
-            ec_xattrop(fop->frame, fop->xl, lock->good_mask, EC_MINIMUM_MIN,
+            ec_xattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,
                        ec_update_size_version_done, link, &link->lock->loc,
                        GF_XATTROP_ADD_ARRAY64, dict, NULL);
     } else {
-            ec_fxattrop(fop->frame, fop->xl, lock->good_mask, EC_MINIMUM_MIN,
+            ec_fxattrop(fop->frame, fop->xl, update_on, EC_MINIMUM_MIN,
                        ec_update_size_version_done, link, link->lock->fd,
                        GF_XATTROP_ADD_ARRAY64, dict, NULL);
     }
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index 02e8123..ae60d2f 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -1908,8 +1908,8 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
         heal->fd = fd_ref (fd);
         heal->xl = ec->xl;
         heal->data = &barrier;
-        syncbarrier_init (heal->data);
         pool = ec->xl->ctx->iobuf_pool;
+        size = ec_adjust_size (ec, size, 0);
         heal->total_size = size;
         heal->size = iobpool_default_pagesize (pool);
         /* We need to adjust the size to a multiple of the stripe size of the
@@ -1947,13 +1947,15 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
 }
 
 int
-__ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec, fd_t *fd,
-                      unsigned char *healed_sinks, unsigned char *trim)
+__ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec,
+                      fd_t *fd, unsigned char *healed_sinks,
+                      unsigned char *trim, uint64_t size)
 {
         default_args_cbk_t *replies = NULL;
         unsigned char      *output  = NULL;
         int                ret      = 0;
         int                i        = 0;
+        off_t        trim_offset    = 0;
 
         EC_REPLIES_ALLOC (replies, ec->nodes);
         output       = alloca0 (ec->nodes);
@@ -1962,9 +1964,9 @@ __ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec, fd_t *fd,
                 ret = 0;
                 goto out;
         }
-
+        trim_offset = ec_adjust_size (ec, size, 1);
         ret = cluster_ftruncate (ec->xl_list, trim, ec->nodes, replies, output,
-                                 frame, ec->xl, fd, 0, NULL);
+                                 frame, ec->xl, fd, trim_offset, NULL);
         for (i = 0; i < ec->nodes; i++) {
                 if (!output[i] && trim[i])
                         healed_sinks[i] = 0;
@@ -2018,7 +2020,7 @@ ec_data_undo_pending (call_frame_t *frame, ec_t *ec, fd_t *fd, dict_t *xattr,
 
         if ((memcmp (versions_xattr, allzero, sizeof (allzero)) == 0) &&
             (memcmp (dirty_xattr, allzero, sizeof (allzero)) == 0) &&
-             (size == 0)) {
+             (size_xattr == 0)) {
                 ret = 0;
                 goto out;
         }
@@ -2224,7 +2226,8 @@ __ec_heal_data (call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on,
                 if (ret < 0)
                         goto unlock;
 
-                ret = __ec_heal_trim_sinks (frame, ec, fd, healed_sinks, trim);
+                ret = __ec_heal_trim_sinks (frame, ec, fd, healed_sinks,
+                                            trim, size[source]);
         }
 unlock:
         cluster_uninodelk (ec->xl_list, locked_on, ec->nodes, replies, output,
-- 
1.8.3.1