f338ef
From 52d71ad0e5c27808e7d8eea8a0920298837e408c Mon Sep 17 00:00:00 2001
f338ef
From: Xavi Hernandez <xhernandez@redhat.com>
f338ef
Date: Wed, 17 Jul 2019 14:50:22 +0200
f338ef
Subject: [PATCH 271/276] cluster/ec: fix EIO error for concurrent writes on
f338ef
 sparse files
f338ef
f338ef
EC doesn't allow concurrent writes on overlapping areas, they are
f338ef
serialized. However non-overlapping writes are serviced in parallel.
f338ef
When a write is not aligned, EC first needs to read the entire chunk
f338ef
from disk, apply the modified fragment and write it again.
f338ef
f338ef
The problem appears on sparse files because a write to an offset
f338ef
implicitly creates data on offsets below it (so, in some way, they
f338ef
are overlapping). For example, if a file is empty and we read 10 bytes
f338ef
from offset 10, read() will return 0 bytes. Now, if we write one byte
f338ef
at offset 1M and retry the same read, the system call will return 10
f338ef
bytes (all containing 0's).
f338ef
f338ef
So if we have two writes, the first one at offset 10 and the second one
f338ef
at offset 1M, EC will send both in parallel because they do not overlap.
f338ef
However, the first one will try to read missing data from the first chunk
f338ef
(i.e. offsets 0 to 9) to recombine the entire chunk and do the final write.
f338ef
This read will happen in parallel with the write to 1M. What could happen
f338ef
is that half of the bricks process the write before the read, and the
f338ef
half do the read before the write. Some bricks will return 10 bytes of
f338ef
data while the otherw will return 0 bytes (because the file on the brick
f338ef
has not been expanded yet).
f338ef
f338ef
When EC tries to recombine the answers from the bricks, it can't, because
f338ef
it needs more than half consistent answers to recover the data. So this
f338ef
read fails with EIO error. This error is propagated to the parent write,
f338ef
which is aborted and EIO is returned to the application.
f338ef
f338ef
The issue happened because EC assumed that a write to a given offset
f338ef
implies that offsets below it exist.
f338ef
f338ef
This fix prevents the read of the chunk from bricks if the current size
f338ef
of the file is smaller than the read chunk offset. This size is
f338ef
correctly tracked, so this fixes the issue.
f338ef
f338ef
Also modifying ec-stripe.t file for Test #13 within it.
f338ef
In this patch, if a file size is less than the offset we are writing, we
f338ef
fill zeros in head and tail and do not consider it strip cache miss.
f338ef
That actually make sense as we know what data that part holds and there is
f338ef
no need of reading it from bricks.
f338ef
f338ef
Upstream-patch: https://review.gluster.org/c/glusterfs/+/23066
f338ef
Change-Id: Ic342e8c35c555b8534109e9314c9a0710b6225d6
f338ef
fixes: bz#1731448
f338ef
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
f338ef
Reviewed-on: https://code.engineering.redhat.com/gerrit/177975
f338ef
Tested-by: RHGS Build Bot <nigelb@redhat.com>
f338ef
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
f338ef
---
f338ef
 tests/basic/ec/ec-stripe.t              |  2 +-
f338ef
 xlators/cluster/ec/src/ec-inode-write.c | 26 +++++++++++++++++---------
f338ef
 2 files changed, 18 insertions(+), 10 deletions(-)
f338ef
f338ef
diff --git a/tests/basic/ec/ec-stripe.t b/tests/basic/ec/ec-stripe.t
f338ef
index 1e940eb..98b9229 100644
f338ef
--- a/tests/basic/ec/ec-stripe.t
f338ef
+++ b/tests/basic/ec/ec-stripe.t
f338ef
@@ -202,7 +202,7 @@ TEST truncate -s 0 $B0/test_file
f338ef
 TEST truncate -s 0 $M0/test_file
f338ef
 TEST dd if=$B0/misc_file of=$B0/test_file  bs=1022 count=5  oflag=seek_bytes,sync seek=400 conv=notrunc
f338ef
 TEST dd if=$B0/misc_file of=$M0/test_file  bs=1022 count=5  oflag=seek_bytes,sync seek=400 conv=notrunc
f338ef
-check_statedump_md5sum 4 5
f338ef
+check_statedump_md5sum 4 4
f338ef
 clean_file_unmount
f338ef
 
f338ef
 ### 14 - Truncate to invalidate  all but one the stripe in cache  ####
f338ef
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
f338ef
index ea55140..a45e6d6 100644
f338ef
--- a/xlators/cluster/ec/src/ec-inode-write.c
f338ef
+++ b/xlators/cluster/ec/src/ec-inode-write.c
f338ef
@@ -2013,20 +2013,28 @@ ec_writev_start(ec_fop_data_t *fop)
f338ef
     if (err != 0) {
f338ef
         goto failed_fd;
f338ef
     }
f338ef
+    tail = fop->size - fop->user_size - fop->head;
f338ef
     if (fop->head > 0) {
f338ef
-        found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD);
f338ef
-        if (!found_stripe) {
f338ef
-            if (ec_make_internal_fop_xdata(&xdata)) {
f338ef
-                err = -ENOMEM;
f338ef
-                goto failed_xdata;
f338ef
+        if (current > fop->offset) {
f338ef
+            found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD);
f338ef
+            if (!found_stripe) {
f338ef
+                if (ec_make_internal_fop_xdata(&xdata)) {
f338ef
+                    err = -ENOMEM;
f338ef
+                    goto failed_xdata;
f338ef
+                }
f338ef
+                ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
f338ef
+                         ec_writev_merge_head, NULL, fd, ec->stripe_size,
f338ef
+                         fop->offset, 0, xdata);
f338ef
+            }
f338ef
+        } else {
f338ef
+            memset(fop->vector[0].iov_base, 0, fop->head);
f338ef
+            memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);
f338ef
+            if (ec->stripe_cache && (fop->size <= ec->stripe_size)) {
f338ef
+                ec_add_stripe_in_cache(ec, fop);
f338ef
             }
f338ef
-            ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
f338ef
-                     ec_writev_merge_head, NULL, fd, ec->stripe_size,
f338ef
-                     fop->offset, 0, xdata);
f338ef
         }
f338ef
     }
f338ef
 
f338ef
-    tail = fop->size - fop->user_size - fop->head;
f338ef
     if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
f338ef
         /* Current locking scheme will make sure the 'current' below will
f338ef
          * never decrease while the fop is in progress, so the checks will
f338ef
-- 
f338ef
1.8.3.1
f338ef