e3c68b
From 52d71ad0e5c27808e7d8eea8a0920298837e408c Mon Sep 17 00:00:00 2001
e3c68b
From: Xavi Hernandez <xhernandez@redhat.com>
e3c68b
Date: Wed, 17 Jul 2019 14:50:22 +0200
e3c68b
Subject: [PATCH 271/276] cluster/ec: fix EIO error for concurrent writes on
e3c68b
 sparse files
e3c68b
e3c68b
EC doesn't allow concurrent writes on overlapping areas, they are
e3c68b
serialized. However non-overlapping writes are serviced in parallel.
e3c68b
When a write is not aligned, EC first needs to read the entire chunk
e3c68b
from disk, apply the modified fragment and write it again.
e3c68b
e3c68b
The problem appears on sparse files because a write to an offset
e3c68b
implicitly creates data on offsets below it (so, in some way, they
e3c68b
are overlapping). For example, if a file is empty and we read 10 bytes
e3c68b
from offset 10, read() will return 0 bytes. Now, if we write one byte
e3c68b
at offset 1M and retry the same read, the system call will return 10
e3c68b
bytes (all containing 0's).
e3c68b
e3c68b
So if we have two writes, the first one at offset 10 and the second one
e3c68b
at offset 1M, EC will send both in parallel because they do not overlap.
e3c68b
However, the first one will try to read missing data from the first chunk
e3c68b
(i.e. offsets 0 to 9) to recombine the entire chunk and do the final write.
e3c68b
This read will happen in parallel with the write to 1M. What could happen
e3c68b
is that half of the bricks process the write before the read, and the
e3c68b
half do the read before the write. Some bricks will return 10 bytes of
e3c68b
data while the otherw will return 0 bytes (because the file on the brick
e3c68b
has not been expanded yet).
e3c68b
e3c68b
When EC tries to recombine the answers from the bricks, it can't, because
e3c68b
it needs more than half consistent answers to recover the data. So this
e3c68b
read fails with EIO error. This error is propagated to the parent write,
e3c68b
which is aborted and EIO is returned to the application.
e3c68b
e3c68b
The issue happened because EC assumed that a write to a given offset
e3c68b
implies that offsets below it exist.
e3c68b
e3c68b
This fix prevents the read of the chunk from bricks if the current size
e3c68b
of the file is smaller than the read chunk offset. This size is
e3c68b
correctly tracked, so this fixes the issue.
e3c68b
e3c68b
Also modifying ec-stripe.t file for Test #13 within it.
e3c68b
In this patch, if a file size is less than the offset we are writing, we
e3c68b
fill zeros in head and tail and do not consider it strip cache miss.
e3c68b
That actually make sense as we know what data that part holds and there is
e3c68b
no need of reading it from bricks.
e3c68b
e3c68b
Upstream-patch: https://review.gluster.org/c/glusterfs/+/23066
e3c68b
Change-Id: Ic342e8c35c555b8534109e9314c9a0710b6225d6
e3c68b
fixes: bz#1731448
e3c68b
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
e3c68b
Reviewed-on: https://code.engineering.redhat.com/gerrit/177975
e3c68b
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e3c68b
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
e3c68b
---
e3c68b
 tests/basic/ec/ec-stripe.t              |  2 +-
e3c68b
 xlators/cluster/ec/src/ec-inode-write.c | 26 +++++++++++++++++---------
e3c68b
 2 files changed, 18 insertions(+), 10 deletions(-)
e3c68b
e3c68b
diff --git a/tests/basic/ec/ec-stripe.t b/tests/basic/ec/ec-stripe.t
e3c68b
index 1e940eb..98b9229 100644
e3c68b
--- a/tests/basic/ec/ec-stripe.t
e3c68b
+++ b/tests/basic/ec/ec-stripe.t
e3c68b
@@ -202,7 +202,7 @@ TEST truncate -s 0 $B0/test_file
e3c68b
 TEST truncate -s 0 $M0/test_file
e3c68b
 TEST dd if=$B0/misc_file of=$B0/test_file  bs=1022 count=5  oflag=seek_bytes,sync seek=400 conv=notrunc
e3c68b
 TEST dd if=$B0/misc_file of=$M0/test_file  bs=1022 count=5  oflag=seek_bytes,sync seek=400 conv=notrunc
e3c68b
-check_statedump_md5sum 4 5
e3c68b
+check_statedump_md5sum 4 4
e3c68b
 clean_file_unmount
e3c68b
 
e3c68b
 ### 14 - Truncate to invalidate  all but one the stripe in cache  ####
e3c68b
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
e3c68b
index ea55140..a45e6d6 100644
e3c68b
--- a/xlators/cluster/ec/src/ec-inode-write.c
e3c68b
+++ b/xlators/cluster/ec/src/ec-inode-write.c
e3c68b
@@ -2013,20 +2013,28 @@ ec_writev_start(ec_fop_data_t *fop)
e3c68b
     if (err != 0) {
e3c68b
         goto failed_fd;
e3c68b
     }
e3c68b
+    tail = fop->size - fop->user_size - fop->head;
e3c68b
     if (fop->head > 0) {
e3c68b
-        found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD);
e3c68b
-        if (!found_stripe) {
e3c68b
-            if (ec_make_internal_fop_xdata(&xdata)) {
e3c68b
-                err = -ENOMEM;
e3c68b
-                goto failed_xdata;
e3c68b
+        if (current > fop->offset) {
e3c68b
+            found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD);
e3c68b
+            if (!found_stripe) {
e3c68b
+                if (ec_make_internal_fop_xdata(&xdata)) {
e3c68b
+                    err = -ENOMEM;
e3c68b
+                    goto failed_xdata;
e3c68b
+                }
e3c68b
+                ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
e3c68b
+                         ec_writev_merge_head, NULL, fd, ec->stripe_size,
e3c68b
+                         fop->offset, 0, xdata);
e3c68b
+            }
e3c68b
+        } else {
e3c68b
+            memset(fop->vector[0].iov_base, 0, fop->head);
e3c68b
+            memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);
e3c68b
+            if (ec->stripe_cache && (fop->size <= ec->stripe_size)) {
e3c68b
+                ec_add_stripe_in_cache(ec, fop);
e3c68b
             }
e3c68b
-            ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
e3c68b
-                     ec_writev_merge_head, NULL, fd, ec->stripe_size,
e3c68b
-                     fop->offset, 0, xdata);
e3c68b
         }
e3c68b
     }
e3c68b
 
e3c68b
-    tail = fop->size - fop->user_size - fop->head;
e3c68b
     if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
e3c68b
         /* Current locking scheme will make sure the 'current' below will
e3c68b
          * never decrease while the fop is in progress, so the checks will
e3c68b
-- 
e3c68b
1.8.3.1
e3c68b