14f8ab
From 52d71ad0e5c27808e7d8eea8a0920298837e408c Mon Sep 17 00:00:00 2001
14f8ab
From: Xavi Hernandez <xhernandez@redhat.com>
14f8ab
Date: Wed, 17 Jul 2019 14:50:22 +0200
14f8ab
Subject: [PATCH 271/276] cluster/ec: fix EIO error for concurrent writes on
14f8ab
 sparse files
14f8ab
14f8ab
EC doesn't allow concurrent writes on overlapping areas, they are
14f8ab
serialized. However non-overlapping writes are serviced in parallel.
14f8ab
When a write is not aligned, EC first needs to read the entire chunk
14f8ab
from disk, apply the modified fragment and write it again.
14f8ab
14f8ab
The problem appears on sparse files because a write to an offset
14f8ab
implicitly creates data on offsets below it (so, in some way, they
14f8ab
are overlapping). For example, if a file is empty and we read 10 bytes
14f8ab
from offset 10, read() will return 0 bytes. Now, if we write one byte
14f8ab
at offset 1M and retry the same read, the system call will return 10
14f8ab
bytes (all containing 0's).
14f8ab
14f8ab
So if we have two writes, the first one at offset 10 and the second one
14f8ab
at offset 1M, EC will send both in parallel because they do not overlap.
14f8ab
However, the first one will try to read missing data from the first chunk
14f8ab
(i.e. offsets 0 to 9) to recombine the entire chunk and do the final write.
14f8ab
This read will happen in parallel with the write to 1M. What could happen
14f8ab
is that half of the bricks process the write before the read, and the
14f8ab
half do the read before the write. Some bricks will return 10 bytes of
14f8ab
data while the otherw will return 0 bytes (because the file on the brick
14f8ab
has not been expanded yet).
14f8ab
14f8ab
When EC tries to recombine the answers from the bricks, it can't, because
14f8ab
it needs more than half consistent answers to recover the data. So this
14f8ab
read fails with EIO error. This error is propagated to the parent write,
14f8ab
which is aborted and EIO is returned to the application.
14f8ab
14f8ab
The issue happened because EC assumed that a write to a given offset
14f8ab
implies that offsets below it exist.
14f8ab
14f8ab
This fix prevents the read of the chunk from bricks if the current size
14f8ab
of the file is smaller than the read chunk offset. This size is
14f8ab
correctly tracked, so this fixes the issue.
14f8ab
14f8ab
Also modifying ec-stripe.t file for Test #13 within it.
14f8ab
In this patch, if a file size is less than the offset we are writing, we
14f8ab
fill zeros in head and tail and do not consider it strip cache miss.
14f8ab
That actually make sense as we know what data that part holds and there is
14f8ab
no need of reading it from bricks.
14f8ab
14f8ab
Upstream-patch: https://review.gluster.org/c/glusterfs/+/23066
14f8ab
Change-Id: Ic342e8c35c555b8534109e9314c9a0710b6225d6
14f8ab
fixes: bz#1731448
14f8ab
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
14f8ab
Reviewed-on: https://code.engineering.redhat.com/gerrit/177975
14f8ab
Tested-by: RHGS Build Bot <nigelb@redhat.com>
14f8ab
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
14f8ab
---
14f8ab
 tests/basic/ec/ec-stripe.t              |  2 +-
14f8ab
 xlators/cluster/ec/src/ec-inode-write.c | 26 +++++++++++++++++---------
14f8ab
 2 files changed, 18 insertions(+), 10 deletions(-)
14f8ab
14f8ab
diff --git a/tests/basic/ec/ec-stripe.t b/tests/basic/ec/ec-stripe.t
14f8ab
index 1e940eb..98b9229 100644
14f8ab
--- a/tests/basic/ec/ec-stripe.t
14f8ab
+++ b/tests/basic/ec/ec-stripe.t
14f8ab
@@ -202,7 +202,7 @@ TEST truncate -s 0 $B0/test_file
14f8ab
 TEST truncate -s 0 $M0/test_file
14f8ab
 TEST dd if=$B0/misc_file of=$B0/test_file  bs=1022 count=5  oflag=seek_bytes,sync seek=400 conv=notrunc
14f8ab
 TEST dd if=$B0/misc_file of=$M0/test_file  bs=1022 count=5  oflag=seek_bytes,sync seek=400 conv=notrunc
14f8ab
-check_statedump_md5sum 4 5
14f8ab
+check_statedump_md5sum 4 4
14f8ab
 clean_file_unmount
14f8ab
 
14f8ab
 ### 14 - Truncate to invalidate  all but one the stripe in cache  ####
14f8ab
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
14f8ab
index ea55140..a45e6d6 100644
14f8ab
--- a/xlators/cluster/ec/src/ec-inode-write.c
14f8ab
+++ b/xlators/cluster/ec/src/ec-inode-write.c
14f8ab
@@ -2013,20 +2013,28 @@ ec_writev_start(ec_fop_data_t *fop)
14f8ab
     if (err != 0) {
14f8ab
         goto failed_fd;
14f8ab
     }
14f8ab
+    tail = fop->size - fop->user_size - fop->head;
14f8ab
     if (fop->head > 0) {
14f8ab
-        found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD);
14f8ab
-        if (!found_stripe) {
14f8ab
-            if (ec_make_internal_fop_xdata(&xdata)) {
14f8ab
-                err = -ENOMEM;
14f8ab
-                goto failed_xdata;
14f8ab
+        if (current > fop->offset) {
14f8ab
+            found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD);
14f8ab
+            if (!found_stripe) {
14f8ab
+                if (ec_make_internal_fop_xdata(&xdata)) {
14f8ab
+                    err = -ENOMEM;
14f8ab
+                    goto failed_xdata;
14f8ab
+                }
14f8ab
+                ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
14f8ab
+                         ec_writev_merge_head, NULL, fd, ec->stripe_size,
14f8ab
+                         fop->offset, 0, xdata);
14f8ab
+            }
14f8ab
+        } else {
14f8ab
+            memset(fop->vector[0].iov_base, 0, fop->head);
14f8ab
+            memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);
14f8ab
+            if (ec->stripe_cache && (fop->size <= ec->stripe_size)) {
14f8ab
+                ec_add_stripe_in_cache(ec, fop);
14f8ab
             }
14f8ab
-            ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
14f8ab
-                     ec_writev_merge_head, NULL, fd, ec->stripe_size,
14f8ab
-                     fop->offset, 0, xdata);
14f8ab
         }
14f8ab
     }
14f8ab
 
14f8ab
-    tail = fop->size - fop->user_size - fop->head;
14f8ab
     if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
14f8ab
         /* Current locking scheme will make sure the 'current' below will
14f8ab
          * never decrease while the fop is in progress, so the checks will
14f8ab
-- 
14f8ab
1.8.3.1
14f8ab