|
|
d2787b |
From 2cb90b7798fa469f2d7d938ae88733eb1962d63d Mon Sep 17 00:00:00 2001
|
|
|
d2787b |
From: Xavi Hernandez <xhernandez@gmail.com>
|
|
|
d2787b |
Date: Fri, 9 Apr 2021 18:13:30 +0200
|
|
|
d2787b |
Subject: [PATCH 554/584] dht: fix rebalance of sparse files
|
|
|
d2787b |
|
|
|
d2787b |
Current implementation of rebalance for sparse files has a bug that,
|
|
|
d2787b |
in some cases, causes a read of 0 bytes from the source subvolume.
|
|
|
d2787b |
Posix xlator doesn't allow 0 byte reads and fails them with EINVAL,
|
|
|
d2787b |
which causes rebalance to abort the migration.
|
|
|
d2787b |
|
|
|
d2787b |
This patch implements a more robust way of finding data segments in
|
|
|
d2787b |
a sparse file that avoids 0 byte reads, allowing the file to be
|
|
|
d2787b |
migrated successfully.
|
|
|
d2787b |
|
|
|
d2787b |
Backport of:
|
|
|
d2787b |
> Upstream-patch: https://github.com/gluster/glusterfs/pull/2318
|
|
|
d2787b |
> Fixes: #2317
|
|
|
d2787b |
> Change-Id: Iff168dda2fb0f2edf716b21eb04cc2cc8ac3915c
|
|
|
d2787b |
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
|
|
|
d2787b |
|
|
|
d2787b |
BUG: 1957641
|
|
|
d2787b |
Change-Id: Iff168dda2fb0f2edf716b21eb04cc2cc8ac3915c
|
|
|
d2787b |
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
|
|
|
d2787b |
Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244551
|
|
|
d2787b |
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
|
d2787b |
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
|
|
d2787b |
---
|
|
|
d2787b |
tests/bugs/distribute/issue-2317.t | 29 ++++++++
|
|
|
d2787b |
tests/volume.rc | 4 ++
|
|
|
d2787b |
xlators/cluster/dht/src/dht-rebalance.c | 116 +++++++++++++++++---------------
|
|
|
d2787b |
3 files changed, 93 insertions(+), 56 deletions(-)
|
|
|
d2787b |
create mode 100755 tests/bugs/distribute/issue-2317.t
|
|
|
d2787b |
|
|
|
d2787b |
diff --git a/tests/bugs/distribute/issue-2317.t b/tests/bugs/distribute/issue-2317.t
|
|
|
d2787b |
new file mode 100755
|
|
|
d2787b |
index 0000000..e29d003
|
|
|
d2787b |
--- /dev/null
|
|
|
d2787b |
+++ b/tests/bugs/distribute/issue-2317.t
|
|
|
d2787b |
@@ -0,0 +1,29 @@
|
|
|
d2787b |
+#!/bin/bash
|
|
|
d2787b |
+
|
|
|
d2787b |
+. $(dirname $0)/../../include.rc
|
|
|
d2787b |
+. $(dirname $0)/../../volume.rc
|
|
|
d2787b |
+
|
|
|
d2787b |
+TESTS_EXPECTED_IN_LOOP=126
|
|
|
d2787b |
+
|
|
|
d2787b |
+cleanup
|
|
|
d2787b |
+
|
|
|
d2787b |
+TEST glusterd
|
|
|
d2787b |
+TEST ${CLI} volume create ${V0} replica 3 ${H0}:/$B0/${V0}_{0..2}
|
|
|
d2787b |
+TEST ${CLI} volume start ${V0}
|
|
|
d2787b |
+
|
|
|
d2787b |
+TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0}
|
|
|
d2787b |
+
|
|
|
d2787b |
+# Create several files to make sure that at least some of them should be
|
|
|
d2787b |
+# migrated by rebalance.
|
|
|
d2787b |
+for i in {0..63}; do
|
|
|
d2787b |
+ TEST dd if=/dev/urandom of=${M0}/file.${i} bs=4k count=1
|
|
|
d2787b |
+ TEST dd if=/dev/urandom of=${M0}/file.${i} bs=4k count=1 seek=128
|
|
|
d2787b |
+done
|
|
|
d2787b |
+
|
|
|
d2787b |
+TEST ${CLI} volume add-brick ${V0} ${H0}:${B0}/${V0}_{3..5}
|
|
|
d2787b |
+TEST ${CLI} volume rebalance ${V0} start force
|
|
|
d2787b |
+EXPECT_WITHIN ${REBALANCE_TIMEOUT} "completed" rebalance_status_field "${V0}"
|
|
|
d2787b |
+
|
|
|
d2787b |
+EXPECT "^0$" rebalance_failed_field "${V0}"
|
|
|
d2787b |
+
|
|
|
d2787b |
+cleanup
|
|
|
d2787b |
diff --git a/tests/volume.rc b/tests/volume.rc
|
|
|
d2787b |
index 9a002d9..f5dd0b1 100644
|
|
|
d2787b |
--- a/tests/volume.rc
|
|
|
d2787b |
+++ b/tests/volume.rc
|
|
|
d2787b |
@@ -75,6 +75,10 @@ function rebalance_status_field {
|
|
|
d2787b |
$CLI volume rebalance $1 status | awk '{print $7}' | sed -n 3p
|
|
|
d2787b |
}
|
|
|
d2787b |
|
|
|
d2787b |
+function rebalance_failed_field {
|
|
|
d2787b |
+ $CLI volume rebalance $1 status | awk '{print $5}' | sed -n 3p
|
|
|
d2787b |
+}
|
|
|
d2787b |
+
|
|
|
d2787b |
function fix-layout_status_field {
|
|
|
d2787b |
#The fix-layout status can be up to 3 words, (ex:'fix-layout in progress'), hence the awk-print $2 thru $4.
|
|
|
d2787b |
#But if the status is less than 3 words, it also prints the next field i.e the run_time_in_secs.(ex:'completed 3.00').
|
|
|
d2787b |
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
d2787b |
index 072896d..eab7558 100644
|
|
|
d2787b |
--- a/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
d2787b |
+++ b/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
d2787b |
@@ -1024,6 +1024,46 @@ out:
|
|
|
d2787b |
return ret;
|
|
|
d2787b |
}
|
|
|
d2787b |
|
|
|
d2787b |
+static int32_t
|
|
|
d2787b |
+dht_rebalance_sparse_segment(xlator_t *subvol, fd_t *fd, off_t *offset,
|
|
|
d2787b |
+ size_t *size)
|
|
|
d2787b |
+{
|
|
|
d2787b |
+ off_t hole;
|
|
|
d2787b |
+ int32_t ret;
|
|
|
d2787b |
+
|
|
|
d2787b |
+ do {
|
|
|
d2787b |
+ ret = syncop_seek(subvol, fd, *offset, GF_SEEK_DATA, NULL, offset);
|
|
|
d2787b |
+ if (ret >= 0) {
|
|
|
d2787b |
+ /* Starting at the offset of the last data segment, find the
|
|
|
d2787b |
+ * next hole. After a data segment there should always be a
|
|
|
d2787b |
+ * hole, since EOF is considered a hole. */
|
|
|
d2787b |
+ ret = syncop_seek(subvol, fd, *offset, GF_SEEK_HOLE, NULL, &hole);
|
|
|
d2787b |
+ }
|
|
|
d2787b |
+
|
|
|
d2787b |
+ if (ret < 0) {
|
|
|
d2787b |
+ if (ret == -ENXIO) {
|
|
|
d2787b |
+ /* This can happen if there are no more data segments (i.e.
|
|
|
d2787b |
+ * the offset is at EOF), or there was a data segment but the
|
|
|
d2787b |
+ * file has been truncated to a smaller size between both
|
|
|
d2787b |
+ * seek requests. In both cases we are done. The file doesn't
|
|
|
d2787b |
+ * contain more data. */
|
|
|
d2787b |
+ ret = 0;
|
|
|
d2787b |
+ }
|
|
|
d2787b |
+ return ret;
|
|
|
d2787b |
+ }
|
|
|
d2787b |
+
|
|
|
d2787b |
+ /* It could happen that at the same offset we detected data in the
|
|
|
d2787b |
+ * first seek, there could be a hole in the second seek if user is
|
|
|
d2787b |
+ * modifying the file concurrently. In this case we need to find a
|
|
|
d2787b |
+ * new data segment to migrate. */
|
|
|
d2787b |
+ } while (hole <= *offset);
|
|
|
d2787b |
+
|
|
|
d2787b |
+ /* Calculate the total size of the current data block */
|
|
|
d2787b |
+ *size = hole - *offset;
|
|
|
d2787b |
+
|
|
|
d2787b |
+ return 1;
|
|
|
d2787b |
+}
|
|
|
d2787b |
+
|
|
|
d2787b |
static int
|
|
|
d2787b |
__dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
|
|
|
d2787b |
xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst,
|
|
|
d2787b |
@@ -1032,8 +1072,6 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
|
|
|
d2787b |
int ret = 0;
|
|
|
d2787b |
int count = 0;
|
|
|
d2787b |
off_t offset = 0;
|
|
|
d2787b |
- off_t data_offset = 0;
|
|
|
d2787b |
- off_t hole_offset = 0;
|
|
|
d2787b |
struct iovec *vector = NULL;
|
|
|
d2787b |
struct iobref *iobref = NULL;
|
|
|
d2787b |
uint64_t total = 0;
|
|
|
d2787b |
@@ -1048,71 +1086,36 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
|
|
|
d2787b |
while (total < ia_size) {
|
|
|
d2787b |
/* This is a regular file - read it sequentially */
|
|
|
d2787b |
if (!hole_exists) {
|
|
|
d2787b |
- read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
|
|
|
d2787b |
- ? DHT_REBALANCE_BLKSIZE
|
|
|
d2787b |
- : (ia_size - total));
|
|
|
d2787b |
+ data_block_size = ia_size - total;
|
|
|
d2787b |
} else {
|
|
|
d2787b |
/* This is a sparse file - read only the data segments in the file
|
|
|
d2787b |
*/
|
|
|
d2787b |
|
|
|
d2787b |
/* If the previous data block is fully copied, find the next data
|
|
|
d2787b |
- * segment
|
|
|
d2787b |
- * starting at the offset of the last read and written byte, */
|
|
|
d2787b |
+ * segment starting at the offset of the last read and written
|
|
|
d2787b |
+ * byte. */
|
|
|
d2787b |
if (data_block_size <= 0) {
|
|
|
d2787b |
- ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
|
|
|
d2787b |
- &data_offset);
|
|
|
d2787b |
- if (ret) {
|
|
|
d2787b |
- if (ret == -ENXIO)
|
|
|
d2787b |
- ret = 0; /* No more data segments */
|
|
|
d2787b |
- else
|
|
|
d2787b |
- *fop_errno = -ret; /* Error occurred */
|
|
|
d2787b |
-
|
|
|
d2787b |
+ ret = dht_rebalance_sparse_segment(from, src, &offset,
|
|
|
d2787b |
+ &data_block_size);
|
|
|
d2787b |
+ if (ret <= 0) {
|
|
|
d2787b |
+ *fop_errno = -ret;
|
|
|
d2787b |
break;
|
|
|
d2787b |
}
|
|
|
d2787b |
-
|
|
|
d2787b |
- /* If the position of the current data segment is greater than
|
|
|
d2787b |
- * the position of the next hole, find the next hole in order to
|
|
|
d2787b |
- * calculate the length of the new data segment */
|
|
|
d2787b |
- if (data_offset > hole_offset) {
|
|
|
d2787b |
- /* Starting at the offset of the last data segment, find the
|
|
|
d2787b |
- * next hole */
|
|
|
d2787b |
- ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
|
|
|
d2787b |
- NULL, &hole_offset);
|
|
|
d2787b |
- if (ret) {
|
|
|
d2787b |
- /* If an error occurred here it's a real error because
|
|
|
d2787b |
- * if the seek for a data segment was successful then
|
|
|
d2787b |
- * necessarily another hole must exist (EOF is a hole)
|
|
|
d2787b |
- */
|
|
|
d2787b |
- *fop_errno = -ret;
|
|
|
d2787b |
- break;
|
|
|
d2787b |
- }
|
|
|
d2787b |
-
|
|
|
d2787b |
- /* Calculate the total size of the current data block */
|
|
|
d2787b |
- data_block_size = hole_offset - data_offset;
|
|
|
d2787b |
- }
|
|
|
d2787b |
- } else {
|
|
|
d2787b |
- /* There is still data in the current segment, move the
|
|
|
d2787b |
- * data_offset to the position of the last written byte */
|
|
|
d2787b |
- data_offset = offset;
|
|
|
d2787b |
}
|
|
|
d2787b |
-
|
|
|
d2787b |
- /* Calculate how much data needs to be read and written. If the data
|
|
|
d2787b |
- * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
|
|
|
d2787b |
- * write DHT_REBALANCE_BLKSIZE data length and the rest in the
|
|
|
d2787b |
- * next iteration(s) */
|
|
|
d2787b |
- read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
|
|
|
d2787b |
- ? DHT_REBALANCE_BLKSIZE
|
|
|
d2787b |
- : data_block_size);
|
|
|
d2787b |
-
|
|
|
d2787b |
- /* Calculate the remaining size of the data block - maybe there's no
|
|
|
d2787b |
- * need to seek for data in the next iteration */
|
|
|
d2787b |
- data_block_size -= read_size;
|
|
|
d2787b |
-
|
|
|
d2787b |
- /* Set offset to the offset of the data segment so read and write
|
|
|
d2787b |
- * will have the correct position */
|
|
|
d2787b |
- offset = data_offset;
|
|
|
d2787b |
}
|
|
|
d2787b |
|
|
|
d2787b |
+ /* Calculate how much data needs to be read and written. If the data
|
|
|
d2787b |
+ * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
|
|
|
d2787b |
+ * write DHT_REBALANCE_BLKSIZE data length and the rest in the
|
|
|
d2787b |
+ * next iteration(s) */
|
|
|
d2787b |
+ read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
|
|
|
d2787b |
+ ? DHT_REBALANCE_BLKSIZE
|
|
|
d2787b |
+ : data_block_size);
|
|
|
d2787b |
+
|
|
|
d2787b |
+ /* Calculate the remaining size of the data block - maybe there's no
|
|
|
d2787b |
+ * need to seek for data in the next iteration */
|
|
|
d2787b |
+ data_block_size -= read_size;
|
|
|
d2787b |
+
|
|
|
d2787b |
ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
|
|
|
d2787b |
&iobref, NULL, NULL, NULL);
|
|
|
d2787b |
|
|
|
d2787b |
@@ -1177,6 +1180,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
|
|
|
d2787b |
iobref = NULL;
|
|
|
d2787b |
vector = NULL;
|
|
|
d2787b |
}
|
|
|
d2787b |
+
|
|
|
d2787b |
if (iobref)
|
|
|
d2787b |
iobref_unref(iobref);
|
|
|
d2787b |
GF_FREE(vector);
|
|
|
d2787b |
--
|
|
|
d2787b |
1.8.3.1
|
|
|
d2787b |
|