|
|
190130 |
From 7b2f1bd4e5a57ea3abd5f14a7d81b120735faecd Mon Sep 17 00:00:00 2001
|
|
|
190130 |
From: Barak Sason Rofman <bsasonro@redhat.com>
|
|
|
190130 |
Date: Wed, 6 May 2020 13:28:40 +0300
|
|
|
190130 |
Subject: [PATCH 438/449] dht - sparse files rebalance enhancements
|
|
|
190130 |
|
|
|
190130 |
Currently data migration in rebalance reads sparse file sequentially,
|
|
|
190130 |
disregarding which segments are holes and which are data. This can lead
|
|
|
190130 |
to extremely long migration time for large sparse file.
|
|
|
190130 |
Data migration mechanism needs to be enhanced so only data segments are
|
|
|
190130 |
read and migrated. This can be achieved using lseek to seek for holes
|
|
|
190130 |
and data in the file.
|
|
|
190130 |
This enhancement is a consequence of
|
|
|
190130 |
https://bugzilla.redhat.com/show_bug.cgi?id=1823703
|
|
|
190130 |
|
|
|
190130 |
> fixes: #1222
|
|
|
190130 |
> Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
|
|
|
190130 |
> Signed-off-by: Barak Sason Rofman <bsasonro@redhat.com>
|
|
|
190130 |
> (Cherry pick from commit 7b7559733ca0c25c63f9d56cb7f4650dbd694c40)
|
|
|
190130 |
> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/24409/)
|
|
|
190130 |
|
|
|
190130 |
BUG: 1836099
|
|
|
190130 |
Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
|
|
|
190130 |
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
|
|
|
190130 |
Reviewed-on: https://code.engineering.redhat.com/gerrit/202647
|
|
|
190130 |
Reviewed-by: Barak Sason Rofman <bsasonro@redhat.com>
|
|
|
190130 |
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
|
|
190130 |
---
|
|
|
190130 |
tests/basic/distribute/spare_file_rebalance.t | 51 ++++++++
|
|
|
190130 |
xlators/cluster/dht/src/dht-rebalance.c | 172 ++++++++++++--------------
|
|
|
190130 |
2 files changed, 130 insertions(+), 93 deletions(-)
|
|
|
190130 |
create mode 100644 tests/basic/distribute/spare_file_rebalance.t
|
|
|
190130 |
|
|
|
190130 |
diff --git a/tests/basic/distribute/spare_file_rebalance.t b/tests/basic/distribute/spare_file_rebalance.t
|
|
|
190130 |
new file mode 100644
|
|
|
190130 |
index 0000000..061c02f
|
|
|
190130 |
--- /dev/null
|
|
|
190130 |
+++ b/tests/basic/distribute/spare_file_rebalance.t
|
|
|
190130 |
@@ -0,0 +1,51 @@
|
|
|
190130 |
+#!/bin/bash
|
|
|
190130 |
+
|
|
|
190130 |
+. $(dirname $0)/../../include.rc
|
|
|
190130 |
+. $(dirname $0)/../../volume.rc
|
|
|
190130 |
+. $(dirname $0)/../../dht.rc
|
|
|
190130 |
+
|
|
|
190130 |
+# Initialize
|
|
|
190130 |
+#------------------------------------------------------------
|
|
|
190130 |
+cleanup;
|
|
|
190130 |
+
|
|
|
190130 |
+# Start glusterd
|
|
|
190130 |
+TEST glusterd;
|
|
|
190130 |
+TEST pidof glusterd;
|
|
|
190130 |
+TEST $CLI volume info;
|
|
|
190130 |
+
|
|
|
190130 |
+# Create a volume
|
|
|
190130 |
+TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2};
|
|
|
190130 |
+
|
|
|
190130 |
+# Verify volume creation
|
|
|
190130 |
+EXPECT "$V0" volinfo_field $V0 'Volume Name';
|
|
|
190130 |
+EXPECT 'Created' volinfo_field $V0 'Status';
|
|
|
190130 |
+
|
|
|
190130 |
+# Start volume and verify successful start
|
|
|
190130 |
+TEST $CLI volume start $V0;
|
|
|
190130 |
+EXPECT 'Started' volinfo_field $V0 'Status';
|
|
|
190130 |
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
|
|
|
190130 |
+
|
|
|
190130 |
+#------------------------------------------------------------
|
|
|
190130 |
+
|
|
|
190130 |
+# Test case - Create sparse files on MP and verify
|
|
|
190130 |
+# file info after rebalance
|
|
|
190130 |
+#------------------------------------------------------------
|
|
|
190130 |
+
|
|
|
190130 |
+# Create some sparse files and get their size
|
|
|
190130 |
+TEST cd $M0;
|
|
|
190130 |
+dd if=/dev/urandom of=sparse_file bs=10k count=1 seek=2M
|
|
|
190130 |
+cp --sparse=always sparse_file sparse_file_3;
|
|
|
190130 |
+
|
|
|
190130 |
+# Add a 3rd brick
|
|
|
190130 |
+TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3;
|
|
|
190130 |
+
|
|
|
190130 |
+# Trigger rebalance
|
|
|
190130 |
+TEST $CLI volume rebalance $V0 start force;
|
|
|
190130 |
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" rebalance_completed;
|
|
|
190130 |
+
|
|
|
190130 |
+# Compare original and rebalanced files
|
|
|
190130 |
+TEST cd $B0/${V0}2
|
|
|
190130 |
+TEST cmp sparse_file $B0/${V0}3/sparse_file_3
|
|
|
190130 |
+EXPECT_WITHIN 30 "";
|
|
|
190130 |
+
|
|
|
190130 |
+cleanup;
|
|
|
190130 |
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
190130 |
index 88b6b54..d0c21b4 100644
|
|
|
190130 |
--- a/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
190130 |
+++ b/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
190130 |
@@ -18,8 +18,8 @@
|
|
|
190130 |
#include <glusterfs/events.h>
|
|
|
190130 |
|
|
|
190130 |
#define GF_DISK_SECTOR_SIZE 512
|
|
|
190130 |
-#define DHT_REBALANCE_PID 4242 /* Change it if required */
|
|
|
190130 |
-#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */
|
|
|
190130 |
+#define DHT_REBALANCE_PID 4242 /* Change it if required */
|
|
|
190130 |
+#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */
|
|
|
190130 |
#define MAX_MIGRATE_QUEUE_COUNT 500
|
|
|
190130 |
#define MIN_MIGRATE_QUEUE_COUNT 200
|
|
|
190130 |
#define MAX_REBAL_TYPE_SIZE 16
|
|
|
190130 |
@@ -178,75 +178,6 @@ dht_strip_out_acls(dict_t *dict)
|
|
|
190130 |
}
|
|
|
190130 |
}
|
|
|
190130 |
|
|
|
190130 |
-static int
|
|
|
190130 |
-dht_write_with_holes(xlator_t *to, fd_t *fd, struct iovec *vec, int count,
|
|
|
190130 |
- int32_t size, off_t offset, struct iobref *iobref,
|
|
|
190130 |
- int *fop_errno)
|
|
|
190130 |
-{
|
|
|
190130 |
- int i = 0;
|
|
|
190130 |
- int ret = -1;
|
|
|
190130 |
- int start_idx = 0;
|
|
|
190130 |
- int tmp_offset = 0;
|
|
|
190130 |
- int write_needed = 0;
|
|
|
190130 |
- int buf_len = 0;
|
|
|
190130 |
- int size_pending = 0;
|
|
|
190130 |
- char *buf = NULL;
|
|
|
190130 |
-
|
|
|
190130 |
- /* loop through each vector */
|
|
|
190130 |
- for (i = 0; i < count; i++) {
|
|
|
190130 |
- buf = vec[i].iov_base;
|
|
|
190130 |
- buf_len = vec[i].iov_len;
|
|
|
190130 |
-
|
|
|
190130 |
- for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len;
|
|
|
190130 |
- start_idx += GF_DISK_SECTOR_SIZE) {
|
|
|
190130 |
- if (mem_0filled(buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) {
|
|
|
190130 |
- write_needed = 1;
|
|
|
190130 |
- continue;
|
|
|
190130 |
- }
|
|
|
190130 |
-
|
|
|
190130 |
- if (write_needed) {
|
|
|
190130 |
- ret = syncop_write(
|
|
|
190130 |
- to, fd, (buf + tmp_offset), (start_idx - tmp_offset),
|
|
|
190130 |
- (offset + tmp_offset), iobref, 0, NULL, NULL);
|
|
|
190130 |
- /* 'path' will be logged in calling function */
|
|
|
190130 |
- if (ret < 0) {
|
|
|
190130 |
- gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
|
|
|
190130 |
- strerror(-ret));
|
|
|
190130 |
- *fop_errno = -ret;
|
|
|
190130 |
- ret = -1;
|
|
|
190130 |
- goto out;
|
|
|
190130 |
- }
|
|
|
190130 |
-
|
|
|
190130 |
- write_needed = 0;
|
|
|
190130 |
- }
|
|
|
190130 |
- tmp_offset = start_idx + GF_DISK_SECTOR_SIZE;
|
|
|
190130 |
- }
|
|
|
190130 |
-
|
|
|
190130 |
- if ((start_idx < buf_len) || write_needed) {
|
|
|
190130 |
- /* This means, last chunk is not yet written.. write it */
|
|
|
190130 |
- ret = syncop_write(to, fd, (buf + tmp_offset),
|
|
|
190130 |
- (buf_len - tmp_offset), (offset + tmp_offset),
|
|
|
190130 |
- iobref, 0, NULL, NULL);
|
|
|
190130 |
- if (ret < 0) {
|
|
|
190130 |
- /* 'path' will be logged in calling function */
|
|
|
190130 |
- gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
|
|
|
190130 |
- strerror(-ret));
|
|
|
190130 |
- *fop_errno = -ret;
|
|
|
190130 |
- ret = -1;
|
|
|
190130 |
- goto out;
|
|
|
190130 |
- }
|
|
|
190130 |
- }
|
|
|
190130 |
-
|
|
|
190130 |
- size_pending = (size - buf_len);
|
|
|
190130 |
- if (!size_pending)
|
|
|
190130 |
- break;
|
|
|
190130 |
- }
|
|
|
190130 |
-
|
|
|
190130 |
- ret = size;
|
|
|
190130 |
-out:
|
|
|
190130 |
- return ret;
|
|
|
190130 |
-}
|
|
|
190130 |
-
|
|
|
190130 |
/*
|
|
|
190130 |
return values:
|
|
|
190130 |
-1 : failure
|
|
|
190130 |
@@ -1101,32 +1032,97 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
|
|
|
190130 |
int ret = 0;
|
|
|
190130 |
int count = 0;
|
|
|
190130 |
off_t offset = 0;
|
|
|
190130 |
+ off_t data_offset = 0;
|
|
|
190130 |
+ off_t hole_offset = 0;
|
|
|
190130 |
struct iovec *vector = NULL;
|
|
|
190130 |
struct iobref *iobref = NULL;
|
|
|
190130 |
uint64_t total = 0;
|
|
|
190130 |
size_t read_size = 0;
|
|
|
190130 |
+ size_t data_block_size = 0;
|
|
|
190130 |
dict_t *xdata = NULL;
|
|
|
190130 |
dht_conf_t *conf = NULL;
|
|
|
190130 |
|
|
|
190130 |
conf = this->private;
|
|
|
190130 |
+
|
|
|
190130 |
/* if file size is '0', no need to enter this loop */
|
|
|
190130 |
while (total < ia_size) {
|
|
|
190130 |
- read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
|
|
|
190130 |
- ? DHT_REBALANCE_BLKSIZE
|
|
|
190130 |
- : (ia_size - total));
|
|
|
190130 |
+ /* This is a regular file - read it sequentially */
|
|
|
190130 |
+ if (!hole_exists) {
|
|
|
190130 |
+ read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
|
|
|
190130 |
+ ? DHT_REBALANCE_BLKSIZE
|
|
|
190130 |
+ : (ia_size - total));
|
|
|
190130 |
+ } else {
|
|
|
190130 |
+ /* This is a sparse file - read only the data segments in the file
|
|
|
190130 |
+ */
|
|
|
190130 |
+
|
|
|
190130 |
+ /* If the previous data block is fully copied, find the next data
|
|
|
190130 |
+ * segment
|
|
|
190130 |
+ * starting at the offset of the last read and written byte, */
|
|
|
190130 |
+ if (data_block_size <= 0) {
|
|
|
190130 |
+ ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
|
|
|
190130 |
+ &data_offset);
|
|
|
190130 |
+ if (ret) {
|
|
|
190130 |
+ if (ret == -ENXIO)
|
|
|
190130 |
+ ret = 0; /* No more data segments */
|
|
|
190130 |
+ else
|
|
|
190130 |
+ *fop_errno = -ret; /* Error occurred */
|
|
|
190130 |
+
|
|
|
190130 |
+ break;
|
|
|
190130 |
+ }
|
|
|
190130 |
+
|
|
|
190130 |
+ /* If the position of the current data segment is greater than
|
|
|
190130 |
+ * the position of the next hole, find the next hole in order to
|
|
|
190130 |
+ * calculate the length of the new data segment */
|
|
|
190130 |
+ if (data_offset > hole_offset) {
|
|
|
190130 |
+ /* Starting at the offset of the last data segment, find the
|
|
|
190130 |
+ * next hole */
|
|
|
190130 |
+ ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
|
|
|
190130 |
+ NULL, &hole_offset);
|
|
|
190130 |
+ if (ret) {
|
|
|
190130 |
+ /* If an error occurred here it's a real error because
|
|
|
190130 |
+ * if the seek for a data segment was successful then
|
|
|
190130 |
+ * necessarily another hole must exist (EOF is a hole)
|
|
|
190130 |
+ */
|
|
|
190130 |
+ *fop_errno = -ret;
|
|
|
190130 |
+ break;
|
|
|
190130 |
+ }
|
|
|
190130 |
+
|
|
|
190130 |
+ /* Calculate the total size of the current data block */
|
|
|
190130 |
+ data_block_size = hole_offset - data_offset;
|
|
|
190130 |
+ }
|
|
|
190130 |
+ } else {
|
|
|
190130 |
+ /* There is still data in the current segment, move the
|
|
|
190130 |
+ * data_offset to the position of the last written byte */
|
|
|
190130 |
+ data_offset = offset;
|
|
|
190130 |
+ }
|
|
|
190130 |
+
|
|
|
190130 |
+ /* Calculate how much data needs to be read and written. If the data
|
|
|
190130 |
+ * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
|
|
|
190130 |
+ * write DHT_REBALANCE_BLKSIZE data length and the rest in the
|
|
|
190130 |
+ * next iteration(s) */
|
|
|
190130 |
+ read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
|
|
|
190130 |
+ ? DHT_REBALANCE_BLKSIZE
|
|
|
190130 |
+ : data_block_size);
|
|
|
190130 |
+
|
|
|
190130 |
+ /* Calculate the remaining size of the data block - maybe there's no
|
|
|
190130 |
+ * need to seek for data in the next iteration */
|
|
|
190130 |
+ data_block_size -= read_size;
|
|
|
190130 |
+
|
|
|
190130 |
+ /* Set offset to the offset of the data segment so read and write
|
|
|
190130 |
+ * will have the correct position */
|
|
|
190130 |
+ offset = data_offset;
|
|
|
190130 |
+ }
|
|
|
190130 |
|
|
|
190130 |
ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
|
|
|
190130 |
&iobref, NULL, NULL, NULL);
|
|
|
190130 |
+
|
|
|
190130 |
if (!ret || (ret < 0)) {
|
|
|
190130 |
*fop_errno = -ret;
|
|
|
190130 |
break;
|
|
|
190130 |
}
|
|
|
190130 |
|
|
|
190130 |
- if (hole_exists) {
|
|
|
190130 |
- ret = dht_write_with_holes(to, dst, vector, count, ret, offset,
|
|
|
190130 |
- iobref, fop_errno);
|
|
|
190130 |
- } else {
|
|
|
190130 |
- if (!conf->force_migration && !dht_is_tier_xlator(this)) {
|
|
|
190130 |
+ if (!conf->force_migration && !dht_is_tier_xlator(this)) {
|
|
|
190130 |
+ if (!xdata) {
|
|
|
190130 |
xdata = dict_new();
|
|
|
190130 |
if (!xdata) {
|
|
|
190130 |
gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
|
|
|
190130 |
@@ -1146,7 +1142,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
|
|
|
190130 |
* https://github.com/gluster/glusterfs/issues/308
|
|
|
190130 |
* for more details.
|
|
|
190130 |
*/
|
|
|
190130 |
- ret = dict_set_int32(xdata, GF_AVOID_OVERWRITE, 1);
|
|
|
190130 |
+ ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1);
|
|
|
190130 |
if (ret) {
|
|
|
190130 |
gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM,
|
|
|
190130 |
"failed to set dict");
|
|
|
190130 |
@@ -1155,22 +1151,12 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
|
|
|
190130 |
break;
|
|
|
190130 |
}
|
|
|
190130 |
}
|
|
|
190130 |
-
|
|
|
190130 |
- ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
|
|
|
190130 |
- NULL, xdata, NULL);
|
|
|
190130 |
- if (ret < 0) {
|
|
|
190130 |
- *fop_errno = -ret;
|
|
|
190130 |
- }
|
|
|
190130 |
- }
|
|
|
190130 |
-
|
|
|
190130 |
- if ((defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) &&
|
|
|
190130 |
- (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)) {
|
|
|
190130 |
- gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED,
|
|
|
190130 |
- "Migrate file paused");
|
|
|
190130 |
- ret = -1;
|
|
|
190130 |
}
|
|
|
190130 |
|
|
|
190130 |
+ ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
|
|
|
190130 |
+ NULL, xdata, NULL);
|
|
|
190130 |
if (ret < 0) {
|
|
|
190130 |
+ *fop_errno = -ret;
|
|
|
190130 |
break;
|
|
|
190130 |
}
|
|
|
190130 |
|
|
|
190130 |
--
|
|
|
190130 |
1.8.3.1
|
|
|
190130 |
|