14f8ab
From 7b2f1bd4e5a57ea3abd5f14a7d81b120735faecd Mon Sep 17 00:00:00 2001
14f8ab
From: Barak Sason Rofman <bsasonro@redhat.com>
14f8ab
Date: Wed, 6 May 2020 13:28:40 +0300
14f8ab
Subject: [PATCH 438/449] dht - sparse files rebalance enhancements
14f8ab
14f8ab
Currently data migration in rebalance reads sparse file sequentially,
14f8ab
disregarding which segments are holes and which are data. This can lead
14f8ab
to extremely long migration time for large sparse file.
14f8ab
Data migration mechanism needs to be enhanced so only data segments are
14f8ab
read and migrated. This can be achieved using lseek to seek for holes
14f8ab
and data in the file.
14f8ab
This enhancement is a consequence of
14f8ab
https://bugzilla.redhat.com/show_bug.cgi?id=1823703
14f8ab
14f8ab
> fixes: #1222
14f8ab
> Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
14f8ab
> Signed-off-by: Barak Sason Rofman <bsasonro@redhat.com>
14f8ab
> (Cherry pick from commit 7b7559733ca0c25c63f9d56cb7f4650dbd694c40)
14f8ab
> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/24409/)
14f8ab
14f8ab
BUG: 1836099
14f8ab
Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
14f8ab
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
14f8ab
Reviewed-on: https://code.engineering.redhat.com/gerrit/202647
14f8ab
Reviewed-by: Barak Sason Rofman <bsasonro@redhat.com>
14f8ab
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
14f8ab
---
14f8ab
 tests/basic/distribute/spare_file_rebalance.t |  51 ++++++++
14f8ab
 xlators/cluster/dht/src/dht-rebalance.c       | 172 ++++++++++++--------------
14f8ab
 2 files changed, 130 insertions(+), 93 deletions(-)
14f8ab
 create mode 100644 tests/basic/distribute/spare_file_rebalance.t
14f8ab
14f8ab
diff --git a/tests/basic/distribute/spare_file_rebalance.t b/tests/basic/distribute/spare_file_rebalance.t
14f8ab
new file mode 100644
14f8ab
index 0000000..061c02f
14f8ab
--- /dev/null
14f8ab
+++ b/tests/basic/distribute/spare_file_rebalance.t
14f8ab
@@ -0,0 +1,51 @@
14f8ab
+#!/bin/bash
14f8ab
+
14f8ab
+. $(dirname $0)/../../include.rc
14f8ab
+. $(dirname $0)/../../volume.rc
14f8ab
+. $(dirname $0)/../../dht.rc
14f8ab
+
14f8ab
+# Initialize
14f8ab
+#------------------------------------------------------------
14f8ab
+cleanup;
14f8ab
+
14f8ab
+# Start glusterd
14f8ab
+TEST glusterd;
14f8ab
+TEST pidof glusterd;
14f8ab
+TEST $CLI volume info;
14f8ab
+
14f8ab
+# Create a volume
14f8ab
+TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2};
14f8ab
+
14f8ab
+# Verify volume creation
14f8ab
+EXPECT "$V0" volinfo_field $V0 'Volume Name';
14f8ab
+EXPECT 'Created' volinfo_field $V0 'Status';
14f8ab
+
14f8ab
+# Start volume and verify successful start
14f8ab
+TEST $CLI volume start $V0;
14f8ab
+EXPECT 'Started' volinfo_field $V0 'Status';
14f8ab
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
14f8ab
+
14f8ab
+#------------------------------------------------------------
14f8ab
+
14f8ab
+# Test case - Create sparse files on MP and verify
14f8ab
+# file info after rebalance
14f8ab
+#------------------------------------------------------------
14f8ab
+
14f8ab
+# Create some sparse files and get their size
14f8ab
+TEST cd $M0;
14f8ab
+dd if=/dev/urandom of=sparse_file bs=10k count=1 seek=2M
14f8ab
+cp --sparse=always sparse_file sparse_file_3;
14f8ab
+
14f8ab
+# Add a 3rd brick
14f8ab
+TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3;
14f8ab
+
14f8ab
+# Trigger rebalance
14f8ab
+TEST $CLI volume rebalance $V0 start force;
14f8ab
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" rebalance_completed;
14f8ab
+
14f8ab
+# Compare original and rebalanced files
14f8ab
+TEST cd $B0/${V0}2
14f8ab
+TEST cmp sparse_file $B0/${V0}3/sparse_file_3
14f8ab
+EXPECT_WITHIN 30 "";
14f8ab
+
14f8ab
+cleanup;
14f8ab
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
14f8ab
index 88b6b54..d0c21b4 100644
14f8ab
--- a/xlators/cluster/dht/src/dht-rebalance.c
14f8ab
+++ b/xlators/cluster/dht/src/dht-rebalance.c
14f8ab
@@ -18,8 +18,8 @@
14f8ab
 #include <glusterfs/events.h>
14f8ab
 
14f8ab
 #define GF_DISK_SECTOR_SIZE 512
14f8ab
-#define DHT_REBALANCE_PID 4242              /* Change it if required */
14f8ab
-#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */
14f8ab
+#define DHT_REBALANCE_PID 4242        /* Change it if required */
14f8ab
+#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */
14f8ab
 #define MAX_MIGRATE_QUEUE_COUNT 500
14f8ab
 #define MIN_MIGRATE_QUEUE_COUNT 200
14f8ab
 #define MAX_REBAL_TYPE_SIZE 16
14f8ab
@@ -178,75 +178,6 @@ dht_strip_out_acls(dict_t *dict)
14f8ab
     }
14f8ab
 }
14f8ab
 
14f8ab
-static int
14f8ab
-dht_write_with_holes(xlator_t *to, fd_t *fd, struct iovec *vec, int count,
14f8ab
-                     int32_t size, off_t offset, struct iobref *iobref,
14f8ab
-                     int *fop_errno)
14f8ab
-{
14f8ab
-    int i = 0;
14f8ab
-    int ret = -1;
14f8ab
-    int start_idx = 0;
14f8ab
-    int tmp_offset = 0;
14f8ab
-    int write_needed = 0;
14f8ab
-    int buf_len = 0;
14f8ab
-    int size_pending = 0;
14f8ab
-    char *buf = NULL;
14f8ab
-
14f8ab
-    /* loop through each vector */
14f8ab
-    for (i = 0; i < count; i++) {
14f8ab
-        buf = vec[i].iov_base;
14f8ab
-        buf_len = vec[i].iov_len;
14f8ab
-
14f8ab
-        for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len;
14f8ab
-             start_idx += GF_DISK_SECTOR_SIZE) {
14f8ab
-            if (mem_0filled(buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) {
14f8ab
-                write_needed = 1;
14f8ab
-                continue;
14f8ab
-            }
14f8ab
-
14f8ab
-            if (write_needed) {
14f8ab
-                ret = syncop_write(
14f8ab
-                    to, fd, (buf + tmp_offset), (start_idx - tmp_offset),
14f8ab
-                    (offset + tmp_offset), iobref, 0, NULL, NULL);
14f8ab
-                /* 'path' will be logged in calling function */
14f8ab
-                if (ret < 0) {
14f8ab
-                    gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
14f8ab
-                           strerror(-ret));
14f8ab
-                    *fop_errno = -ret;
14f8ab
-                    ret = -1;
14f8ab
-                    goto out;
14f8ab
-                }
14f8ab
-
14f8ab
-                write_needed = 0;
14f8ab
-            }
14f8ab
-            tmp_offset = start_idx + GF_DISK_SECTOR_SIZE;
14f8ab
-        }
14f8ab
-
14f8ab
-        if ((start_idx < buf_len) || write_needed) {
14f8ab
-            /* This means, last chunk is not yet written.. write it */
14f8ab
-            ret = syncop_write(to, fd, (buf + tmp_offset),
14f8ab
-                               (buf_len - tmp_offset), (offset + tmp_offset),
14f8ab
-                               iobref, 0, NULL, NULL);
14f8ab
-            if (ret < 0) {
14f8ab
-                /* 'path' will be logged in calling function */
14f8ab
-                gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
14f8ab
-                       strerror(-ret));
14f8ab
-                *fop_errno = -ret;
14f8ab
-                ret = -1;
14f8ab
-                goto out;
14f8ab
-            }
14f8ab
-        }
14f8ab
-
14f8ab
-        size_pending = (size - buf_len);
14f8ab
-        if (!size_pending)
14f8ab
-            break;
14f8ab
-    }
14f8ab
-
14f8ab
-    ret = size;
14f8ab
-out:
14f8ab
-    return ret;
14f8ab
-}
14f8ab
-
14f8ab
 /*
14f8ab
    return values:
14f8ab
    -1 : failure
14f8ab
@@ -1101,32 +1032,97 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
14f8ab
     int ret = 0;
14f8ab
     int count = 0;
14f8ab
     off_t offset = 0;
14f8ab
+    off_t data_offset = 0;
14f8ab
+    off_t hole_offset = 0;
14f8ab
     struct iovec *vector = NULL;
14f8ab
     struct iobref *iobref = NULL;
14f8ab
     uint64_t total = 0;
14f8ab
     size_t read_size = 0;
14f8ab
+    size_t data_block_size = 0;
14f8ab
     dict_t *xdata = NULL;
14f8ab
     dht_conf_t *conf = NULL;
14f8ab
 
14f8ab
     conf = this->private;
14f8ab
+
14f8ab
     /* if file size is '0', no need to enter this loop */
14f8ab
     while (total < ia_size) {
14f8ab
-        read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
14f8ab
-                         ? DHT_REBALANCE_BLKSIZE
14f8ab
-                         : (ia_size - total));
14f8ab
+        /* This is a regular file - read it sequentially */
14f8ab
+        if (!hole_exists) {
14f8ab
+            read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
14f8ab
+                             ? DHT_REBALANCE_BLKSIZE
14f8ab
+                             : (ia_size - total));
14f8ab
+        } else {
14f8ab
+            /* This is a sparse file - read only the data segments in the file
14f8ab
+             */
14f8ab
+
14f8ab
+            /* If the previous data block is fully copied, find the next data
14f8ab
+             * segment
14f8ab
+             * starting at the offset of the last read and written byte,  */
14f8ab
+            if (data_block_size <= 0) {
14f8ab
+                ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
14f8ab
+                                  &data_offset);
14f8ab
+                if (ret) {
14f8ab
+                    if (ret == -ENXIO)
14f8ab
+                        ret = 0; /* No more data segments */
14f8ab
+                    else
14f8ab
+                        *fop_errno = -ret; /* Error occurred */
14f8ab
+
14f8ab
+                    break;
14f8ab
+                }
14f8ab
+
14f8ab
+                /* If the position of the current data segment is greater than
14f8ab
+                 * the position of the next hole, find the next hole in order to
14f8ab
+                 * calculate the length of the new data segment */
14f8ab
+                if (data_offset > hole_offset) {
14f8ab
+                    /* Starting at the offset of the last data segment, find the
14f8ab
+                     * next hole */
14f8ab
+                    ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
14f8ab
+                                      NULL, &hole_offset);
14f8ab
+                    if (ret) {
14f8ab
+                        /* If an error occurred here it's a real error because
14f8ab
+                         * if the seek for a data segment was successful then
14f8ab
+                         * necessarily another hole must exist (EOF is a hole)
14f8ab
+                         */
14f8ab
+                        *fop_errno = -ret;
14f8ab
+                        break;
14f8ab
+                    }
14f8ab
+
14f8ab
+                    /* Calculate the total size of the current data block */
14f8ab
+                    data_block_size = hole_offset - data_offset;
14f8ab
+                }
14f8ab
+            } else {
14f8ab
+                /* There is still data in the current segment, move the
14f8ab
+                 * data_offset to the position of the last written byte */
14f8ab
+                data_offset = offset;
14f8ab
+            }
14f8ab
+
14f8ab
+            /* Calculate how much data needs to be read and written. If the data
14f8ab
+             * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
14f8ab
+             * write DHT_REBALANCE_BLKSIZE data length and the rest in the
14f8ab
+             * next iteration(s) */
14f8ab
+            read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
14f8ab
+                             ? DHT_REBALANCE_BLKSIZE
14f8ab
+                             : data_block_size);
14f8ab
+
14f8ab
+            /* Calculate the remaining size of the data block - maybe there's no
14f8ab
+             * need to seek for data in the next iteration */
14f8ab
+            data_block_size -= read_size;
14f8ab
+
14f8ab
+            /* Set offset to the offset of the data segment so read and write
14f8ab
+             * will have the correct position */
14f8ab
+            offset = data_offset;
14f8ab
+        }
14f8ab
 
14f8ab
         ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
14f8ab
                            &iobref, NULL, NULL, NULL);
14f8ab
+
14f8ab
         if (!ret || (ret < 0)) {
14f8ab
             *fop_errno = -ret;
14f8ab
             break;
14f8ab
         }
14f8ab
 
14f8ab
-        if (hole_exists) {
14f8ab
-            ret = dht_write_with_holes(to, dst, vector, count, ret, offset,
14f8ab
-                                       iobref, fop_errno);
14f8ab
-        } else {
14f8ab
-            if (!conf->force_migration && !dht_is_tier_xlator(this)) {
14f8ab
+        if (!conf->force_migration && !dht_is_tier_xlator(this)) {
14f8ab
+            if (!xdata) {
14f8ab
                 xdata = dict_new();
14f8ab
                 if (!xdata) {
14f8ab
                     gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
14f8ab
@@ -1146,7 +1142,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
14f8ab
                  * https://github.com/gluster/glusterfs/issues/308
14f8ab
                  * for more details.
14f8ab
                  */
14f8ab
-                ret = dict_set_int32(xdata, GF_AVOID_OVERWRITE, 1);
14f8ab
+                ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1);
14f8ab
                 if (ret) {
14f8ab
                     gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM,
14f8ab
                            "failed to set dict");
14f8ab
@@ -1155,22 +1151,12 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
14f8ab
                     break;
14f8ab
                 }
14f8ab
             }
14f8ab
-
14f8ab
-            ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
14f8ab
-                                NULL, xdata, NULL);
14f8ab
-            if (ret < 0) {
14f8ab
-                *fop_errno = -ret;
14f8ab
-            }
14f8ab
-        }
14f8ab
-
14f8ab
-        if ((defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) &&
14f8ab
-            (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)) {
14f8ab
-            gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED,
14f8ab
-                   "Migrate file paused");
14f8ab
-            ret = -1;
14f8ab
         }
14f8ab
 
14f8ab
+        ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
14f8ab
+                            NULL, xdata, NULL);
14f8ab
         if (ret < 0) {
14f8ab
+            *fop_errno = -ret;
14f8ab
             break;
14f8ab
         }
14f8ab
 
14f8ab
-- 
14f8ab
1.8.3.1
14f8ab