d84cf8
From 7b2f1bd4e5a57ea3abd5f14a7d81b120735faecd Mon Sep 17 00:00:00 2001
d84cf8
From: Barak Sason Rofman <bsasonro@redhat.com>
d84cf8
Date: Wed, 6 May 2020 13:28:40 +0300
d84cf8
Subject: [PATCH 438/449] dht - sparse files rebalance enhancements
d84cf8
d84cf8
Currently data migration in rebalance reads sparse file sequentially,
d84cf8
disregarding which segments are holes and which are data. This can lead
d84cf8
to extremely long migration time for large sparse file.
d84cf8
Data migration mechanism needs to be enhanced so only data segments are
d84cf8
read and migrated. This can be achieved using lseek to seek for holes
d84cf8
and data in the file.
d84cf8
This enhancement is a consequence of
d84cf8
https://bugzilla.redhat.com/show_bug.cgi?id=1823703
d84cf8
d84cf8
> fixes: #1222
d84cf8
> Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
d84cf8
> Signed-off-by: Barak Sason Rofman <bsasonro@redhat.com>
d84cf8
> (Cherry pick from commit 7b7559733ca0c25c63f9d56cb7f4650dbd694c40)
d84cf8
> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/24409/)
d84cf8
d84cf8
BUG: 1836099
d84cf8
Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
d84cf8
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
d84cf8
Reviewed-on: https://code.engineering.redhat.com/gerrit/202647
d84cf8
Reviewed-by: Barak Sason Rofman <bsasonro@redhat.com>
d84cf8
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
d84cf8
---
d84cf8
 tests/basic/distribute/spare_file_rebalance.t |  51 ++++++++
d84cf8
 xlators/cluster/dht/src/dht-rebalance.c       | 172 ++++++++++++--------------
d84cf8
 2 files changed, 130 insertions(+), 93 deletions(-)
d84cf8
 create mode 100644 tests/basic/distribute/spare_file_rebalance.t
d84cf8
d84cf8
diff --git a/tests/basic/distribute/spare_file_rebalance.t b/tests/basic/distribute/spare_file_rebalance.t
d84cf8
new file mode 100644
d84cf8
index 0000000..061c02f
d84cf8
--- /dev/null
d84cf8
+++ b/tests/basic/distribute/spare_file_rebalance.t
d84cf8
@@ -0,0 +1,51 @@
d84cf8
+#!/bin/bash
d84cf8
+
d84cf8
+. $(dirname $0)/../../include.rc
d84cf8
+. $(dirname $0)/../../volume.rc
d84cf8
+. $(dirname $0)/../../dht.rc
d84cf8
+
d84cf8
+# Initialize
d84cf8
+#------------------------------------------------------------
d84cf8
+cleanup;
d84cf8
+
d84cf8
+# Start glusterd
d84cf8
+TEST glusterd;
d84cf8
+TEST pidof glusterd;
d84cf8
+TEST $CLI volume info;
d84cf8
+
d84cf8
+# Create a volume
d84cf8
+TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2};
d84cf8
+
d84cf8
+# Verify volume creation
d84cf8
+EXPECT "$V0" volinfo_field $V0 'Volume Name';
d84cf8
+EXPECT 'Created' volinfo_field $V0 'Status';
d84cf8
+
d84cf8
+# Start volume and verify successful start
d84cf8
+TEST $CLI volume start $V0;
d84cf8
+EXPECT 'Started' volinfo_field $V0 'Status';
d84cf8
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
d84cf8
+
d84cf8
+#------------------------------------------------------------
d84cf8
+
d84cf8
+# Test case - Create sparse files on MP and verify
d84cf8
+# file info after rebalance
d84cf8
+#------------------------------------------------------------
d84cf8
+
d84cf8
+# Create some sparse files and get their size
d84cf8
+TEST cd $M0;
d84cf8
+dd if=/dev/urandom of=sparse_file bs=10k count=1 seek=2M
d84cf8
+cp --sparse=always sparse_file sparse_file_3;
d84cf8
+
d84cf8
+# Add a 3rd brick
d84cf8
+TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3;
d84cf8
+
d84cf8
+# Trigger rebalance
d84cf8
+TEST $CLI volume rebalance $V0 start force;
d84cf8
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" rebalance_completed;
d84cf8
+
d84cf8
+# Compare original and rebalanced files
d84cf8
+TEST cd $B0/${V0}2
d84cf8
+TEST cmp sparse_file $B0/${V0}3/sparse_file_3
d84cf8
+EXPECT_WITHIN 30 "";
d84cf8
+
d84cf8
+cleanup;
d84cf8
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
d84cf8
index 88b6b54..d0c21b4 100644
d84cf8
--- a/xlators/cluster/dht/src/dht-rebalance.c
d84cf8
+++ b/xlators/cluster/dht/src/dht-rebalance.c
d84cf8
@@ -18,8 +18,8 @@
d84cf8
 #include <glusterfs/events.h>
d84cf8
 
d84cf8
 #define GF_DISK_SECTOR_SIZE 512
d84cf8
-#define DHT_REBALANCE_PID 4242              /* Change it if required */
d84cf8
-#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */
d84cf8
+#define DHT_REBALANCE_PID 4242        /* Change it if required */
d84cf8
+#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */
d84cf8
 #define MAX_MIGRATE_QUEUE_COUNT 500
d84cf8
 #define MIN_MIGRATE_QUEUE_COUNT 200
d84cf8
 #define MAX_REBAL_TYPE_SIZE 16
d84cf8
@@ -178,75 +178,6 @@ dht_strip_out_acls(dict_t *dict)
d84cf8
     }
d84cf8
 }
d84cf8
 
d84cf8
-static int
d84cf8
-dht_write_with_holes(xlator_t *to, fd_t *fd, struct iovec *vec, int count,
d84cf8
-                     int32_t size, off_t offset, struct iobref *iobref,
d84cf8
-                     int *fop_errno)
d84cf8
-{
d84cf8
-    int i = 0;
d84cf8
-    int ret = -1;
d84cf8
-    int start_idx = 0;
d84cf8
-    int tmp_offset = 0;
d84cf8
-    int write_needed = 0;
d84cf8
-    int buf_len = 0;
d84cf8
-    int size_pending = 0;
d84cf8
-    char *buf = NULL;
d84cf8
-
d84cf8
-    /* loop through each vector */
d84cf8
-    for (i = 0; i < count; i++) {
d84cf8
-        buf = vec[i].iov_base;
d84cf8
-        buf_len = vec[i].iov_len;
d84cf8
-
d84cf8
-        for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len;
d84cf8
-             start_idx += GF_DISK_SECTOR_SIZE) {
d84cf8
-            if (mem_0filled(buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) {
d84cf8
-                write_needed = 1;
d84cf8
-                continue;
d84cf8
-            }
d84cf8
-
d84cf8
-            if (write_needed) {
d84cf8
-                ret = syncop_write(
d84cf8
-                    to, fd, (buf + tmp_offset), (start_idx - tmp_offset),
d84cf8
-                    (offset + tmp_offset), iobref, 0, NULL, NULL);
d84cf8
-                /* 'path' will be logged in calling function */
d84cf8
-                if (ret < 0) {
d84cf8
-                    gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
d84cf8
-                           strerror(-ret));
d84cf8
-                    *fop_errno = -ret;
d84cf8
-                    ret = -1;
d84cf8
-                    goto out;
d84cf8
-                }
d84cf8
-
d84cf8
-                write_needed = 0;
d84cf8
-            }
d84cf8
-            tmp_offset = start_idx + GF_DISK_SECTOR_SIZE;
d84cf8
-        }
d84cf8
-
d84cf8
-        if ((start_idx < buf_len) || write_needed) {
d84cf8
-            /* This means, last chunk is not yet written.. write it */
d84cf8
-            ret = syncop_write(to, fd, (buf + tmp_offset),
d84cf8
-                               (buf_len - tmp_offset), (offset + tmp_offset),
d84cf8
-                               iobref, 0, NULL, NULL);
d84cf8
-            if (ret < 0) {
d84cf8
-                /* 'path' will be logged in calling function */
d84cf8
-                gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
d84cf8
-                       strerror(-ret));
d84cf8
-                *fop_errno = -ret;
d84cf8
-                ret = -1;
d84cf8
-                goto out;
d84cf8
-            }
d84cf8
-        }
d84cf8
-
d84cf8
-        size_pending = (size - buf_len);
d84cf8
-        if (!size_pending)
d84cf8
-            break;
d84cf8
-    }
d84cf8
-
d84cf8
-    ret = size;
d84cf8
-out:
d84cf8
-    return ret;
d84cf8
-}
d84cf8
-
d84cf8
 /*
d84cf8
    return values:
d84cf8
    -1 : failure
d84cf8
@@ -1101,32 +1032,97 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
d84cf8
     int ret = 0;
d84cf8
     int count = 0;
d84cf8
     off_t offset = 0;
d84cf8
+    off_t data_offset = 0;
d84cf8
+    off_t hole_offset = 0;
d84cf8
     struct iovec *vector = NULL;
d84cf8
     struct iobref *iobref = NULL;
d84cf8
     uint64_t total = 0;
d84cf8
     size_t read_size = 0;
d84cf8
+    size_t data_block_size = 0;
d84cf8
     dict_t *xdata = NULL;
d84cf8
     dht_conf_t *conf = NULL;
d84cf8
 
d84cf8
     conf = this->private;
d84cf8
+
d84cf8
     /* if file size is '0', no need to enter this loop */
d84cf8
     while (total < ia_size) {
d84cf8
-        read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
d84cf8
-                         ? DHT_REBALANCE_BLKSIZE
d84cf8
-                         : (ia_size - total));
d84cf8
+        /* This is a regular file - read it sequentially */
d84cf8
+        if (!hole_exists) {
d84cf8
+            read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
d84cf8
+                             ? DHT_REBALANCE_BLKSIZE
d84cf8
+                             : (ia_size - total));
d84cf8
+        } else {
d84cf8
+            /* This is a sparse file - read only the data segments in the file
d84cf8
+             */
d84cf8
+
d84cf8
+            /* If the previous data block is fully copied, find the next data
d84cf8
+             * segment
d84cf8
+             * starting at the offset of the last read and written byte,  */
d84cf8
+            if (data_block_size <= 0) {
d84cf8
+                ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
d84cf8
+                                  &data_offset);
d84cf8
+                if (ret) {
d84cf8
+                    if (ret == -ENXIO)
d84cf8
+                        ret = 0; /* No more data segments */
d84cf8
+                    else
d84cf8
+                        *fop_errno = -ret; /* Error occurred */
d84cf8
+
d84cf8
+                    break;
d84cf8
+                }
d84cf8
+
d84cf8
+                /* If the position of the current data segment is greater than
d84cf8
+                 * the position of the next hole, find the next hole in order to
d84cf8
+                 * calculate the length of the new data segment */
d84cf8
+                if (data_offset > hole_offset) {
d84cf8
+                    /* Starting at the offset of the last data segment, find the
d84cf8
+                     * next hole */
d84cf8
+                    ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
d84cf8
+                                      NULL, &hole_offset);
d84cf8
+                    if (ret) {
d84cf8
+                        /* If an error occurred here it's a real error because
d84cf8
+                         * if the seek for a data segment was successful then
d84cf8
+                         * necessarily another hole must exist (EOF is a hole)
d84cf8
+                         */
d84cf8
+                        *fop_errno = -ret;
d84cf8
+                        break;
d84cf8
+                    }
d84cf8
+
d84cf8
+                    /* Calculate the total size of the current data block */
d84cf8
+                    data_block_size = hole_offset - data_offset;
d84cf8
+                }
d84cf8
+            } else {
d84cf8
+                /* There is still data in the current segment, move the
d84cf8
+                 * data_offset to the position of the last written byte */
d84cf8
+                data_offset = offset;
d84cf8
+            }
d84cf8
+
d84cf8
+            /* Calculate how much data needs to be read and written. If the data
d84cf8
+             * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
d84cf8
+             * write DHT_REBALANCE_BLKSIZE data length and the rest in the
d84cf8
+             * next iteration(s) */
d84cf8
+            read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
d84cf8
+                             ? DHT_REBALANCE_BLKSIZE
d84cf8
+                             : data_block_size);
d84cf8
+
d84cf8
+            /* Calculate the remaining size of the data block - maybe there's no
d84cf8
+             * need to seek for data in the next iteration */
d84cf8
+            data_block_size -= read_size;
d84cf8
+
d84cf8
+            /* Set offset to the offset of the data segment so read and write
d84cf8
+             * will have the correct position */
d84cf8
+            offset = data_offset;
d84cf8
+        }
d84cf8
 
d84cf8
         ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
d84cf8
                            &iobref, NULL, NULL, NULL);
d84cf8
+
d84cf8
         if (!ret || (ret < 0)) {
d84cf8
             *fop_errno = -ret;
d84cf8
             break;
d84cf8
         }
d84cf8
 
d84cf8
-        if (hole_exists) {
d84cf8
-            ret = dht_write_with_holes(to, dst, vector, count, ret, offset,
d84cf8
-                                       iobref, fop_errno);
d84cf8
-        } else {
d84cf8
-            if (!conf->force_migration && !dht_is_tier_xlator(this)) {
d84cf8
+        if (!conf->force_migration && !dht_is_tier_xlator(this)) {
d84cf8
+            if (!xdata) {
d84cf8
                 xdata = dict_new();
d84cf8
                 if (!xdata) {
d84cf8
                     gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
d84cf8
@@ -1146,7 +1142,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
d84cf8
                  * https://github.com/gluster/glusterfs/issues/308
d84cf8
                  * for more details.
d84cf8
                  */
d84cf8
-                ret = dict_set_int32(xdata, GF_AVOID_OVERWRITE, 1);
d84cf8
+                ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1);
d84cf8
                 if (ret) {
d84cf8
                     gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM,
d84cf8
                            "failed to set dict");
d84cf8
@@ -1155,22 +1151,12 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
d84cf8
                     break;
d84cf8
                 }
d84cf8
             }
d84cf8
-
d84cf8
-            ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
d84cf8
-                                NULL, xdata, NULL);
d84cf8
-            if (ret < 0) {
d84cf8
-                *fop_errno = -ret;
d84cf8
-            }
d84cf8
-        }
d84cf8
-
d84cf8
-        if ((defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) &&
d84cf8
-            (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)) {
d84cf8
-            gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED,
d84cf8
-                   "Migrate file paused");
d84cf8
-            ret = -1;
d84cf8
         }
d84cf8
 
d84cf8
+        ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
d84cf8
+                            NULL, xdata, NULL);
d84cf8
         if (ret < 0) {
d84cf8
+            *fop_errno = -ret;
d84cf8
             break;
d84cf8
         }
d84cf8
 
d84cf8
-- 
d84cf8
1.8.3.1
d84cf8