17b94a
From 7b2f1bd4e5a57ea3abd5f14a7d81b120735faecd Mon Sep 17 00:00:00 2001
17b94a
From: Barak Sason Rofman <bsasonro@redhat.com>
17b94a
Date: Wed, 6 May 2020 13:28:40 +0300
17b94a
Subject: [PATCH 438/449] dht - sparse files rebalance enhancements
17b94a
17b94a
Currently data migration in rebalance reads sparse file sequentially,
17b94a
disregarding which segments are holes and which are data. This can lead
17b94a
to extremely long migration time for large sparse file.
17b94a
Data migration mechanism needs to be enhanced so only data segments are
17b94a
read and migrated. This can be achieved using lseek to seek for holes
17b94a
and data in the file.
17b94a
This enhancement is a consequence of
17b94a
https://bugzilla.redhat.com/show_bug.cgi?id=1823703
17b94a
17b94a
> fixes: #1222
17b94a
> Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
17b94a
> Signed-off-by: Barak Sason Rofman <bsasonro@redhat.com>
17b94a
> (Cherry pick from commit 7b7559733ca0c25c63f9d56cb7f4650dbd694c40)
17b94a
> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/24409/)
17b94a
17b94a
BUG: 1836099
17b94a
Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
17b94a
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
17b94a
Reviewed-on: https://code.engineering.redhat.com/gerrit/202647
17b94a
Reviewed-by: Barak Sason Rofman <bsasonro@redhat.com>
17b94a
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
17b94a
---
17b94a
 tests/basic/distribute/spare_file_rebalance.t |  51 ++++++++
17b94a
 xlators/cluster/dht/src/dht-rebalance.c       | 172 ++++++++++++--------------
17b94a
 2 files changed, 130 insertions(+), 93 deletions(-)
17b94a
 create mode 100644 tests/basic/distribute/spare_file_rebalance.t
17b94a
17b94a
diff --git a/tests/basic/distribute/spare_file_rebalance.t b/tests/basic/distribute/spare_file_rebalance.t
17b94a
new file mode 100644
17b94a
index 0000000..061c02f
17b94a
--- /dev/null
17b94a
+++ b/tests/basic/distribute/spare_file_rebalance.t
17b94a
@@ -0,0 +1,51 @@
17b94a
+#!/bin/bash
17b94a
+
17b94a
+. $(dirname $0)/../../include.rc
17b94a
+. $(dirname $0)/../../volume.rc
17b94a
+. $(dirname $0)/../../dht.rc
17b94a
+
17b94a
+# Initialize
17b94a
+#------------------------------------------------------------
17b94a
+cleanup;
17b94a
+
17b94a
+# Start glusterd
17b94a
+TEST glusterd;
17b94a
+TEST pidof glusterd;
17b94a
+TEST $CLI volume info;
17b94a
+
17b94a
+# Create a volume
17b94a
+TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2};
17b94a
+
17b94a
+# Verify volume creation
17b94a
+EXPECT "$V0" volinfo_field $V0 'Volume Name';
17b94a
+EXPECT 'Created' volinfo_field $V0 'Status';
17b94a
+
17b94a
+# Start volume and verify successful start
17b94a
+TEST $CLI volume start $V0;
17b94a
+EXPECT 'Started' volinfo_field $V0 'Status';
17b94a
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
17b94a
+
17b94a
+#------------------------------------------------------------
17b94a
+
17b94a
+# Test case - Create sparse files on MP and verify
17b94a
+# file info after rebalance
17b94a
+#------------------------------------------------------------
17b94a
+
17b94a
+# Create some sparse files and get their size
17b94a
+TEST cd $M0;
17b94a
+dd if=/dev/urandom of=sparse_file bs=10k count=1 seek=2M
17b94a
+cp --sparse=always sparse_file sparse_file_3;
17b94a
+
17b94a
+# Add a 3rd brick
17b94a
+TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3;
17b94a
+
17b94a
+# Trigger rebalance
17b94a
+TEST $CLI volume rebalance $V0 start force;
17b94a
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" rebalance_completed;
17b94a
+
17b94a
+# Compare original and rebalanced files
17b94a
+TEST cd $B0/${V0}2
17b94a
+TEST cmp sparse_file $B0/${V0}3/sparse_file_3
17b94a
+EXPECT_WITHIN 30 "";
17b94a
+
17b94a
+cleanup;
17b94a
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
17b94a
index 88b6b54..d0c21b4 100644
17b94a
--- a/xlators/cluster/dht/src/dht-rebalance.c
17b94a
+++ b/xlators/cluster/dht/src/dht-rebalance.c
17b94a
@@ -18,8 +18,8 @@
17b94a
 #include <glusterfs/events.h>
17b94a
 
17b94a
 #define GF_DISK_SECTOR_SIZE 512
17b94a
-#define DHT_REBALANCE_PID 4242              /* Change it if required */
17b94a
-#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */
17b94a
+#define DHT_REBALANCE_PID 4242        /* Change it if required */
17b94a
+#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */
17b94a
 #define MAX_MIGRATE_QUEUE_COUNT 500
17b94a
 #define MIN_MIGRATE_QUEUE_COUNT 200
17b94a
 #define MAX_REBAL_TYPE_SIZE 16
17b94a
@@ -178,75 +178,6 @@ dht_strip_out_acls(dict_t *dict)
17b94a
     }
17b94a
 }
17b94a
 
17b94a
-static int
17b94a
-dht_write_with_holes(xlator_t *to, fd_t *fd, struct iovec *vec, int count,
17b94a
-                     int32_t size, off_t offset, struct iobref *iobref,
17b94a
-                     int *fop_errno)
17b94a
-{
17b94a
-    int i = 0;
17b94a
-    int ret = -1;
17b94a
-    int start_idx = 0;
17b94a
-    int tmp_offset = 0;
17b94a
-    int write_needed = 0;
17b94a
-    int buf_len = 0;
17b94a
-    int size_pending = 0;
17b94a
-    char *buf = NULL;
17b94a
-
17b94a
-    /* loop through each vector */
17b94a
-    for (i = 0; i < count; i++) {
17b94a
-        buf = vec[i].iov_base;
17b94a
-        buf_len = vec[i].iov_len;
17b94a
-
17b94a
-        for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len;
17b94a
-             start_idx += GF_DISK_SECTOR_SIZE) {
17b94a
-            if (mem_0filled(buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) {
17b94a
-                write_needed = 1;
17b94a
-                continue;
17b94a
-            }
17b94a
-
17b94a
-            if (write_needed) {
17b94a
-                ret = syncop_write(
17b94a
-                    to, fd, (buf + tmp_offset), (start_idx - tmp_offset),
17b94a
-                    (offset + tmp_offset), iobref, 0, NULL, NULL);
17b94a
-                /* 'path' will be logged in calling function */
17b94a
-                if (ret < 0) {
17b94a
-                    gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
17b94a
-                           strerror(-ret));
17b94a
-                    *fop_errno = -ret;
17b94a
-                    ret = -1;
17b94a
-                    goto out;
17b94a
-                }
17b94a
-
17b94a
-                write_needed = 0;
17b94a
-            }
17b94a
-            tmp_offset = start_idx + GF_DISK_SECTOR_SIZE;
17b94a
-        }
17b94a
-
17b94a
-        if ((start_idx < buf_len) || write_needed) {
17b94a
-            /* This means, last chunk is not yet written.. write it */
17b94a
-            ret = syncop_write(to, fd, (buf + tmp_offset),
17b94a
-                               (buf_len - tmp_offset), (offset + tmp_offset),
17b94a
-                               iobref, 0, NULL, NULL);
17b94a
-            if (ret < 0) {
17b94a
-                /* 'path' will be logged in calling function */
17b94a
-                gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
17b94a
-                       strerror(-ret));
17b94a
-                *fop_errno = -ret;
17b94a
-                ret = -1;
17b94a
-                goto out;
17b94a
-            }
17b94a
-        }
17b94a
-
17b94a
-        size_pending = (size - buf_len);
17b94a
-        if (!size_pending)
17b94a
-            break;
17b94a
-    }
17b94a
-
17b94a
-    ret = size;
17b94a
-out:
17b94a
-    return ret;
17b94a
-}
17b94a
-
17b94a
 /*
17b94a
    return values:
17b94a
    -1 : failure
17b94a
@@ -1101,32 +1032,97 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
17b94a
     int ret = 0;
17b94a
     int count = 0;
17b94a
     off_t offset = 0;
17b94a
+    off_t data_offset = 0;
17b94a
+    off_t hole_offset = 0;
17b94a
     struct iovec *vector = NULL;
17b94a
     struct iobref *iobref = NULL;
17b94a
     uint64_t total = 0;
17b94a
     size_t read_size = 0;
17b94a
+    size_t data_block_size = 0;
17b94a
     dict_t *xdata = NULL;
17b94a
     dht_conf_t *conf = NULL;
17b94a
 
17b94a
     conf = this->private;
17b94a
+
17b94a
     /* if file size is '0', no need to enter this loop */
17b94a
     while (total < ia_size) {
17b94a
-        read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
17b94a
-                         ? DHT_REBALANCE_BLKSIZE
17b94a
-                         : (ia_size - total));
17b94a
+        /* This is a regular file - read it sequentially */
17b94a
+        if (!hole_exists) {
17b94a
+            read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
17b94a
+                             ? DHT_REBALANCE_BLKSIZE
17b94a
+                             : (ia_size - total));
17b94a
+        } else {
17b94a
+            /* This is a sparse file - read only the data segments in the file
17b94a
+             */
17b94a
+
17b94a
+            /* If the previous data block is fully copied, find the next data
17b94a
+             * segment
17b94a
+             * starting at the offset of the last read and written byte,  */
17b94a
+            if (data_block_size <= 0) {
17b94a
+                ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
17b94a
+                                  &data_offset);
17b94a
+                if (ret) {
17b94a
+                    if (ret == -ENXIO)
17b94a
+                        ret = 0; /* No more data segments */
17b94a
+                    else
17b94a
+                        *fop_errno = -ret; /* Error occurred */
17b94a
+
17b94a
+                    break;
17b94a
+                }
17b94a
+
17b94a
+                /* If the position of the current data segment is greater than
17b94a
+                 * the position of the next hole, find the next hole in order to
17b94a
+                 * calculate the length of the new data segment */
17b94a
+                if (data_offset > hole_offset) {
17b94a
+                    /* Starting at the offset of the last data segment, find the
17b94a
+                     * next hole */
17b94a
+                    ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
17b94a
+                                      NULL, &hole_offset);
17b94a
+                    if (ret) {
17b94a
+                        /* If an error occurred here it's a real error because
17b94a
+                         * if the seek for a data segment was successful then
17b94a
+                         * necessarily another hole must exist (EOF is a hole)
17b94a
+                         */
17b94a
+                        *fop_errno = -ret;
17b94a
+                        break;
17b94a
+                    }
17b94a
+
17b94a
+                    /* Calculate the total size of the current data block */
17b94a
+                    data_block_size = hole_offset - data_offset;
17b94a
+                }
17b94a
+            } else {
17b94a
+                /* There is still data in the current segment, move the
17b94a
+                 * data_offset to the position of the last written byte */
17b94a
+                data_offset = offset;
17b94a
+            }
17b94a
+
17b94a
+            /* Calculate how much data needs to be read and written. If the data
17b94a
+             * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
17b94a
+             * write DHT_REBALANCE_BLKSIZE data length and the rest in the
17b94a
+             * next iteration(s) */
17b94a
+            read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
17b94a
+                             ? DHT_REBALANCE_BLKSIZE
17b94a
+                             : data_block_size);
17b94a
+
17b94a
+            /* Calculate the remaining size of the data block - maybe there's no
17b94a
+             * need to seek for data in the next iteration */
17b94a
+            data_block_size -= read_size;
17b94a
+
17b94a
+            /* Set offset to the offset of the data segment so read and write
17b94a
+             * will have the correct position */
17b94a
+            offset = data_offset;
17b94a
+        }
17b94a
 
17b94a
         ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
17b94a
                            &iobref, NULL, NULL, NULL);
17b94a
+
17b94a
         if (!ret || (ret < 0)) {
17b94a
             *fop_errno = -ret;
17b94a
             break;
17b94a
         }
17b94a
 
17b94a
-        if (hole_exists) {
17b94a
-            ret = dht_write_with_holes(to, dst, vector, count, ret, offset,
17b94a
-                                       iobref, fop_errno);
17b94a
-        } else {
17b94a
-            if (!conf->force_migration && !dht_is_tier_xlator(this)) {
17b94a
+        if (!conf->force_migration && !dht_is_tier_xlator(this)) {
17b94a
+            if (!xdata) {
17b94a
                 xdata = dict_new();
17b94a
                 if (!xdata) {
17b94a
                     gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
17b94a
@@ -1146,7 +1142,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
17b94a
                  * https://github.com/gluster/glusterfs/issues/308
17b94a
                  * for more details.
17b94a
                  */
17b94a
-                ret = dict_set_int32(xdata, GF_AVOID_OVERWRITE, 1);
17b94a
+                ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1);
17b94a
                 if (ret) {
17b94a
                     gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM,
17b94a
                            "failed to set dict");
17b94a
@@ -1155,22 +1151,12 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
17b94a
                     break;
17b94a
                 }
17b94a
             }
17b94a
-
17b94a
-            ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
17b94a
-                                NULL, xdata, NULL);
17b94a
-            if (ret < 0) {
17b94a
-                *fop_errno = -ret;
17b94a
-            }
17b94a
-        }
17b94a
-
17b94a
-        if ((defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) &&
17b94a
-            (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)) {
17b94a
-            gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED,
17b94a
-                   "Migrate file paused");
17b94a
-            ret = -1;
17b94a
         }
17b94a
 
17b94a
+        ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
17b94a
+                            NULL, xdata, NULL);
17b94a
         if (ret < 0) {
17b94a
+            *fop_errno = -ret;
17b94a
             break;
17b94a
         }
17b94a
 
17b94a
-- 
17b94a
1.8.3.1
17b94a