74096c
From 7b2f1bd4e5a57ea3abd5f14a7d81b120735faecd Mon Sep 17 00:00:00 2001
74096c
From: Barak Sason Rofman <bsasonro@redhat.com>
74096c
Date: Wed, 6 May 2020 13:28:40 +0300
74096c
Subject: [PATCH 438/449] dht - sparse files rebalance enhancements
74096c
74096c
Currently data migration in rebalance reads sparse file sequentially,
74096c
disregarding which segments are holes and which are data. This can lead
74096c
to extremely long migration time for large sparse file.
74096c
Data migration mechanism needs to be enhanced so only data segments are
74096c
read and migrated. This can be achieved using lseek to seek for holes
74096c
and data in the file.
74096c
This enhancement is a consequence of
74096c
https://bugzilla.redhat.com/show_bug.cgi?id=1823703
74096c
74096c
> fixes: #1222
74096c
> Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
74096c
> Signed-off-by: Barak Sason Rofman <bsasonro@redhat.com>
74096c
> (Cherry pick from commit 7b7559733ca0c25c63f9d56cb7f4650dbd694c40)
74096c
> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/24409/)
74096c
74096c
BUG: 1836099
74096c
Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
74096c
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
74096c
Reviewed-on: https://code.engineering.redhat.com/gerrit/202647
74096c
Reviewed-by: Barak Sason Rofman <bsasonro@redhat.com>
74096c
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
74096c
---
74096c
 tests/basic/distribute/spare_file_rebalance.t |  51 ++++++++
74096c
 xlators/cluster/dht/src/dht-rebalance.c       | 172 ++++++++++++--------------
74096c
 2 files changed, 130 insertions(+), 93 deletions(-)
74096c
 create mode 100644 tests/basic/distribute/spare_file_rebalance.t
74096c
74096c
diff --git a/tests/basic/distribute/spare_file_rebalance.t b/tests/basic/distribute/spare_file_rebalance.t
74096c
new file mode 100644
74096c
index 0000000..061c02f
74096c
--- /dev/null
74096c
+++ b/tests/basic/distribute/spare_file_rebalance.t
74096c
@@ -0,0 +1,51 @@
74096c
+#!/bin/bash
74096c
+
74096c
+. $(dirname $0)/../../include.rc
74096c
+. $(dirname $0)/../../volume.rc
74096c
+. $(dirname $0)/../../dht.rc
74096c
+
74096c
+# Initialize
74096c
+#------------------------------------------------------------
74096c
+cleanup;
74096c
+
74096c
+# Start glusterd
74096c
+TEST glusterd;
74096c
+TEST pidof glusterd;
74096c
+TEST $CLI volume info;
74096c
+
74096c
+# Create a volume
74096c
+TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2};
74096c
+
74096c
+# Verify volume creation
74096c
+EXPECT "$V0" volinfo_field $V0 'Volume Name';
74096c
+EXPECT 'Created' volinfo_field $V0 'Status';
74096c
+
74096c
+# Start volume and verify successful start
74096c
+TEST $CLI volume start $V0;
74096c
+EXPECT 'Started' volinfo_field $V0 'Status';
74096c
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
74096c
+
74096c
+#------------------------------------------------------------
74096c
+
74096c
+# Test case - Create sparse files on MP and verify
74096c
+# file info after rebalance
74096c
+#------------------------------------------------------------
74096c
+
74096c
+# Create some sparse files and get their size
74096c
+TEST cd $M0;
74096c
+dd if=/dev/urandom of=sparse_file bs=10k count=1 seek=2M
74096c
+cp --sparse=always sparse_file sparse_file_3;
74096c
+
74096c
+# Add a 3rd brick
74096c
+TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3;
74096c
+
74096c
+# Trigger rebalance
74096c
+TEST $CLI volume rebalance $V0 start force;
74096c
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" rebalance_completed;
74096c
+
74096c
+# Compare original and rebalanced files
74096c
+TEST cd $B0/${V0}2
74096c
+TEST cmp sparse_file $B0/${V0}3/sparse_file_3
74096c
+EXPECT_WITHIN 30 "";
74096c
+
74096c
+cleanup;
74096c
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
74096c
index 88b6b54..d0c21b4 100644
74096c
--- a/xlators/cluster/dht/src/dht-rebalance.c
74096c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
74096c
@@ -18,8 +18,8 @@
74096c
 #include <glusterfs/events.h>
74096c
 
74096c
 #define GF_DISK_SECTOR_SIZE 512
74096c
-#define DHT_REBALANCE_PID 4242              /* Change it if required */
74096c
-#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */
74096c
+#define DHT_REBALANCE_PID 4242        /* Change it if required */
74096c
+#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */
74096c
 #define MAX_MIGRATE_QUEUE_COUNT 500
74096c
 #define MIN_MIGRATE_QUEUE_COUNT 200
74096c
 #define MAX_REBAL_TYPE_SIZE 16
74096c
@@ -178,75 +178,6 @@ dht_strip_out_acls(dict_t *dict)
74096c
     }
74096c
 }
74096c
 
74096c
-static int
74096c
-dht_write_with_holes(xlator_t *to, fd_t *fd, struct iovec *vec, int count,
74096c
-                     int32_t size, off_t offset, struct iobref *iobref,
74096c
-                     int *fop_errno)
74096c
-{
74096c
-    int i = 0;
74096c
-    int ret = -1;
74096c
-    int start_idx = 0;
74096c
-    int tmp_offset = 0;
74096c
-    int write_needed = 0;
74096c
-    int buf_len = 0;
74096c
-    int size_pending = 0;
74096c
-    char *buf = NULL;
74096c
-
74096c
-    /* loop through each vector */
74096c
-    for (i = 0; i < count; i++) {
74096c
-        buf = vec[i].iov_base;
74096c
-        buf_len = vec[i].iov_len;
74096c
-
74096c
-        for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len;
74096c
-             start_idx += GF_DISK_SECTOR_SIZE) {
74096c
-            if (mem_0filled(buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) {
74096c
-                write_needed = 1;
74096c
-                continue;
74096c
-            }
74096c
-
74096c
-            if (write_needed) {
74096c
-                ret = syncop_write(
74096c
-                    to, fd, (buf + tmp_offset), (start_idx - tmp_offset),
74096c
-                    (offset + tmp_offset), iobref, 0, NULL, NULL);
74096c
-                /* 'path' will be logged in calling function */
74096c
-                if (ret < 0) {
74096c
-                    gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
74096c
-                           strerror(-ret));
74096c
-                    *fop_errno = -ret;
74096c
-                    ret = -1;
74096c
-                    goto out;
74096c
-                }
74096c
-
74096c
-                write_needed = 0;
74096c
-            }
74096c
-            tmp_offset = start_idx + GF_DISK_SECTOR_SIZE;
74096c
-        }
74096c
-
74096c
-        if ((start_idx < buf_len) || write_needed) {
74096c
-            /* This means, last chunk is not yet written.. write it */
74096c
-            ret = syncop_write(to, fd, (buf + tmp_offset),
74096c
-                               (buf_len - tmp_offset), (offset + tmp_offset),
74096c
-                               iobref, 0, NULL, NULL);
74096c
-            if (ret < 0) {
74096c
-                /* 'path' will be logged in calling function */
74096c
-                gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
74096c
-                       strerror(-ret));
74096c
-                *fop_errno = -ret;
74096c
-                ret = -1;
74096c
-                goto out;
74096c
-            }
74096c
-        }
74096c
-
74096c
-        size_pending = (size - buf_len);
74096c
-        if (!size_pending)
74096c
-            break;
74096c
-    }
74096c
-
74096c
-    ret = size;
74096c
-out:
74096c
-    return ret;
74096c
-}
74096c
-
74096c
 /*
74096c
    return values:
74096c
    -1 : failure
74096c
@@ -1101,32 +1032,97 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
74096c
     int ret = 0;
74096c
     int count = 0;
74096c
     off_t offset = 0;
74096c
+    off_t data_offset = 0;
74096c
+    off_t hole_offset = 0;
74096c
     struct iovec *vector = NULL;
74096c
     struct iobref *iobref = NULL;
74096c
     uint64_t total = 0;
74096c
     size_t read_size = 0;
74096c
+    size_t data_block_size = 0;
74096c
     dict_t *xdata = NULL;
74096c
     dht_conf_t *conf = NULL;
74096c
 
74096c
     conf = this->private;
74096c
+
74096c
     /* if file size is '0', no need to enter this loop */
74096c
     while (total < ia_size) {
74096c
-        read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
74096c
-                         ? DHT_REBALANCE_BLKSIZE
74096c
-                         : (ia_size - total));
74096c
+        /* This is a regular file - read it sequentially */
74096c
+        if (!hole_exists) {
74096c
+            read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
74096c
+                             ? DHT_REBALANCE_BLKSIZE
74096c
+                             : (ia_size - total));
74096c
+        } else {
74096c
+            /* This is a sparse file - read only the data segments in the file
74096c
+             */
74096c
+
74096c
+            /* If the previous data block is fully copied, find the next data
74096c
+             * segment
74096c
+             * starting at the offset of the last read and written byte,  */
74096c
+            if (data_block_size <= 0) {
74096c
+                ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
74096c
+                                  &data_offset);
74096c
+                if (ret) {
74096c
+                    if (ret == -ENXIO)
74096c
+                        ret = 0; /* No more data segments */
74096c
+                    else
74096c
+                        *fop_errno = -ret; /* Error occurred */
74096c
+
74096c
+                    break;
74096c
+                }
74096c
+
74096c
+                /* If the position of the current data segment is greater than
74096c
+                 * the position of the next hole, find the next hole in order to
74096c
+                 * calculate the length of the new data segment */
74096c
+                if (data_offset > hole_offset) {
74096c
+                    /* Starting at the offset of the last data segment, find the
74096c
+                     * next hole */
74096c
+                    ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
74096c
+                                      NULL, &hole_offset);
74096c
+                    if (ret) {
74096c
+                        /* If an error occurred here it's a real error because
74096c
+                         * if the seek for a data segment was successful then
74096c
+                         * necessarily another hole must exist (EOF is a hole)
74096c
+                         */
74096c
+                        *fop_errno = -ret;
74096c
+                        break;
74096c
+                    }
74096c
+
74096c
+                    /* Calculate the total size of the current data block */
74096c
+                    data_block_size = hole_offset - data_offset;
74096c
+                }
74096c
+            } else {
74096c
+                /* There is still data in the current segment, move the
74096c
+                 * data_offset to the position of the last written byte */
74096c
+                data_offset = offset;
74096c
+            }
74096c
+
74096c
+            /* Calculate how much data needs to be read and written. If the data
74096c
+             * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
74096c
+             * write DHT_REBALANCE_BLKSIZE data length and the rest in the
74096c
+             * next iteration(s) */
74096c
+            read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
74096c
+                             ? DHT_REBALANCE_BLKSIZE
74096c
+                             : data_block_size);
74096c
+
74096c
+            /* Calculate the remaining size of the data block - maybe there's no
74096c
+             * need to seek for data in the next iteration */
74096c
+            data_block_size -= read_size;
74096c
+
74096c
+            /* Set offset to the offset of the data segment so read and write
74096c
+             * will have the correct position */
74096c
+            offset = data_offset;
74096c
+        }
74096c
 
74096c
         ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
74096c
                            &iobref, NULL, NULL, NULL);
74096c
+
74096c
         if (!ret || (ret < 0)) {
74096c
             *fop_errno = -ret;
74096c
             break;
74096c
         }
74096c
 
74096c
-        if (hole_exists) {
74096c
-            ret = dht_write_with_holes(to, dst, vector, count, ret, offset,
74096c
-                                       iobref, fop_errno);
74096c
-        } else {
74096c
-            if (!conf->force_migration && !dht_is_tier_xlator(this)) {
74096c
+        if (!conf->force_migration && !dht_is_tier_xlator(this)) {
74096c
+            if (!xdata) {
74096c
                 xdata = dict_new();
74096c
                 if (!xdata) {
74096c
                     gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
74096c
@@ -1146,7 +1142,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
74096c
                  * https://github.com/gluster/glusterfs/issues/308
74096c
                  * for more details.
74096c
                  */
74096c
-                ret = dict_set_int32(xdata, GF_AVOID_OVERWRITE, 1);
74096c
+                ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1);
74096c
                 if (ret) {
74096c
                     gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM,
74096c
                            "failed to set dict");
74096c
@@ -1155,22 +1151,12 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
74096c
                     break;
74096c
                 }
74096c
             }
74096c
-
74096c
-            ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
74096c
-                                NULL, xdata, NULL);
74096c
-            if (ret < 0) {
74096c
-                *fop_errno = -ret;
74096c
-            }
74096c
-        }
74096c
-
74096c
-        if ((defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) &&
74096c
-            (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)) {
74096c
-            gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED,
74096c
-                   "Migrate file paused");
74096c
-            ret = -1;
74096c
         }
74096c
 
74096c
+        ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
74096c
+                            NULL, xdata, NULL);
74096c
         if (ret < 0) {
74096c
+            *fop_errno = -ret;
74096c
             break;
74096c
         }
74096c
 
74096c
-- 
74096c
1.8.3.1
74096c