Blob Blame History Raw
From 7b2f1bd4e5a57ea3abd5f14a7d81b120735faecd Mon Sep 17 00:00:00 2001
From: Barak Sason Rofman <bsasonro@redhat.com>
Date: Wed, 6 May 2020 13:28:40 +0300
Subject: [PATCH 438/449] dht - sparse files rebalance enhancements

Currently data migration in rebalance reads sparse file sequentially,
disregarding which segments are holes and which are data. This can lead
to extremely long migration time for large sparse file.
Data migration mechanism needs to be enhanced so only data segments are
read and migrated. This can be achieved using lseek to seek for holes
and data in the file.
This enhancement is a consequence of
https://bugzilla.redhat.com/show_bug.cgi?id=1823703

> fixes: #1222
> Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
> Signed-off-by: Barak Sason Rofman <bsasonro@redhat.com>
> (Cherry pick from commit 7b7559733ca0c25c63f9d56cb7f4650dbd694c40)
> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/24409/)

BUG: 1836099
Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/202647
Reviewed-by: Barak Sason Rofman <bsasonro@redhat.com>
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
---
 tests/basic/distribute/spare_file_rebalance.t |  51 ++++++++
 xlators/cluster/dht/src/dht-rebalance.c       | 172 ++++++++++++--------------
 2 files changed, 130 insertions(+), 93 deletions(-)
 create mode 100644 tests/basic/distribute/spare_file_rebalance.t

diff --git a/tests/basic/distribute/spare_file_rebalance.t b/tests/basic/distribute/spare_file_rebalance.t
new file mode 100644
index 0000000..061c02f
--- /dev/null
+++ b/tests/basic/distribute/spare_file_rebalance.t
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../dht.rc
+
+# Initialize
+#------------------------------------------------------------
+cleanup;
+
+# Start glusterd
+TEST glusterd;
+TEST pidof glusterd;
+TEST $CLI volume info;
+
+# Create a volume
+TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2};
+
+# Verify volume creation
+EXPECT "$V0" volinfo_field $V0 'Volume Name';
+EXPECT 'Created' volinfo_field $V0 'Status';
+
+# Start volume and verify successful start
+TEST $CLI volume start $V0;
+EXPECT 'Started' volinfo_field $V0 'Status';
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
+
+#------------------------------------------------------------
+
+# Test case - Create sparse files on MP and verify
+# file info after rebalance
+#------------------------------------------------------------
+
+# Create some sparse files and get their size
+TEST cd $M0;
+dd if=/dev/urandom of=sparse_file bs=10k count=1 seek=2M
+cp --sparse=always sparse_file sparse_file_3;
+
+# Add a 3rd brick
+TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3;
+
+# Trigger rebalance
+TEST $CLI volume rebalance $V0 start force;
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" rebalance_completed;
+
+# Compare original and rebalanced files
+TEST cd $B0/${V0}2
+TEST cmp sparse_file $B0/${V0}3/sparse_file_3
+EXPECT_WITHIN 30 "";
+
+cleanup;
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index 88b6b54..d0c21b4 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -18,8 +18,8 @@
 #include <glusterfs/events.h>
 
 #define GF_DISK_SECTOR_SIZE 512
-#define DHT_REBALANCE_PID 4242              /* Change it if required */
-#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */
+#define DHT_REBALANCE_PID 4242        /* Change it if required */
+#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */
 #define MAX_MIGRATE_QUEUE_COUNT 500
 #define MIN_MIGRATE_QUEUE_COUNT 200
 #define MAX_REBAL_TYPE_SIZE 16
@@ -178,75 +178,6 @@ dht_strip_out_acls(dict_t *dict)
     }
 }
 
-static int
-dht_write_with_holes(xlator_t *to, fd_t *fd, struct iovec *vec, int count,
-                     int32_t size, off_t offset, struct iobref *iobref,
-                     int *fop_errno)
-{
-    int i = 0;
-    int ret = -1;
-    int start_idx = 0;
-    int tmp_offset = 0;
-    int write_needed = 0;
-    int buf_len = 0;
-    int size_pending = 0;
-    char *buf = NULL;
-
-    /* loop through each vector */
-    for (i = 0; i < count; i++) {
-        buf = vec[i].iov_base;
-        buf_len = vec[i].iov_len;
-
-        for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len;
-             start_idx += GF_DISK_SECTOR_SIZE) {
-            if (mem_0filled(buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) {
-                write_needed = 1;
-                continue;
-            }
-
-            if (write_needed) {
-                ret = syncop_write(
-                    to, fd, (buf + tmp_offset), (start_idx - tmp_offset),
-                    (offset + tmp_offset), iobref, 0, NULL, NULL);
-                /* 'path' will be logged in calling function */
-                if (ret < 0) {
-                    gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
-                           strerror(-ret));
-                    *fop_errno = -ret;
-                    ret = -1;
-                    goto out;
-                }
-
-                write_needed = 0;
-            }
-            tmp_offset = start_idx + GF_DISK_SECTOR_SIZE;
-        }
-
-        if ((start_idx < buf_len) || write_needed) {
-            /* This means, last chunk is not yet written.. write it */
-            ret = syncop_write(to, fd, (buf + tmp_offset),
-                               (buf_len - tmp_offset), (offset + tmp_offset),
-                               iobref, 0, NULL, NULL);
-            if (ret < 0) {
-                /* 'path' will be logged in calling function */
-                gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
-                       strerror(-ret));
-                *fop_errno = -ret;
-                ret = -1;
-                goto out;
-            }
-        }
-
-        size_pending = (size - buf_len);
-        if (!size_pending)
-            break;
-    }
-
-    ret = size;
-out:
-    return ret;
-}
-
 /*
    return values:
    -1 : failure
@@ -1101,32 +1032,97 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
     int ret = 0;
     int count = 0;
     off_t offset = 0;
+    off_t data_offset = 0;
+    off_t hole_offset = 0;
     struct iovec *vector = NULL;
     struct iobref *iobref = NULL;
     uint64_t total = 0;
     size_t read_size = 0;
+    size_t data_block_size = 0;
     dict_t *xdata = NULL;
     dht_conf_t *conf = NULL;
 
     conf = this->private;
+
     /* if file size is '0', no need to enter this loop */
     while (total < ia_size) {
-        read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
-                         ? DHT_REBALANCE_BLKSIZE
-                         : (ia_size - total));
+        /* This is a regular file - read it sequentially */
+        if (!hole_exists) {
+            read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
+                             ? DHT_REBALANCE_BLKSIZE
+                             : (ia_size - total));
+        } else {
+            /* This is a sparse file - read only the data segments in the file
+             */
+
+            /* If the previous data block is fully copied, find the next data
+             * segment
+             * starting at the offset of the last read and written byte,  */
+            if (data_block_size <= 0) {
+                ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
+                                  &data_offset);
+                if (ret) {
+                    if (ret == -ENXIO)
+                        ret = 0; /* No more data segments */
+                    else
+                        *fop_errno = -ret; /* Error occurred */
+
+                    break;
+                }
+
+                /* If the position of the current data segment is greater than
+                 * the position of the next hole, find the next hole in order to
+                 * calculate the length of the new data segment */
+                if (data_offset > hole_offset) {
+                    /* Starting at the offset of the last data segment, find the
+                     * next hole */
+                    ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
+                                      NULL, &hole_offset);
+                    if (ret) {
+                        /* If an error occurred here it's a real error because
+                         * if the seek for a data segment was successful then
+                         * necessarily another hole must exist (EOF is a hole)
+                         */
+                        *fop_errno = -ret;
+                        break;
+                    }
+
+                    /* Calculate the total size of the current data block */
+                    data_block_size = hole_offset - data_offset;
+                }
+            } else {
+                /* There is still data in the current segment, move the
+                 * data_offset to the position of the last written byte */
+                data_offset = offset;
+            }
+
+            /* Calculate how much data needs to be read and written. If the data
+             * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
+             * write DHT_REBALANCE_BLKSIZE data length and the rest in the
+             * next iteration(s) */
+            read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
+                             ? DHT_REBALANCE_BLKSIZE
+                             : data_block_size);
+
+            /* Calculate the remaining size of the data block - maybe there's no
+             * need to seek for data in the next iteration */
+            data_block_size -= read_size;
+
+            /* Set offset to the offset of the data segment so read and write
+             * will have the correct position */
+            offset = data_offset;
+        }
 
         ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
                            &iobref, NULL, NULL, NULL);
+
         if (!ret || (ret < 0)) {
             *fop_errno = -ret;
             break;
         }
 
-        if (hole_exists) {
-            ret = dht_write_with_holes(to, dst, vector, count, ret, offset,
-                                       iobref, fop_errno);
-        } else {
-            if (!conf->force_migration && !dht_is_tier_xlator(this)) {
+        if (!conf->force_migration && !dht_is_tier_xlator(this)) {
+            if (!xdata) {
                 xdata = dict_new();
                 if (!xdata) {
                     gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
@@ -1146,7 +1142,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
                  * https://github.com/gluster/glusterfs/issues/308
                  * for more details.
                  */
-                ret = dict_set_int32(xdata, GF_AVOID_OVERWRITE, 1);
+                ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1);
                 if (ret) {
                     gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM,
                            "failed to set dict");
@@ -1155,22 +1151,12 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
                     break;
                 }
             }
-
-            ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
-                                NULL, xdata, NULL);
-            if (ret < 0) {
-                *fop_errno = -ret;
-            }
-        }
-
-        if ((defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) &&
-            (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)) {
-            gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED,
-                   "Migrate file paused");
-            ret = -1;
         }
 
+        ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
+                            NULL, xdata, NULL);
         if (ret < 0) {
+            *fop_errno = -ret;
             break;
         }
 
-- 
1.8.3.1