190130
From 7b2f1bd4e5a57ea3abd5f14a7d81b120735faecd Mon Sep 17 00:00:00 2001
190130
From: Barak Sason Rofman <bsasonro@redhat.com>
190130
Date: Wed, 6 May 2020 13:28:40 +0300
190130
Subject: [PATCH 438/449] dht - sparse files rebalance enhancements
190130
190130
Currently data migration in rebalance reads sparse file sequentially,
190130
disregarding which segments are holes and which are data. This can lead
190130
to extremely long migration time for large sparse file.
190130
Data migration mechanism needs to be enhanced so only data segments are
190130
read and migrated. This can be achieved using lseek to seek for holes
190130
and data in the file.
190130
This enhancement is a consequence of
190130
https://bugzilla.redhat.com/show_bug.cgi?id=1823703
190130
190130
> fixes: #1222
190130
> Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
190130
> Signed-off-by: Barak Sason Rofman <bsasonro@redhat.com>
190130
> (Cherry pick from commit 7b7559733ca0c25c63f9d56cb7f4650dbd694c40)
190130
> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/24409/)
190130
190130
BUG: 1836099
190130
Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
190130
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
190130
Reviewed-on: https://code.engineering.redhat.com/gerrit/202647
190130
Reviewed-by: Barak Sason Rofman <bsasonro@redhat.com>
190130
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
190130
---
190130
 tests/basic/distribute/spare_file_rebalance.t |  51 ++++++++
190130
 xlators/cluster/dht/src/dht-rebalance.c       | 172 ++++++++++++--------------
190130
 2 files changed, 130 insertions(+), 93 deletions(-)
190130
 create mode 100644 tests/basic/distribute/spare_file_rebalance.t
190130
190130
diff --git a/tests/basic/distribute/spare_file_rebalance.t b/tests/basic/distribute/spare_file_rebalance.t
190130
new file mode 100644
190130
index 0000000..061c02f
190130
--- /dev/null
190130
+++ b/tests/basic/distribute/spare_file_rebalance.t
190130
@@ -0,0 +1,51 @@
190130
+#!/bin/bash
190130
+
190130
+. $(dirname $0)/../../include.rc
190130
+. $(dirname $0)/../../volume.rc
190130
+. $(dirname $0)/../../dht.rc
190130
+
190130
+# Initialize
190130
+#------------------------------------------------------------
190130
+cleanup;
190130
+
190130
+# Start glusterd
190130
+TEST glusterd;
190130
+TEST pidof glusterd;
190130
+TEST $CLI volume info;
190130
+
190130
+# Create a volume
190130
+TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2};
190130
+
190130
+# Verify volume creation
190130
+EXPECT "$V0" volinfo_field $V0 'Volume Name';
190130
+EXPECT 'Created' volinfo_field $V0 'Status';
190130
+
190130
+# Start volume and verify successful start
190130
+TEST $CLI volume start $V0;
190130
+EXPECT 'Started' volinfo_field $V0 'Status';
190130
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
190130
+
190130
+#------------------------------------------------------------
190130
+
190130
+# Test case - Create sparse files on MP and verify
190130
+# file info after rebalance
190130
+#------------------------------------------------------------
190130
+
190130
+# Create some sparse files and get their size
190130
+TEST cd $M0;
190130
+dd if=/dev/urandom of=sparse_file bs=10k count=1 seek=2M
190130
+cp --sparse=always sparse_file sparse_file_3;
190130
+
190130
+# Add a 3rd brick
190130
+TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3;
190130
+
190130
+# Trigger rebalance
190130
+TEST $CLI volume rebalance $V0 start force;
190130
+EXPECT_WITHIN $REBALANCE_TIMEOUT "0" rebalance_completed;
190130
+
190130
+# Compare original and rebalanced files
190130
+TEST cd $B0/${V0}2
190130
+TEST cmp sparse_file $B0/${V0}3/sparse_file_3
190130
+EXPECT_WITHIN 30 "";
190130
+
190130
+cleanup;
190130
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
190130
index 88b6b54..d0c21b4 100644
190130
--- a/xlators/cluster/dht/src/dht-rebalance.c
190130
+++ b/xlators/cluster/dht/src/dht-rebalance.c
190130
@@ -18,8 +18,8 @@
190130
 #include <glusterfs/events.h>
190130
 
190130
 #define GF_DISK_SECTOR_SIZE 512
190130
-#define DHT_REBALANCE_PID 4242              /* Change it if required */
190130
-#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */
190130
+#define DHT_REBALANCE_PID 4242        /* Change it if required */
190130
+#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */
190130
 #define MAX_MIGRATE_QUEUE_COUNT 500
190130
 #define MIN_MIGRATE_QUEUE_COUNT 200
190130
 #define MAX_REBAL_TYPE_SIZE 16
190130
@@ -178,75 +178,6 @@ dht_strip_out_acls(dict_t *dict)
190130
     }
190130
 }
190130
 
190130
-static int
190130
-dht_write_with_holes(xlator_t *to, fd_t *fd, struct iovec *vec, int count,
190130
-                     int32_t size, off_t offset, struct iobref *iobref,
190130
-                     int *fop_errno)
190130
-{
190130
-    int i = 0;
190130
-    int ret = -1;
190130
-    int start_idx = 0;
190130
-    int tmp_offset = 0;
190130
-    int write_needed = 0;
190130
-    int buf_len = 0;
190130
-    int size_pending = 0;
190130
-    char *buf = NULL;
190130
-
190130
-    /* loop through each vector */
190130
-    for (i = 0; i < count; i++) {
190130
-        buf = vec[i].iov_base;
190130
-        buf_len = vec[i].iov_len;
190130
-
190130
-        for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len;
190130
-             start_idx += GF_DISK_SECTOR_SIZE) {
190130
-            if (mem_0filled(buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) {
190130
-                write_needed = 1;
190130
-                continue;
190130
-            }
190130
-
190130
-            if (write_needed) {
190130
-                ret = syncop_write(
190130
-                    to, fd, (buf + tmp_offset), (start_idx - tmp_offset),
190130
-                    (offset + tmp_offset), iobref, 0, NULL, NULL);
190130
-                /* 'path' will be logged in calling function */
190130
-                if (ret < 0) {
190130
-                    gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
190130
-                           strerror(-ret));
190130
-                    *fop_errno = -ret;
190130
-                    ret = -1;
190130
-                    goto out;
190130
-                }
190130
-
190130
-                write_needed = 0;
190130
-            }
190130
-            tmp_offset = start_idx + GF_DISK_SECTOR_SIZE;
190130
-        }
190130
-
190130
-        if ((start_idx < buf_len) || write_needed) {
190130
-            /* This means, last chunk is not yet written.. write it */
190130
-            ret = syncop_write(to, fd, (buf + tmp_offset),
190130
-                               (buf_len - tmp_offset), (offset + tmp_offset),
190130
-                               iobref, 0, NULL, NULL);
190130
-            if (ret < 0) {
190130
-                /* 'path' will be logged in calling function */
190130
-                gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
190130
-                       strerror(-ret));
190130
-                *fop_errno = -ret;
190130
-                ret = -1;
190130
-                goto out;
190130
-            }
190130
-        }
190130
-
190130
-        size_pending = (size - buf_len);
190130
-        if (!size_pending)
190130
-            break;
190130
-    }
190130
-
190130
-    ret = size;
190130
-out:
190130
-    return ret;
190130
-}
190130
-
190130
 /*
190130
    return values:
190130
    -1 : failure
190130
@@ -1101,32 +1032,97 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
190130
     int ret = 0;
190130
     int count = 0;
190130
     off_t offset = 0;
190130
+    off_t data_offset = 0;
190130
+    off_t hole_offset = 0;
190130
     struct iovec *vector = NULL;
190130
     struct iobref *iobref = NULL;
190130
     uint64_t total = 0;
190130
     size_t read_size = 0;
190130
+    size_t data_block_size = 0;
190130
     dict_t *xdata = NULL;
190130
     dht_conf_t *conf = NULL;
190130
 
190130
     conf = this->private;
190130
+
190130
     /* if file size is '0', no need to enter this loop */
190130
     while (total < ia_size) {
190130
-        read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
190130
-                         ? DHT_REBALANCE_BLKSIZE
190130
-                         : (ia_size - total));
190130
+        /* This is a regular file - read it sequentially */
190130
+        if (!hole_exists) {
190130
+            read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
190130
+                             ? DHT_REBALANCE_BLKSIZE
190130
+                             : (ia_size - total));
190130
+        } else {
190130
+            /* This is a sparse file - read only the data segments in the file
190130
+             */
190130
+
190130
+            /* If the previous data block is fully copied, find the next data
190130
+             * segment
190130
+             * starting at the offset of the last read and written byte,  */
190130
+            if (data_block_size <= 0) {
190130
+                ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
190130
+                                  &data_offset);
190130
+                if (ret) {
190130
+                    if (ret == -ENXIO)
190130
+                        ret = 0; /* No more data segments */
190130
+                    else
190130
+                        *fop_errno = -ret; /* Error occurred */
190130
+
190130
+                    break;
190130
+                }
190130
+
190130
+                /* If the position of the current data segment is greater than
190130
+                 * the position of the next hole, find the next hole in order to
190130
+                 * calculate the length of the new data segment */
190130
+                if (data_offset > hole_offset) {
190130
+                    /* Starting at the offset of the last data segment, find the
190130
+                     * next hole */
190130
+                    ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
190130
+                                      NULL, &hole_offset);
190130
+                    if (ret) {
190130
+                        /* If an error occurred here it's a real error because
190130
+                         * if the seek for a data segment was successful then
190130
+                         * necessarily another hole must exist (EOF is a hole)
190130
+                         */
190130
+                        *fop_errno = -ret;
190130
+                        break;
190130
+                    }
190130
+
190130
+                    /* Calculate the total size of the current data block */
190130
+                    data_block_size = hole_offset - data_offset;
190130
+                }
190130
+            } else {
190130
+                /* There is still data in the current segment, move the
190130
+                 * data_offset to the position of the last written byte */
190130
+                data_offset = offset;
190130
+            }
190130
+
190130
+            /* Calculate how much data needs to be read and written. If the data
190130
+             * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
190130
+             * write DHT_REBALANCE_BLKSIZE data length and the rest in the
190130
+             * next iteration(s) */
190130
+            read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
190130
+                             ? DHT_REBALANCE_BLKSIZE
190130
+                             : data_block_size);
190130
+
190130
+            /* Calculate the remaining size of the data block - maybe there's no
190130
+             * need to seek for data in the next iteration */
190130
+            data_block_size -= read_size;
190130
+
190130
+            /* Set offset to the offset of the data segment so read and write
190130
+             * will have the correct position */
190130
+            offset = data_offset;
190130
+        }
190130
 
190130
         ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
190130
                            &iobref, NULL, NULL, NULL);
190130
+
190130
         if (!ret || (ret < 0)) {
190130
             *fop_errno = -ret;
190130
             break;
190130
         }
190130
 
190130
-        if (hole_exists) {
190130
-            ret = dht_write_with_holes(to, dst, vector, count, ret, offset,
190130
-                                       iobref, fop_errno);
190130
-        } else {
190130
-            if (!conf->force_migration && !dht_is_tier_xlator(this)) {
190130
+        if (!conf->force_migration && !dht_is_tier_xlator(this)) {
190130
+            if (!xdata) {
190130
                 xdata = dict_new();
190130
                 if (!xdata) {
190130
                     gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
190130
@@ -1146,7 +1142,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
190130
                  * https://github.com/gluster/glusterfs/issues/308
190130
                  * for more details.
190130
                  */
190130
-                ret = dict_set_int32(xdata, GF_AVOID_OVERWRITE, 1);
190130
+                ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1);
190130
                 if (ret) {
190130
                     gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM,
190130
                            "failed to set dict");
190130
@@ -1155,22 +1151,12 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
190130
                     break;
190130
                 }
190130
             }
190130
-
190130
-            ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
190130
-                                NULL, xdata, NULL);
190130
-            if (ret < 0) {
190130
-                *fop_errno = -ret;
190130
-            }
190130
-        }
190130
-
190130
-        if ((defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) &&
190130
-            (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)) {
190130
-            gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED,
190130
-                   "Migrate file paused");
190130
-            ret = -1;
190130
         }
190130
 
190130
+        ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
190130
+                            NULL, xdata, NULL);
190130
         if (ret < 0) {
190130
+            *fop_errno = -ret;
190130
             break;
190130
         }
190130
 
190130
-- 
190130
1.8.3.1
190130