21ab4e
From 9eee0cea926bf4a953972fc6ed37a2c925c9c748 Mon Sep 17 00:00:00 2001
21ab4e
From: Susant Palai <spalai@redhat.com>
21ab4e
Date: Mon, 17 Apr 2017 13:00:54 +0530
21ab4e
Subject: [PATCH 395/406] cluster/dht: Skip file migration if the subvol that
21ab4e
 meets min-free-disk criteria happens to be the same subvol containing
21ab4e
 data-file
21ab4e
21ab4e
Rebalance need to figure out a new subvol in case the hashed subvol
21ab4e
does not have enough space. In the process of figuring out the new subvol,
21ab4e
we need to ignore the source subvol, otherwise it will lead to data loss.
21ab4e
21ab4e
Test: Manual
21ab4e
Ran the following
21ab4e
sizeof /tmp/1: 1.5GB
21ab4e
sizeof /brick/1: 16GB
21ab4e
sizeof /tmp/2: 1.5GB
21ab4e
<start>
21ab4e
21ab4e
glusterd;  gluster v create test1 vm1:/brick/1 vm1:/tmp/1;
21ab4e
gluster v start test1;
21ab4e
mount -t glusterfs vm1:test1 /mnt;
21ab4e
for i in {1..2000}
21ab4e
do
21ab4e
dd if=/dev/zero of=/mnt/file$i bs=1KB count=1 &> /dev/null;
21ab4e
done
21ab4e
gluster v add-brick test1 vm1:/tmp/2
21ab4e
gluster v set test1 min-free-disk 12GB
21ab4e
gluster v remove-brick test1 vm1:/tmp/1 star
21ab4e
<end>
21ab4e
21ab4e
file count and data were intact.
21ab4e
21ab4e
> Signed-off-by: Susant Palai <spalai@redhat.com>
21ab4e
> Reviewed-on: https://review.gluster.org/17064
21ab4e
> Smoke: Gluster Build System <jenkins@build.gluster.org>
21ab4e
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
21ab4e
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
21ab4e
> Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
21ab4e
> Signed-off-by: Susant Palai <spalai@redhat.com>
21ab4e
21ab4e
Change-Id: Ib8fc8467a3d48a7c12958824c4f0b88e160b86c1
21ab4e
BUG: 1360317
21ab4e
Signed-off-by: Susant Palai <spalai@redhat.com>
21ab4e
Reviewed-on: https://code.engineering.redhat.com/gerrit/103915
21ab4e
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
---
21ab4e
 xlators/cluster/dht/src/dht-common.h    |  2 +-
21ab4e
 xlators/cluster/dht/src/dht-diskusage.c | 19 ++++---
21ab4e
 xlators/cluster/dht/src/dht-rebalance.c | 96 ++++++++++++++++++++++++++-------
21ab4e
 3 files changed, 92 insertions(+), 25 deletions(-)
21ab4e
21ab4e
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
21ab4e
index 37a6e61..eb6d1e8 100644
21ab4e
--- a/xlators/cluster/dht/src/dht-common.h
21ab4e
+++ b/xlators/cluster/dht/src/dht-common.h
21ab4e
@@ -1114,7 +1114,7 @@ dht_dir_has_layout (dict_t *xattr, char *name);
21ab4e
 gf_boolean_t
21ab4e
 dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator);
21ab4e
 xlator_t *
21ab4e
-dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol,
21ab4e
+dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, xlator_t *ignore,
21ab4e
                                    dht_layout_t *layout, uint64_t filesize);
21ab4e
 xlator_t *
21ab4e
 dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
21ab4e
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
21ab4e
index 13698a9..0559215 100644
21ab4e
--- a/xlators/cluster/dht/src/dht-diskusage.c
21ab4e
+++ b/xlators/cluster/dht/src/dht-diskusage.c
21ab4e
@@ -315,7 +315,7 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
21ab4e
 
21ab4e
         LOCK (&conf->subvolume_lock);
21ab4e
 	{
21ab4e
-                avail_subvol = dht_subvol_with_free_space_inodes(this, subvol,
21ab4e
+                avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, NULL,
21ab4e
                                                                  layout, 0);
21ab4e
                 if(!avail_subvol)
21ab4e
                 {
21ab4e
@@ -340,8 +340,8 @@ out:
21ab4e
 }
21ab4e
 
21ab4e
 static inline
21ab4e
-int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this,
21ab4e
-                                         dht_layout_t *layout)
21ab4e
+int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this, xlator_t *ignore,
21ab4e
+                            dht_layout_t *layout)
21ab4e
 {
21ab4e
         int ret = -1;
21ab4e
         int i   = 0;
21ab4e
@@ -349,6 +349,13 @@ int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this,
21ab4e
         if (!this || !layout)
21ab4e
                 goto out;
21ab4e
 
21ab4e
+        /* this check is meant for rebalance process. The source of the file
21ab4e
+         * should be ignored for space check */
21ab4e
+        if (this == ignore) {
21ab4e
+                goto out;
21ab4e
+        }
21ab4e
+
21ab4e
+
21ab4e
         /* check if subvol has layout errors, before selecting it */
21ab4e
         for (i = 0; i < layout->cnt; i++) {
21ab4e
                 if (!strcmp (layout->list[i].xlator->name, this->name) &&
21ab4e
@@ -376,7 +383,7 @@ out:
21ab4e
 
21ab4e
 /*Get subvolume which has both space and inodes more than the min criteria*/
21ab4e
 xlator_t *
21ab4e
-dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
21ab4e
+dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, xlator_t *ignore,
21ab4e
                                   dht_layout_t *layout, uint64_t filesize)
21ab4e
 {
21ab4e
         int i = 0;
21ab4e
@@ -398,7 +405,7 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
21ab4e
                 /* check if subvol has layout errors and also it is not a
21ab4e
                  * decommissioned brick, before selecting it */
21ab4e
                 ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i],
21ab4e
-                                                    layout);
21ab4e
+                                                    ignore, layout);
21ab4e
                 if (ignore_subvol)
21ab4e
                         continue;
21ab4e
 
21ab4e
@@ -463,7 +470,7 @@ dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
21ab4e
                 /* check if subvol has layout errors and also it is not a
21ab4e
                  * decommissioned brick, before selecting it*/
21ab4e
 
21ab4e
-                ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i],
21ab4e
+                ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i], NULL,
21ab4e
                                                     layout);
21ab4e
                 if (ignore_subvol)
21ab4e
                         continue;
21ab4e
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
21ab4e
index 9465cde..49b2230 100644
21ab4e
--- a/xlators/cluster/dht/src/dht-rebalance.c
21ab4e
+++ b/xlators/cluster/dht/src/dht-rebalance.c
21ab4e
@@ -719,27 +719,30 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
21ab4e
                         loc->path, to->name, strerror (-ret));
21ab4e
         */
21ab4e
 
21ab4e
+        ret = syncop_fsetattr (to, fd, stbuf,
21ab4e
+                               (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
21ab4e
+                                NULL, NULL, NULL, NULL);
21ab4e
+        if (ret < 0)
21ab4e
+                gf_msg (this->name, GF_LOG_ERROR, 0,
21ab4e
+                        DHT_MSG_MIGRATE_FILE_FAILED,
21ab4e
+                        "chown failed for %s on %s (%s)",
21ab4e
+                        loc->path, to->name, strerror (-ret));
21ab4e
+
21ab4e
         /* Fallocate does not work for size 0, hence the check. Anyway we don't
21ab4e
          * need to care about min-free-disk for 0 byte size file */
21ab4e
         if (stbuf->ia_size > 0) {
21ab4e
                 ret = syncop_fallocate (to, fd, 0, 0, stbuf->ia_size, NULL,
21ab4e
                                         NULL);
21ab4e
-                if (ret < 0)
21ab4e
+                if (ret < 0) {
21ab4e
                         gf_msg (this->name, GF_LOG_ERROR, 0,
21ab4e
                                 DHT_MSG_MIGRATE_FILE_FAILED,
21ab4e
                                 "fallocate failed for %s on %s (%s)",
21ab4e
                                 loc->path, to->name, strerror (-ret));
21ab4e
+                        ret = -1;
21ab4e
+                        goto out;
21ab4e
+                }
21ab4e
         }
21ab4e
 
21ab4e
-        ret = syncop_fsetattr (to, fd, stbuf,
21ab4e
-                               (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
21ab4e
-                                NULL, NULL, NULL, NULL);
21ab4e
-        if (ret < 0)
21ab4e
-                gf_msg (this->name, GF_LOG_ERROR, 0,
21ab4e
-                        DHT_MSG_MIGRATE_FILE_FAILED,
21ab4e
-                        "chown failed for %s on %s (%s)",
21ab4e
-                        loc->path, to->name, strerror (-ret));
21ab4e
-
21ab4e
         /* success */
21ab4e
         ret = 0;
21ab4e
 
21ab4e
@@ -761,7 +764,8 @@ out:
21ab4e
 static int
21ab4e
 __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
21ab4e
                         struct iatt *stbuf, int flag, dht_conf_t *conf,
21ab4e
-                        gf_boolean_t *target_changed, xlator_t **new_subvol)
21ab4e
+                        gf_boolean_t *target_changed, xlator_t **new_subvol,
21ab4e
+                        gf_boolean_t *ignore_failure)
21ab4e
 {
21ab4e
         struct statvfs  src_statfs = {0,};
21ab4e
         struct statvfs  dst_statfs = {0,};
21ab4e
@@ -773,6 +777,7 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
21ab4e
         uint64_t        dst_statfs_blocks = 1;
21ab4e
         double   post_availspace = 0;
21ab4e
         double   post_percent = 0;
21ab4e
+        int             i = 0;
21ab4e
 
21ab4e
         this = THIS;
21ab4e
 
21ab4e
@@ -897,13 +902,27 @@ find_new_subvol:
21ab4e
                 goto out;
21ab4e
         }
21ab4e
 
21ab4e
-        *new_subvol = dht_subvol_with_free_space_inodes (this, to,
21ab4e
-                      layout, stbuf->ia_size);
21ab4e
-        if (!(*new_subvol)) {
21ab4e
+        *new_subvol = dht_subvol_with_free_space_inodes (this, to, from, layout,
21ab4e
+                                                         stbuf->ia_size);
21ab4e
+        if ((!(*new_subvol)) || (*new_subvol == from)) {
21ab4e
                 gf_msg (this->name, GF_LOG_WARNING, 0,
21ab4e
                         DHT_MSG_SUBVOL_INSUFF_SPACE, "Could not find any subvol"
21ab4e
-                        " with space accomodating the file. Consider adding "
21ab4e
-                        "bricks");
21ab4e
+                        " with space accomodating the file - %s. Consider adding "
21ab4e
+                        "bricks", loc->path);
21ab4e
+
21ab4e
+                /* For remove-brick case if the source is not one of the
21ab4e
+                 * removed-brick, do not mark the error as failure */
21ab4e
+                if (conf->decommission_subvols_cnt) {
21ab4e
+                        *ignore_failure = _gf_true;
21ab4e
+                        for (i = 0; i < conf->decommission_subvols_cnt; i++) {
21ab4e
+                                if (conf->decommissioned_bricks[i] == from) {
21ab4e
+                                        *ignore_failure = _gf_false;
21ab4e
+                                         break;
21ab4e
+                                }
21ab4e
+                        }
21ab4e
+                } else {
21ab4e
+                        *ignore_failure = _gf_false;
21ab4e
+                }
21ab4e
 
21ab4e
                 *target_changed = _gf_false;
21ab4e
                 ret = -1;
21ab4e
@@ -1382,6 +1401,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
         gf_boolean_t            target_changed          = _gf_false;
21ab4e
         xlator_t                *new_target             = NULL;
21ab4e
         xlator_t                *old_target             = NULL;
21ab4e
+        fd_t                    *linkto_fd              = NULL;
21ab4e
+        gf_boolean_t            ignore_failure          = _gf_false;
21ab4e
 
21ab4e
         defrag = conf->defrag;
21ab4e
         if (!defrag)
21ab4e
@@ -1499,7 +1520,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
         clean_dst = _gf_true;
21ab4e
 
21ab4e
         ret = __dht_check_free_space (to, from, loc, &stbuf, flag, conf,
21ab4e
-                                      &target_changed, &new_target);
21ab4e
+                                      &target_changed, &new_target, &ignore_failure);
21ab4e
         if (target_changed) {
21ab4e
                 /* Can't handle for hardlinks. Marking this as failure */
21ab4e
                 if (flag == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS || stbuf.ia_nlink > 1) {
21ab4e
@@ -1543,6 +1564,9 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
         }
21ab4e
 
21ab4e
         if (ret) {
21ab4e
+                if (ignore_failure)
21ab4e
+                        ret = 0;
21ab4e
+
21ab4e
                 goto out;
21ab4e
         }
21ab4e
 
21ab4e
@@ -1792,13 +1816,47 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
                         }
21ab4e
 
21ab4e
                         ret = syncop_setxattr (old_target, loc, dict, 0, NULL, NULL);
21ab4e
-                        if (ret) {
21ab4e
+                        if (ret && -ret != ESTALE && -ret != ENOENT) {
21ab4e
                                 gf_msg (this->name, GF_LOG_ERROR, 0,
21ab4e
                                         DHT_MSG_MIGRATE_FILE_FAILED,
21ab4e
                                         "failed to set xattr on %s in %s (%s)",
21ab4e
                                         loc->path, old_target->name, strerror (-ret));
21ab4e
                                 ret = -1;
21ab4e
                                 goto out;
21ab4e
+                        } else if (-ret == ESTALE || -ret == ENOENT) {
21ab4e
+                               /* The failure ESTALE indicates that the linkto
21ab4e
+                                * file on the hashed subvol might have been deleted.
21ab4e
+                                * In this case will create a linkto file with new target
21ab4e
+                                * as linkto xattr value*/
21ab4e
+                                linkto_fd = fd_create (loc->inode, DHT_REBALANCE_PID);
21ab4e
+                                if (!linkto_fd) {
21ab4e
+                                        gf_msg (this->name, GF_LOG_ERROR, 0,
21ab4e
+                                                DHT_MSG_MIGRATE_FILE_FAILED,
21ab4e
+                                                "%s: fd create failed (%s)",
21ab4e
+                                                loc->path, strerror (errno));
21ab4e
+                                        ret = -1;
21ab4e
+                                        goto out;
21ab4e
+                                }
21ab4e
+                                ret = syncop_create (old_target, loc, O_RDWR,
21ab4e
+                                                     DHT_LINKFILE_MODE, linkto_fd,
21ab4e
+                                                     NULL, dict, NULL);
21ab4e
+                                if (ret != 0 && -ret != EEXIST && -ret != ESTALE) {
21ab4e
+                                        ret = -1;
21ab4e
+                                        gf_msg (this->name, GF_LOG_ERROR, 0,
21ab4e
+                                                DHT_MSG_MIGRATE_FILE_FAILED,
21ab4e
+                                                "failed to create linkto file on %s in %s (%s)",
21ab4e
+                                                loc->path, old_target->name, strerror (-ret));
21ab4e
+                                        goto out;
21ab4e
+                                } else if (ret == 0) {
21ab4e
+                                        ret = syncop_fsetattr (old_target, linkto_fd, &stbuf,
21ab4e
+                                                               (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
21ab4e
+                                                               NULL, NULL, NULL, NULL);
21ab4e
+                                        if (ret < 0)
21ab4e
+                                                gf_msg (this->name, GF_LOG_ERROR, 0,
21ab4e
+                                                DHT_MSG_MIGRATE_FILE_FAILED,
21ab4e
+                                                "chown failed for %s on %s (%s)",
21ab4e
+                                                loc->path, old_target->name, strerror (-ret));
21ab4e
+                                }
21ab4e
                         }
21ab4e
                }
21ab4e
         }
21ab4e
@@ -2044,6 +2102,8 @@ out:
21ab4e
                 syncop_close (dst_fd);
21ab4e
         if (src_fd)
21ab4e
                 syncop_close (src_fd);
21ab4e
+        if (linkto_fd)
21ab4e
+                syncop_close (linkto_fd);
21ab4e
 
21ab4e
         loc_wipe (&tmp_loc);
21ab4e
 
21ab4e
-- 
21ab4e
1.8.3.1
21ab4e