Blob Blame History Raw
From 9eee0cea926bf4a953972fc6ed37a2c925c9c748 Mon Sep 17 00:00:00 2001
From: Susant Palai <spalai@redhat.com>
Date: Mon, 17 Apr 2017 13:00:54 +0530
Subject: [PATCH 395/406] cluster/dht: Skip file migration if the subvol that
 meets min-free-disk criteria happens to be the same subvol containing
 data-file

Rebalance need to figure out a new subvol in case the hashed subvol
does not have enough space. In the process of figuring out the new subvol,
we need to ignore the source subvol, otherwise it will lead to data loss.

Test: Manual
Ran the following
sizeof /tmp/1: 1.5GB
sizeof /brick/1: 16GB
sizeof /tmp/2: 1.5GB
<start>

glusterd;  gluster v create test1 vm1:/brick/1 vm1:/tmp/1;
gluster v start test1;
mount -t glusterfs vm1:test1 /mnt;
for i in {1..2000}
do
dd if=/dev/zero of=/mnt/file$i bs=1KB count=1 &> /dev/null;
done
gluster v add-brick test1 vm1:/tmp/2
gluster v set test1 min-free-disk 12GB
gluster v remove-brick test1 vm1:/tmp/1 star
<end>

file count and data were intact.

> Signed-off-by: Susant Palai <spalai@redhat.com>
> Reviewed-on: https://review.gluster.org/17064
> Smoke: Gluster Build System <jenkins@build.gluster.org>
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
> Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
> Signed-off-by: Susant Palai <spalai@redhat.com>

Change-Id: Ib8fc8467a3d48a7c12958824c4f0b88e160b86c1
BUG: 1360317
Signed-off-by: Susant Palai <spalai@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/103915
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
---
 xlators/cluster/dht/src/dht-common.h    |  2 +-
 xlators/cluster/dht/src/dht-diskusage.c | 19 ++++---
 xlators/cluster/dht/src/dht-rebalance.c | 96 ++++++++++++++++++++++++++-------
 3 files changed, 92 insertions(+), 25 deletions(-)

diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index 37a6e61..eb6d1e8 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -1114,7 +1114,7 @@ dht_dir_has_layout (dict_t *xattr, char *name);
 gf_boolean_t
 dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator);
 xlator_t *
-dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol,
+dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, xlator_t *ignore,
                                    dht_layout_t *layout, uint64_t filesize);
 xlator_t *
 dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
index 13698a9..0559215 100644
--- a/xlators/cluster/dht/src/dht-diskusage.c
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -315,7 +315,7 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
 
         LOCK (&conf->subvolume_lock);
 	{
-                avail_subvol = dht_subvol_with_free_space_inodes(this, subvol,
+                avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, NULL,
                                                                  layout, 0);
                 if(!avail_subvol)
                 {
@@ -340,8 +340,8 @@ out:
 }
 
 static inline
-int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this,
-                                         dht_layout_t *layout)
+int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this, xlator_t *ignore,
+                            dht_layout_t *layout)
 {
         int ret = -1;
         int i   = 0;
@@ -349,6 +349,13 @@ int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this,
         if (!this || !layout)
                 goto out;
 
+        /* this check is meant for rebalance process. The source of the file
+         * should be ignored for space check */
+        if (this == ignore) {
+                goto out;
+        }
+
+
         /* check if subvol has layout errors, before selecting it */
         for (i = 0; i < layout->cnt; i++) {
                 if (!strcmp (layout->list[i].xlator->name, this->name) &&
@@ -376,7 +383,7 @@ out:
 
 /*Get subvolume which has both space and inodes more than the min criteria*/
 xlator_t *
-dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
+dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, xlator_t *ignore,
                                   dht_layout_t *layout, uint64_t filesize)
 {
         int i = 0;
@@ -398,7 +405,7 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
                 /* check if subvol has layout errors and also it is not a
                  * decommissioned brick, before selecting it */
                 ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i],
-                                                    layout);
+                                                    ignore, layout);
                 if (ignore_subvol)
                         continue;
 
@@ -463,7 +470,7 @@ dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
                 /* check if subvol has layout errors and also it is not a
                  * decommissioned brick, before selecting it*/
 
-                ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i],
+                ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i], NULL,
                                                     layout);
                 if (ignore_subvol)
                         continue;
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index 9465cde..49b2230 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -719,27 +719,30 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
                         loc->path, to->name, strerror (-ret));
         */
 
+        ret = syncop_fsetattr (to, fd, stbuf,
+                               (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
+                                NULL, NULL, NULL, NULL);
+        if (ret < 0)
+                gf_msg (this->name, GF_LOG_ERROR, 0,
+                        DHT_MSG_MIGRATE_FILE_FAILED,
+                        "chown failed for %s on %s (%s)",
+                        loc->path, to->name, strerror (-ret));
+
         /* Fallocate does not work for size 0, hence the check. Anyway we don't
          * need to care about min-free-disk for 0 byte size file */
         if (stbuf->ia_size > 0) {
                 ret = syncop_fallocate (to, fd, 0, 0, stbuf->ia_size, NULL,
                                         NULL);
-                if (ret < 0)
+                if (ret < 0) {
                         gf_msg (this->name, GF_LOG_ERROR, 0,
                                 DHT_MSG_MIGRATE_FILE_FAILED,
                                 "fallocate failed for %s on %s (%s)",
                                 loc->path, to->name, strerror (-ret));
+                        ret = -1;
+                        goto out;
+                }
         }
 
-        ret = syncop_fsetattr (to, fd, stbuf,
-                               (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
-                                NULL, NULL, NULL, NULL);
-        if (ret < 0)
-                gf_msg (this->name, GF_LOG_ERROR, 0,
-                        DHT_MSG_MIGRATE_FILE_FAILED,
-                        "chown failed for %s on %s (%s)",
-                        loc->path, to->name, strerror (-ret));
-
         /* success */
         ret = 0;
 
@@ -761,7 +764,8 @@ out:
 static int
 __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
                         struct iatt *stbuf, int flag, dht_conf_t *conf,
-                        gf_boolean_t *target_changed, xlator_t **new_subvol)
+                        gf_boolean_t *target_changed, xlator_t **new_subvol,
+                        gf_boolean_t *ignore_failure)
 {
         struct statvfs  src_statfs = {0,};
         struct statvfs  dst_statfs = {0,};
@@ -773,6 +777,7 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
         uint64_t        dst_statfs_blocks = 1;
         double   post_availspace = 0;
         double   post_percent = 0;
+        int             i = 0;
 
         this = THIS;
 
@@ -897,13 +902,27 @@ find_new_subvol:
                 goto out;
         }
 
-        *new_subvol = dht_subvol_with_free_space_inodes (this, to,
-                      layout, stbuf->ia_size);
-        if (!(*new_subvol)) {
+        *new_subvol = dht_subvol_with_free_space_inodes (this, to, from, layout,
+                                                         stbuf->ia_size);
+        if ((!(*new_subvol)) || (*new_subvol == from)) {
                 gf_msg (this->name, GF_LOG_WARNING, 0,
                         DHT_MSG_SUBVOL_INSUFF_SPACE, "Could not find any subvol"
-                        " with space accomodating the file. Consider adding "
-                        "bricks");
+                        " with space accomodating the file - %s. Consider adding "
+                        "bricks", loc->path);
+
+                /* For remove-brick case if the source is not one of the
+                 * removed-brick, do not mark the error as failure */
+                if (conf->decommission_subvols_cnt) {
+                        *ignore_failure = _gf_true;
+                        for (i = 0; i < conf->decommission_subvols_cnt; i++) {
+                                if (conf->decommissioned_bricks[i] == from) {
+                                        *ignore_failure = _gf_false;
+                                         break;
+                                }
+                        }
+                } else {
+                        *ignore_failure = _gf_false;
+                }
 
                 *target_changed = _gf_false;
                 ret = -1;
@@ -1382,6 +1401,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
         gf_boolean_t            target_changed          = _gf_false;
         xlator_t                *new_target             = NULL;
         xlator_t                *old_target             = NULL;
+        fd_t                    *linkto_fd              = NULL;
+        gf_boolean_t            ignore_failure          = _gf_false;
 
         defrag = conf->defrag;
         if (!defrag)
@@ -1499,7 +1520,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
         clean_dst = _gf_true;
 
         ret = __dht_check_free_space (to, from, loc, &stbuf, flag, conf,
-                                      &target_changed, &new_target);
+                                      &target_changed, &new_target, &ignore_failure);
         if (target_changed) {
                 /* Can't handle for hardlinks. Marking this as failure */
                 if (flag == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS || stbuf.ia_nlink > 1) {
@@ -1543,6 +1564,9 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
         }
 
         if (ret) {
+                if (ignore_failure)
+                        ret = 0;
+
                 goto out;
         }
 
@@ -1792,13 +1816,47 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
                         }
 
                         ret = syncop_setxattr (old_target, loc, dict, 0, NULL, NULL);
-                        if (ret) {
+                        if (ret && -ret != ESTALE && -ret != ENOENT) {
                                 gf_msg (this->name, GF_LOG_ERROR, 0,
                                         DHT_MSG_MIGRATE_FILE_FAILED,
                                         "failed to set xattr on %s in %s (%s)",
                                         loc->path, old_target->name, strerror (-ret));
                                 ret = -1;
                                 goto out;
+                        } else if (-ret == ESTALE || -ret == ENOENT) {
+                               /* The failure ESTALE indicates that the linkto
+                                * file on the hashed subvol might have been deleted.
+                                * In this case will create a linkto file with new target
+                                * as linkto xattr value*/
+                                linkto_fd = fd_create (loc->inode, DHT_REBALANCE_PID);
+                                if (!linkto_fd) {
+                                        gf_msg (this->name, GF_LOG_ERROR, 0,
+                                                DHT_MSG_MIGRATE_FILE_FAILED,
+                                                "%s: fd create failed (%s)",
+                                                loc->path, strerror (errno));
+                                        ret = -1;
+                                        goto out;
+                                }
+                                ret = syncop_create (old_target, loc, O_RDWR,
+                                                     DHT_LINKFILE_MODE, linkto_fd,
+                                                     NULL, dict, NULL);
+                                if (ret != 0 && -ret != EEXIST && -ret != ESTALE) {
+                                        ret = -1;
+                                        gf_msg (this->name, GF_LOG_ERROR, 0,
+                                                DHT_MSG_MIGRATE_FILE_FAILED,
+                                                "failed to create linkto file on %s in %s (%s)",
+                                                loc->path, old_target->name, strerror (-ret));
+                                        goto out;
+                                } else if (ret == 0) {
+                                        ret = syncop_fsetattr (old_target, linkto_fd, &stbuf,
+                                                               (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
+                                                               NULL, NULL, NULL, NULL);
+                                        if (ret < 0)
+                                                gf_msg (this->name, GF_LOG_ERROR, 0,
+                                                DHT_MSG_MIGRATE_FILE_FAILED,
+                                                "chown failed for %s on %s (%s)",
+                                                loc->path, old_target->name, strerror (-ret));
+                                }
                         }
                }
         }
@@ -2044,6 +2102,8 @@ out:
                 syncop_close (dst_fd);
         if (src_fd)
                 syncop_close (src_fd);
+        if (linkto_fd)
+                syncop_close (linkto_fd);
 
         loc_wipe (&tmp_loc);
 
-- 
1.8.3.1