21ab4e
From 593d8e1087a3de532e891d127f081d5bc4b12ba5 Mon Sep 17 00:00:00 2001
21ab4e
From: Susant Palai <spalai@redhat.com>
21ab4e
Date: Wed, 12 Jul 2017 12:01:40 +0530
21ab4e
Subject: [PATCH 565/566] cluster/rebalance: Fix hardlink migration failures
21ab4e
21ab4e
A brief about how hardlink migration works:
21ab4e
  - Different hardlinks (to the same file) may hash to different bricks,
21ab4e
but their cached subvol will be same. Rebalance picks up the first hardlink,
21ab4e
calculates it's  hash(call it TARGET) and set the hashed subvolume as an xattr
21ab4e
on the data file.
21ab4e
  - Now all the hardlinks those come after this will fetch that xattr and will
21ab4e
create linkto files on TARGET (all linkto files for the hardlinks will be hardlink
21ab4e
to each other on TARGET).
21ab4e
  - When number of hardlinks on source is equal to the number of hardlinks on
21ab4e
TARGET, the data migration will happen.
21ab4e
21ab4e
RACE:1
21ab4e
  Since rebalance is multi-threaded, the first lookup (which decides where the TARGET
21ab4e
subvol should be), can be called by two hardlink migration parallely and they may end
21ab4e
up creating linkto files on two different TARGET subvols. Hence, hardlinks won't be
21ab4e
migrated.
21ab4e
21ab4e
Fix: Rely on the xattr response of lookup inside gf_defrag_handle_hardlink since it
21ab4e
is executed under synclock.
21ab4e
21ab4e
RACE:2
21ab4e
  The linkto files on TARGET can be created by other clients also if they are doing
21ab4e
lookup on the hardlinks.  Consider a scenario where you have 100 hardlinks.  When
21ab4e
rebalance is migrating 99th hardlink, as a result of continuous lookups from other
21ab4e
client, linkcount on TARGET is equal to source linkcount. Rebalance will migrate data
21ab4e
on the 99th hardlink itself. On 100th hardlink migration, hardlink will have TARGET as
21ab4e
cached subvolume. If it's hash is also the same, then a migration will be triggered from
21ab4e
TARGET to TARGET leading to data loss.
21ab4e
21ab4e
Fix: Make sure before the final data migration, source is not same as destination.
21ab4e
21ab4e
RACE:3
21ab4e
  Since a hardlink can be migrating to a non-hashed subvolume, a lookup from other
21ab4e
client or even the rebalance it self, might delete the linkto file on TARGET leading
21ab4e
to hardlinks never getting migrated.
21ab4e
21ab4e
This will be addressed in a different patch in future.
21ab4e
21ab4e
> Change-Id: If0f6852f0e662384ee3875a2ac9d19ac4a6cea98
21ab4e
> BUG: 1469964
21ab4e
> Signed-off-by: Susant Palai <spalai@redhat.com>
21ab4e
> Reviewed-on: https://review.gluster.org/17755
21ab4e
> Smoke: Gluster Build System <jenkins@build.gluster.org>
21ab4e
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
21ab4e
> Reviewed-by: N Balachandran <nbalacha@redhat.com>
21ab4e
> Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
21ab4e
> Signed-off-by: Susant Palai <spalai@redhat.com>
21ab4e
21ab4e
Change-Id: If0f6852f0e662384ee3875a2ac9d19ac4a6cea98
21ab4e
BUG: 1469971
21ab4e
Signed-off-by: Susant Palai <spalai@redhat.com>
21ab4e
Reviewed-on: https://code.engineering.redhat.com/gerrit/112423
21ab4e
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
---
21ab4e
 xlators/cluster/dht/src/dht-common.h    |  3 +-
21ab4e
 xlators/cluster/dht/src/dht-rebalance.c | 94 ++++++++++++++++++++++++++++-----
21ab4e
 2 files changed, 82 insertions(+), 15 deletions(-)
21ab4e
21ab4e
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
21ab4e
index 8112b01..6db4c06 100644
21ab4e
--- a/xlators/cluster/dht/src/dht-common.h
21ab4e
+++ b/xlators/cluster/dht/src/dht-common.h
21ab4e
@@ -1107,8 +1107,7 @@ void*
21ab4e
 gf_defrag_start (void *this);
21ab4e
 
21ab4e
 int32_t
21ab4e
-gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t  *xattrs,
21ab4e
-                           struct iatt *stbuf, int *fop_errno);
21ab4e
+gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, int *fop_errno);
21ab4e
 int
21ab4e
 dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
                  int flag, int *fop_errno);
21ab4e
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
21ab4e
index 869c8b3..c60ce27 100644
21ab4e
--- a/xlators/cluster/dht/src/dht-rebalance.c
21ab4e
+++ b/xlators/cluster/dht/src/dht-rebalance.c
21ab4e
@@ -276,8 +276,7 @@ be converted to "0" in dht_migrate_file.
21ab4e
 */
21ab4e
 
21ab4e
 int32_t
21ab4e
-gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t  *xattrs,
21ab4e
-                           struct iatt *stbuf, int *fop_errno)
21ab4e
+gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, int *fop_errno)
21ab4e
 {
21ab4e
         int32_t                 ret             = -1;
21ab4e
         xlator_t               *cached_subvol   = NULL;
21ab4e
@@ -289,15 +288,15 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t  *xattrs,
21ab4e
         dht_conf_t             *conf            = NULL;
21ab4e
         gf_loglevel_t          loglevel         = 0;
21ab4e
         dict_t                 *link_xattr      = NULL;
21ab4e
-
21ab4e
+        dict_t                 *dict            = NULL;
21ab4e
+        dict_t                 *xattr_rsp       = NULL;
21ab4e
+        struct iatt             stbuf                   = {0,};
21ab4e
 
21ab4e
         *fop_errno = EINVAL;
21ab4e
 
21ab4e
         GF_VALIDATE_OR_GOTO ("defrag", loc, out);
21ab4e
         GF_VALIDATE_OR_GOTO ("defrag", loc->name, out);
21ab4e
-        GF_VALIDATE_OR_GOTO ("defrag", stbuf, out);
21ab4e
         GF_VALIDATE_OR_GOTO ("defrag", this, out);
21ab4e
-        GF_VALIDATE_OR_GOTO ("defrag", xattrs, out);
21ab4e
         GF_VALIDATE_OR_GOTO ("defrag", this->private, out);
21ab4e
 
21ab4e
         conf = this->private;
21ab4e
@@ -353,8 +352,27 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t  *xattrs,
21ab4e
           file or not.
21ab4e
         */
21ab4e
 
21ab4e
-        ret = syncop_lookup (this, loc, NULL, NULL,
21ab4e
-                                             NULL, NULL);
21ab4e
+        dict = dict_new ();
21ab4e
+        if (!dict) {
21ab4e
+                ret = -1;
21ab4e
+                *fop_errno = ENOMEM;
21ab4e
+                gf_msg (this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
21ab4e
+                        "could not allocate memory for dict");
21ab4e
+                goto out;
21ab4e
+        }
21ab4e
+
21ab4e
+        ret = dict_set_int32 (dict, conf->link_xattr_name, 256);
21ab4e
+        if (ret) {
21ab4e
+                *fop_errno = ENOMEM;
21ab4e
+                ret = -1;
21ab4e
+                gf_msg (this->name, GF_LOG_ERROR, 0,
21ab4e
+                        DHT_MSG_MIGRATE_FILE_FAILED,
21ab4e
+                        "Migrate file failed:"
21ab4e
+                        "%s: failed to set 'linkto' key in dict", loc->path);
21ab4e
+                goto out;
21ab4e
+        }
21ab4e
+
21ab4e
+        ret = syncop_lookup (this, loc, &stbuf, NULL, dict, &xattr_rsp);
21ab4e
         if (ret) {
21ab4e
                 /*Ignore ENOENT and ESTALE as file might have been
21ab4e
                   migrated already*/
21ab4e
@@ -395,6 +413,10 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t  *xattrs,
21ab4e
                 goto out;
21ab4e
         }
21ab4e
 
21ab4e
+        /* Hardlink migration happens only with remove-brick. So this condition will
21ab4e
+         * be true only when the migration has happened. In case hardlinks are migrated
21ab4e
+         * for rebalance case, remove this check. Having this check here avoid redundant
21ab4e
+         * calls below*/
21ab4e
         if (hashed_subvol == cached_subvol) {
21ab4e
                 ret = -2;
21ab4e
                 goto out;
21ab4e
@@ -403,7 +425,8 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t  *xattrs,
21ab4e
         gf_log (this->name, GF_LOG_INFO, "Attempting to migrate hardlink %s "
21ab4e
                 "with gfid %s from %s -> %s", loc->name, uuid_utoa (loc->gfid),
21ab4e
                 cached_subvol->name, hashed_subvol->name);
21ab4e
-        data = dict_get (xattrs, conf->link_xattr_name);
21ab4e
+
21ab4e
+        data = dict_get (xattr_rsp, conf->link_xattr_name);
21ab4e
         /* set linkto on cached -> hashed if not present, else link it */
21ab4e
         if (!data) {
21ab4e
                 ret = dict_set_str (link_xattr, conf->link_xattr_name,
21ab4e
@@ -433,10 +456,15 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t  *xattrs,
21ab4e
                         ret = -1;
21ab4e
                         goto out;
21ab4e
                 }
21ab4e
+
21ab4e
+                gf_msg_debug (this->name, 0, "hardlink target subvol created on %s "
21ab4e
+                              ",cached %s, file %s",
21ab4e
+                              hashed_subvol->name, cached_subvol->name, loc->path);
21ab4e
+
21ab4e
                 ret = -2;
21ab4e
                 goto out;
21ab4e
         } else {
21ab4e
-                linkto_subvol = dht_linkfile_subvol (this, NULL, NULL, xattrs);
21ab4e
+                linkto_subvol = dht_linkfile_subvol (this, NULL, NULL, xattr_rsp);
21ab4e
                 if (!linkto_subvol) {
21ab4e
                         gf_msg (this->name, GF_LOG_ERROR, 0,
21ab4e
                                 DHT_MSG_SUBVOL_ERROR,
21ab4e
@@ -463,8 +491,14 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t  *xattrs,
21ab4e
                                 *fop_errno = op_errno;
21ab4e
                                 goto out;
21ab4e
                         }
21ab4e
+                } else {
21ab4e
+                        gf_msg_debug (this->name, 0, "syncop_link successful for"
21ab4e
+                                      " hardlink %s on subvol %s, cached %s", loc->path,
21ab4e
+                                      hashed_subvol->name, cached_subvol->name);
21ab4e
+
21ab4e
                 }
21ab4e
         }
21ab4e
+
21ab4e
         ret = syncop_lookup (hashed_subvol, loc, &iatt, NULL, NULL, NULL);
21ab4e
         if (ret) {
21ab4e
                 gf_msg (this->name, GF_LOG_ERROR, -ret,
21ab4e
@@ -477,7 +511,22 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t  *xattrs,
21ab4e
                 goto out;
21ab4e
         }
21ab4e
 
21ab4e
-        if (iatt.ia_nlink == stbuf->ia_nlink) {
21ab4e
+        /* There is a race where on the target subvol for the hardlink
21ab4e
+         * (note: hash subvol for the hardlink might differ from this), some
21ab4e
+         * other client(non-rebalance) would have created a linkto file for that
21ab4e
+         * hardlink as part of lookup. So let say there are 10 hardlinks, on the
21ab4e
+         * 5th hardlink it self the hardlinks might have migrated. Now for
21ab4e
+         * (6..10th) hardlinks the cached and target would be same as the file
21ab4e
+         * has already migrated. Hence this check is needed  */
21ab4e
+        if (cached_subvol == hashed_subvol) {
21ab4e
+                gf_msg_debug (this->name, 0, "source %s and destination %s "
21ab4e
+                             "for hardlink %s are same", cached_subvol->name,
21ab4e
+                             hashed_subvol->name, loc->path);
21ab4e
+                ret = -2;
21ab4e
+                goto out;
21ab4e
+        }
21ab4e
+
21ab4e
+        if (iatt.ia_nlink == stbuf.ia_nlink) {
21ab4e
                 ret = dht_migrate_file (this, loc, cached_subvol, hashed_subvol,
21ab4e
                                         GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS,
21ab4e
                                         fop_errno);
21ab4e
@@ -489,6 +538,13 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t  *xattrs,
21ab4e
 out:
21ab4e
         if (link_xattr)
21ab4e
                 dict_unref (link_xattr);
21ab4e
+
21ab4e
+        if (xattr_rsp)
21ab4e
+                dict_unref (xattr_rsp);
21ab4e
+
21ab4e
+        if (dict)
21ab4e
+                dict_unref (dict);
21ab4e
+
21ab4e
         return ret;
21ab4e
 }
21ab4e
 
21ab4e
@@ -509,7 +565,7 @@ __check_file_has_hardlink (xlator_t *this, loc_t *loc,
21ab4e
                 if (flags == GF_DHT_MIGRATE_HARDLINK) {
21ab4e
                         synclock_lock (&conf->link_lock);
21ab4e
                         ret = gf_defrag_handle_hardlink
21ab4e
-                                (this, loc, xattrs, stbuf, fop_errno);
21ab4e
+                                (this, loc, fop_errno);
21ab4e
                         synclock_unlock (&conf->link_lock);
21ab4e
                         /*
21ab4e
                           Returning zero will force the file to be remigrated.
21ab4e
@@ -1497,6 +1553,13 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
         gf_boolean_t            ignore_failure          = _gf_false;
21ab4e
 
21ab4e
 
21ab4e
+        if (from == to) {
21ab4e
+                gf_msg_debug (this->name, 0, "destination and source are same. file %s"
21ab4e
+                              " might have migrated already", loc->path);
21ab4e
+                ret = 0;
21ab4e
+                goto out;
21ab4e
+        }
21ab4e
+
21ab4e
         /* If defrag is NULL, it should be assumed that migration is triggered
21ab4e
          * from client */
21ab4e
         defrag = conf->defrag;
21ab4e
@@ -1561,6 +1624,11 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
         gf_uuid_copy (tmp_loc.gfid, loc->gfid);
21ab4e
         tmp_loc.path = gf_strdup(loc->path);
21ab4e
 
21ab4e
+        /* this inodelk happens with flock.owner being zero. But to synchronize
21ab4e
+         * hardlink migration we need to have different lkowner for each migration
21ab4e
+         * Filed a bug here: https://bugzilla.redhat.com/show_bug.cgi?id=1468202 to
21ab4e
+         * track the fix for this. Currently synclock takes care of synchronizing
21ab4e
+         * hardlink migration. Once this bug is fixed we can avoid taking synclock */
21ab4e
         ret = syncop_inodelk (from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc, F_SETLKW,
21ab4e
                               &flock, NULL, NULL);
21ab4e
         if (ret < 0) {
21ab4e
@@ -2671,8 +2739,8 @@ gf_defrag_migrate_single_file (void *opaque)
21ab4e
                         }
21ab4e
                         UNLOCK (&defrag->lock);
21ab4e
                 } else if (fop_errno != EEXIST) {
21ab4e
-                        gf_msg (this->name, GF_LOG_ERROR,
21ab4e
-                                DHT_MSG_MIGRATE_FILE_FAILED, fop_errno,
21ab4e
+                        gf_msg (this->name, GF_LOG_ERROR, fop_errno,
21ab4e
+                                DHT_MSG_MIGRATE_FILE_FAILED,
21ab4e
                                 "migrate-data failed for %s", entry_loc.path);
21ab4e
 
21ab4e
                         LOCK (&defrag->lock);
21ab4e
-- 
21ab4e
1.8.3.1
21ab4e