|
|
21ab4e |
From 593d8e1087a3de532e891d127f081d5bc4b12ba5 Mon Sep 17 00:00:00 2001
|
|
|
21ab4e |
From: Susant Palai <spalai@redhat.com>
|
|
|
21ab4e |
Date: Wed, 12 Jul 2017 12:01:40 +0530
|
|
|
21ab4e |
Subject: [PATCH 565/566] cluster/rebalance: Fix hardlink migration failures
|
|
|
21ab4e |
|
|
|
21ab4e |
A brief about how hardlink migration works:
|
|
|
21ab4e |
- Different hardlinks (to the same file) may hash to different bricks,
|
|
|
21ab4e |
but their cached subvol will be same. Rebalance picks up the first hardlink,
|
|
|
21ab4e |
calculates it's hash(call it TARGET) and set the hashed subvolume as an xattr
|
|
|
21ab4e |
on the data file.
|
|
|
21ab4e |
- Now all the hardlinks those come after this will fetch that xattr and will
|
|
|
21ab4e |
create linkto files on TARGET (all linkto files for the hardlinks will be hardlink
|
|
|
21ab4e |
to each other on TARGET).
|
|
|
21ab4e |
- When number of hardlinks on source is equal to the number of hardlinks on
|
|
|
21ab4e |
TARGET, the data migration will happen.
|
|
|
21ab4e |
|
|
|
21ab4e |
RACE:1
|
|
|
21ab4e |
Since rebalance is multi-threaded, the first lookup (which decides where the TARGET
|
|
|
21ab4e |
subvol should be), can be called by two hardlink migration parallely and they may end
|
|
|
21ab4e |
up creating linkto files on two different TARGET subvols. Hence, hardlinks won't be
|
|
|
21ab4e |
migrated.
|
|
|
21ab4e |
|
|
|
21ab4e |
Fix: Rely on the xattr response of lookup inside gf_defrag_handle_hardlink since it
|
|
|
21ab4e |
is executed under synclock.
|
|
|
21ab4e |
|
|
|
21ab4e |
RACE:2
|
|
|
21ab4e |
The linkto files on TARGET can be created by other clients also if they are doing
|
|
|
21ab4e |
lookup on the hardlinks. Consider a scenario where you have 100 hardlinks. When
|
|
|
21ab4e |
rebalance is migrating 99th hardlink, as a result of continuous lookups from other
|
|
|
21ab4e |
client, linkcount on TARGET is equal to source linkcount. Rebalance will migrate data
|
|
|
21ab4e |
on the 99th hardlink itself. On 100th hardlink migration, hardlink will have TARGET as
|
|
|
21ab4e |
cached subvolume. If it's hash is also the same, then a migration will be triggered from
|
|
|
21ab4e |
TARGET to TARGET leading to data loss.
|
|
|
21ab4e |
|
|
|
21ab4e |
Fix: Make sure before the final data migration, source is not same as destination.
|
|
|
21ab4e |
|
|
|
21ab4e |
RACE:3
|
|
|
21ab4e |
Since a hardlink can be migrating to a non-hashed subvolume, a lookup from other
|
|
|
21ab4e |
client or even the rebalance it self, might delete the linkto file on TARGET leading
|
|
|
21ab4e |
to hardlinks never getting migrated.
|
|
|
21ab4e |
|
|
|
21ab4e |
This will be addressed in a different patch in future.
|
|
|
21ab4e |
|
|
|
21ab4e |
> Change-Id: If0f6852f0e662384ee3875a2ac9d19ac4a6cea98
|
|
|
21ab4e |
> BUG: 1469964
|
|
|
21ab4e |
> Signed-off-by: Susant Palai <spalai@redhat.com>
|
|
|
21ab4e |
> Reviewed-on: https://review.gluster.org/17755
|
|
|
21ab4e |
> Smoke: Gluster Build System <jenkins@build.gluster.org>
|
|
|
21ab4e |
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
|
|
|
21ab4e |
> Reviewed-by: N Balachandran <nbalacha@redhat.com>
|
|
|
21ab4e |
> Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
|
|
|
21ab4e |
> Signed-off-by: Susant Palai <spalai@redhat.com>
|
|
|
21ab4e |
|
|
|
21ab4e |
Change-Id: If0f6852f0e662384ee3875a2ac9d19ac4a6cea98
|
|
|
21ab4e |
BUG: 1469971
|
|
|
21ab4e |
Signed-off-by: Susant Palai <spalai@redhat.com>
|
|
|
21ab4e |
Reviewed-on: https://code.engineering.redhat.com/gerrit/112423
|
|
|
21ab4e |
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
|
|
21ab4e |
---
|
|
|
21ab4e |
xlators/cluster/dht/src/dht-common.h | 3 +-
|
|
|
21ab4e |
xlators/cluster/dht/src/dht-rebalance.c | 94 ++++++++++++++++++++++++++++-----
|
|
|
21ab4e |
2 files changed, 82 insertions(+), 15 deletions(-)
|
|
|
21ab4e |
|
|
|
21ab4e |
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
|
|
|
21ab4e |
index 8112b01..6db4c06 100644
|
|
|
21ab4e |
--- a/xlators/cluster/dht/src/dht-common.h
|
|
|
21ab4e |
+++ b/xlators/cluster/dht/src/dht-common.h
|
|
|
21ab4e |
@@ -1107,8 +1107,7 @@ void*
|
|
|
21ab4e |
gf_defrag_start (void *this);
|
|
|
21ab4e |
|
|
|
21ab4e |
int32_t
|
|
|
21ab4e |
-gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
|
|
|
21ab4e |
- struct iatt *stbuf, int *fop_errno);
|
|
|
21ab4e |
+gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, int *fop_errno);
|
|
|
21ab4e |
int
|
|
|
21ab4e |
dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
|
|
|
21ab4e |
int flag, int *fop_errno);
|
|
|
21ab4e |
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
21ab4e |
index 869c8b3..c60ce27 100644
|
|
|
21ab4e |
--- a/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
21ab4e |
+++ b/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
21ab4e |
@@ -276,8 +276,7 @@ be converted to "0" in dht_migrate_file.
|
|
|
21ab4e |
*/
|
|
|
21ab4e |
|
|
|
21ab4e |
int32_t
|
|
|
21ab4e |
-gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
|
|
|
21ab4e |
- struct iatt *stbuf, int *fop_errno)
|
|
|
21ab4e |
+gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, int *fop_errno)
|
|
|
21ab4e |
{
|
|
|
21ab4e |
int32_t ret = -1;
|
|
|
21ab4e |
xlator_t *cached_subvol = NULL;
|
|
|
21ab4e |
@@ -289,15 +288,15 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
|
|
|
21ab4e |
dht_conf_t *conf = NULL;
|
|
|
21ab4e |
gf_loglevel_t loglevel = 0;
|
|
|
21ab4e |
dict_t *link_xattr = NULL;
|
|
|
21ab4e |
-
|
|
|
21ab4e |
+ dict_t *dict = NULL;
|
|
|
21ab4e |
+ dict_t *xattr_rsp = NULL;
|
|
|
21ab4e |
+ struct iatt stbuf = {0,};
|
|
|
21ab4e |
|
|
|
21ab4e |
*fop_errno = EINVAL;
|
|
|
21ab4e |
|
|
|
21ab4e |
GF_VALIDATE_OR_GOTO ("defrag", loc, out);
|
|
|
21ab4e |
GF_VALIDATE_OR_GOTO ("defrag", loc->name, out);
|
|
|
21ab4e |
- GF_VALIDATE_OR_GOTO ("defrag", stbuf, out);
|
|
|
21ab4e |
GF_VALIDATE_OR_GOTO ("defrag", this, out);
|
|
|
21ab4e |
- GF_VALIDATE_OR_GOTO ("defrag", xattrs, out);
|
|
|
21ab4e |
GF_VALIDATE_OR_GOTO ("defrag", this->private, out);
|
|
|
21ab4e |
|
|
|
21ab4e |
conf = this->private;
|
|
|
21ab4e |
@@ -353,8 +352,27 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
|
|
|
21ab4e |
file or not.
|
|
|
21ab4e |
*/
|
|
|
21ab4e |
|
|
|
21ab4e |
- ret = syncop_lookup (this, loc, NULL, NULL,
|
|
|
21ab4e |
- NULL, NULL);
|
|
|
21ab4e |
+ dict = dict_new ();
|
|
|
21ab4e |
+ if (!dict) {
|
|
|
21ab4e |
+ ret = -1;
|
|
|
21ab4e |
+ *fop_errno = ENOMEM;
|
|
|
21ab4e |
+ gf_msg (this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
|
|
|
21ab4e |
+ "could not allocate memory for dict");
|
|
|
21ab4e |
+ goto out;
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ ret = dict_set_int32 (dict, conf->link_xattr_name, 256);
|
|
|
21ab4e |
+ if (ret) {
|
|
|
21ab4e |
+ *fop_errno = ENOMEM;
|
|
|
21ab4e |
+ ret = -1;
|
|
|
21ab4e |
+ gf_msg (this->name, GF_LOG_ERROR, 0,
|
|
|
21ab4e |
+ DHT_MSG_MIGRATE_FILE_FAILED,
|
|
|
21ab4e |
+ "Migrate file failed:"
|
|
|
21ab4e |
+ "%s: failed to set 'linkto' key in dict", loc->path);
|
|
|
21ab4e |
+ goto out;
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ ret = syncop_lookup (this, loc, &stbuf, NULL, dict, &xattr_rsp);
|
|
|
21ab4e |
if (ret) {
|
|
|
21ab4e |
/*Ignore ENOENT and ESTALE as file might have been
|
|
|
21ab4e |
migrated already*/
|
|
|
21ab4e |
@@ -395,6 +413,10 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
+ /* Hardlink migration happens only with remove-brick. So this condition will
|
|
|
21ab4e |
+ * be true only when the migration has happened. In case hardlinks are migrated
|
|
|
21ab4e |
+ * for rebalance case, remove this check. Having this check here avoid redundant
|
|
|
21ab4e |
+ * calls below*/
|
|
|
21ab4e |
if (hashed_subvol == cached_subvol) {
|
|
|
21ab4e |
ret = -2;
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
@@ -403,7 +425,8 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
|
|
|
21ab4e |
gf_log (this->name, GF_LOG_INFO, "Attempting to migrate hardlink %s "
|
|
|
21ab4e |
"with gfid %s from %s -> %s", loc->name, uuid_utoa (loc->gfid),
|
|
|
21ab4e |
cached_subvol->name, hashed_subvol->name);
|
|
|
21ab4e |
- data = dict_get (xattrs, conf->link_xattr_name);
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ data = dict_get (xattr_rsp, conf->link_xattr_name);
|
|
|
21ab4e |
/* set linkto on cached -> hashed if not present, else link it */
|
|
|
21ab4e |
if (!data) {
|
|
|
21ab4e |
ret = dict_set_str (link_xattr, conf->link_xattr_name,
|
|
|
21ab4e |
@@ -433,10 +456,15 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
|
|
|
21ab4e |
ret = -1;
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ gf_msg_debug (this->name, 0, "hardlink target subvol created on %s "
|
|
|
21ab4e |
+ ",cached %s, file %s",
|
|
|
21ab4e |
+ hashed_subvol->name, cached_subvol->name, loc->path);
|
|
|
21ab4e |
+
|
|
|
21ab4e |
ret = -2;
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
} else {
|
|
|
21ab4e |
- linkto_subvol = dht_linkfile_subvol (this, NULL, NULL, xattrs);
|
|
|
21ab4e |
+ linkto_subvol = dht_linkfile_subvol (this, NULL, NULL, xattr_rsp);
|
|
|
21ab4e |
if (!linkto_subvol) {
|
|
|
21ab4e |
gf_msg (this->name, GF_LOG_ERROR, 0,
|
|
|
21ab4e |
DHT_MSG_SUBVOL_ERROR,
|
|
|
21ab4e |
@@ -463,8 +491,14 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
|
|
|
21ab4e |
*fop_errno = op_errno;
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
+ } else {
|
|
|
21ab4e |
+ gf_msg_debug (this->name, 0, "syncop_link successful for"
|
|
|
21ab4e |
+ " hardlink %s on subvol %s, cached %s", loc->path,
|
|
|
21ab4e |
+ hashed_subvol->name, cached_subvol->name);
|
|
|
21ab4e |
+
|
|
|
21ab4e |
}
|
|
|
21ab4e |
}
|
|
|
21ab4e |
+
|
|
|
21ab4e |
ret = syncop_lookup (hashed_subvol, loc, &iatt, NULL, NULL, NULL);
|
|
|
21ab4e |
if (ret) {
|
|
|
21ab4e |
gf_msg (this->name, GF_LOG_ERROR, -ret,
|
|
|
21ab4e |
@@ -477,7 +511,22 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
|
|
|
21ab4e |
goto out;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
- if (iatt.ia_nlink == stbuf->ia_nlink) {
|
|
|
21ab4e |
+ /* There is a race where on the target subvol for the hardlink
|
|
|
21ab4e |
+ * (note: hash subvol for the hardlink might differ from this), some
|
|
|
21ab4e |
+ * other client(non-rebalance) would have created a linkto file for that
|
|
|
21ab4e |
+ * hardlink as part of lookup. So let say there are 10 hardlinks, on the
|
|
|
21ab4e |
+ * 5th hardlink it self the hardlinks might have migrated. Now for
|
|
|
21ab4e |
+ * (6..10th) hardlinks the cached and target would be same as the file
|
|
|
21ab4e |
+ * has already migrated. Hence this check is needed */
|
|
|
21ab4e |
+ if (cached_subvol == hashed_subvol) {
|
|
|
21ab4e |
+ gf_msg_debug (this->name, 0, "source %s and destination %s "
|
|
|
21ab4e |
+ "for hardlink %s are same", cached_subvol->name,
|
|
|
21ab4e |
+ hashed_subvol->name, loc->path);
|
|
|
21ab4e |
+ ret = -2;
|
|
|
21ab4e |
+ goto out;
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ if (iatt.ia_nlink == stbuf.ia_nlink) {
|
|
|
21ab4e |
ret = dht_migrate_file (this, loc, cached_subvol, hashed_subvol,
|
|
|
21ab4e |
GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS,
|
|
|
21ab4e |
fop_errno);
|
|
|
21ab4e |
@@ -489,6 +538,13 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
|
|
|
21ab4e |
out:
|
|
|
21ab4e |
if (link_xattr)
|
|
|
21ab4e |
dict_unref (link_xattr);
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ if (xattr_rsp)
|
|
|
21ab4e |
+ dict_unref (xattr_rsp);
|
|
|
21ab4e |
+
|
|
|
21ab4e |
+ if (dict)
|
|
|
21ab4e |
+ dict_unref (dict);
|
|
|
21ab4e |
+
|
|
|
21ab4e |
return ret;
|
|
|
21ab4e |
}
|
|
|
21ab4e |
|
|
|
21ab4e |
@@ -509,7 +565,7 @@ __check_file_has_hardlink (xlator_t *this, loc_t *loc,
|
|
|
21ab4e |
if (flags == GF_DHT_MIGRATE_HARDLINK) {
|
|
|
21ab4e |
synclock_lock (&conf->link_lock);
|
|
|
21ab4e |
ret = gf_defrag_handle_hardlink
|
|
|
21ab4e |
- (this, loc, xattrs, stbuf, fop_errno);
|
|
|
21ab4e |
+ (this, loc, fop_errno);
|
|
|
21ab4e |
synclock_unlock (&conf->link_lock);
|
|
|
21ab4e |
/*
|
|
|
21ab4e |
Returning zero will force the file to be remigrated.
|
|
|
21ab4e |
@@ -1497,6 +1553,13 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
|
|
|
21ab4e |
gf_boolean_t ignore_failure = _gf_false;
|
|
|
21ab4e |
|
|
|
21ab4e |
|
|
|
21ab4e |
+ if (from == to) {
|
|
|
21ab4e |
+ gf_msg_debug (this->name, 0, "destination and source are same. file %s"
|
|
|
21ab4e |
+ " might have migrated already", loc->path);
|
|
|
21ab4e |
+ ret = 0;
|
|
|
21ab4e |
+ goto out;
|
|
|
21ab4e |
+ }
|
|
|
21ab4e |
+
|
|
|
21ab4e |
/* If defrag is NULL, it should be assumed that migration is triggered
|
|
|
21ab4e |
* from client */
|
|
|
21ab4e |
defrag = conf->defrag;
|
|
|
21ab4e |
@@ -1561,6 +1624,11 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
|
|
|
21ab4e |
gf_uuid_copy (tmp_loc.gfid, loc->gfid);
|
|
|
21ab4e |
tmp_loc.path = gf_strdup(loc->path);
|
|
|
21ab4e |
|
|
|
21ab4e |
+ /* this inodelk happens with flock.owner being zero. But to synchronize
|
|
|
21ab4e |
+ * hardlink migration we need to have different lkowner for each migration
|
|
|
21ab4e |
+ * Filed a bug here: https://bugzilla.redhat.com/show_bug.cgi?id=1468202 to
|
|
|
21ab4e |
+ * track the fix for this. Currently synclock takes care of synchronizing
|
|
|
21ab4e |
+ * hardlink migration. Once this bug is fixed we can avoid taking synclock */
|
|
|
21ab4e |
ret = syncop_inodelk (from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc, F_SETLKW,
|
|
|
21ab4e |
&flock, NULL, NULL);
|
|
|
21ab4e |
if (ret < 0) {
|
|
|
21ab4e |
@@ -2671,8 +2739,8 @@ gf_defrag_migrate_single_file (void *opaque)
|
|
|
21ab4e |
}
|
|
|
21ab4e |
UNLOCK (&defrag->lock);
|
|
|
21ab4e |
} else if (fop_errno != EEXIST) {
|
|
|
21ab4e |
- gf_msg (this->name, GF_LOG_ERROR,
|
|
|
21ab4e |
- DHT_MSG_MIGRATE_FILE_FAILED, fop_errno,
|
|
|
21ab4e |
+ gf_msg (this->name, GF_LOG_ERROR, fop_errno,
|
|
|
21ab4e |
+ DHT_MSG_MIGRATE_FILE_FAILED,
|
|
|
21ab4e |
"migrate-data failed for %s", entry_loc.path);
|
|
|
21ab4e |
|
|
|
21ab4e |
LOCK (&defrag->lock);
|
|
|
21ab4e |
--
|
|
|
21ab4e |
1.8.3.1
|
|
|
21ab4e |
|