|
|
3604df |
From 7fde925c55250e925df4045a849918c8eebe2b48 Mon Sep 17 00:00:00 2001
|
|
|
3604df |
From: Susant Palai <spalai@redhat.com>
|
|
|
3604df |
Date: Tue, 13 Dec 2016 14:26:53 +0530
|
|
|
3604df |
Subject: [PATCH 256/257] dht/rebalance: reverify lookup failures
|
|
|
3604df |
|
|
|
3604df |
race: readdirp has read one entry, and doing a lookup on
|
|
|
3604df |
that entry, but user might have renamed/removed that entry just
|
|
|
3604df |
after readdirp but before lookup.
|
|
|
3604df |
|
|
|
3604df |
Since remove-brick is a costly opertaion,will ingore any
|
|
|
3604df |
ENOENT/ESTALE failures and move on.
|
|
|
3604df |
|
|
|
3604df |
upstream patch: http://review.gluster.org/#/c/15846/
|
|
|
3604df |
|
|
|
3604df |
Change-Id: I62c7fa93c0b9b7e764065ad1574b97acf51b5996
|
|
|
3604df |
BUG: 1405000
|
|
|
3604df |
(cherry picked from commit bb71f13a338e7b8e24ef162ac0669523bc52783f)
|
|
|
3604df |
Signed-off-by: Susant Palai <spalai@redhat.com>
|
|
|
3604df |
Reviewed-on: https://code.engineering.redhat.com/gerrit/93573
|
|
|
3604df |
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
|
|
3604df |
---
|
|
|
3604df |
xlators/cluster/dht/src/dht-messages.h | 11 +-
|
|
|
3604df |
xlators/cluster/dht/src/dht-rebalance.c | 177 ++++++++++++++++++++++----------
|
|
|
3604df |
2 files changed, 130 insertions(+), 58 deletions(-)
|
|
|
3604df |
|
|
|
3604df |
diff --git a/xlators/cluster/dht/src/dht-messages.h b/xlators/cluster/dht/src/dht-messages.h
|
|
|
3604df |
index 153f4de..30b64eb 100644
|
|
|
3604df |
--- a/xlators/cluster/dht/src/dht-messages.h
|
|
|
3604df |
+++ b/xlators/cluster/dht/src/dht-messages.h
|
|
|
3604df |
@@ -40,7 +40,7 @@
|
|
|
3604df |
*/
|
|
|
3604df |
|
|
|
3604df |
#define GLFS_DHT_BASE GLFS_MSGID_COMP_DHT
|
|
|
3604df |
-#define GLFS_DHT_NUM_MESSAGES 117
|
|
|
3604df |
+#define GLFS_DHT_NUM_MESSAGES 118
|
|
|
3604df |
#define GLFS_MSGID_END (GLFS_DHT_BASE + GLFS_DHT_NUM_MESSAGES + 1)
|
|
|
3604df |
|
|
|
3604df |
/* Messages with message IDs */
|
|
|
3604df |
@@ -1072,11 +1072,18 @@
|
|
|
3604df |
#define DHT_MSG_LOCK_INODE_UNREF_FAILED (GLFS_DHT_BASE + 116)
|
|
|
3604df |
|
|
|
3604df |
/*
|
|
|
3604df |
- * @messageid 109116
|
|
|
3604df |
+ * @messageid 109117
|
|
|
3604df |
* @diagnosis
|
|
|
3604df |
* @recommendedaction None
|
|
|
3604df |
*/
|
|
|
3604df |
#define DHT_MSG_ASPRINTF_FAILED (GLFS_DHT_BASE + 117)
|
|
|
3604df |
|
|
|
3604df |
+/*
|
|
|
3604df |
+ * @messageid 109118
|
|
|
3604df |
+ * @diagnosis
|
|
|
3604df |
+ * @recommendedaction None
|
|
|
3604df |
+ */
|
|
|
3604df |
+#define DHT_MSG_DIR_LOOKUP_FAILED (GLFS_DHT_BASE + 118)
|
|
|
3604df |
+
|
|
|
3604df |
#define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
|
|
|
3604df |
#endif /* _DHT_MESSAGES_H_ */
|
|
|
3604df |
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
3604df |
index c82700e..ed9991f 100644
|
|
|
3604df |
--- a/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
3604df |
+++ b/xlators/cluster/dht/src/dht-rebalance.c
|
|
|
3604df |
@@ -2446,6 +2446,7 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container,
|
|
|
3604df |
struct dht_container *tmp_container = NULL;
|
|
|
3604df |
xlator_t *hashed_subvol = NULL;
|
|
|
3604df |
xlator_t *cached_subvol = NULL;
|
|
|
3604df |
+ int fop_errno = 0;
|
|
|
3604df |
|
|
|
3604df |
if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
|
|
|
3604df |
ret = -1;
|
|
|
3604df |
@@ -2469,11 +2470,11 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container,
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
if (ret < 0) {
|
|
|
3604df |
- gf_msg (this->name, GF_LOG_ERROR, 0,
|
|
|
3604df |
+ gf_msg (this->name, GF_LOG_WARNING, -ret,
|
|
|
3604df |
DHT_MSG_MIGRATE_DATA_FAILED,
|
|
|
3604df |
- "%s: Migrate data failed: Readdir returned"
|
|
|
3604df |
- " %s. Aborting migrate-data", loc->path,
|
|
|
3604df |
- strerror(-ret));
|
|
|
3604df |
+ "Readdirp failed. Aborting data migration for "
|
|
|
3604df |
+ "directory: %s", loc->path);
|
|
|
3604df |
+ fop_errno = -ret;
|
|
|
3604df |
ret = -1;
|
|
|
3604df |
goto out;
|
|
|
3604df |
}
|
|
|
3604df |
@@ -2605,9 +2606,9 @@ gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container,
|
|
|
3604df |
ret = syncop_lookup (this, &entry_loc, NULL, NULL,
|
|
|
3604df |
NULL, NULL);
|
|
|
3604df |
if (ret) {
|
|
|
3604df |
- gf_msg (this->name, GF_LOG_ERROR, 0,
|
|
|
3604df |
+ gf_msg (this->name, GF_LOG_WARNING, -ret,
|
|
|
3604df |
DHT_MSG_MIGRATE_FILE_FAILED,
|
|
|
3604df |
- "Migrate file failed:%s lookup failed",
|
|
|
3604df |
+ "lookup failed for file:%s",
|
|
|
3604df |
entry_loc.path);
|
|
|
3604df |
|
|
|
3604df |
if (-ret != ENOENT && -ret != ESTALE) {
|
|
|
3604df |
@@ -2728,6 +2729,9 @@ out:
|
|
|
3604df |
|
|
|
3604df |
if (xattr_rsp)
|
|
|
3604df |
dict_unref (xattr_rsp);
|
|
|
3604df |
+
|
|
|
3604df |
+
|
|
|
3604df |
+ errno = fop_errno;
|
|
|
3604df |
return ret;
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
@@ -2753,6 +2757,7 @@ gf_defrag_process_dir (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
|
|
|
3604df |
int throttle_up = 0;
|
|
|
3604df |
struct dir_dfmeta *dir_dfmeta = NULL;
|
|
|
3604df |
int should_commit_hash = 1;
|
|
|
3604df |
+ int fop_errno = 0;
|
|
|
3604df |
|
|
|
3604df |
gf_log (this->name, GF_LOG_INFO, "migrate data called on %s",
|
|
|
3604df |
loc->path);
|
|
|
3604df |
@@ -2775,10 +2780,11 @@ gf_defrag_process_dir (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
|
|
|
3604df |
|
|
|
3604df |
ret = syncop_opendir (this, loc, fd, NULL, NULL);
|
|
|
3604df |
if (ret) {
|
|
|
3604df |
- gf_msg (this->name, GF_LOG_ERROR, 0,
|
|
|
3604df |
+ gf_msg (this->name, GF_LOG_WARNING, 0,
|
|
|
3604df |
DHT_MSG_MIGRATE_DATA_FAILED,
|
|
|
3604df |
"Migrate data failed: Failed to open dir %s",
|
|
|
3604df |
loc->path);
|
|
|
3604df |
+ fop_errno = -ret;
|
|
|
3604df |
ret = -1;
|
|
|
3604df |
goto out;
|
|
|
3604df |
}
|
|
|
3604df |
@@ -2925,9 +2931,12 @@ gf_defrag_process_dir (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
|
|
|
3604df |
migrate_data, dir_dfmeta,
|
|
|
3604df |
xattr_req,
|
|
|
3604df |
&should_commit_hash);
|
|
|
3604df |
+
|
|
|
3604df |
if (ret) {
|
|
|
3604df |
- gf_log ("DHT", GF_LOG_INFO, "Found critical "
|
|
|
3604df |
+ fop_errno = errno;
|
|
|
3604df |
+ gf_log ("this->name", GF_LOG_WARNING, "Found "
|
|
|
3604df |
"error from gf_defrag_get_entry");
|
|
|
3604df |
+
|
|
|
3604df |
ret = -1;
|
|
|
3604df |
goto out;
|
|
|
3604df |
}
|
|
|
3604df |
@@ -2985,6 +2994,7 @@ out:
|
|
|
3604df |
ret = 2;
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
+ errno = fop_errno;
|
|
|
3604df |
return ret;
|
|
|
3604df |
}
|
|
|
3604df |
int
|
|
|
3604df |
@@ -3154,7 +3164,6 @@ out:
|
|
|
3604df |
return ret;
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
-
|
|
|
3604df |
int
|
|
|
3604df |
gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
|
|
|
3604df |
dict_t *fix_layout, dict_t *migrate_data)
|
|
|
3604df |
@@ -3178,14 +3187,33 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
|
|
|
3604df |
goto out;
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
-
|
|
|
3604df |
-
|
|
|
3604df |
ret = syncop_lookup (this, loc, &iatt, NULL, NULL, NULL);
|
|
|
3604df |
if (ret) {
|
|
|
3604df |
- gf_log (this->name, GF_LOG_ERROR, "Lookup failed on %s",
|
|
|
3604df |
- loc->path);
|
|
|
3604df |
- ret = -1;
|
|
|
3604df |
- goto out;
|
|
|
3604df |
+ if (strcmp (loc->path, "/") == 0) {
|
|
|
3604df |
+ gf_msg (this->name, GF_LOG_ERROR, -ret,
|
|
|
3604df |
+ DHT_MSG_DIR_LOOKUP_FAILED,
|
|
|
3604df |
+ "lookup failed for:%s", loc->path);
|
|
|
3604df |
+
|
|
|
3604df |
+ defrag->total_failures++;
|
|
|
3604df |
+ ret = -1;
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
+ }
|
|
|
3604df |
+
|
|
|
3604df |
+ if (-ret == ENOENT || -ret == ESTALE) {
|
|
|
3604df |
+ gf_msg (this->name, GF_LOG_INFO, errno,
|
|
|
3604df |
+ DHT_MSG_DIR_LOOKUP_FAILED,
|
|
|
3604df |
+ "Dir:%s renamed or removed. Skipping",
|
|
|
3604df |
+ loc->path);
|
|
|
3604df |
+ ret = 0;
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
+ } else {
|
|
|
3604df |
+ gf_msg (this->name, GF_LOG_ERROR, -ret,
|
|
|
3604df |
+ DHT_MSG_DIR_LOOKUP_FAILED,
|
|
|
3604df |
+ "lookup failed for:%s", loc->path);
|
|
|
3604df |
+
|
|
|
3604df |
+ defrag->total_failures++;
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
+ }
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) &&
|
|
|
3604df |
@@ -3193,18 +3221,24 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
|
|
|
3604df |
ret = gf_defrag_process_dir (this, defrag, loc, migrate_data);
|
|
|
3604df |
|
|
|
3604df |
if (ret && ret != 2) {
|
|
|
3604df |
- defrag->total_failures++;
|
|
|
3604df |
+ if (errno == ENOENT || errno == ESTALE) {
|
|
|
3604df |
+ ret = 0;
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
+ } else {
|
|
|
3604df |
|
|
|
3604df |
- gf_msg (this->name, GF_LOG_ERROR, 0,
|
|
|
3604df |
- DHT_MSG_DEFRAG_PROCESS_DIR_FAILED,
|
|
|
3604df |
- "gf_defrag_process_dir failed for directory: %s"
|
|
|
3604df |
- , loc->path);
|
|
|
3604df |
+ defrag->total_failures++;
|
|
|
3604df |
|
|
|
3604df |
- if (conf->decommission_in_progress) {
|
|
|
3604df |
- goto out;
|
|
|
3604df |
- }
|
|
|
3604df |
+ gf_msg (this->name, GF_LOG_ERROR, 0,
|
|
|
3604df |
+ DHT_MSG_DEFRAG_PROCESS_DIR_FAILED,
|
|
|
3604df |
+ "gf_defrag_process_dir failed for "
|
|
|
3604df |
+ "directory: %s", loc->path);
|
|
|
3604df |
|
|
|
3604df |
- should_commit_hash = 0;
|
|
|
3604df |
+ if (conf->decommission_in_progress) {
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
+ }
|
|
|
3604df |
+
|
|
|
3604df |
+ should_commit_hash = 0;
|
|
|
3604df |
+ }
|
|
|
3604df |
} else if (ret == 2) {
|
|
|
3604df |
should_commit_hash = 0;
|
|
|
3604df |
}
|
|
|
3604df |
@@ -3221,8 +3255,14 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
|
|
|
3604df |
|
|
|
3604df |
ret = syncop_opendir (this, loc, fd, NULL, NULL);
|
|
|
3604df |
if (ret) {
|
|
|
3604df |
- gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s",
|
|
|
3604df |
- loc->path);
|
|
|
3604df |
+ if (-ret == ENOENT || -ret == ESTALE) {
|
|
|
3604df |
+ ret = 0;
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
+ }
|
|
|
3604df |
+
|
|
|
3604df |
+ gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s, "
|
|
|
3604df |
+ "err:%d", loc->path, -ret);
|
|
|
3604df |
+
|
|
|
3604df |
ret = -1;
|
|
|
3604df |
goto out;
|
|
|
3604df |
}
|
|
|
3604df |
@@ -3234,8 +3274,15 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
|
|
|
3604df |
{
|
|
|
3604df |
|
|
|
3604df |
if (ret < 0) {
|
|
|
3604df |
- gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s"
|
|
|
3604df |
- ". Aborting fix-layout",strerror(-ret));
|
|
|
3604df |
+ if (-ret == ENOENT || -ret == ESTALE) {
|
|
|
3604df |
+ ret = 0;
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
+ }
|
|
|
3604df |
+
|
|
|
3604df |
+ gf_msg (this->name, GF_LOG_ERROR, -ret,
|
|
|
3604df |
+ DHT_MSG_READDIR_ERROR, "readdirp failed for "
|
|
|
3604df |
+ "path %s. Aborting fix-layout", loc->path);
|
|
|
3604df |
+
|
|
|
3604df |
ret = -1;
|
|
|
3604df |
goto out;
|
|
|
3604df |
}
|
|
|
3604df |
@@ -3327,43 +3374,63 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
|
|
|
3604df |
|
|
|
3604df |
ret = syncop_lookup (this, &entry_loc, &iatt, NULL,
|
|
|
3604df |
NULL, NULL);
|
|
|
3604df |
- /*Check whether it is ENOENT or ESTALE*/
|
|
|
3604df |
if (ret) {
|
|
|
3604df |
- gf_log (this->name, GF_LOG_ERROR, "%s"
|
|
|
3604df |
- " lookup failed with %d",
|
|
|
3604df |
- entry_loc.path, -ret);
|
|
|
3604df |
-
|
|
|
3604df |
- if (!conf->decommission_in_progress &&
|
|
|
3604df |
- -ret != ENOENT && -ret != ESTALE) {
|
|
|
3604df |
- should_commit_hash = 0;
|
|
|
3604df |
+ if (-ret == ENOENT || -ret == ESTALE) {
|
|
|
3604df |
+ gf_msg (this->name, GF_LOG_INFO, errno,
|
|
|
3604df |
+ DHT_MSG_DIR_LOOKUP_FAILED,
|
|
|
3604df |
+ "Dir:%s renamed or removed. "
|
|
|
3604df |
+ "Skipping", loc->path);
|
|
|
3604df |
+ ret = 0;
|
|
|
3604df |
+ continue;
|
|
|
3604df |
+ } else {
|
|
|
3604df |
+ gf_msg (this->name, GF_LOG_ERROR, -ret,
|
|
|
3604df |
+ DHT_MSG_DIR_LOOKUP_FAILED,
|
|
|
3604df |
+ "lookup failed for:%s",
|
|
|
3604df |
+ entry_loc.path);
|
|
|
3604df |
+ defrag->total_failures++;
|
|
|
3604df |
+ if (conf->decommission_in_progress) {
|
|
|
3604df |
+ defrag->defrag_status =
|
|
|
3604df |
+ GF_DEFRAG_STATUS_FAILED;
|
|
|
3604df |
+ ret = -1;
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
+ } else {
|
|
|
3604df |
+ should_commit_hash = 0;
|
|
|
3604df |
+ continue;
|
|
|
3604df |
+ }
|
|
|
3604df |
}
|
|
|
3604df |
-
|
|
|
3604df |
- continue;
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
ret = syncop_setxattr (this, &entry_loc, fix_layout,
|
|
|
3604df |
0, NULL, NULL);
|
|
|
3604df |
if (ret) {
|
|
|
3604df |
- gf_log (this->name, GF_LOG_ERROR, "Setxattr "
|
|
|
3604df |
- "failed for %s", entry_loc.path);
|
|
|
3604df |
-
|
|
|
3604df |
- defrag->total_failures++;
|
|
|
3604df |
-
|
|
|
3604df |
- /*Don't go for fix-layout of child subtree if"
|
|
|
3604df |
- fix-layout failed*/
|
|
|
3604df |
- if (conf->decommission_in_progress) {
|
|
|
3604df |
- defrag->defrag_status =
|
|
|
3604df |
- GF_DEFRAG_STATUS_FAILED;
|
|
|
3604df |
-
|
|
|
3604df |
- ret = -1;
|
|
|
3604df |
-
|
|
|
3604df |
- goto out;
|
|
|
3604df |
- } else {
|
|
|
3604df |
+ if (-ret == ENOENT || -ret == ESTALE) {
|
|
|
3604df |
+ gf_msg (this->name, GF_LOG_INFO, -ret,
|
|
|
3604df |
+ DHT_MSG_LAYOUT_FIX_FAILED,
|
|
|
3604df |
+ "Setxattr failed. Dir %s "
|
|
|
3604df |
+ "renamed or removed",
|
|
|
3604df |
+ entry_loc.path);
|
|
|
3604df |
+ ret = 0;
|
|
|
3604df |
continue;
|
|
|
3604df |
+ } else {
|
|
|
3604df |
+
|
|
|
3604df |
+ gf_msg (this->name, GF_LOG_ERROR, -ret,
|
|
|
3604df |
+ DHT_MSG_LAYOUT_FIX_FAILED,
|
|
|
3604df |
+ "Setxattr failed for %s",
|
|
|
3604df |
+ entry_loc.path);
|
|
|
3604df |
+
|
|
|
3604df |
+ defrag->total_failures++;
|
|
|
3604df |
+
|
|
|
3604df |
+ if (conf->decommission_in_progress) {
|
|
|
3604df |
+ defrag->defrag_status =
|
|
|
3604df |
+ GF_DEFRAG_STATUS_FAILED;
|
|
|
3604df |
+ ret = -1;
|
|
|
3604df |
+ goto out;
|
|
|
3604df |
+ } else {
|
|
|
3604df |
+ continue;
|
|
|
3604df |
+ }
|
|
|
3604df |
}
|
|
|
3604df |
}
|
|
|
3604df |
|
|
|
3604df |
-
|
|
|
3604df |
/* A return value of 2 means, either process_dir or
|
|
|
3604df |
* lookup of a dir failed. Hence, don't commit hash
|
|
|
3604df |
* for the current directory*/
|
|
|
3604df |
@@ -3383,8 +3450,6 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
|
|
|
3604df |
defrag->defrag_status =
|
|
|
3604df |
GF_DEFRAG_STATUS_FAILED;
|
|
|
3604df |
|
|
|
3604df |
- ret = -1;
|
|
|
3604df |
-
|
|
|
3604df |
goto out;
|
|
|
3604df |
} else {
|
|
|
3604df |
/* Let's not commit-hash if
|
|
|
3604df |
--
|
|
|
3604df |
2.9.3
|
|
|
3604df |
|