From 4c91b33219078226a3cccabc409b366f3bccfbc0 Mon Sep 17 00:00:00 2001 From: Susant Palai Date: Tue, 16 Jun 2015 20:35:46 +0530 Subject: [PATCH 292/304] dht: block/handle create op falling to decommissioned brick Problem: Post remove-brick start till commit phase, the client layout may not be in sync with disk layout because of lack of lookup. Hence,a create call may fall on the decommissioned brick. Solution: Will acquire a lock on hashed subvol. So that a fix-layout or selfheal can not step on layout while reading the layout. Even if we read a layout before remove-brick fix-layout and the file falls on the decommissioned brick, the file should be migrated to a new brick as per the fix-layout. BUG: 1225452 Change-Id: Ice13674d61522a64c92ba26fe402333335da0462 Signed-off-by: Susant Palai Reviewed-on: http://review.gluster.org/11260 Tested-by: Gluster Build System Tested-by: NetBSD Build System Reviewed-by: Raghavendra G Signed-off-by: Susant Palai Reviewed-on: https://code.engineering.redhat.com/gerrit/56166 Reviewed-by: Raghavendra Gowdappa Tested-by: Raghavendra Gowdappa --- xlators/cluster/dht/src/dht-common.c | 456 +++++++++++++++++++++++++++---- xlators/cluster/dht/src/dht-common.h | 11 + xlators/cluster/dht/src/dht-diskusage.c | 25 ++- xlators/cluster/dht/src/dht-linkfile.c | 2 +- xlators/cluster/dht/src/dht-selfheal.c | 18 +- 5 files changed, 455 insertions(+), 57 deletions(-) diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 8b2715f..ebc183d 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -39,7 +39,6 @@ dht_setxattr2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame); int run_defrag = 0; - int dht_aggregate_quota_xattr (dict_t *dst, char *key, data_t *value) { @@ -3505,7 +3504,6 @@ err: return 0; } - static int dht_common_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, @@ -5462,9 +5460,6 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int ret = -1; dht_local_t *local = NULL; - if (op_ret == -1) - goto out; - local = frame->local; if (!local) { op_ret = -1; @@ -5472,6 +5467,9 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } + if (op_ret == -1) + goto out; + prev = cookie; if (local->loc.parent) { @@ -5491,18 +5489,34 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, op_errno = EINVAL; goto out; } + + local->op_errno = op_errno; + if (local->linked == _gf_true) { local->stbuf = *stbuf; dht_linkfile_attr_heal (frame, this); } out: + DHT_STRIP_PHASE1_FLAGS (stbuf); - DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent, - postparent, xdata); + + if (local && local->lock.locks) { + /* store op_errno for failure case*/ + local->op_errno = op_errno; + local->refresh_layout_unlock (frame, this, op_ret); + + if (op_ret == 0) { + DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, + inode, stbuf, preparent, postparent, + xdata); + } + } else { + DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, + stbuf, preparent, postparent, xdata); + } return 0; } - int dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -5514,8 +5528,10 @@ dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, dht_local_t *local = NULL; xlator_t *cached_subvol = NULL; - if (op_ret == -1) + if (op_ret == -1) { + local->op_errno = op_errno; goto err; + } local = frame->local; cached_subvol = local->cached_subvol; @@ -5527,25 +5543,327 @@ dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, return 0; err: - DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL, NULL); + if (local->lock.locks) + local->refresh_layout_unlock (frame, this, -1); + + return 0; +} + +int +dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, + dict_t *params) +{ + dht_local_t *local = NULL; + xlator_t *avail_subvol = NULL; + + local = frame->local; + + if (!dht_is_subvol_filled (this, subvol)) { + gf_msg_debug (this->name, 0, + "creating %s on %s", loc->path, + subvol->name); + + STACK_WIND (frame, dht_create_cbk, + subvol, subvol->fops->create, + loc, flags, mode, umask, fd, params); + + } else { + avail_subvol = dht_free_disk_available_subvol (this, subvol, local); + + if (avail_subvol != subvol) { + local->params = dict_ref (params); + local->flags = flags; + local->mode = mode; + local->umask = umask; + local->cached_subvol = avail_subvol; + local->hashed_subvol = subvol; + + gf_msg_debug (this->name, 0, + "creating %s on %s (link at %s)", loc->path, + avail_subvol->name, subvol->name); + + dht_linkfile_create (frame, dht_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); + + goto out; + } + + gf_msg_debug (this->name, 0, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_create_cbk, + subvol, subvol->fops->create, + loc, flags, mode, umask, fd, params); + } +out: return 0; } int +dht_build_parent_loc (xlator_t *this, loc_t *parent, loc_t *child, + int32_t *op_errno) +{ + inode_table_t *table = NULL; + int ret = -1; + + if (!parent || !child) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } + + if (child->parent) { + parent->inode = inode_ref (child->parent); + if (!parent->inode) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } + + gf_uuid_copy (parent->gfid, child->pargfid); + + ret = 0; + + goto out; + } else { + if (gf_uuid_is_null (child->pargfid)) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } + + table = this->itable; + + if (!table) { + if (op_errno) { + *op_errno = EINVAL; + goto out; + } + } + + parent->inode = inode_find (table, child->pargfid); + + if (!parent->inode) { + if (op_errno) { + *op_errno = EINVAL; + goto out; + } + } + + gf_uuid_copy (parent->gfid, child->pargfid); + + ret = 0; + } + +out: + return ret; +} + + +int32_t +dht_create_do (call_frame_t *frame) +{ + dht_local_t *local = NULL; + dht_layout_t *refreshed = NULL; + xlator_t *subvol = NULL; + xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_methods_t *methods = NULL; + + local = frame->local; + + this = THIS; + + conf = this->private; + + GF_VALIDATE_OR_GOTO (this->name, conf, err); + + methods = conf->methods; + + GF_VALIDATE_OR_GOTO (this->name, conf->methods, err); + + /* We don't need parent_loc anymore */ + loc_wipe (&local->loc); + + loc_copy (&local->loc, &local->loc2); + + loc_wipe (&local->loc2); + + refreshed = local->selfheal.refreshed_layout; + + subvol = methods->layout_search (this, refreshed, local->loc.name); + + if (!subvol) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, "no subvolume in " + "layout for path=%s", local->loc.path); + local->op_errno = ENOENT; + goto err; + } + + dht_create_wind_to_avail_subvol (frame, this, subvol, &local->loc, + local->flags, local->mode, + local->umask, local->fd, local->params); + return 0; +err: + local->refresh_layout_unlock (frame, this, -1); + + return 0; +} + +int32_t +dht_create_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + DHT_STACK_DESTROY (frame); + return 0; +} + +int32_t +dht_create_finish (call_frame_t *frame, xlator_t *this, int op_ret) +{ + dht_local_t *local = NULL, *lock_local = NULL; + call_frame_t *lock_frame = NULL; + int lock_count = 0; + + local = frame->local; + lock_count = dht_lock_count (local->lock.locks, local->lock.lk_count); + if (lock_count == 0) + goto done; + + lock_frame = copy_frame (frame); + if (lock_frame == NULL) { + goto done; + } + + lock_local = dht_local_init (lock_frame, &local->loc, NULL, + lock_frame->root->op); + if (lock_local == NULL) { + goto done; + } + + lock_local->lock.locks = local->lock.locks; + lock_local->lock.lk_count = local->lock.lk_count; + + local->lock.locks = NULL; + local->lock.lk_count = 0; + + dht_unlock_inodelk (lock_frame, lock_local->lock.locks, + lock_local->lock.lk_count, + dht_create_unlock_cbk); + lock_frame = NULL; + +done: + if (lock_frame != NULL) { + DHT_STACK_DESTROY (lock_frame); + } + + if (op_ret == 0) + return 0; + + DHT_STACK_UNWIND (create, frame, op_ret, local->op_errno, NULL, NULL, + NULL, NULL, NULL, NULL); + return 0; +} + +int32_t +dht_create_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + dht_local_t *local = NULL; + + local = frame->local; + + if (!local) { + goto err; + } + + if (op_ret < 0) { + gf_msg ("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR, + "Create lock failed for file: %s", local->loc2.name); + + local->op_errno = op_errno; + + goto err; + } + + local->refresh_layout_unlock = dht_create_finish; + + local->refresh_layout_done = dht_create_do; + + dht_refresh_layout (frame); + + return 0; +err: + dht_create_finish (frame, this, -1); + return 0; +} + +int32_t +dht_create_lock (call_frame_t *frame, xlator_t *subvol) +{ + dht_local_t *local = NULL; + int count = 1, ret = -1; + dht_lock_t **lk_array = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO (frame->this->name, frame->local, err); + + local = frame->local; + + lk_array = GF_CALLOC (count, sizeof (*lk_array), gf_common_mt_char); + + if (lk_array == NULL) + goto err; + + lk_array[0] = dht_lock_new (frame->this, subvol, &local->loc, F_RDLCK, + DHT_LAYOUT_HEAL_DOMAIN); + + if (lk_array[0] == NULL) + goto err; + + local->lock.locks = lk_array; + local->lock.lk_count = count; + + ret = dht_blocking_inodelk (frame, lk_array, count, + dht_create_lock_cbk); + + if (ret < 0) { + local->lock.locks = NULL; + local->lock.lk_count = 0; + goto err; + } + + return 0; +err: + if (lk_array != NULL) { + dht_lock_array_free (lk_array, count); + GF_FREE (lk_array); + } + + return -1; +} + +int dht_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *params) { - int op_errno = -1; - xlator_t *subvol = NULL; - dht_local_t *local = NULL; - xlator_t *avail_subvol = NULL; + int op_errno = -1; + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + int i = 0; + dht_conf_t *conf = NULL; + int ret = 0; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); + conf = this->private; + dht_get_du_info (frame, this, loc); local = dht_local_init (frame, loc, fd, GF_FOP_CREATE); @@ -5568,48 +5886,90 @@ dht_create (call_frame_t *frame, xlator_t *this, subvol = dht_subvol_get_hashed (this, loc); if (!subvol) { - gf_msg_debug (this->name, 0, - "no subvolume in layout for path=%s", - loc->path); + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_HASHED_SUBVOL_GET_FAILED, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; goto err; } - if (!dht_is_subvol_filled (this, subvol)) { - gf_msg_trace (this->name, 0, - "creating %s on %s", loc->path, - subvol->name); - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - loc, flags, mode, umask, fd, params); - goto done; - } - /* Choose the minimum filled volume, and create the - files there */ - avail_subvol = dht_free_disk_available_subvol (this, subvol, local); - if (avail_subvol != subvol) { - local->params = dict_ref (params); - local->flags = flags; - local->mode = mode; - local->umask = umask; - local->cached_subvol = avail_subvol; - local->hashed_subvol = subvol; - gf_msg_trace (this->name, 0, - "creating %s on %s (link at %s)", loc->path, - avail_subvol->name, subvol->name); - dht_linkfile_create (frame, dht_create_linkfile_create_cbk, - this, avail_subvol, subvol, loc); - goto done; + /* Post remove-brick, the client layout may not be in sync with + * disk layout because of lack of lookup. Hence,a create call + * may fall on the decommissioned brick. Hence, if the + * hashed_subvol is part of decommissioned bricks list, do a + * lookup on parent dir. If a fix-layout is already done by the + * remove-brick process, the parent directory layout will be in + * sync with that of the disk. If fix-layout is still ending + * on the parent directory, we can let the file get created on + * the decommissioned brick which will be eventually migrated to + * non-decommissioned brick based on the new layout. + */ + + if (conf->decommission_subvols_cnt) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i] && + conf->decommissioned_bricks[i] == subvol) { + + gf_msg_debug (this->name, 0, "hashed subvol:%s is " + "part of decommission brick list for " + "file: %s", subvol->name, loc->path); + + /* dht_refresh_layout needs directory info in + * local->loc. Hence, storing the parent_loc in + * local->loc and storing the create context in + * local->loc2. We will restore this information + * in dht_creation do */ + + ret = loc_copy (&local->loc2, &local->loc); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_NO_MEMORY, + "loc_copy failed %s", loc->path); + + goto err; + } + + local->params = dict_ref (params); + local->flags = flags; + local->mode = mode; + local->umask = umask; + + loc_wipe (&local->loc); + + ret = dht_build_parent_loc (this, &local->loc, loc, + &op_errno); + + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, ENOMEM, + DHT_MSG_NO_MEMORY, + "parent loc build failed"); + goto err; + } + + ret = dht_create_lock (frame, subvol); + + if (ret < 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + DHT_MSG_INODE_LK_ERROR, + "locking parent failed"); + goto err; + } + + goto done; + } + } } - gf_msg_trace (this->name, 0, - "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_create_cbk, - subvol, subvol->fops->create, - loc, flags, mode, umask, fd, params); + + + dht_create_wind_to_avail_subvol (frame, this, subvol, loc, flags, mode, + umask, fd, params); done: return 0; err: + op_errno = (op_errno == -1) ? errno : op_errno; DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, NULL); diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 5e86b32..f583d30 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -44,6 +44,10 @@ typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie, typedef int (*dht_defrag_cbk_fn_t) (xlator_t *this, xlator_t *dst_node, call_frame_t *frame); +typedef int (*dht_refresh_layout_unlock) (call_frame_t *frame, xlator_t *this, + int op_ret); + +typedef int (*dht_refresh_layout_done_handle) (call_frame_t *frame); struct dht_layout { int spread_cnt; /* layout spread count per directory, @@ -212,6 +216,10 @@ struct dht_local { gf_boolean_t force_mkdir; dht_layout_t *layout, *refreshed_layout; } selfheal; + + dht_refresh_layout_unlock refresh_layout_unlock; + dht_refresh_layout_done_handle refresh_layout_done; + uint32_t uid; uint32_t gid; @@ -510,6 +518,7 @@ typedef struct dht_migrate_info { GF_REF_DECL; } dht_migrate_info_t; + #define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT) #define is_revalidate(loc) (dht_inode_ctx_layout_get (loc->inode, this, NULL) == 0) @@ -1067,4 +1076,6 @@ dht_layout_sort (dht_layout_t *layout); int dht_layout_missing_dirs (dht_layout_t *layout); +int +dht_refresh_layout (call_frame_t *frame); #endif/* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 000494c..2a9ad37 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -339,7 +339,8 @@ out: } static inline -int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout) +int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this, + dht_layout_t *layout) { int ret = -1; int i = 0; @@ -355,6 +356,17 @@ int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout) goto out; } } + + /* discard decommissioned subvol */ + if (conf->decommission_subvols_cnt) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i] && + conf->decommissioned_bricks[i] == this) + ret = -1; + goto out; + } + } + ret = 0; out: return ret; @@ -376,8 +388,9 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, conf = this->private; for(i=0; i < conf->subvolume_cnt; i++) { - /* check if subvol has layout errors, before selecting it */ - ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + /* check if subvol has layout errors and also it is not a + * decommissioned brick, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i], layout); if (ignore_subvol) continue; @@ -424,8 +437,10 @@ dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, conf = this->private; for (i = 0; i < conf->subvolume_cnt; i++) { - /* check if subvol has layout errors, before selecting it */ - ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + /* check if subvol has layout errors and also it is not a + * decommissioned brick, before selecting it*/ + + ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i], layout); if (ignore_subvol) continue; diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c index a247170..a3e6d99 100644 --- a/xlators/cluster/dht/src/dht-linkfile.c +++ b/xlators/cluster/dht/src/dht-linkfile.c @@ -120,7 +120,7 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, int need_unref = 0; int ret = 0; dht_conf_t *conf = this->private; - char gfid[GF_UUID_BUF_SIZE] = {0}; + char gfid[GF_UUID_BUF_SIZE] = {0}; local = frame->local; local->linkfile.linkfile_cbk = linkfile_cbk; diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index cfe7e5a..cd1d97f 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -214,7 +214,7 @@ unlock: if (is_last_call (this_call_cnt)) { if (local->op_ret == 0) { - dht_refresh_layout_done (frame); + local->refresh_layout_done (frame); } else { goto err; } @@ -224,7 +224,8 @@ unlock: return 0; err: - dht_selfheal_dir_finish (frame, this, -1); + local->refresh_layout_unlock (frame, this, -1); + return 0; } @@ -290,7 +291,7 @@ dht_refresh_layout (call_frame_t *frame) return 0; out: - dht_selfheal_dir_finish (frame, this, -1); + local->refresh_layout_unlock (frame, this, -1); return 0; } @@ -299,10 +300,21 @@ int32_t dht_selfheal_layout_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { + dht_local_t *local = NULL; + + local = frame->local; + + if (!local) { + goto err; + } + if (op_ret < 0) { goto err; } + local->refresh_layout_unlock = dht_selfheal_dir_finish; + local->refresh_layout_done = dht_refresh_layout_done; + dht_refresh_layout (frame); return 0; -- 1.7.1