cb8e9e
From 4c91b33219078226a3cccabc409b366f3bccfbc0 Mon Sep 17 00:00:00 2001
cb8e9e
From: Susant Palai <spalai@redhat.com>
cb8e9e
Date: Tue, 16 Jun 2015 20:35:46 +0530
cb8e9e
Subject: [PATCH 292/304] dht: block/handle create op falling to decommissioned brick
cb8e9e
cb8e9e
Problem:
cb8e9e
Post remove-brick start till commit phase, the client layout
cb8e9e
may not be in sync with disk layout because of lack of lookup.
cb8e9e
Hence,a create call may fall on the decommissioned brick.
cb8e9e
cb8e9e
Solution:
cb8e9e
Will acquire a lock on hashed subvol. So that a fix-layout or
cb8e9e
selfheal can not step on layout while reading the layout.
cb8e9e
cb8e9e
Even if we read a layout before remove-brick fix-layout and the
cb8e9e
file falls on the decommissioned brick, the file should be
cb8e9e
migrated to a new brick as per the fix-layout.
cb8e9e
cb8e9e
BUG: 1225452
cb8e9e
Change-Id: Ice13674d61522a64c92ba26fe402333335da0462
cb8e9e
Signed-off-by: Susant Palai <spalai@redhat.com>
cb8e9e
Reviewed-on: http://review.gluster.org/11260
cb8e9e
Tested-by: Gluster Build System <jenkins@build.gluster.com>
cb8e9e
Tested-by: NetBSD Build System <jenkins@build.gluster.org>
cb8e9e
Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
cb8e9e
Signed-off-by: Susant Palai <spalai@redhat.com>
cb8e9e
Reviewed-on: https://code.engineering.redhat.com/gerrit/56166
cb8e9e
Reviewed-by: Raghavendra Gowdappa <rgowdapp@redhat.com>
cb8e9e
Tested-by: Raghavendra Gowdappa <rgowdapp@redhat.com>
cb8e9e
---
cb8e9e
 xlators/cluster/dht/src/dht-common.c    |  456 +++++++++++++++++++++++++++----
cb8e9e
 xlators/cluster/dht/src/dht-common.h    |   11 +
cb8e9e
 xlators/cluster/dht/src/dht-diskusage.c |   25 ++-
cb8e9e
 xlators/cluster/dht/src/dht-linkfile.c  |    2 +-
cb8e9e
 xlators/cluster/dht/src/dht-selfheal.c  |   18 +-
cb8e9e
 5 files changed, 455 insertions(+), 57 deletions(-)
cb8e9e
cb8e9e
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
cb8e9e
index 8b2715f..ebc183d 100644
cb8e9e
--- a/xlators/cluster/dht/src/dht-common.c
cb8e9e
+++ b/xlators/cluster/dht/src/dht-common.c
cb8e9e
@@ -39,7 +39,6 @@ dht_setxattr2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame);
cb8e9e
 
cb8e9e
 int run_defrag = 0;
cb8e9e
 
cb8e9e
-
cb8e9e
 int
cb8e9e
 dht_aggregate_quota_xattr (dict_t *dst, char *key, data_t *value)
cb8e9e
 {
cb8e9e
@@ -3505,7 +3504,6 @@ err:
cb8e9e
         return 0;
cb8e9e
 }
cb8e9e
 
cb8e9e
-
cb8e9e
 static int
cb8e9e
 dht_common_setxattr_cbk (call_frame_t *frame, void *cookie,
cb8e9e
                          xlator_t *this, int32_t op_ret, int32_t op_errno,
cb8e9e
@@ -5462,9 +5460,6 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
         int           ret = -1;
cb8e9e
         dht_local_t  *local = NULL;
cb8e9e
 
cb8e9e
-        if (op_ret == -1)
cb8e9e
-                goto out;
cb8e9e
-
cb8e9e
         local = frame->local;
cb8e9e
         if (!local) {
cb8e9e
                 op_ret = -1;
cb8e9e
@@ -5472,6 +5467,9 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
                 goto out;
cb8e9e
         }
cb8e9e
 
cb8e9e
+        if (op_ret == -1)
cb8e9e
+                goto out;
cb8e9e
+
cb8e9e
         prev = cookie;
cb8e9e
 
cb8e9e
         if (local->loc.parent) {
cb8e9e
@@ -5491,18 +5489,34 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
                 op_errno = EINVAL;
cb8e9e
                 goto out;
cb8e9e
         }
cb8e9e
+
cb8e9e
+        local->op_errno = op_errno;
cb8e9e
+
cb8e9e
         if (local->linked == _gf_true) {
cb8e9e
                 local->stbuf = *stbuf;
cb8e9e
                 dht_linkfile_attr_heal (frame, this);
cb8e9e
         }
cb8e9e
 out:
cb8e9e
+
cb8e9e
         DHT_STRIP_PHASE1_FLAGS (stbuf);
cb8e9e
-        DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent,
cb8e9e
-                          postparent, xdata);
cb8e9e
+
cb8e9e
+        if (local && local->lock.locks) {
cb8e9e
+                /* store op_errno for failure case*/
cb8e9e
+                local->op_errno = op_errno;
cb8e9e
+                local->refresh_layout_unlock (frame, this, op_ret);
cb8e9e
+
cb8e9e
+                if (op_ret == 0) {
cb8e9e
+                        DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd,
cb8e9e
+                                          inode, stbuf, preparent, postparent,
cb8e9e
+                                          xdata);
cb8e9e
+                }
cb8e9e
+        } else {
cb8e9e
+                DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode,
cb8e9e
+                                  stbuf, preparent, postparent, xdata);
cb8e9e
+        }
cb8e9e
         return 0;
cb8e9e
 }
cb8e9e
 
cb8e9e
-
cb8e9e
 int
cb8e9e
 dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie,
cb8e9e
                                 xlator_t *this,
cb8e9e
@@ -5514,8 +5528,10 @@ dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie,
cb8e9e
         dht_local_t  *local = NULL;
cb8e9e
         xlator_t     *cached_subvol = NULL;
cb8e9e
 
cb8e9e
-        if (op_ret == -1)
cb8e9e
+        if (op_ret == -1) {
cb8e9e
+                local->op_errno = op_errno;
cb8e9e
                 goto err;
cb8e9e
+        }
cb8e9e
 
cb8e9e
         local = frame->local;
cb8e9e
         cached_subvol = local->cached_subvol;
cb8e9e
@@ -5527,25 +5543,327 @@ dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie,
cb8e9e
 
cb8e9e
         return 0;
cb8e9e
 err:
cb8e9e
-        DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL,
cb8e9e
-                          NULL, NULL, NULL);
cb8e9e
+        if (local->lock.locks)
cb8e9e
+                local->refresh_layout_unlock (frame, this, -1);
cb8e9e
+
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
+int
cb8e9e
+dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
cb8e9e
+                                 xlator_t *subvol, loc_t *loc, int32_t flags,
cb8e9e
+                                 mode_t mode, mode_t umask, fd_t *fd,
cb8e9e
+                                 dict_t *params)
cb8e9e
+{
cb8e9e
+        dht_local_t     *local          = NULL;
cb8e9e
+        xlator_t        *avail_subvol   = NULL;
cb8e9e
+
cb8e9e
+        local = frame->local;
cb8e9e
+
cb8e9e
+        if (!dht_is_subvol_filled (this, subvol)) {
cb8e9e
+                gf_msg_debug (this->name, 0,
cb8e9e
+                              "creating %s on %s", loc->path,
cb8e9e
+                              subvol->name);
cb8e9e
+
cb8e9e
+                STACK_WIND (frame, dht_create_cbk,
cb8e9e
+                            subvol, subvol->fops->create,
cb8e9e
+                            loc, flags, mode, umask, fd, params);
cb8e9e
+
cb8e9e
+        } else {
cb8e9e
+                avail_subvol = dht_free_disk_available_subvol (this, subvol, local);
cb8e9e
+
cb8e9e
+                if (avail_subvol != subvol) {
cb8e9e
+                        local->params = dict_ref (params);
cb8e9e
+                        local->flags = flags;
cb8e9e
+                        local->mode = mode;
cb8e9e
+                        local->umask = umask;
cb8e9e
+                        local->cached_subvol = avail_subvol;
cb8e9e
+                        local->hashed_subvol = subvol;
cb8e9e
+
cb8e9e
+                        gf_msg_debug (this->name, 0,
cb8e9e
+                                      "creating %s on %s (link at %s)", loc->path,
cb8e9e
+                                      avail_subvol->name, subvol->name);
cb8e9e
+
cb8e9e
+                        dht_linkfile_create (frame, dht_create_linkfile_create_cbk,
cb8e9e
+                                             this, avail_subvol, subvol, loc);
cb8e9e
+
cb8e9e
+                        goto out;
cb8e9e
+                }
cb8e9e
+
cb8e9e
+                gf_msg_debug (this->name, 0,
cb8e9e
+                              "creating %s on %s", loc->path, subvol->name);
cb8e9e
+
cb8e9e
+                STACK_WIND (frame, dht_create_cbk,
cb8e9e
+                            subvol, subvol->fops->create,
cb8e9e
+                            loc, flags, mode, umask, fd, params);
cb8e9e
+        }
cb8e9e
+out:
cb8e9e
         return 0;
cb8e9e
 }
cb8e9e
 
cb8e9e
 int
cb8e9e
+dht_build_parent_loc (xlator_t *this, loc_t *parent, loc_t *child,
cb8e9e
+                                                 int32_t *op_errno)
cb8e9e
+{
cb8e9e
+        inode_table_t   *table = NULL;
cb8e9e
+        int     ret = -1;
cb8e9e
+
cb8e9e
+        if (!parent || !child) {
cb8e9e
+                if (op_errno)
cb8e9e
+                        *op_errno = EINVAL;
cb8e9e
+                goto out;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        if (child->parent) {
cb8e9e
+                parent->inode = inode_ref (child->parent);
cb8e9e
+                if (!parent->inode) {
cb8e9e
+                        if (op_errno)
cb8e9e
+                                *op_errno = EINVAL;
cb8e9e
+                        goto out;
cb8e9e
+                }
cb8e9e
+
cb8e9e
+                gf_uuid_copy (parent->gfid, child->pargfid);
cb8e9e
+
cb8e9e
+                ret = 0;
cb8e9e
+
cb8e9e
+                goto out;
cb8e9e
+        } else {
cb8e9e
+                if (gf_uuid_is_null (child->pargfid)) {
cb8e9e
+                        if (op_errno)
cb8e9e
+                                *op_errno = EINVAL;
cb8e9e
+                        goto out;
cb8e9e
+                }
cb8e9e
+
cb8e9e
+                table = this->itable;
cb8e9e
+
cb8e9e
+                if (!table) {
cb8e9e
+                        if (op_errno) {
cb8e9e
+                                *op_errno = EINVAL;
cb8e9e
+                                goto out;
cb8e9e
+                        }
cb8e9e
+                }
cb8e9e
+
cb8e9e
+                parent->inode = inode_find (table, child->pargfid);
cb8e9e
+
cb8e9e
+                if (!parent->inode) {
cb8e9e
+                         if (op_errno) {
cb8e9e
+                                *op_errno = EINVAL;
cb8e9e
+                                goto out;
cb8e9e
+                        }
cb8e9e
+                }
cb8e9e
+
cb8e9e
+                gf_uuid_copy (parent->gfid, child->pargfid);
cb8e9e
+
cb8e9e
+                ret = 0;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+out:
cb8e9e
+        return ret;
cb8e9e
+}
cb8e9e
+
cb8e9e
+
cb8e9e
+int32_t
cb8e9e
+dht_create_do (call_frame_t *frame)
cb8e9e
+{
cb8e9e
+        dht_local_t     *local          = NULL;
cb8e9e
+        dht_layout_t    *refreshed      = NULL;
cb8e9e
+        xlator_t        *subvol         = NULL;
cb8e9e
+        xlator_t        *this           = NULL;
cb8e9e
+        dht_conf_t      *conf           = NULL;
cb8e9e
+        dht_methods_t   *methods        = NULL;
cb8e9e
+
cb8e9e
+        local = frame->local;
cb8e9e
+
cb8e9e
+        this = THIS;
cb8e9e
+
cb8e9e
+        conf = this->private;
cb8e9e
+
cb8e9e
+        GF_VALIDATE_OR_GOTO (this->name, conf, err);
cb8e9e
+
cb8e9e
+        methods = conf->methods;
cb8e9e
+
cb8e9e
+        GF_VALIDATE_OR_GOTO (this->name, conf->methods, err);
cb8e9e
+
cb8e9e
+        /* We don't need parent_loc anymore */
cb8e9e
+        loc_wipe (&local->loc);
cb8e9e
+
cb8e9e
+        loc_copy (&local->loc, &local->loc2);
cb8e9e
+
cb8e9e
+        loc_wipe (&local->loc2);
cb8e9e
+
cb8e9e
+        refreshed = local->selfheal.refreshed_layout;
cb8e9e
+
cb8e9e
+        subvol = methods->layout_search (this, refreshed, local->loc.name);
cb8e9e
+
cb8e9e
+        if (!subvol) {
cb8e9e
+                gf_msg (this->name, GF_LOG_ERROR, 0,
cb8e9e
+                        DHT_MSG_HASHED_SUBVOL_GET_FAILED, "no subvolume in "
cb8e9e
+                        "layout for path=%s", local->loc.path);
cb8e9e
+                local->op_errno = ENOENT;
cb8e9e
+                goto err;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        dht_create_wind_to_avail_subvol (frame, this, subvol, &local->loc,
cb8e9e
+                                         local->flags, local->mode,
cb8e9e
+                                         local->umask, local->fd, local->params);
cb8e9e
+        return 0;
cb8e9e
+err:
cb8e9e
+        local->refresh_layout_unlock (frame, this, -1);
cb8e9e
+
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
+int32_t
cb8e9e
+dht_create_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
+                         int32_t op_ret, int32_t op_errno, dict_t *xdata)
cb8e9e
+{
cb8e9e
+        DHT_STACK_DESTROY (frame);
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
+int32_t
cb8e9e
+dht_create_finish (call_frame_t *frame, xlator_t *this, int op_ret)
cb8e9e
+{
cb8e9e
+        dht_local_t  *local      = NULL, *lock_local = NULL;
cb8e9e
+        call_frame_t *lock_frame = NULL;
cb8e9e
+        int           lock_count = 0;
cb8e9e
+
cb8e9e
+        local = frame->local;
cb8e9e
+        lock_count = dht_lock_count (local->lock.locks, local->lock.lk_count);
cb8e9e
+        if (lock_count == 0)
cb8e9e
+                goto done;
cb8e9e
+
cb8e9e
+        lock_frame = copy_frame (frame);
cb8e9e
+        if (lock_frame == NULL) {
cb8e9e
+                goto done;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        lock_local = dht_local_init (lock_frame, &local->loc, NULL,
cb8e9e
+                                     lock_frame->root->op);
cb8e9e
+        if (lock_local == NULL) {
cb8e9e
+                goto done;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        lock_local->lock.locks = local->lock.locks;
cb8e9e
+        lock_local->lock.lk_count = local->lock.lk_count;
cb8e9e
+
cb8e9e
+        local->lock.locks = NULL;
cb8e9e
+        local->lock.lk_count = 0;
cb8e9e
+
cb8e9e
+        dht_unlock_inodelk (lock_frame, lock_local->lock.locks,
cb8e9e
+                            lock_local->lock.lk_count,
cb8e9e
+                            dht_create_unlock_cbk);
cb8e9e
+        lock_frame = NULL;
cb8e9e
+
cb8e9e
+done:
cb8e9e
+        if (lock_frame != NULL) {
cb8e9e
+                DHT_STACK_DESTROY (lock_frame);
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        if (op_ret == 0)
cb8e9e
+                return 0;
cb8e9e
+
cb8e9e
+        DHT_STACK_UNWIND (create, frame, op_ret, local->op_errno, NULL, NULL,
cb8e9e
+                          NULL, NULL, NULL, NULL);
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
+int32_t
cb8e9e
+dht_create_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
cb8e9e
+{
cb8e9e
+        dht_local_t     *local = NULL;
cb8e9e
+
cb8e9e
+        local = frame->local;
cb8e9e
+
cb8e9e
+        if (!local) {
cb8e9e
+                goto err;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        if (op_ret < 0) {
cb8e9e
+                gf_msg ("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
cb8e9e
+                        "Create lock failed for file: %s", local->loc2.name);
cb8e9e
+
cb8e9e
+                local->op_errno = op_errno;
cb8e9e
+
cb8e9e
+                goto err;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        local->refresh_layout_unlock = dht_create_finish;
cb8e9e
+
cb8e9e
+        local->refresh_layout_done = dht_create_do;
cb8e9e
+
cb8e9e
+        dht_refresh_layout (frame);
cb8e9e
+
cb8e9e
+        return 0;
cb8e9e
+err:
cb8e9e
+        dht_create_finish (frame, this, -1);
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
+int32_t
cb8e9e
+dht_create_lock (call_frame_t *frame, xlator_t *subvol)
cb8e9e
+{
cb8e9e
+        dht_local_t     *local          = NULL;
cb8e9e
+        int              count  = 1,    ret = -1;
cb8e9e
+        dht_lock_t     **lk_array       = NULL;
cb8e9e
+
cb8e9e
+        GF_VALIDATE_OR_GOTO ("dht", frame, err);
cb8e9e
+        GF_VALIDATE_OR_GOTO (frame->this->name, frame->local, err);
cb8e9e
+
cb8e9e
+        local = frame->local;
cb8e9e
+
cb8e9e
+        lk_array = GF_CALLOC (count, sizeof (*lk_array), gf_common_mt_char);
cb8e9e
+
cb8e9e
+        if (lk_array == NULL)
cb8e9e
+                goto err;
cb8e9e
+
cb8e9e
+        lk_array[0] = dht_lock_new (frame->this, subvol, &local->loc, F_RDLCK,
cb8e9e
+                                    DHT_LAYOUT_HEAL_DOMAIN);
cb8e9e
+
cb8e9e
+        if (lk_array[0] == NULL)
cb8e9e
+                goto err;
cb8e9e
+
cb8e9e
+        local->lock.locks = lk_array;
cb8e9e
+        local->lock.lk_count = count;
cb8e9e
+
cb8e9e
+        ret = dht_blocking_inodelk (frame, lk_array, count,
cb8e9e
+                                    dht_create_lock_cbk);
cb8e9e
+
cb8e9e
+        if (ret < 0) {
cb8e9e
+                local->lock.locks = NULL;
cb8e9e
+                local->lock.lk_count = 0;
cb8e9e
+                goto err;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        return 0;
cb8e9e
+err:
cb8e9e
+        if (lk_array != NULL) {
cb8e9e
+                dht_lock_array_free (lk_array, count);
cb8e9e
+                GF_FREE (lk_array);
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        return -1;
cb8e9e
+}
cb8e9e
+
cb8e9e
+int
cb8e9e
 dht_create (call_frame_t *frame, xlator_t *this,
cb8e9e
             loc_t *loc, int32_t flags, mode_t mode,
cb8e9e
             mode_t umask, fd_t *fd, dict_t *params)
cb8e9e
 {
cb8e9e
-        int          op_errno = -1;
cb8e9e
-        xlator_t    *subvol = NULL;
cb8e9e
-        dht_local_t *local = NULL;
cb8e9e
-        xlator_t    *avail_subvol = NULL;
cb8e9e
+        int             op_errno           = -1;
cb8e9e
+        xlator_t       *subvol             = NULL;
cb8e9e
+        dht_local_t    *local              = NULL;
cb8e9e
+        int             i                  = 0;
cb8e9e
+        dht_conf_t     *conf               = NULL;
cb8e9e
+        int             ret                = 0;
cb8e9e
 
cb8e9e
         VALIDATE_OR_GOTO (frame, err);
cb8e9e
         VALIDATE_OR_GOTO (this, err);
cb8e9e
         VALIDATE_OR_GOTO (loc, err);
cb8e9e
 
cb8e9e
+        conf = this->private;
cb8e9e
+
cb8e9e
         dht_get_du_info (frame, this, loc);
cb8e9e
 
cb8e9e
         local = dht_local_init (frame, loc, fd, GF_FOP_CREATE);
cb8e9e
@@ -5568,48 +5886,90 @@ dht_create (call_frame_t *frame, xlator_t *this,
cb8e9e
 
cb8e9e
         subvol = dht_subvol_get_hashed (this, loc);
cb8e9e
         if (!subvol) {
cb8e9e
-                gf_msg_debug (this->name, 0,
cb8e9e
-                              "no subvolume in layout for path=%s",
cb8e9e
-                              loc->path);
cb8e9e
+                gf_msg (this->name, GF_LOG_ERROR, 0,
cb8e9e
+                        DHT_MSG_HASHED_SUBVOL_GET_FAILED,
cb8e9e
+                        "no subvolume in layout for path=%s",
cb8e9e
+                        loc->path);
cb8e9e
+
cb8e9e
                 op_errno = ENOENT;
cb8e9e
                 goto err;
cb8e9e
         }
cb8e9e
 
cb8e9e
-        if (!dht_is_subvol_filled (this, subvol)) {
cb8e9e
-                gf_msg_trace (this->name, 0,
cb8e9e
-                              "creating %s on %s", loc->path,
cb8e9e
-                               subvol->name);
cb8e9e
-                STACK_WIND (frame, dht_create_cbk,
cb8e9e
-                            subvol, subvol->fops->create,
cb8e9e
-                            loc, flags, mode, umask, fd, params);
cb8e9e
-                goto done;
cb8e9e
-        }
cb8e9e
-        /* Choose the minimum filled volume, and create the
cb8e9e
-           files there */
cb8e9e
-        avail_subvol = dht_free_disk_available_subvol (this, subvol, local);
cb8e9e
-        if (avail_subvol != subvol) {
cb8e9e
-                local->params = dict_ref (params);
cb8e9e
-                local->flags = flags;
cb8e9e
-                local->mode = mode;
cb8e9e
-                local->umask = umask;
cb8e9e
-                local->cached_subvol = avail_subvol;
cb8e9e
-                local->hashed_subvol = subvol;
cb8e9e
-                gf_msg_trace (this->name, 0,
cb8e9e
-                              "creating %s on %s (link at %s)", loc->path,
cb8e9e
-                              avail_subvol->name, subvol->name);
cb8e9e
-                dht_linkfile_create (frame, dht_create_linkfile_create_cbk,
cb8e9e
-                                     this, avail_subvol, subvol, loc);
cb8e9e
-                goto done;
cb8e9e
+       /* Post remove-brick, the client layout may not be in sync with
cb8e9e
+        * disk layout because of lack of lookup. Hence,a create call
cb8e9e
+        * may fall on the decommissioned brick.  Hence, if the
cb8e9e
+        * hashed_subvol is part of decommissioned bricks  list, do a
cb8e9e
+        * lookup on parent dir. If a fix-layout is already done by the
cb8e9e
+        * remove-brick process, the parent directory layout will be in
cb8e9e
+        * sync with that of the disk. If fix-layout is still ending
cb8e9e
+        * on the parent directory, we can let the file get created on
cb8e9e
+        * the decommissioned brick which will be eventually migrated to
cb8e9e
+        * non-decommissioned brick based on the new layout.
cb8e9e
+        */
cb8e9e
+
cb8e9e
+        if (conf->decommission_subvols_cnt) {
cb8e9e
+            for (i = 0; i < conf->subvolume_cnt; i++) {
cb8e9e
+                if (conf->decommissioned_bricks[i] &&
cb8e9e
+                        conf->decommissioned_bricks[i] == subvol) {
cb8e9e
+
cb8e9e
+                        gf_msg_debug (this->name, 0, "hashed subvol:%s is "
cb8e9e
+                                      "part of decommission brick list for "
cb8e9e
+                                      "file: %s", subvol->name, loc->path);
cb8e9e
+
cb8e9e
+                        /* dht_refresh_layout needs directory info in
cb8e9e
+                         * local->loc. Hence, storing the parent_loc in
cb8e9e
+                         * local->loc and storing the create context in
cb8e9e
+                         * local->loc2. We will restore this information
cb8e9e
+                         * in dht_creation do */
cb8e9e
+
cb8e9e
+                        ret = loc_copy (&local->loc2, &local->loc);
cb8e9e
+                        if (ret) {
cb8e9e
+                                gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
cb8e9e
+                                        DHT_MSG_NO_MEMORY,
cb8e9e
+                                        "loc_copy failed %s", loc->path);
cb8e9e
+
cb8e9e
+                                goto err;
cb8e9e
+                        }
cb8e9e
+
cb8e9e
+                        local->params = dict_ref (params);
cb8e9e
+                        local->flags = flags;
cb8e9e
+                        local->mode = mode;
cb8e9e
+                        local->umask = umask;
cb8e9e
+
cb8e9e
+                        loc_wipe (&local->loc);
cb8e9e
+
cb8e9e
+                        ret = dht_build_parent_loc (this, &local->loc, loc,
cb8e9e
+                                                                 &op_errno);
cb8e9e
+
cb8e9e
+                        if (ret) {
cb8e9e
+                                gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
cb8e9e
+                                        DHT_MSG_NO_MEMORY,
cb8e9e
+                                        "parent loc build failed");
cb8e9e
+                                goto err;
cb8e9e
+                        }
cb8e9e
+
cb8e9e
+                        ret = dht_create_lock (frame, subvol);
cb8e9e
+
cb8e9e
+                        if (ret < 0) {
cb8e9e
+                                gf_msg (this->name, GF_LOG_ERROR, 0,
cb8e9e
+                                        DHT_MSG_INODE_LK_ERROR,
cb8e9e
+                                        "locking parent failed");
cb8e9e
+                                goto err;
cb8e9e
+                        }
cb8e9e
+
cb8e9e
+                        goto done;
cb8e9e
+               }
cb8e9e
+            }
cb8e9e
         }
cb8e9e
-        gf_msg_trace (this->name, 0,
cb8e9e
-                      "creating %s on %s", loc->path, subvol->name);
cb8e9e
-        STACK_WIND (frame, dht_create_cbk,
cb8e9e
-                    subvol, subvol->fops->create,
cb8e9e
-                    loc, flags, mode, umask, fd, params);
cb8e9e
+
cb8e9e
+
cb8e9e
+        dht_create_wind_to_avail_subvol (frame, this, subvol, loc, flags, mode,
cb8e9e
+                                         umask, fd, params);
cb8e9e
 done:
cb8e9e
         return 0;
cb8e9e
 
cb8e9e
 err:
cb8e9e
+
cb8e9e
         op_errno = (op_errno == -1) ? errno : op_errno;
cb8e9e
         DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL,
cb8e9e
                           NULL, NULL, NULL);
cb8e9e
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
cb8e9e
index 5e86b32..f583d30 100644
cb8e9e
--- a/xlators/cluster/dht/src/dht-common.h
cb8e9e
+++ b/xlators/cluster/dht/src/dht-common.h
cb8e9e
@@ -44,6 +44,10 @@ typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie,
cb8e9e
 typedef int (*dht_defrag_cbk_fn_t) (xlator_t        *this, xlator_t *dst_node,
cb8e9e
                                     call_frame_t    *frame);
cb8e9e
 
cb8e9e
+typedef int (*dht_refresh_layout_unlock) (call_frame_t *frame, xlator_t *this,
cb8e9e
+                                         int op_ret);
cb8e9e
+
cb8e9e
+typedef int (*dht_refresh_layout_done_handle) (call_frame_t *frame);
cb8e9e
 
cb8e9e
 struct dht_layout {
cb8e9e
         int                spread_cnt;  /* layout spread count per directory,
cb8e9e
@@ -212,6 +216,10 @@ struct dht_local {
cb8e9e
                 gf_boolean_t            force_mkdir;
cb8e9e
                 dht_layout_t           *layout, *refreshed_layout;
cb8e9e
         } selfheal;
cb8e9e
+
cb8e9e
+        dht_refresh_layout_unlock              refresh_layout_unlock;
cb8e9e
+        dht_refresh_layout_done_handle         refresh_layout_done;
cb8e9e
+
cb8e9e
         uint32_t                 uid;
cb8e9e
         uint32_t                 gid;
cb8e9e
 
cb8e9e
@@ -510,6 +518,7 @@ typedef struct dht_migrate_info {
cb8e9e
         GF_REF_DECL;
cb8e9e
 } dht_migrate_info_t;
cb8e9e
 
cb8e9e
+
cb8e9e
 #define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT)
cb8e9e
 
cb8e9e
 #define is_revalidate(loc) (dht_inode_ctx_layout_get (loc->inode, this, NULL) == 0)
cb8e9e
@@ -1067,4 +1076,6 @@ dht_layout_sort (dht_layout_t *layout);
cb8e9e
 int
cb8e9e
 dht_layout_missing_dirs (dht_layout_t *layout);
cb8e9e
 
cb8e9e
+int
cb8e9e
+dht_refresh_layout (call_frame_t *frame);
cb8e9e
 #endif/* _DHT_H */
cb8e9e
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
cb8e9e
index 000494c..2a9ad37 100644
cb8e9e
--- a/xlators/cluster/dht/src/dht-diskusage.c
cb8e9e
+++ b/xlators/cluster/dht/src/dht-diskusage.c
cb8e9e
@@ -339,7 +339,8 @@ out:
cb8e9e
 }
cb8e9e
 
cb8e9e
 static inline
cb8e9e
-int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout)
cb8e9e
+int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this,
cb8e9e
+                                         dht_layout_t *layout)
cb8e9e
 {
cb8e9e
         int ret = -1;
cb8e9e
         int i   = 0;
cb8e9e
@@ -355,6 +356,17 @@ int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout)
cb8e9e
                         goto out;
cb8e9e
                 }
cb8e9e
         }
cb8e9e
+
cb8e9e
+        /* discard decommissioned subvol */
cb8e9e
+        if (conf->decommission_subvols_cnt) {
cb8e9e
+                for (i = 0; i < conf->subvolume_cnt; i++) {
cb8e9e
+                        if (conf->decommissioned_bricks[i] &&
cb8e9e
+                            conf->decommissioned_bricks[i] == this)
cb8e9e
+                                ret = -1;
cb8e9e
+                                goto out;
cb8e9e
+                }
cb8e9e
+        }
cb8e9e
+
cb8e9e
         ret = 0;
cb8e9e
 out:
cb8e9e
         return ret;
cb8e9e
@@ -376,8 +388,9 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
cb8e9e
         conf = this->private;
cb8e9e
 
cb8e9e
         for(i=0; i < conf->subvolume_cnt; i++) {
cb8e9e
-                /* check if subvol has layout errors, before selecting it */
cb8e9e
-                ignore_subvol = dht_subvol_has_err (conf->subvolumes[i],
cb8e9e
+                /* check if subvol has layout errors and also it is not a
cb8e9e
+                 * decommissioned brick, before selecting it */
cb8e9e
+                ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i],
cb8e9e
                                                     layout);
cb8e9e
                 if (ignore_subvol)
cb8e9e
                         continue;
cb8e9e
@@ -424,8 +437,10 @@ dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
cb8e9e
         conf = this->private;
cb8e9e
 
cb8e9e
         for (i = 0; i < conf->subvolume_cnt; i++) {
cb8e9e
-                /* check if subvol has layout errors, before selecting it */
cb8e9e
-                ignore_subvol = dht_subvol_has_err (conf->subvolumes[i],
cb8e9e
+                /* check if subvol has layout errors and also it is not a
cb8e9e
+                 * decommissioned brick, before selecting it*/
cb8e9e
+
cb8e9e
+                ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i],
cb8e9e
                                                     layout);
cb8e9e
                 if (ignore_subvol)
cb8e9e
                         continue;
cb8e9e
diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c
cb8e9e
index a247170..a3e6d99 100644
cb8e9e
--- a/xlators/cluster/dht/src/dht-linkfile.c
cb8e9e
+++ b/xlators/cluster/dht/src/dht-linkfile.c
cb8e9e
@@ -120,7 +120,7 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
cb8e9e
         int          need_unref = 0;
cb8e9e
         int          ret = 0;
cb8e9e
         dht_conf_t  *conf = this->private;
cb8e9e
-        char           gfid[GF_UUID_BUF_SIZE] = {0};
cb8e9e
+        char         gfid[GF_UUID_BUF_SIZE] = {0};
cb8e9e
 
cb8e9e
         local = frame->local;
cb8e9e
         local->linkfile.linkfile_cbk = linkfile_cbk;
cb8e9e
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
cb8e9e
index cfe7e5a..cd1d97f 100644
cb8e9e
--- a/xlators/cluster/dht/src/dht-selfheal.c
cb8e9e
+++ b/xlators/cluster/dht/src/dht-selfheal.c
cb8e9e
@@ -214,7 +214,7 @@ unlock:
cb8e9e
 
cb8e9e
         if (is_last_call (this_call_cnt)) {
cb8e9e
                 if (local->op_ret == 0) {
cb8e9e
-                        dht_refresh_layout_done (frame);
cb8e9e
+                        local->refresh_layout_done (frame);
cb8e9e
                 } else {
cb8e9e
                         goto err;
cb8e9e
                 }
cb8e9e
@@ -224,7 +224,8 @@ unlock:
cb8e9e
         return 0;
cb8e9e
 
cb8e9e
 err:
cb8e9e
-        dht_selfheal_dir_finish (frame, this, -1);
cb8e9e
+        local->refresh_layout_unlock (frame, this, -1);
cb8e9e
+
cb8e9e
         return 0;
cb8e9e
 }
cb8e9e
 
cb8e9e
@@ -290,7 +291,7 @@ dht_refresh_layout (call_frame_t *frame)
cb8e9e
         return 0;
cb8e9e
 
cb8e9e
 out:
cb8e9e
-        dht_selfheal_dir_finish (frame, this, -1);
cb8e9e
+        local->refresh_layout_unlock (frame, this, -1);
cb8e9e
         return 0;
cb8e9e
 }
cb8e9e
 
cb8e9e
@@ -299,10 +300,21 @@ int32_t
cb8e9e
 dht_selfheal_layout_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
                               int32_t op_ret, int32_t op_errno, dict_t *xdata)
cb8e9e
 {
cb8e9e
+        dht_local_t     *local = NULL;
cb8e9e
+
cb8e9e
+        local = frame->local;
cb8e9e
+
cb8e9e
+        if (!local) {
cb8e9e
+                goto err;
cb8e9e
+        }
cb8e9e
+
cb8e9e
         if (op_ret < 0) {
cb8e9e
                 goto err;
cb8e9e
         }
cb8e9e
 
cb8e9e
+        local->refresh_layout_unlock = dht_selfheal_dir_finish;
cb8e9e
+        local->refresh_layout_done = dht_refresh_layout_done;
cb8e9e
+
cb8e9e
         dht_refresh_layout (frame);
cb8e9e
         return 0;
cb8e9e
 
cb8e9e
-- 
cb8e9e
1.7.1
cb8e9e