cb8e9e
From 3095da8cb7b78a755700b6edfaa9fab20d98df2b Mon Sep 17 00:00:00 2001
cb8e9e
From: Susant Palai <spalai@redhat.com>
cb8e9e
Date: Mon, 24 Aug 2015 03:04:41 -0400
cb8e9e
Subject: [PATCH 293/304] cluster/dht: avoid mknod on decommissioned brick
cb8e9e
cb8e9e
BUG: 1225452
cb8e9e
Change-Id: I838bce1a28d53d437def149d85d6e1770a292f19
cb8e9e
Signed-off-by: Susant Palai <spalai@redhat.com>
cb8e9e
Reviewed-on: http://review.gluster.org/11998
cb8e9e
Tested-by: NetBSD Build System <jenkins@build.gluster.org>
cb8e9e
Tested-by: Gluster Build System <jenkins@build.gluster.com>
cb8e9e
Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
cb8e9e
Signed-off-by: Susant Palai <spalai@redhat.com>
cb8e9e
Reviewed-on: https://code.engineering.redhat.com/gerrit/56186
cb8e9e
Reviewed-by: Raghavendra Gowdappa <rgowdapp@redhat.com>
cb8e9e
Tested-by: Raghavendra Gowdappa <rgowdapp@redhat.com>
cb8e9e
---
cb8e9e
 xlators/cluster/dht/src/dht-common.c |  366 ++++++++++++++++++++++++++++++----
cb8e9e
 xlators/cluster/dht/src/dht-common.h |    5 +
cb8e9e
 2 files changed, 335 insertions(+), 36 deletions(-)
cb8e9e
cb8e9e
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
cb8e9e
index ebc183d..5c1a693 100644
cb8e9e
--- a/xlators/cluster/dht/src/dht-common.c
cb8e9e
+++ b/xlators/cluster/dht/src/dht-common.c
cb8e9e
@@ -4995,8 +4995,22 @@ out:
cb8e9e
          * See dht_iatt_merge for reference.
cb8e9e
          */
cb8e9e
         DHT_STRIP_PHASE1_FLAGS (stbuf);
cb8e9e
-        DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, stbuf,
cb8e9e
-                          preparent, postparent, xdata);
cb8e9e
+
cb8e9e
+        if (local && local->lock.locks) {
cb8e9e
+                /* store op_errno for failure case*/
cb8e9e
+                local->op_errno = op_errno;
cb8e9e
+                local->refresh_layout_unlock (frame, this, op_ret);
cb8e9e
+
cb8e9e
+                if (op_ret == 0) {
cb8e9e
+                        DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno,
cb8e9e
+                                          inode, stbuf, preparent, postparent,
cb8e9e
+                                          xdata);
cb8e9e
+                }
cb8e9e
+        } else {
cb8e9e
+                DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode,
cb8e9e
+                                  stbuf, preparent, postparent, xdata);
cb8e9e
+        }
cb8e9e
+
cb8e9e
         return 0;
cb8e9e
 }
cb8e9e
 
cb8e9e
@@ -5029,24 +5043,269 @@ dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie,
cb8e9e
 
cb8e9e
         return 0;
cb8e9e
 err:
cb8e9e
-        DHT_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
cb8e9e
-                          NULL);
cb8e9e
+        if (local->lock.locks)
cb8e9e
+                local->refresh_layout_unlock (frame, this, -1);
cb8e9e
+
cb8e9e
         return 0;
cb8e9e
 }
cb8e9e
 
cb8e9e
 int
cb8e9e
+dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
cb8e9e
+                                 xlator_t *subvol, loc_t *loc, dev_t rdev,
cb8e9e
+                                 mode_t mode, mode_t umask, dict_t *params)
cb8e9e
+{
cb8e9e
+        dht_local_t     *local          = NULL;
cb8e9e
+        xlator_t        *avail_subvol   = NULL;
cb8e9e
+
cb8e9e
+        local = frame->local;
cb8e9e
+
cb8e9e
+        if (!dht_is_subvol_filled (this, subvol)) {
cb8e9e
+                gf_msg_debug (this->name, 0,
cb8e9e
+                              "creating %s on %s", loc->path,
cb8e9e
+                              subvol->name);
cb8e9e
+
cb8e9e
+                STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol,
cb8e9e
+                                   subvol, subvol->fops->mknod, loc, mode,
cb8e9e
+                                   rdev, umask, params);
cb8e9e
+        } else {
cb8e9e
+                avail_subvol = dht_free_disk_available_subvol (this, subvol, local);
cb8e9e
+
cb8e9e
+                if (avail_subvol != subvol) {
cb8e9e
+                        local->params = dict_ref (params);
cb8e9e
+                        local->rdev = rdev;
cb8e9e
+                        local->mode = mode;
cb8e9e
+                        local->umask = umask;
cb8e9e
+                        local->cached_subvol = avail_subvol;
cb8e9e
+                        local->hashed_subvol = subvol;
cb8e9e
+
cb8e9e
+                        gf_msg_debug (this->name, 0,
cb8e9e
+                                      "creating %s on %s (link at %s)", loc->path,
cb8e9e
+                                      avail_subvol->name, subvol->name);
cb8e9e
+
cb8e9e
+                        dht_linkfile_create (frame,
cb8e9e
+                                             dht_mknod_linkfile_create_cbk,
cb8e9e
+                                             this, avail_subvol, subvol, loc);
cb8e9e
+
cb8e9e
+                        goto out;
cb8e9e
+                }
cb8e9e
+
cb8e9e
+                gf_msg_debug (this->name, 0,
cb8e9e
+                              "creating %s on %s", loc->path, subvol->name);
cb8e9e
+
cb8e9e
+                STACK_WIND_COOKIE (frame, dht_newfile_cbk,
cb8e9e
+                                   (void *)subvol, subvol,
cb8e9e
+                                   subvol->fops->mknod, loc, mode,
cb8e9e
+                                   rdev, umask, params);
cb8e9e
+
cb8e9e
+        }
cb8e9e
+out:
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
+
cb8e9e
+int32_t
cb8e9e
+dht_mknod_do (call_frame_t *frame)
cb8e9e
+{
cb8e9e
+        dht_local_t     *local          = NULL;
cb8e9e
+        dht_layout_t    *refreshed      = NULL;
cb8e9e
+        xlator_t        *subvol         = NULL;
cb8e9e
+        xlator_t        *this           = NULL;
cb8e9e
+        dht_conf_t      *conf           = NULL;
cb8e9e
+        dht_methods_t   *methods        = NULL;
cb8e9e
+
cb8e9e
+        local = frame->local;
cb8e9e
+
cb8e9e
+        this = THIS;
cb8e9e
+
cb8e9e
+        conf = this->private;
cb8e9e
+
cb8e9e
+        GF_VALIDATE_OR_GOTO (this->name, conf, err);
cb8e9e
+
cb8e9e
+        methods = conf->methods;
cb8e9e
+
cb8e9e
+        GF_VALIDATE_OR_GOTO (this->name, conf->methods, err);
cb8e9e
+
cb8e9e
+        /* We don't need parent_loc anymore */
cb8e9e
+        loc_wipe (&local->loc);
cb8e9e
+
cb8e9e
+        loc_copy (&local->loc, &local->loc2);
cb8e9e
+
cb8e9e
+        loc_wipe (&local->loc2);
cb8e9e
+
cb8e9e
+        refreshed = local->selfheal.refreshed_layout;
cb8e9e
+
cb8e9e
+        subvol = methods->layout_search (this, refreshed, local->loc.name);
cb8e9e
+
cb8e9e
+        if (!subvol) {
cb8e9e
+                gf_msg (this->name, GF_LOG_ERROR, 0,
cb8e9e
+                        DHT_MSG_HASHED_SUBVOL_GET_FAILED, "no subvolume in "
cb8e9e
+                        "layout for path=%s", local->loc.path);
cb8e9e
+                local->op_errno = ENOENT;
cb8e9e
+                goto err;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        dht_mknod_wind_to_avail_subvol (frame, this, subvol, &local->loc,
cb8e9e
+                                         local->rdev, local->mode,
cb8e9e
+                                         local->umask, local->params);
cb8e9e
+        return 0;
cb8e9e
+err:
cb8e9e
+        local->refresh_layout_unlock (frame, this, -1);
cb8e9e
+
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
+
cb8e9e
+int32_t
cb8e9e
+dht_mknod_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
+                         int32_t op_ret, int32_t op_errno, dict_t *xdata)
cb8e9e
+{
cb8e9e
+        DHT_STACK_DESTROY (frame);
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
+int32_t
cb8e9e
+dht_mknod_finish (call_frame_t *frame, xlator_t *this, int op_ret)
cb8e9e
+{
cb8e9e
+        dht_local_t  *local      = NULL, *lock_local = NULL;
cb8e9e
+        call_frame_t *lock_frame = NULL;
cb8e9e
+        int           lock_count = 0;
cb8e9e
+
cb8e9e
+        local = frame->local;
cb8e9e
+        lock_count = dht_lock_count (local->lock.locks, local->lock.lk_count);
cb8e9e
+        if (lock_count == 0)
cb8e9e
+                goto done;
cb8e9e
+
cb8e9e
+        lock_frame = copy_frame (frame);
cb8e9e
+        if (lock_frame == NULL) {
cb8e9e
+                goto done;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        lock_local = dht_local_init (lock_frame, &local->loc, NULL,
cb8e9e
+                                     lock_frame->root->op);
cb8e9e
+        if (lock_local == NULL) {
cb8e9e
+                goto done;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        lock_local->lock.locks = local->lock.locks;
cb8e9e
+        lock_local->lock.lk_count = local->lock.lk_count;
cb8e9e
+
cb8e9e
+        local->lock.locks = NULL;
cb8e9e
+        local->lock.lk_count = 0;
cb8e9e
+
cb8e9e
+        dht_unlock_inodelk (lock_frame, lock_local->lock.locks,
cb8e9e
+                            lock_local->lock.lk_count,
cb8e9e
+                            dht_mknod_unlock_cbk);
cb8e9e
+        lock_frame = NULL;
cb8e9e
+
cb8e9e
+done:
cb8e9e
+        if (lock_frame != NULL) {
cb8e9e
+                DHT_STACK_DESTROY (lock_frame);
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        if (op_ret == 0)
cb8e9e
+                return 0;
cb8e9e
+
cb8e9e
+        DHT_STACK_UNWIND (mknod, frame, op_ret, local->op_errno, NULL, NULL,
cb8e9e
+                          NULL, NULL, NULL);
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
+int32_t
cb8e9e
+dht_mknod_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
+                     int32_t op_ret, int32_t op_errno, dict_t *xdata)
cb8e9e
+{
cb8e9e
+        dht_local_t     *local = NULL;
cb8e9e
+
cb8e9e
+        local = frame->local;
cb8e9e
+
cb8e9e
+        if (!local) {
cb8e9e
+                goto err;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        if (op_ret < 0) {
cb8e9e
+                gf_msg ("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
cb8e9e
+                        "mknod lock failed for file: %s", local->loc2.name);
cb8e9e
+
cb8e9e
+                local->op_errno = op_errno;
cb8e9e
+
cb8e9e
+                goto err;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        local->refresh_layout_unlock = dht_mknod_finish;
cb8e9e
+
cb8e9e
+        local->refresh_layout_done = dht_mknod_do;
cb8e9e
+
cb8e9e
+        dht_refresh_layout (frame);
cb8e9e
+
cb8e9e
+        return 0;
cb8e9e
+err:
cb8e9e
+        dht_mknod_finish (frame, this, -1);
cb8e9e
+        return 0;
cb8e9e
+}
cb8e9e
+
cb8e9e
+int32_t
cb8e9e
+dht_mknod_lock (call_frame_t *frame, xlator_t *subvol)
cb8e9e
+{
cb8e9e
+        dht_local_t     *local          = NULL;
cb8e9e
+        int              count  = 1,    ret = -1;
cb8e9e
+        dht_lock_t     **lk_array       = NULL;
cb8e9e
+
cb8e9e
+        GF_VALIDATE_OR_GOTO ("dht", frame, err);
cb8e9e
+        GF_VALIDATE_OR_GOTO (frame->this->name, frame->local, err);
cb8e9e
+
cb8e9e
+        local = frame->local;
cb8e9e
+
cb8e9e
+        lk_array = GF_CALLOC (count, sizeof (*lk_array), gf_common_mt_char);
cb8e9e
+
cb8e9e
+        if (lk_array == NULL)
cb8e9e
+                goto err;
cb8e9e
+
cb8e9e
+        lk_array[0] = dht_lock_new (frame->this, subvol, &local->loc, F_RDLCK,
cb8e9e
+                                    DHT_LAYOUT_HEAL_DOMAIN);
cb8e9e
+
cb8e9e
+        if (lk_array[0] == NULL)
cb8e9e
+                goto err;
cb8e9e
+
cb8e9e
+        local->lock.locks = lk_array;
cb8e9e
+        local->lock.lk_count = count;
cb8e9e
+
cb8e9e
+        ret = dht_blocking_inodelk (frame, lk_array, count,
cb8e9e
+                                    dht_mknod_lock_cbk);
cb8e9e
+
cb8e9e
+        if (ret < 0) {
cb8e9e
+                local->lock.locks = NULL;
cb8e9e
+                local->lock.lk_count = 0;
cb8e9e
+                goto err;
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        return 0;
cb8e9e
+err:
cb8e9e
+        if (lk_array != NULL) {
cb8e9e
+                dht_lock_array_free (lk_array, count);
cb8e9e
+                GF_FREE (lk_array);
cb8e9e
+        }
cb8e9e
+
cb8e9e
+        return -1;
cb8e9e
+}
cb8e9e
+
cb8e9e
+
cb8e9e
+int
cb8e9e
 dht_mknod (call_frame_t *frame, xlator_t *this,
cb8e9e
            loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *params)
cb8e9e
 {
cb8e9e
-        xlator_t    *subvol = NULL;
cb8e9e
-        int          op_errno = -1;
cb8e9e
-        xlator_t    *avail_subvol = NULL;
cb8e9e
-        dht_local_t *local = NULL;
cb8e9e
+        xlator_t       *subvol     = NULL;
cb8e9e
+        int             op_errno   = -1;
cb8e9e
+        int             i          = 0;
cb8e9e
+        int             ret        = 0;
cb8e9e
+        dht_local_t    *local      = NULL;
cb8e9e
+        dht_conf_t     *conf       = NULL;
cb8e9e
 
cb8e9e
         VALIDATE_OR_GOTO (frame, err);
cb8e9e
         VALIDATE_OR_GOTO (this, err);
cb8e9e
         VALIDATE_OR_GOTO (loc, err);
cb8e9e
 
cb8e9e
+        conf = this->private;
cb8e9e
+
cb8e9e
         dht_get_du_info (frame, this, loc);
cb8e9e
 
cb8e9e
         local = dht_local_init (frame, loc, NULL, GF_FOP_MKNOD);
cb8e9e
@@ -5064,42 +5323,77 @@ dht_mknod (call_frame_t *frame, xlator_t *this,
cb8e9e
                 goto err;
cb8e9e
         }
cb8e9e
 
cb8e9e
-        if (!dht_is_subvol_filled (this, subvol)) {
cb8e9e
-                gf_msg_trace (this->name, 0,
cb8e9e
-                              "creating %s on %s", loc->path,
cb8e9e
-                              subvol->name);
cb8e9e
+        /* Post remove-brick, the client layout may not be in sync with
cb8e9e
+        * disk layout because of lack of lookup. Hence,a mknod call
cb8e9e
+        * may fall on the decommissioned brick.  Hence, if the
cb8e9e
+        * hashed_subvol is part of decommissioned bricks  list, do a
cb8e9e
+        * lookup on parent dir. If a fix-layout is already done by the
cb8e9e
+        * remove-brick process, the parent directory layout will be in
cb8e9e
+        * sync with that of the disk. If fix-layout is still ending
cb8e9e
+        * on the parent directory, we can let the file get created on
cb8e9e
+        * the decommissioned brick which will be eventually migrated to
cb8e9e
+        * non-decommissioned brick based on the new layout.
cb8e9e
+        */
cb8e9e
 
cb8e9e
-                STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol,
cb8e9e
-                                   subvol, subvol->fops->mknod, loc, mode,
cb8e9e
-                                   rdev, umask, params);
cb8e9e
-        } else {
cb8e9e
+        if (conf->decommission_subvols_cnt) {
cb8e9e
+            for (i = 0; i < conf->subvolume_cnt; i++) {
cb8e9e
+                if (conf->decommissioned_bricks[i] &&
cb8e9e
+                        conf->decommissioned_bricks[i] == subvol) {
cb8e9e
 
cb8e9e
-                avail_subvol = dht_free_disk_available_subvol (this, subvol,
cb8e9e
-                                                               local);
cb8e9e
-                if (avail_subvol != subvol) {
cb8e9e
-                        /* Choose the minimum filled volume, and create the
cb8e9e
-                           files there */
cb8e9e
+                        gf_msg_debug (this->name, 0, "hashed subvol:%s is "
cb8e9e
+                                      "part of decommission brick list for "
cb8e9e
+                                      "file: %s", subvol->name, loc->path);
cb8e9e
+
cb8e9e
+                        /* dht_refresh_layout needs directory info in
cb8e9e
+                         * local->loc. Hence, storing the parent_loc in
cb8e9e
+                         * local->loc and storing the create context in
cb8e9e
+                         * local->loc2. We will restore this information
cb8e9e
+                         * in dht_creation do */
cb8e9e
+
cb8e9e
+                        ret = loc_copy (&local->loc2, &local->loc);
cb8e9e
+                        if (ret) {
cb8e9e
+                                gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
cb8e9e
+                                        DHT_MSG_NO_MEMORY,
cb8e9e
+                                        "loc_copy failed %s", loc->path);
cb8e9e
+
cb8e9e
+                                goto err;
cb8e9e
+                        }
cb8e9e
 
cb8e9e
                         local->params = dict_ref (params);
cb8e9e
-                        local->cached_subvol = avail_subvol;
cb8e9e
-                        local->mode = mode;
cb8e9e
                         local->rdev = rdev;
cb8e9e
+                        local->mode = mode;
cb8e9e
                         local->umask = umask;
cb8e9e
-                        dht_linkfile_create (frame,
cb8e9e
-                                             dht_mknod_linkfile_create_cbk,
cb8e9e
-                                             this, avail_subvol, subvol, loc);
cb8e9e
-                } else {
cb8e9e
-                        gf_msg_trace (this->name, 0,
cb8e9e
-                                      "creating %s on %s", loc->path,
cb8e9e
-                                      subvol->name);
cb8e9e
 
cb8e9e
-                        STACK_WIND_COOKIE (frame, dht_newfile_cbk,
cb8e9e
-                                           (void *)subvol, subvol,
cb8e9e
-                                           subvol->fops->mknod, loc, mode,
cb8e9e
-                                           rdev, umask, params);
cb8e9e
-                }
cb8e9e
+                        loc_wipe (&local->loc);
cb8e9e
+
cb8e9e
+                        ret = dht_build_parent_loc (this, &local->loc, loc,
cb8e9e
+                                                                 &op_errno);
cb8e9e
+
cb8e9e
+                        if (ret) {
cb8e9e
+                                gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
cb8e9e
+                                        DHT_MSG_NO_MEMORY,
cb8e9e
+                                        "parent loc build failed");
cb8e9e
+                                goto err;
cb8e9e
+                        }
cb8e9e
+
cb8e9e
+                        ret = dht_mknod_lock (frame, subvol);
cb8e9e
+
cb8e9e
+                        if (ret < 0) {
cb8e9e
+                                gf_msg (this->name, GF_LOG_ERROR, 0,
cb8e9e
+                                        DHT_MSG_INODE_LK_ERROR,
cb8e9e
+                                        "locking parent failed");
cb8e9e
+                                goto err;
cb8e9e
+                        }
cb8e9e
+
cb8e9e
+                        goto done;
cb8e9e
+               }
cb8e9e
+            }
cb8e9e
         }
cb8e9e
 
cb8e9e
+        dht_mknod_wind_to_avail_subvol (frame, this, subvol, loc, rdev, mode,
cb8e9e
+                                        umask, params);
cb8e9e
+
cb8e9e
+done:
cb8e9e
         return 0;
cb8e9e
 
cb8e9e
 err:
cb8e9e
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
cb8e9e
index f583d30..1b5a084 100644
cb8e9e
--- a/xlators/cluster/dht/src/dht-common.h
cb8e9e
+++ b/xlators/cluster/dht/src/dht-common.h
cb8e9e
@@ -1078,4 +1078,9 @@ dht_layout_missing_dirs (dht_layout_t *layout);
cb8e9e
 
cb8e9e
 int
cb8e9e
 dht_refresh_layout (call_frame_t *frame);
cb8e9e
+
cb8e9e
+int
cb8e9e
+dht_build_parent_loc (xlator_t *this, loc_t *parent, loc_t *child,
cb8e9e
+                                                 int32_t *op_errno);
cb8e9e
+
cb8e9e
 #endif/* _DHT_H */
cb8e9e
-- 
cb8e9e
1.7.1
cb8e9e