a3470f
From edd4d523869cc65c389253a225b02c578ad3af85 Mon Sep 17 00:00:00 2001
a3470f
From: Mohit Agrawal <moagrawa@redhat.com>
a3470f
Date: Fri, 6 Oct 2017 15:13:02 +0530
a3470f
Subject: [PATCH 214/236] cluster/dht: Serialize mds update code path with
a3470f
 lookup unwind in selfheal
a3470f
a3470f
Problem: Sometime test case ./tests/bugs/bug-1371806_1.t is failing on
a3470f
         centos due to race condition between fresh lookup and setxattr fop.
a3470f
a3470f
Solution: In selfheal code path we do save mds on inode_ctx, it was not
a3470f
          serialize with lookup unwind. Due to this behavior after lookup
a3470f
          unwind if mds is not saved on inode_ctx and if any subsequent
a3470f
          setxattr fop call it has failed with ENOENT because
a3470f
          no mds has found on inode ctx.To resolve it save mds on
a3470f
          inode ctx has been serialize with lookup unwind.
a3470f
a3470f
> BUG: 1498966
a3470f
> Change-Id: I8d4bb40a6cbf0cec35d181ec0095cc7142b02e29
a3470f
> Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
a3470f
> (cherry picked from commit https://review.gluster.org/#/c/18436/)
a3470f
> (Upstream patch link https://review.gluster.org/#/c/18436/)
a3470f
a3470f
BUG: 1550315
a3470f
Change-Id: I0d3c03cb6ab9a3729f8c4219fd54058d97ed526b
a3470f
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
a3470f
Reviewed-on: https://code.engineering.redhat.com/gerrit/134282
a3470f
Tested-by: RHGS Build Bot <nigelb@redhat.com>
a3470f
Reviewed-by: Nithya Balachandran <nbalacha@redhat.com>
a3470f
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
a3470f
---
a3470f
 tests/bugs/bug-1371806_1.t             |   1 -
a3470f
 xlators/cluster/dht/src/dht-common.c   | 314 ++++++++++++++++++++-------------
a3470f
 xlators/cluster/dht/src/dht-common.h   |  14 +-
a3470f
 xlators/cluster/dht/src/dht-selfheal.c | 188 +++-----------------
a3470f
 4 files changed, 231 insertions(+), 286 deletions(-)
a3470f
a3470f
diff --git a/tests/bugs/bug-1371806_1.t b/tests/bugs/bug-1371806_1.t
a3470f
index 44a57a9..df19a8c 100644
a3470f
--- a/tests/bugs/bug-1371806_1.t
a3470f
+++ b/tests/bugs/bug-1371806_1.t
a3470f
@@ -46,4 +46,3 @@ EXPECT "abc" get_getfattr ./tmp{1..10}
a3470f
 
a3470f
 cd -
a3470f
 cleanup
a3470f
-exit
a3470f
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
a3470f
index 6319a87..2fd145d 100644
a3470f
--- a/xlators/cluster/dht/src/dht-common.c
a3470f
+++ b/xlators/cluster/dht/src/dht-common.c
a3470f
@@ -579,6 +579,7 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
a3470f
         uint32_t         vol_commit_hash = 0;
a3470f
         xlator_t        *source          = NULL;
a3470f
         int              heal_path       = 0;
a3470f
+        int              error_while_marking_mds   = 0;
a3470f
         int              i               = 0;
a3470f
         loc_t            loc             = {0 };
a3470f
         int8_t           is_read_only    = 0, layout_anomalies = 0;
a3470f
@@ -684,7 +685,8 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
a3470f
                    internal mds xattr is not present and all subvols are up
a3470f
                 */
a3470f
                 if (!local->op_ret && !__is_root_gfid (local->stbuf.ia_gfid))
a3470f
-                        (void) dht_mark_mds_subvolume (discover_frame, this);
a3470f
+                        (void) dht_common_mark_mdsxattr (discover_frame,
a3470f
+                                                         &error_while_marking_mds, 1);
a3470f
 
a3470f
                 if (local->need_xattr_heal && !heal_path) {
a3470f
                         local->need_xattr_heal = 0;
a3470f
@@ -699,7 +701,7 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
a3470f
                 }
a3470f
         }
a3470f
 
a3470f
-        if (source && (heal_path || layout_anomalies)) {
a3470f
+        if (source && (heal_path || layout_anomalies || error_while_marking_mds)) {
a3470f
                 gf_uuid_copy (loc.gfid, local->gfid);
a3470f
                 if (gf_uuid_is_null (loc.gfid)) {
a3470f
                         goto done;
a3470f
@@ -761,62 +763,82 @@ out:
a3470f
 }
a3470f
 
a3470f
 int
a3470f
-dht_mds_internal_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
-                               int op_ret, int op_errno, dict_t *xdata)
a3470f
+dht_common_mark_mdsxattr_cbk (call_frame_t *frame, void *cookie,
a3470f
+                              xlator_t *this, int op_ret, int op_errno,
a3470f
+                              dict_t *xdata)
a3470f
 {
a3470f
-        dht_local_t  *local                   = NULL;
a3470f
-        xlator_t     *hashed_subvol           = NULL;
a3470f
-        dht_conf_t   *conf                    = NULL;
a3470f
-        int           ret                     = 0;
a3470f
+        dht_local_t  *local = NULL;
a3470f
+        xlator_t     *prev  = cookie;
a3470f
+        int           ret   = -1;
a3470f
+        dht_conf_t   *conf  = 0;
a3470f
+        dht_layout_t *layout = NULL;
a3470f
 
a3470f
         GF_VALIDATE_OR_GOTO (this->name, frame, out);
a3470f
         GF_VALIDATE_OR_GOTO (this->name, frame->local, out);
a3470f
 
a3470f
         local = frame->local;
a3470f
-        hashed_subvol  = cookie;
a3470f
         conf = this->private;
a3470f
+        layout = local->selfheal.layout;
a3470f
 
a3470f
         if (op_ret) {
a3470f
                 gf_msg_debug (this->name, op_ret,
a3470f
-                              "Failed to set %s on the MDS for path %s. ",
a3470f
-                              conf->mds_xattr_key, local->loc.path);
a3470f
+                              "Failed to set %s on the MDS %s for path %s. ",
a3470f
+                               conf->mds_xattr_key, prev->name, local->loc.path);
a3470f
         } else {
a3470f
-               /* Save mds subvol on inode ctx */
a3470f
-                ret = dht_inode_ctx_mdsvol_set (local->inode, this,
a3470f
-                                                hashed_subvol);
a3470f
+                /* Save mds subvol on inode ctx */
a3470f
+                ret = dht_inode_ctx_mdsvol_set (local->inode, this, prev);
a3470f
                 if (ret) {
a3470f
                         gf_msg (this->name, GF_LOG_ERROR, 0,
a3470f
                                 DHT_MSG_SET_INODE_CTX_FAILED,
a3470f
                                 "Failed to set mds subvol on inode ctx"
a3470f
-                                " %s for %s", hashed_subvol->name,
a3470f
+                                " %s for %s ", prev->name,
a3470f
                                 local->loc.path);
a3470f
                 }
a3470f
         }
a3470f
+        if (!local->mds_heal_fresh_lookup && layout) {
a3470f
+                dht_selfheal_dir_setattr (frame, &local->loc, &local->stbuf,
a3470f
+                                          0xffffffff, layout);
a3470f
+        }
a3470f
 out:
a3470f
-        DHT_STACK_DESTROY (frame);
a3470f
+        if (local && local->mds_heal_fresh_lookup)
a3470f
+                DHT_STACK_DESTROY (frame);
a3470f
         return 0;
a3470f
 }
a3470f
 
a3470f
 
a3470f
 
a3470f
-/* Code to save hashed subvol on inode ctx only while no
a3470f
-   mds xattr is availble and all subvols are up for fresh
a3470f
+/* Common function call by revalidate/selfheal code path to populate
a3470f
+   internal xattr if it is not present, mark_during_fresh_lookup value
a3470f
+   determines either function is call by revalidate_cbk(discover_complete)
a3470f
+   or call by selfheal code path while fresh lookup.
a3470f
+   Here we do wind a call serially in case of fresh lookup and
a3470f
+   for other lookup code path we do wind a call parallel.The reason
a3470f
+   to wind a call serially is at the time of fresh lookup directory is not
a3470f
+   discovered and at the time of revalidate_lookup directory is
a3470f
+   already discovered. So, revalidate codepath can race with setxattr
a3470f
+   codepath and can get into spurious heals because of an ongoing setxattr.
a3470f
+   This can slow down revalidates, if healing happens in foreground.
a3470f
+   However, if healing happens in background, there is no direct performance
a3470f
+   penalty.
a3470f
 */
a3470f
 int
a3470f
-dht_mark_mds_subvolume (call_frame_t *frame, xlator_t *this)
a3470f
+dht_common_mark_mdsxattr (call_frame_t *frame, int *errst, int mark_during_fresh_lookup)
a3470f
 {
a3470f
-        dht_local_t  *local                   = NULL;
a3470f
-        xlator_t     *hashed_subvol           = NULL;
a3470f
-        int           i                       = 0;
a3470f
-        gf_boolean_t  vol_down                = _gf_false;
a3470f
-        dht_conf_t   *conf                    = 0;
a3470f
-        int           ret                     = -1;
a3470f
-        char          gfid_local[GF_UUID_BUF_SIZE] = {0};
a3470f
-        dict_t       *xattrs                      = NULL;
a3470f
-        dht_local_t  *copy_local                  = NULL;
a3470f
-        call_frame_t *xattr_frame                 = NULL;
a3470f
-        int32_t       zero[1]                     = {0};
a3470f
+        dht_local_t  *local          = NULL;
a3470f
+        xlator_t     *this           = NULL;
a3470f
+        xlator_t     *hashed_subvol  = NULL;
a3470f
+        int           ret            = 0;
a3470f
+        int           i              = 0;
a3470f
+        dict_t       *xattrs         = NULL;
a3470f
+        char          gfid_local[GF_UUID_BUF_SIZE] = {0,};
a3470f
+        int32_t       zero[1]        = {0};
a3470f
+        dht_conf_t   *conf           = 0;
a3470f
+        dht_layout_t *layout         = NULL;
a3470f
+        dht_local_t  *copy_local     = NULL;
a3470f
+        call_frame_t *xattr_frame    = NULL;
a3470f
+        gf_boolean_t  vol_down       = _gf_false;
a3470f
 
a3470f
+        this = frame->this;
a3470f
 
a3470f
         GF_VALIDATE_OR_GOTO ("dht", frame, out);
a3470f
         GF_VALIDATE_OR_GOTO ("dht", this, out);
a3470f
@@ -825,66 +847,78 @@ dht_mark_mds_subvolume (call_frame_t *frame, xlator_t *this)
a3470f
 
a3470f
         local = frame->local;
a3470f
         conf = this->private;
a3470f
+        layout = local->selfheal.layout;
a3470f
+        local->mds_heal_fresh_lookup = mark_during_fresh_lookup;
a3470f
         gf_uuid_unparse(local->gfid, gfid_local);
a3470f
 
a3470f
-
a3470f
         /* Code to update hashed subvol consider as a mds subvol
a3470f
-           and save on inode ctx if all subvols are up and no internal
a3470f
-           xattr has been set yet
a3470f
+           and wind a setxattr call on hashed subvol to update
a3470f
+           internal xattr
a3470f
         */
a3470f
         if (!dict_get (local->xattr, conf->mds_xattr_key)) {
a3470f
                 /* It means no internal MDS xattr has been set yet
a3470f
                 */
a3470f
-                /* Check the status of all subvol are up
a3470f
+                /* Check the status of all subvol are up while call
a3470f
+                   this function call by lookup code path
a3470f
                 */
a3470f
-                for (i = 0; i < conf->subvolume_cnt; i++) {
a3470f
-                        if (!conf->subvolume_status[i]) {
a3470f
-                                vol_down = _gf_true;
a3470f
-                                break;
a3470f
+                if (mark_during_fresh_lookup) {
a3470f
+                        for (i = 0; i < conf->subvolume_cnt; i++) {
a3470f
+                                if (!conf->subvolume_status[i]) {
a3470f
+                                        vol_down = _gf_true;
a3470f
+                                        break;
a3470f
+                                }
a3470f
+                        }
a3470f
+                        if (vol_down) {
a3470f
+                                gf_msg_debug (this->name, 0,
a3470f
+                                              "subvol %s is down. Unable to "
a3470f
+                                              " save mds subvol on inode for "
a3470f
+                                              " path %s gfid is %s " ,
a3470f
+                                              conf->subvolumes[i]->name,
a3470f
+                                              local->loc.path, gfid_local);
a3470f
+                                goto out;
a3470f
                         }
a3470f
                 }
a3470f
-                if (vol_down) {
a3470f
-                        ret = 0;
a3470f
-                        gf_msg_debug (this->name, 0,
a3470f
-                                      "subvol %s is down. Unable to "
a3470f
-                                      " save mds subvol on inode for "
a3470f
-                                      " path %s gfid is %s " ,
a3470f
-                                      conf->subvolumes[i]->name, local->loc.path,
a3470f
-                                      gfid_local);
a3470f
-                       goto out;
a3470f
-                }
a3470f
-                /* Calculate hashed subvol based on inode and
a3470f
-                   parent inode
a3470f
+
a3470f
+                /* Calculate hashed subvol based on inode and parent node
a3470f
                 */
a3470f
-                hashed_subvol = dht_inode_get_hashed_subvol (local->inode,
a3470f
-                                                             this, &local->loc);
a3470f
+                hashed_subvol = dht_inode_get_hashed_subvol (local->inode, this,
a3470f
+                                                             &local->loc);
a3470f
                 if (!hashed_subvol) {
a3470f
                         gf_msg (this->name, GF_LOG_DEBUG, 0,
a3470f
                                 DHT_MSG_HASHED_SUBVOL_GET_FAILED,
a3470f
                                 "Failed to get hashed subvol for path %s"
a3470f
-                                " gfid is %s ",
a3470f
+                                "gfid is %s ",
a3470f
                                 local->loc.path, gfid_local);
a3470f
-                } else {
a3470f
-                        xattrs = dict_new ();
a3470f
-                        if (!xattrs) {
a3470f
-                                gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
a3470f
-                                        DHT_MSG_NO_MEMORY, "dict_new failed");
a3470f
-                                ret = -1;
a3470f
-                                goto out;
a3470f
-                        }
a3470f
-                        /* Add internal MDS xattr on disk for hashed subvol
a3470f
-                        */
a3470f
-                        ret = dht_dict_set_array (xattrs, conf->mds_xattr_key, zero, 1);
a3470f
-                        if (ret) {
a3470f
-                                gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
a3470f
-                                        DHT_MSG_DICT_SET_FAILED,
a3470f
-                                        "Failed to set dictionary"
a3470f
-                                        "  value:key = %s for "
a3470f
-                                        "path %s", conf->mds_xattr_key,
a3470f
-                                        local->loc.path);
a3470f
-                                ret = -1;
a3470f
-                                goto out;
a3470f
-                        }
a3470f
+                        (*errst) = 1;
a3470f
+                        ret = -1;
a3470f
+                        goto out;
a3470f
+                }
a3470f
+                xattrs = dict_new ();
a3470f
+                if (!xattrs) {
a3470f
+                        gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
a3470f
+                                DHT_MSG_NO_MEMORY, "dict_new failed");
a3470f
+                        ret = -1;
a3470f
+                        goto out;
a3470f
+                }
a3470f
+                /* Add internal MDS xattr on disk for hashed subvol
a3470f
+                */
a3470f
+                ret = dht_dict_set_array (xattrs, conf->mds_xattr_key,
a3470f
+                                          zero, 1);
a3470f
+                if (ret) {
a3470f
+                        gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
a3470f
+                                DHT_MSG_DICT_SET_FAILED,
a3470f
+                                "Failed to set dictionary"
a3470f
+                                "  value:key = %s for "
a3470f
+                                "path %s", conf->mds_xattr_key,
a3470f
+                                local->loc.path);
a3470f
+                        ret = -1;
a3470f
+                        goto out;
a3470f
+                }
a3470f
+                /* Create a new frame to wind a call only while
a3470f
+                   this function call by revalidate_cbk code path
a3470f
+                   To wind a call parallel need to create a new frame
a3470f
+                */
a3470f
+                if (mark_during_fresh_lookup) {
a3470f
                         xattr_frame = create_frame (this, this->ctx->pool);
a3470f
                         if (!xattr_frame) {
a3470f
                                 ret = -1;
a3470f
@@ -898,32 +932,42 @@ dht_mark_mds_subvolume (call_frame_t *frame, xlator_t *this)
a3470f
                                 goto out;
a3470f
                         }
a3470f
                         copy_local->stbuf = local->stbuf;
a3470f
+                        copy_local->mds_heal_fresh_lookup = mark_during_fresh_lookup;
a3470f
                         if (!copy_local->inode)
a3470f
                                 copy_local->inode = inode_ref (local->inode);
a3470f
                         gf_uuid_copy (copy_local->loc.gfid, local->gfid);
a3470f
-                        STACK_WIND_COOKIE (xattr_frame, dht_mds_internal_setxattr_cbk,
a3470f
+                        FRAME_SU_DO (xattr_frame, dht_local_t);
a3470f
+                        STACK_WIND_COOKIE (xattr_frame, dht_common_mark_mdsxattr_cbk,
a3470f
                                            hashed_subvol, hashed_subvol,
a3470f
                                            hashed_subvol->fops->setxattr,
a3470f
                                            &local->loc, xattrs, 0, NULL);
a3470f
-                        ret = 0;
a3470f
+                } else {
a3470f
+                        STACK_WIND_COOKIE (frame,
a3470f
+                                           dht_common_mark_mdsxattr_cbk,
a3470f
+                                           (void *)hashed_subvol,
a3470f
+                                           hashed_subvol,
a3470f
+                                           hashed_subvol->fops->setxattr,
a3470f
+                                           &local->loc, xattrs, 0,
a3470f
+                                           NULL);
a3470f
                 }
a3470f
         } else {
a3470f
-                ret = 0;
a3470f
                 gf_msg_debug (this->name, 0,
a3470f
                               "internal xattr %s is present on subvol"
a3470f
                               "on path %s gfid is %s " , conf->mds_xattr_key,
a3470f
                                local->loc.path, gfid_local);
a3470f
+                if (!mark_during_fresh_lookup)
a3470f
+                        dht_selfheal_dir_setattr (frame, &local->loc,
a3470f
+                                                  &local->stbuf, 0xffffffff,
a3470f
+                                                  layout);
a3470f
         }
a3470f
 
a3470f
-
a3470f
 out:
a3470f
         if (xattrs)
a3470f
                 dict_unref (xattrs);
a3470f
-       return ret;
a3470f
+        return ret;
a3470f
 }
a3470f
 
a3470f
 
a3470f
-
a3470f
 int
a3470f
 dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
                   int op_ret, int op_errno,
a3470f
@@ -1646,11 +1690,11 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
                         } else {
a3470f
                                 check_mds = dht_dict_get_array (xattr, conf->mds_xattr_key,
a3470f
                                                                 mds_xattr_val, 1, &errst);
a3470f
-                                if (local->mds_subvol == prev) {
a3470f
-                                        local->mds_stbuf.ia_gid = stbuf->ia_gid;
a3470f
-                                        local->mds_stbuf.ia_uid = stbuf->ia_uid;
a3470f
-                                        local->mds_stbuf.ia_prot = stbuf->ia_prot;
a3470f
-                                }
a3470f
+                                local->mds_subvol  = prev;
a3470f
+                                local->mds_stbuf.ia_gid = stbuf->ia_gid;
a3470f
+                                local->mds_stbuf.ia_uid = stbuf->ia_uid;
a3470f
+                                local->mds_stbuf.ia_prot = stbuf->ia_prot;
a3470f
+
a3470f
                                 /* save mds subvol on inode ctx */
a3470f
                                 ret = dht_inode_ctx_mdsvol_set (local->inode, this,
a3470f
                                                                 prev);
a3470f
@@ -1672,7 +1716,6 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
                                                       local->loc.path,
a3470f
                                                       prev->name, gfid);
a3470f
                                         local->need_xattr_heal = 1;
a3470f
-                                        local->mds_subvol  = prev;
a3470f
                                 }
a3470f
                         }
a3470f
                         ret = dht_layout_dir_mismatch (this, layout,
a3470f
@@ -1749,31 +1792,35 @@ out:
a3470f
                 if (conf->subvolume_cnt == 1)
a3470f
                         local->need_xattr_heal = 0;
a3470f
 
a3470f
-                /* Code to update all extended attributed from hashed subvol
a3470f
-                   to local->xattr
a3470f
-                */
a3470f
-                if (local->need_xattr_heal && (local->mds_xattr)) {
a3470f
-                        dht_dir_set_heal_xattr (this, local, local->xattr,
a3470f
-                                                local->mds_xattr, NULL, NULL);
a3470f
-                        dict_unref (local->mds_xattr);
a3470f
-                        local->mds_xattr = NULL;
a3470f
-                }
a3470f
-                /* Call function to save hashed subvol on inode ctx if
a3470f
-                   internal mds xattr is not present and all subvols are up
a3470f
-                */
a3470f
-                if (inode && !__is_root_gfid (inode->gfid) &&
a3470f
-                    (!local->op_ret) && (IA_ISDIR (local->stbuf.ia_type)))
a3470f
-                        (void) dht_mark_mds_subvolume (frame, this);
a3470f
-
a3470f
-                if (local->need_xattr_heal) {
a3470f
-                        local->need_xattr_heal = 0;
a3470f
-                        ret =  dht_dir_xattr_heal (this, local);
a3470f
-                        if (ret)
a3470f
-                                gf_msg (this->name, GF_LOG_ERROR,
a3470f
-                                        ret, DHT_MSG_DIR_XATTR_HEAL_FAILED,
a3470f
-                                        "xattr heal failed for directory %s "
a3470f
-                                        " gfid %s ", local->loc.path,
a3470f
-                                        gfid);
a3470f
+                if (IA_ISDIR (local->stbuf.ia_type)) {
a3470f
+                        /* Code to update all extended attributed from hashed
a3470f
+                           subvol to local->xattr and call heal code to heal
a3470f
+                           custom xattr from hashed subvol to non-hashed subvol
a3470f
+                        */
a3470f
+                        if (local->need_xattr_heal && (local->mds_xattr)) {
a3470f
+                                dht_dir_set_heal_xattr (this, local,
a3470f
+                                                        local->xattr,
a3470f
+                                                        local->mds_xattr, NULL,
a3470f
+                                                        NULL);
a3470f
+                                dict_unref (local->mds_xattr);
a3470f
+                                local->mds_xattr = NULL;
a3470f
+                                local->need_xattr_heal = 0;
a3470f
+                                ret =  dht_dir_xattr_heal (this, local);
a3470f
+                                if (ret)
a3470f
+                                        gf_msg (this->name, GF_LOG_ERROR,
a3470f
+                                                ret, DHT_MSG_DIR_XATTR_HEAL_FAILED,
a3470f
+                                                "xattr heal failed for directory %s "
a3470f
+                                                " gfid %s ", local->loc.path,
a3470f
+                                                gfid);
a3470f
+                        } else {
a3470f
+                                /* Call function to save hashed subvol on inode
a3470f
+                                   ctx if internal mds xattr is not present and
a3470f
+                                   all subvols are up
a3470f
+                                */
a3470f
+                                if (inode && !__is_root_gfid (inode->gfid) &&
a3470f
+                                    (!local->op_ret))
a3470f
+                                        (void) dht_common_mark_mdsxattr (frame, NULL, 1);
a3470f
+                        }
a3470f
                 }
a3470f
                 if (local->need_selfheal) {
a3470f
                         local->need_selfheal = 0;
a3470f
@@ -3629,6 +3676,28 @@ int32_t dht_dict_set_array (dict_t *dict, char *key, int32_t value[],
a3470f
         return ret;
a3470f
 }
a3470f
 
a3470f
+int
a3470f
+dht_common_mds_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
+                            int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
a3470f
+{
a3470f
+        dht_local_t *local = NULL;
a3470f
+        call_frame_t  *prev  = cookie;
a3470f
+
a3470f
+        local = frame->local;
a3470f
+
a3470f
+        if (op_ret)
a3470f
+                gf_msg_debug (this->name, op_errno,
a3470f
+                              "subvolume %s returned -1",
a3470f
+                              prev->this->name);
a3470f
+
a3470f
+        if (local->fop == GF_FOP_SETXATTR) {
a3470f
+                DHT_STACK_UNWIND (setxattr, frame, 0, op_errno, local->xdata);
a3470f
+        } else {
a3470f
+                DHT_STACK_UNWIND (fsetxattr, frame, 0, op_errno, local->xdata);
a3470f
+        }
a3470f
+        return 0;
a3470f
+}
a3470f
+
a3470f
 /* Code to wind a xattrop call to add 1 on current mds internal xattr
a3470f
    value
a3470f
 */
a3470f
@@ -3682,13 +3751,13 @@ dht_setxattr_non_mds_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
                                 goto out;
a3470f
                         }
a3470f
                         if (local->fop == GF_FOP_SETXATTR) {
a3470f
-                                STACK_WIND (frame, dht_common_xattrop_cbk,
a3470f
+                                STACK_WIND (frame, dht_common_mds_xattrop_cbk,
a3470f
                                             local->mds_subvol,
a3470f
                                             local->mds_subvol->fops->xattrop,
a3470f
                                             &local->loc, GF_XATTROP_ADD_ARRAY,
a3470f
                                             xattrop, NULL);
a3470f
                         } else {
a3470f
-                                STACK_WIND (frame, dht_common_xattrop_cbk,
a3470f
+                                STACK_WIND (frame, dht_common_mds_xattrop_cbk,
a3470f
                                             local->mds_subvol,
a3470f
                                             local->mds_subvol->fops->fxattrop,
a3470f
                                             local->fd, GF_XATTROP_ADD_ARRAY,
a3470f
@@ -8822,15 +8891,11 @@ dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie,
a3470f
 
a3470f
         if (gf_uuid_is_null (local->loc.gfid))
a3470f
                 gf_uuid_copy (local->loc.gfid, stbuf->ia_gfid);
a3470f
-        if (local->call_cnt == 0) {
a3470f
-                /*Unlock namespace lock once mkdir is done on all subvols*/
a3470f
-                dht_unlock_namespace (frame, &local->lock[0]);
a3470f
-                FRAME_SU_DO (frame, dht_local_t);
a3470f
-                dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk,
a3470f
-                                        &local->loc, layout);
a3470f
-        }
a3470f
 
a3470f
         /* Set hashed subvol as a mds subvol on inode ctx */
a3470f
+        /*if (!local->inode)
a3470f
+                local->inode  = inode_ref (inode);
a3470f
+        */
a3470f
         ret = dht_inode_ctx_mdsvol_set (local->inode, this, hashed_subvol);
a3470f
         if (ret) {
a3470f
                 gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
a3470f
@@ -8838,6 +8903,15 @@ dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie,
a3470f
                         local->loc.path, hashed_subvol->name);
a3470f
         }
a3470f
 
a3470f
+        if (local->call_cnt == 0) {
a3470f
+                /*Unlock namespace lock once mkdir is done on all subvols*/
a3470f
+                dht_unlock_namespace (frame, &local->lock[0]);
a3470f
+                FRAME_SU_DO (frame, dht_local_t);
a3470f
+                dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk,
a3470f
+                                        &local->loc, layout);
a3470f
+                return 0;
a3470f
+        }
a3470f
+
a3470f
         for (i = 0; i < conf->subvolume_cnt; i++) {
a3470f
                 if (conf->subvolumes[i] == hashed_subvol)
a3470f
                         continue;
a3470f
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
a3470f
index 2aa7251..a785876 100644
a3470f
--- a/xlators/cluster/dht/src/dht-common.h
a3470f
+++ b/xlators/cluster/dht/src/dht-common.h
a3470f
@@ -381,6 +381,7 @@ struct dht_local {
a3470f
         /* This is use only for directory operation */
a3470f
         int32_t valid;
a3470f
         gf_boolean_t heal_layout;
a3470f
+        int32_t mds_heal_fresh_lookup;
a3470f
 };
a3470f
 typedef struct dht_local dht_local_t;
a3470f
 
a3470f
@@ -1463,12 +1464,13 @@ xlator_t *
a3470f
 dht_inode_get_hashed_subvol (inode_t *inode, xlator_t *this, loc_t *loc);
a3470f
 
a3470f
 int
a3470f
-dht_mark_mds_subvolume (call_frame_t *frame, xlator_t *this);
a3470f
+dht_common_mark_mdsxattr (call_frame_t *frame, int *errst, int flag);
a3470f
 
a3470f
 int
a3470f
-dht_mds_internal_setxattr_cbk (call_frame_t *frame, void *cookie,
a3470f
-                               xlator_t *this, int op_ret, int op_errno,
a3470f
-                               dict_t *xdata);
a3470f
+dht_common_mark_mdsxattr_cbk (call_frame_t *frame, void *cookie,
a3470f
+                              xlator_t *this, int op_ret, int op_errno,
a3470f
+                              dict_t *xdata);
a3470f
+
a3470f
 int
a3470f
 dht_inode_ctx_mdsvol_set (inode_t *inode, xlator_t *this,
a3470f
                           xlator_t *mds_subvol);
a3470f
@@ -1476,4 +1478,8 @@ int
a3470f
 dht_inode_ctx_mdsvol_get (inode_t *inode, xlator_t *this,
a3470f
                           xlator_t **mdsvol);
a3470f
 
a3470f
+int
a3470f
+dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf,
a3470f
+                          int32_t valid, dht_layout_t *layout);
a3470f
+
a3470f
 #endif/* _DHT_H */
a3470f
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
a3470f
index 1707e08..c2c4034 100644
a3470f
--- a/xlators/cluster/dht/src/dht-selfheal.c
a3470f
+++ b/xlators/cluster/dht/src/dht-selfheal.c
a3470f
@@ -1159,141 +1159,6 @@ dht_selfheal_dir_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
         return 0;
a3470f
 }
a3470f
 
a3470f
-int
a3470f
-dht_selfheal_dir_check_set_mdsxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
-                                         int op_ret, int op_errno, dict_t *xdata)
a3470f
-{
a3470f
-        dht_local_t  *local = NULL;
a3470f
-        xlator_t     *prev  = cookie;
a3470f
-        int           ret   = -1;
a3470f
-        dht_conf_t   *conf  = 0;
a3470f
-
a3470f
-        GF_VALIDATE_OR_GOTO (this->name, frame, out);
a3470f
-        GF_VALIDATE_OR_GOTO (this->name, frame->local, out);
a3470f
-
a3470f
-        local = frame->local;
a3470f
-        conf = this->private;
a3470f
-
a3470f
-        if (op_ret) {
a3470f
-                gf_msg_debug (this->name, op_ret,
a3470f
-                              "internal mds setxattr %s is failed on mds subvol "
a3470f
-                              "at the time of heal on path %s " ,
a3470f
-                               conf->mds_xattr_key, local->loc.path);
a3470f
-        } else {
a3470f
-                /* Save mds subvol on inode ctx */
a3470f
-                ret = dht_inode_ctx_mdsvol_set (local->inode, this, prev);
a3470f
-                if (ret) {
a3470f
-                        gf_msg (this->name, GF_LOG_ERROR, 0,
a3470f
-                                DHT_MSG_SET_INODE_CTX_FAILED,
a3470f
-                                "Failed to set hashed subvol "
a3470f
-                                " %s for %s ", prev->name,
a3470f
-                                local->loc.path);
a3470f
-                }
a3470f
-        }
a3470f
-
a3470f
-out:
a3470f
-        DHT_STACK_DESTROY (frame);
a3470f
-        return 0;
a3470f
-}
a3470f
-
a3470f
-/* Code to set internal mds xattr if it is not present
a3470f
-*/
a3470f
-int
a3470f
-dht_selfheal_dir_check_set_mdsxattr (call_frame_t *frame, loc_t *loc)
a3470f
-{
a3470f
-        dht_local_t  *local          = NULL;
a3470f
-        xlator_t     *this           = NULL;
a3470f
-        xlator_t     *hashed_subvol  = NULL;
a3470f
-        int ret                      = -1;
a3470f
-        dict_t       *xattrs         = NULL;
a3470f
-        char          gfid_local[GF_UUID_BUF_SIZE] = {0,};
a3470f
-        int32_t       zero[1]        = {0};
a3470f
-        call_frame_t *xattr_frame    = NULL;
a3470f
-        dht_local_t  *copy_local     = NULL;
a3470f
-        dht_conf_t   *conf           = 0;
a3470f
-
a3470f
-        local = frame->local;
a3470f
-        this = frame->this;
a3470f
-        conf = this->private;
a3470f
-        gf_uuid_unparse(local->gfid, gfid_local);
a3470f
-
a3470f
-        if (!dict_get (local->xattr, conf->mds_xattr_key)) {
a3470f
-                /* It means no internal MDS xattr has been set yet
a3470f
-                */
a3470f
-                /* Calculate hashed subvol based on inode and
a3470f
-                   parent inode
a3470f
-                */
a3470f
-                hashed_subvol = dht_inode_get_hashed_subvol (local->inode, this,
a3470f
-                                                             loc);
a3470f
-                if (!hashed_subvol) {
a3470f
-                        gf_msg (this->name, GF_LOG_DEBUG, 0,
a3470f
-                                DHT_MSG_HASHED_SUBVOL_GET_FAILED,
a3470f
-                                "Failed to get hashed subvol for path %s"
a3470f
-                                "gfid is %s ",
a3470f
-                                local->loc.path, gfid_local);
a3470f
-                        ret = -1;
a3470f
-                        goto out;
a3470f
-                } else {
a3470f
-                        /* Set internal mds xattr on disk   */
a3470f
-                        xattrs = dict_new ();
a3470f
-                        if (!xattrs) {
a3470f
-                                gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
a3470f
-                                        DHT_MSG_NO_MEMORY, "dict_new failed");
a3470f
-                                ret = -1;
a3470f
-                                goto out;
a3470f
-                        }
a3470f
-                        /* Add internal MDS xattr on disk for hashed subvol
a3470f
-                        */
a3470f
-                        ret = dht_dict_set_array (xattrs, conf->mds_xattr_key, zero, 1);
a3470f
-                        if (ret) {
a3470f
-                                gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
a3470f
-                                        DHT_MSG_DICT_SET_FAILED,
a3470f
-                                        "Failed to set dictionary"
a3470f
-                                        "  value:key = %s for "
a3470f
-                                        "path %s", conf->mds_xattr_key,
a3470f
-                                        local->loc.path);
a3470f
-                                ret = -1;
a3470f
-                                goto out;
a3470f
-                        }
a3470f
-
a3470f
-                        xattr_frame = create_frame (this, this->ctx->pool);
a3470f
-                        if (!xattr_frame) {
a3470f
-                                ret = -1;
a3470f
-                                goto out;
a3470f
-                        }
a3470f
-                        copy_local = dht_local_init (xattr_frame, &(local->loc),
a3470f
-                                                     NULL, 0);
a3470f
-                        if (!copy_local) {
a3470f
-                                ret = -1;
a3470f
-                                DHT_STACK_DESTROY (xattr_frame);
a3470f
-                                goto out;
a3470f
-                        }
a3470f
-
a3470f
-                        copy_local->stbuf = local->stbuf;
a3470f
-                        copy_local->inode = inode_ref (local->inode);
a3470f
-                        gf_uuid_copy (copy_local->loc.gfid, local->gfid);
a3470f
-
a3470f
-                        STACK_WIND_COOKIE (xattr_frame,
a3470f
-                                           dht_selfheal_dir_check_set_mdsxattr_cbk,
a3470f
-                                           (void *)hashed_subvol, hashed_subvol,
a3470f
-                                           hashed_subvol->fops->setxattr,
a3470f
-                                           loc, xattrs, 0, NULL);
a3470f
-                        ret = 0;
a3470f
-                }
a3470f
-        } else {
a3470f
-                ret = 0;
a3470f
-                gf_msg_debug (this->name, 0,
a3470f
-                              "internal xattr %s is present on subvol"
a3470f
-                              "on path %s gfid is %s " , conf->mds_xattr_key,
a3470f
-                               local->loc.path, gfid_local);
a3470f
-        }
a3470f
-
a3470f
-out:
a3470f
-        if (xattrs)
a3470f
-                dict_unref (xattrs);
a3470f
-        return ret;
a3470f
-}
a3470f
-
a3470f
 
a3470f
 int
a3470f
 dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf,
a3470f
@@ -1313,32 +1178,6 @@ dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf,
a3470f
                         missing_attr++;
a3470f
         }
a3470f
 
a3470f
-        if (!__is_root_gfid (local->stbuf.ia_gfid)) {
a3470f
-                if (local->need_xattr_heal) {
a3470f
-                        local->need_xattr_heal = 0;
a3470f
-                        ret =  dht_dir_xattr_heal (this, local);
a3470f
-                        if (ret)
a3470f
-                                gf_msg (this->name, GF_LOG_ERROR,
a3470f
-                                        ret,
a3470f
-                                        DHT_MSG_DIR_XATTR_HEAL_FAILED,
a3470f
-                                        "xattr heal failed for "
a3470f
-                                        "directory  %s gfid %s ",
a3470f
-                                        local->loc.path,
a3470f
-                                        local->gfid);
a3470f
-                } else {
a3470f
-                        ret = dht_selfheal_dir_check_set_mdsxattr (frame, loc);
a3470f
-                        if (ret)
a3470f
-                                gf_msg (this->name, GF_LOG_INFO, ret,
a3470f
-                                        DHT_MSG_DIR_XATTR_HEAL_FAILED,
a3470f
-                                        "set mds internal xattr failed for "
a3470f
-                                        "directory  %s gfid %s ", local->loc.path,
a3470f
-                                        local->gfid);
a3470f
-                }
a3470f
-        }
a3470f
-
a3470f
-        if (!gf_uuid_is_null (local->gfid))
a3470f
-                gf_uuid_copy (loc->gfid, local->gfid);
a3470f
-
a3470f
         if (missing_attr == 0) {
a3470f
                 if (!local->heal_layout) {
a3470f
                         gf_msg_trace (this->name, 0,
a3470f
@@ -1789,6 +1628,33 @@ dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc,
a3470f
         }
a3470f
 
a3470f
         if (missing_dirs == 0) {
a3470f
+                if (!__is_root_gfid (local->stbuf.ia_gfid)) {
a3470f
+                        if (local->need_xattr_heal) {
a3470f
+                                local->need_xattr_heal = 0;
a3470f
+                                ret =  dht_dir_xattr_heal (this, local);
a3470f
+                                if (ret)
a3470f
+                                        gf_msg (this->name, GF_LOG_ERROR,
a3470f
+                                                ret,
a3470f
+                                                DHT_MSG_DIR_XATTR_HEAL_FAILED,
a3470f
+                                                "xattr heal failed for "
a3470f
+                                                "directory  %s gfid %s ",
a3470f
+                                                local->loc.path,
a3470f
+                                                local->gfid);
a3470f
+                        } else {
a3470f
+                                if (!gf_uuid_is_null (local->gfid))
a3470f
+                                        gf_uuid_copy (loc->gfid, local->gfid);
a3470f
+
a3470f
+                                ret = dht_common_mark_mdsxattr (frame, NULL, 0);
a3470f
+                                if (!ret)
a3470f
+                                        return 0;
a3470f
+
a3470f
+                                gf_msg (this->name, GF_LOG_INFO, 0,
a3470f
+                                        DHT_MSG_DIR_XATTR_HEAL_FAILED,
a3470f
+                                        "Failed to set mds xattr "
a3470f
+                                        "for directory  %s gfid %s ",
a3470f
+                                        local->loc.path, local->gfid);
a3470f
+                        }
a3470f
+                }
a3470f
                 dht_selfheal_dir_setattr (frame, loc, &local->stbuf,
a3470f
                                           0xffffffff, layout);
a3470f
                 return 0;
a3470f
-- 
a3470f
1.8.3.1
a3470f