e7a346
From edd4d523869cc65c389253a225b02c578ad3af85 Mon Sep 17 00:00:00 2001
e7a346
From: Mohit Agrawal <moagrawa@redhat.com>
e7a346
Date: Fri, 6 Oct 2017 15:13:02 +0530
e7a346
Subject: [PATCH 214/236] cluster/dht: Serialize mds update code path with
e7a346
 lookup unwind in selfheal
e7a346
e7a346
Problem: Sometime test case ./tests/bugs/bug-1371806_1.t is failing on
e7a346
         centos due to race condition between fresh lookup and setxattr fop.
e7a346
e7a346
Solution: In selfheal code path we do save mds on inode_ctx, it was not
e7a346
          serialize with lookup unwind. Due to this behavior after lookup
e7a346
          unwind if mds is not saved on inode_ctx and if any subsequent
e7a346
          setxattr fop call it has failed with ENOENT because
e7a346
          no mds has found on inode ctx.To resolve it save mds on
e7a346
          inode ctx has been serialize with lookup unwind.
e7a346
e7a346
> BUG: 1498966
e7a346
> Change-Id: I8d4bb40a6cbf0cec35d181ec0095cc7142b02e29
e7a346
> Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
e7a346
> (cherry picked from commit https://review.gluster.org/#/c/18436/)
e7a346
> (Upstream patch link https://review.gluster.org/#/c/18436/)
e7a346
e7a346
BUG: 1550315
e7a346
Change-Id: I0d3c03cb6ab9a3729f8c4219fd54058d97ed526b
e7a346
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
e7a346
Reviewed-on: https://code.engineering.redhat.com/gerrit/134282
e7a346
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e7a346
Reviewed-by: Nithya Balachandran <nbalacha@redhat.com>
e7a346
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
e7a346
---
e7a346
 tests/bugs/bug-1371806_1.t             |   1 -
e7a346
 xlators/cluster/dht/src/dht-common.c   | 314 ++++++++++++++++++++-------------
e7a346
 xlators/cluster/dht/src/dht-common.h   |  14 +-
e7a346
 xlators/cluster/dht/src/dht-selfheal.c | 188 +++-----------------
e7a346
 4 files changed, 231 insertions(+), 286 deletions(-)
e7a346
e7a346
diff --git a/tests/bugs/bug-1371806_1.t b/tests/bugs/bug-1371806_1.t
e7a346
index 44a57a9..df19a8c 100644
e7a346
--- a/tests/bugs/bug-1371806_1.t
e7a346
+++ b/tests/bugs/bug-1371806_1.t
e7a346
@@ -46,4 +46,3 @@ EXPECT "abc" get_getfattr ./tmp{1..10}
e7a346
 
e7a346
 cd -
e7a346
 cleanup
e7a346
-exit
e7a346
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
e7a346
index 6319a87..2fd145d 100644
e7a346
--- a/xlators/cluster/dht/src/dht-common.c
e7a346
+++ b/xlators/cluster/dht/src/dht-common.c
e7a346
@@ -579,6 +579,7 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
e7a346
         uint32_t         vol_commit_hash = 0;
e7a346
         xlator_t        *source          = NULL;
e7a346
         int              heal_path       = 0;
e7a346
+        int              error_while_marking_mds   = 0;
e7a346
         int              i               = 0;
e7a346
         loc_t            loc             = {0 };
e7a346
         int8_t           is_read_only    = 0, layout_anomalies = 0;
e7a346
@@ -684,7 +685,8 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
e7a346
                    internal mds xattr is not present and all subvols are up
e7a346
                 */
e7a346
                 if (!local->op_ret && !__is_root_gfid (local->stbuf.ia_gfid))
e7a346
-                        (void) dht_mark_mds_subvolume (discover_frame, this);
e7a346
+                        (void) dht_common_mark_mdsxattr (discover_frame,
e7a346
+                                                         &error_while_marking_mds, 1);
e7a346
 
e7a346
                 if (local->need_xattr_heal && !heal_path) {
e7a346
                         local->need_xattr_heal = 0;
e7a346
@@ -699,7 +701,7 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
e7a346
                 }
e7a346
         }
e7a346
 
e7a346
-        if (source && (heal_path || layout_anomalies)) {
e7a346
+        if (source && (heal_path || layout_anomalies || error_while_marking_mds)) {
e7a346
                 gf_uuid_copy (loc.gfid, local->gfid);
e7a346
                 if (gf_uuid_is_null (loc.gfid)) {
e7a346
                         goto done;
e7a346
@@ -761,62 +763,82 @@ out:
e7a346
 }
e7a346
 
e7a346
 int
e7a346
-dht_mds_internal_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
-                               int op_ret, int op_errno, dict_t *xdata)
e7a346
+dht_common_mark_mdsxattr_cbk (call_frame_t *frame, void *cookie,
e7a346
+                              xlator_t *this, int op_ret, int op_errno,
e7a346
+                              dict_t *xdata)
e7a346
 {
e7a346
-        dht_local_t  *local                   = NULL;
e7a346
-        xlator_t     *hashed_subvol           = NULL;
e7a346
-        dht_conf_t   *conf                    = NULL;
e7a346
-        int           ret                     = 0;
e7a346
+        dht_local_t  *local = NULL;
e7a346
+        xlator_t     *prev  = cookie;
e7a346
+        int           ret   = -1;
e7a346
+        dht_conf_t   *conf  = 0;
e7a346
+        dht_layout_t *layout = NULL;
e7a346
 
e7a346
         GF_VALIDATE_OR_GOTO (this->name, frame, out);
e7a346
         GF_VALIDATE_OR_GOTO (this->name, frame->local, out);
e7a346
 
e7a346
         local = frame->local;
e7a346
-        hashed_subvol  = cookie;
e7a346
         conf = this->private;
e7a346
+        layout = local->selfheal.layout;
e7a346
 
e7a346
         if (op_ret) {
e7a346
                 gf_msg_debug (this->name, op_ret,
e7a346
-                              "Failed to set %s on the MDS for path %s. ",
e7a346
-                              conf->mds_xattr_key, local->loc.path);
e7a346
+                              "Failed to set %s on the MDS %s for path %s. ",
e7a346
+                               conf->mds_xattr_key, prev->name, local->loc.path);
e7a346
         } else {
e7a346
-               /* Save mds subvol on inode ctx */
e7a346
-                ret = dht_inode_ctx_mdsvol_set (local->inode, this,
e7a346
-                                                hashed_subvol);
e7a346
+                /* Save mds subvol on inode ctx */
e7a346
+                ret = dht_inode_ctx_mdsvol_set (local->inode, this, prev);
e7a346
                 if (ret) {
e7a346
                         gf_msg (this->name, GF_LOG_ERROR, 0,
e7a346
                                 DHT_MSG_SET_INODE_CTX_FAILED,
e7a346
                                 "Failed to set mds subvol on inode ctx"
e7a346
-                                " %s for %s", hashed_subvol->name,
e7a346
+                                " %s for %s ", prev->name,
e7a346
                                 local->loc.path);
e7a346
                 }
e7a346
         }
e7a346
+        if (!local->mds_heal_fresh_lookup && layout) {
e7a346
+                dht_selfheal_dir_setattr (frame, &local->loc, &local->stbuf,
e7a346
+                                          0xffffffff, layout);
e7a346
+        }
e7a346
 out:
e7a346
-        DHT_STACK_DESTROY (frame);
e7a346
+        if (local && local->mds_heal_fresh_lookup)
e7a346
+                DHT_STACK_DESTROY (frame);
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
 
e7a346
 
e7a346
-/* Code to save hashed subvol on inode ctx only while no
e7a346
-   mds xattr is availble and all subvols are up for fresh
e7a346
+/* Common function call by revalidate/selfheal code path to populate
e7a346
+   internal xattr if it is not present, mark_during_fresh_lookup value
e7a346
+   determines either function is call by revalidate_cbk(discover_complete)
e7a346
+   or call by selfheal code path while fresh lookup.
e7a346
+   Here we do wind a call serially in case of fresh lookup and
e7a346
+   for other lookup code path we do wind a call parallel.The reason
e7a346
+   to wind a call serially is at the time of fresh lookup directory is not
e7a346
+   discovered and at the time of revalidate_lookup directory is
e7a346
+   already discovered. So, revalidate codepath can race with setxattr
e7a346
+   codepath and can get into spurious heals because of an ongoing setxattr.
e7a346
+   This can slow down revalidates, if healing happens in foreground.
e7a346
+   However, if healing happens in background, there is no direct performance
e7a346
+   penalty.
e7a346
 */
e7a346
 int
e7a346
-dht_mark_mds_subvolume (call_frame_t *frame, xlator_t *this)
e7a346
+dht_common_mark_mdsxattr (call_frame_t *frame, int *errst, int mark_during_fresh_lookup)
e7a346
 {
e7a346
-        dht_local_t  *local                   = NULL;
e7a346
-        xlator_t     *hashed_subvol           = NULL;
e7a346
-        int           i                       = 0;
e7a346
-        gf_boolean_t  vol_down                = _gf_false;
e7a346
-        dht_conf_t   *conf                    = 0;
e7a346
-        int           ret                     = -1;
e7a346
-        char          gfid_local[GF_UUID_BUF_SIZE] = {0};
e7a346
-        dict_t       *xattrs                      = NULL;
e7a346
-        dht_local_t  *copy_local                  = NULL;
e7a346
-        call_frame_t *xattr_frame                 = NULL;
e7a346
-        int32_t       zero[1]                     = {0};
e7a346
+        dht_local_t  *local          = NULL;
e7a346
+        xlator_t     *this           = NULL;
e7a346
+        xlator_t     *hashed_subvol  = NULL;
e7a346
+        int           ret            = 0;
e7a346
+        int           i              = 0;
e7a346
+        dict_t       *xattrs         = NULL;
e7a346
+        char          gfid_local[GF_UUID_BUF_SIZE] = {0,};
e7a346
+        int32_t       zero[1]        = {0};
e7a346
+        dht_conf_t   *conf           = 0;
e7a346
+        dht_layout_t *layout         = NULL;
e7a346
+        dht_local_t  *copy_local     = NULL;
e7a346
+        call_frame_t *xattr_frame    = NULL;
e7a346
+        gf_boolean_t  vol_down       = _gf_false;
e7a346
 
e7a346
+        this = frame->this;
e7a346
 
e7a346
         GF_VALIDATE_OR_GOTO ("dht", frame, out);
e7a346
         GF_VALIDATE_OR_GOTO ("dht", this, out);
e7a346
@@ -825,66 +847,78 @@ dht_mark_mds_subvolume (call_frame_t *frame, xlator_t *this)
e7a346
 
e7a346
         local = frame->local;
e7a346
         conf = this->private;
e7a346
+        layout = local->selfheal.layout;
e7a346
+        local->mds_heal_fresh_lookup = mark_during_fresh_lookup;
e7a346
         gf_uuid_unparse(local->gfid, gfid_local);
e7a346
 
e7a346
-
e7a346
         /* Code to update hashed subvol consider as a mds subvol
e7a346
-           and save on inode ctx if all subvols are up and no internal
e7a346
-           xattr has been set yet
e7a346
+           and wind a setxattr call on hashed subvol to update
e7a346
+           internal xattr
e7a346
         */
e7a346
         if (!dict_get (local->xattr, conf->mds_xattr_key)) {
e7a346
                 /* It means no internal MDS xattr has been set yet
e7a346
                 */
e7a346
-                /* Check the status of all subvol are up
e7a346
+                /* Check the status of all subvol are up while call
e7a346
+                   this function call by lookup code path
e7a346
                 */
e7a346
-                for (i = 0; i < conf->subvolume_cnt; i++) {
e7a346
-                        if (!conf->subvolume_status[i]) {
e7a346
-                                vol_down = _gf_true;
e7a346
-                                break;
e7a346
+                if (mark_during_fresh_lookup) {
e7a346
+                        for (i = 0; i < conf->subvolume_cnt; i++) {
e7a346
+                                if (!conf->subvolume_status[i]) {
e7a346
+                                        vol_down = _gf_true;
e7a346
+                                        break;
e7a346
+                                }
e7a346
+                        }
e7a346
+                        if (vol_down) {
e7a346
+                                gf_msg_debug (this->name, 0,
e7a346
+                                              "subvol %s is down. Unable to "
e7a346
+                                              " save mds subvol on inode for "
e7a346
+                                              " path %s gfid is %s " ,
e7a346
+                                              conf->subvolumes[i]->name,
e7a346
+                                              local->loc.path, gfid_local);
e7a346
+                                goto out;
e7a346
                         }
e7a346
                 }
e7a346
-                if (vol_down) {
e7a346
-                        ret = 0;
e7a346
-                        gf_msg_debug (this->name, 0,
e7a346
-                                      "subvol %s is down. Unable to "
e7a346
-                                      " save mds subvol on inode for "
e7a346
-                                      " path %s gfid is %s " ,
e7a346
-                                      conf->subvolumes[i]->name, local->loc.path,
e7a346
-                                      gfid_local);
e7a346
-                       goto out;
e7a346
-                }
e7a346
-                /* Calculate hashed subvol based on inode and
e7a346
-                   parent inode
e7a346
+
e7a346
+                /* Calculate hashed subvol based on inode and parent node
e7a346
                 */
e7a346
-                hashed_subvol = dht_inode_get_hashed_subvol (local->inode,
e7a346
-                                                             this, &local->loc);
e7a346
+                hashed_subvol = dht_inode_get_hashed_subvol (local->inode, this,
e7a346
+                                                             &local->loc);
e7a346
                 if (!hashed_subvol) {
e7a346
                         gf_msg (this->name, GF_LOG_DEBUG, 0,
e7a346
                                 DHT_MSG_HASHED_SUBVOL_GET_FAILED,
e7a346
                                 "Failed to get hashed subvol for path %s"
e7a346
-                                " gfid is %s ",
e7a346
+                                "gfid is %s ",
e7a346
                                 local->loc.path, gfid_local);
e7a346
-                } else {
e7a346
-                        xattrs = dict_new ();
e7a346
-                        if (!xattrs) {
e7a346
-                                gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
e7a346
-                                        DHT_MSG_NO_MEMORY, "dict_new failed");
e7a346
-                                ret = -1;
e7a346
-                                goto out;
e7a346
-                        }
e7a346
-                        /* Add internal MDS xattr on disk for hashed subvol
e7a346
-                        */
e7a346
-                        ret = dht_dict_set_array (xattrs, conf->mds_xattr_key, zero, 1);
e7a346
-                        if (ret) {
e7a346
-                                gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
e7a346
-                                        DHT_MSG_DICT_SET_FAILED,
e7a346
-                                        "Failed to set dictionary"
e7a346
-                                        "  value:key = %s for "
e7a346
-                                        "path %s", conf->mds_xattr_key,
e7a346
-                                        local->loc.path);
e7a346
-                                ret = -1;
e7a346
-                                goto out;
e7a346
-                        }
e7a346
+                        (*errst) = 1;
e7a346
+                        ret = -1;
e7a346
+                        goto out;
e7a346
+                }
e7a346
+                xattrs = dict_new ();
e7a346
+                if (!xattrs) {
e7a346
+                        gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
e7a346
+                                DHT_MSG_NO_MEMORY, "dict_new failed");
e7a346
+                        ret = -1;
e7a346
+                        goto out;
e7a346
+                }
e7a346
+                /* Add internal MDS xattr on disk for hashed subvol
e7a346
+                */
e7a346
+                ret = dht_dict_set_array (xattrs, conf->mds_xattr_key,
e7a346
+                                          zero, 1);
e7a346
+                if (ret) {
e7a346
+                        gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
e7a346
+                                DHT_MSG_DICT_SET_FAILED,
e7a346
+                                "Failed to set dictionary"
e7a346
+                                "  value:key = %s for "
e7a346
+                                "path %s", conf->mds_xattr_key,
e7a346
+                                local->loc.path);
e7a346
+                        ret = -1;
e7a346
+                        goto out;
e7a346
+                }
e7a346
+                /* Create a new frame to wind a call only while
e7a346
+                   this function call by revalidate_cbk code path
e7a346
+                   To wind a call parallel need to create a new frame
e7a346
+                */
e7a346
+                if (mark_during_fresh_lookup) {
e7a346
                         xattr_frame = create_frame (this, this->ctx->pool);
e7a346
                         if (!xattr_frame) {
e7a346
                                 ret = -1;
e7a346
@@ -898,32 +932,42 @@ dht_mark_mds_subvolume (call_frame_t *frame, xlator_t *this)
e7a346
                                 goto out;
e7a346
                         }
e7a346
                         copy_local->stbuf = local->stbuf;
e7a346
+                        copy_local->mds_heal_fresh_lookup = mark_during_fresh_lookup;
e7a346
                         if (!copy_local->inode)
e7a346
                                 copy_local->inode = inode_ref (local->inode);
e7a346
                         gf_uuid_copy (copy_local->loc.gfid, local->gfid);
e7a346
-                        STACK_WIND_COOKIE (xattr_frame, dht_mds_internal_setxattr_cbk,
e7a346
+                        FRAME_SU_DO (xattr_frame, dht_local_t);
e7a346
+                        STACK_WIND_COOKIE (xattr_frame, dht_common_mark_mdsxattr_cbk,
e7a346
                                            hashed_subvol, hashed_subvol,
e7a346
                                            hashed_subvol->fops->setxattr,
e7a346
                                            &local->loc, xattrs, 0, NULL);
e7a346
-                        ret = 0;
e7a346
+                } else {
e7a346
+                        STACK_WIND_COOKIE (frame,
e7a346
+                                           dht_common_mark_mdsxattr_cbk,
e7a346
+                                           (void *)hashed_subvol,
e7a346
+                                           hashed_subvol,
e7a346
+                                           hashed_subvol->fops->setxattr,
e7a346
+                                           &local->loc, xattrs, 0,
e7a346
+                                           NULL);
e7a346
                 }
e7a346
         } else {
e7a346
-                ret = 0;
e7a346
                 gf_msg_debug (this->name, 0,
e7a346
                               "internal xattr %s is present on subvol"
e7a346
                               "on path %s gfid is %s " , conf->mds_xattr_key,
e7a346
                                local->loc.path, gfid_local);
e7a346
+                if (!mark_during_fresh_lookup)
e7a346
+                        dht_selfheal_dir_setattr (frame, &local->loc,
e7a346
+                                                  &local->stbuf, 0xffffffff,
e7a346
+                                                  layout);
e7a346
         }
e7a346
 
e7a346
-
e7a346
 out:
e7a346
         if (xattrs)
e7a346
                 dict_unref (xattrs);
e7a346
-       return ret;
e7a346
+        return ret;
e7a346
 }
e7a346
 
e7a346
 
e7a346
-
e7a346
 int
e7a346
 dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                   int op_ret, int op_errno,
e7a346
@@ -1646,11 +1690,11 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                         } else {
e7a346
                                 check_mds = dht_dict_get_array (xattr, conf->mds_xattr_key,
e7a346
                                                                 mds_xattr_val, 1, &errst);
e7a346
-                                if (local->mds_subvol == prev) {
e7a346
-                                        local->mds_stbuf.ia_gid = stbuf->ia_gid;
e7a346
-                                        local->mds_stbuf.ia_uid = stbuf->ia_uid;
e7a346
-                                        local->mds_stbuf.ia_prot = stbuf->ia_prot;
e7a346
-                                }
e7a346
+                                local->mds_subvol  = prev;
e7a346
+                                local->mds_stbuf.ia_gid = stbuf->ia_gid;
e7a346
+                                local->mds_stbuf.ia_uid = stbuf->ia_uid;
e7a346
+                                local->mds_stbuf.ia_prot = stbuf->ia_prot;
e7a346
+
e7a346
                                 /* save mds subvol on inode ctx */
e7a346
                                 ret = dht_inode_ctx_mdsvol_set (local->inode, this,
e7a346
                                                                 prev);
e7a346
@@ -1672,7 +1716,6 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                                                       local->loc.path,
e7a346
                                                       prev->name, gfid);
e7a346
                                         local->need_xattr_heal = 1;
e7a346
-                                        local->mds_subvol  = prev;
e7a346
                                 }
e7a346
                         }
e7a346
                         ret = dht_layout_dir_mismatch (this, layout,
e7a346
@@ -1749,31 +1792,35 @@ out:
e7a346
                 if (conf->subvolume_cnt == 1)
e7a346
                         local->need_xattr_heal = 0;
e7a346
 
e7a346
-                /* Code to update all extended attributed from hashed subvol
e7a346
-                   to local->xattr
e7a346
-                */
e7a346
-                if (local->need_xattr_heal && (local->mds_xattr)) {
e7a346
-                        dht_dir_set_heal_xattr (this, local, local->xattr,
e7a346
-                                                local->mds_xattr, NULL, NULL);
e7a346
-                        dict_unref (local->mds_xattr);
e7a346
-                        local->mds_xattr = NULL;
e7a346
-                }
e7a346
-                /* Call function to save hashed subvol on inode ctx if
e7a346
-                   internal mds xattr is not present and all subvols are up
e7a346
-                */
e7a346
-                if (inode && !__is_root_gfid (inode->gfid) &&
e7a346
-                    (!local->op_ret) && (IA_ISDIR (local->stbuf.ia_type)))
e7a346
-                        (void) dht_mark_mds_subvolume (frame, this);
e7a346
-
e7a346
-                if (local->need_xattr_heal) {
e7a346
-                        local->need_xattr_heal = 0;
e7a346
-                        ret =  dht_dir_xattr_heal (this, local);
e7a346
-                        if (ret)
e7a346
-                                gf_msg (this->name, GF_LOG_ERROR,
e7a346
-                                        ret, DHT_MSG_DIR_XATTR_HEAL_FAILED,
e7a346
-                                        "xattr heal failed for directory %s "
e7a346
-                                        " gfid %s ", local->loc.path,
e7a346
-                                        gfid);
e7a346
+                if (IA_ISDIR (local->stbuf.ia_type)) {
e7a346
+                        /* Code to update all extended attributed from hashed
e7a346
+                           subvol to local->xattr and call heal code to heal
e7a346
+                           custom xattr from hashed subvol to non-hashed subvol
e7a346
+                        */
e7a346
+                        if (local->need_xattr_heal && (local->mds_xattr)) {
e7a346
+                                dht_dir_set_heal_xattr (this, local,
e7a346
+                                                        local->xattr,
e7a346
+                                                        local->mds_xattr, NULL,
e7a346
+                                                        NULL);
e7a346
+                                dict_unref (local->mds_xattr);
e7a346
+                                local->mds_xattr = NULL;
e7a346
+                                local->need_xattr_heal = 0;
e7a346
+                                ret =  dht_dir_xattr_heal (this, local);
e7a346
+                                if (ret)
e7a346
+                                        gf_msg (this->name, GF_LOG_ERROR,
e7a346
+                                                ret, DHT_MSG_DIR_XATTR_HEAL_FAILED,
e7a346
+                                                "xattr heal failed for directory %s "
e7a346
+                                                " gfid %s ", local->loc.path,
e7a346
+                                                gfid);
e7a346
+                        } else {
e7a346
+                                /* Call function to save hashed subvol on inode
e7a346
+                                   ctx if internal mds xattr is not present and
e7a346
+                                   all subvols are up
e7a346
+                                */
e7a346
+                                if (inode && !__is_root_gfid (inode->gfid) &&
e7a346
+                                    (!local->op_ret))
e7a346
+                                        (void) dht_common_mark_mdsxattr (frame, NULL, 1);
e7a346
+                        }
e7a346
                 }
e7a346
                 if (local->need_selfheal) {
e7a346
                         local->need_selfheal = 0;
e7a346
@@ -3629,6 +3676,28 @@ int32_t dht_dict_set_array (dict_t *dict, char *key, int32_t value[],
e7a346
         return ret;
e7a346
 }
e7a346
 
e7a346
+int
e7a346
+dht_common_mds_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
+                            int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
e7a346
+{
e7a346
+        dht_local_t *local = NULL;
e7a346
+        call_frame_t  *prev  = cookie;
e7a346
+
e7a346
+        local = frame->local;
e7a346
+
e7a346
+        if (op_ret)
e7a346
+                gf_msg_debug (this->name, op_errno,
e7a346
+                              "subvolume %s returned -1",
e7a346
+                              prev->this->name);
e7a346
+
e7a346
+        if (local->fop == GF_FOP_SETXATTR) {
e7a346
+                DHT_STACK_UNWIND (setxattr, frame, 0, op_errno, local->xdata);
e7a346
+        } else {
e7a346
+                DHT_STACK_UNWIND (fsetxattr, frame, 0, op_errno, local->xdata);
e7a346
+        }
e7a346
+        return 0;
e7a346
+}
e7a346
+
e7a346
 /* Code to wind a xattrop call to add 1 on current mds internal xattr
e7a346
    value
e7a346
 */
e7a346
@@ -3682,13 +3751,13 @@ dht_setxattr_non_mds_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                                 goto out;
e7a346
                         }
e7a346
                         if (local->fop == GF_FOP_SETXATTR) {
e7a346
-                                STACK_WIND (frame, dht_common_xattrop_cbk,
e7a346
+                                STACK_WIND (frame, dht_common_mds_xattrop_cbk,
e7a346
                                             local->mds_subvol,
e7a346
                                             local->mds_subvol->fops->xattrop,
e7a346
                                             &local->loc, GF_XATTROP_ADD_ARRAY,
e7a346
                                             xattrop, NULL);
e7a346
                         } else {
e7a346
-                                STACK_WIND (frame, dht_common_xattrop_cbk,
e7a346
+                                STACK_WIND (frame, dht_common_mds_xattrop_cbk,
e7a346
                                             local->mds_subvol,
e7a346
                                             local->mds_subvol->fops->fxattrop,
e7a346
                                             local->fd, GF_XATTROP_ADD_ARRAY,
e7a346
@@ -8822,15 +8891,11 @@ dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie,
e7a346
 
e7a346
         if (gf_uuid_is_null (local->loc.gfid))
e7a346
                 gf_uuid_copy (local->loc.gfid, stbuf->ia_gfid);
e7a346
-        if (local->call_cnt == 0) {
e7a346
-                /*Unlock namespace lock once mkdir is done on all subvols*/
e7a346
-                dht_unlock_namespace (frame, &local->lock[0]);
e7a346
-                FRAME_SU_DO (frame, dht_local_t);
e7a346
-                dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk,
e7a346
-                                        &local->loc, layout);
e7a346
-        }
e7a346
 
e7a346
         /* Set hashed subvol as a mds subvol on inode ctx */
e7a346
+        /*if (!local->inode)
e7a346
+                local->inode  = inode_ref (inode);
e7a346
+        */
e7a346
         ret = dht_inode_ctx_mdsvol_set (local->inode, this, hashed_subvol);
e7a346
         if (ret) {
e7a346
                 gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
e7a346
@@ -8838,6 +8903,15 @@ dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie,
e7a346
                         local->loc.path, hashed_subvol->name);
e7a346
         }
e7a346
 
e7a346
+        if (local->call_cnt == 0) {
e7a346
+                /*Unlock namespace lock once mkdir is done on all subvols*/
e7a346
+                dht_unlock_namespace (frame, &local->lock[0]);
e7a346
+                FRAME_SU_DO (frame, dht_local_t);
e7a346
+                dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk,
e7a346
+                                        &local->loc, layout);
e7a346
+                return 0;
e7a346
+        }
e7a346
+
e7a346
         for (i = 0; i < conf->subvolume_cnt; i++) {
e7a346
                 if (conf->subvolumes[i] == hashed_subvol)
e7a346
                         continue;
e7a346
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
e7a346
index 2aa7251..a785876 100644
e7a346
--- a/xlators/cluster/dht/src/dht-common.h
e7a346
+++ b/xlators/cluster/dht/src/dht-common.h
e7a346
@@ -381,6 +381,7 @@ struct dht_local {
e7a346
         /* This is use only for directory operation */
e7a346
         int32_t valid;
e7a346
         gf_boolean_t heal_layout;
e7a346
+        int32_t mds_heal_fresh_lookup;
e7a346
 };
e7a346
 typedef struct dht_local dht_local_t;
e7a346
 
e7a346
@@ -1463,12 +1464,13 @@ xlator_t *
e7a346
 dht_inode_get_hashed_subvol (inode_t *inode, xlator_t *this, loc_t *loc);
e7a346
 
e7a346
 int
e7a346
-dht_mark_mds_subvolume (call_frame_t *frame, xlator_t *this);
e7a346
+dht_common_mark_mdsxattr (call_frame_t *frame, int *errst, int flag);
e7a346
 
e7a346
 int
e7a346
-dht_mds_internal_setxattr_cbk (call_frame_t *frame, void *cookie,
e7a346
-                               xlator_t *this, int op_ret, int op_errno,
e7a346
-                               dict_t *xdata);
e7a346
+dht_common_mark_mdsxattr_cbk (call_frame_t *frame, void *cookie,
e7a346
+                              xlator_t *this, int op_ret, int op_errno,
e7a346
+                              dict_t *xdata);
e7a346
+
e7a346
 int
e7a346
 dht_inode_ctx_mdsvol_set (inode_t *inode, xlator_t *this,
e7a346
                           xlator_t *mds_subvol);
e7a346
@@ -1476,4 +1478,8 @@ int
e7a346
 dht_inode_ctx_mdsvol_get (inode_t *inode, xlator_t *this,
e7a346
                           xlator_t **mdsvol);
e7a346
 
e7a346
+int
e7a346
+dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf,
e7a346
+                          int32_t valid, dht_layout_t *layout);
e7a346
+
e7a346
 #endif/* _DHT_H */
e7a346
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
e7a346
index 1707e08..c2c4034 100644
e7a346
--- a/xlators/cluster/dht/src/dht-selfheal.c
e7a346
+++ b/xlators/cluster/dht/src/dht-selfheal.c
e7a346
@@ -1159,141 +1159,6 @@ dht_selfheal_dir_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
         return 0;
e7a346
 }
e7a346
 
e7a346
-int
e7a346
-dht_selfheal_dir_check_set_mdsxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
-                                         int op_ret, int op_errno, dict_t *xdata)
e7a346
-{
e7a346
-        dht_local_t  *local = NULL;
e7a346
-        xlator_t     *prev  = cookie;
e7a346
-        int           ret   = -1;
e7a346
-        dht_conf_t   *conf  = 0;
e7a346
-
e7a346
-        GF_VALIDATE_OR_GOTO (this->name, frame, out);
e7a346
-        GF_VALIDATE_OR_GOTO (this->name, frame->local, out);
e7a346
-
e7a346
-        local = frame->local;
e7a346
-        conf = this->private;
e7a346
-
e7a346
-        if (op_ret) {
e7a346
-                gf_msg_debug (this->name, op_ret,
e7a346
-                              "internal mds setxattr %s is failed on mds subvol "
e7a346
-                              "at the time of heal on path %s " ,
e7a346
-                               conf->mds_xattr_key, local->loc.path);
e7a346
-        } else {
e7a346
-                /* Save mds subvol on inode ctx */
e7a346
-                ret = dht_inode_ctx_mdsvol_set (local->inode, this, prev);
e7a346
-                if (ret) {
e7a346
-                        gf_msg (this->name, GF_LOG_ERROR, 0,
e7a346
-                                DHT_MSG_SET_INODE_CTX_FAILED,
e7a346
-                                "Failed to set hashed subvol "
e7a346
-                                " %s for %s ", prev->name,
e7a346
-                                local->loc.path);
e7a346
-                }
e7a346
-        }
e7a346
-
e7a346
-out:
e7a346
-        DHT_STACK_DESTROY (frame);
e7a346
-        return 0;
e7a346
-}
e7a346
-
e7a346
-/* Code to set internal mds xattr if it is not present
e7a346
-*/
e7a346
-int
e7a346
-dht_selfheal_dir_check_set_mdsxattr (call_frame_t *frame, loc_t *loc)
e7a346
-{
e7a346
-        dht_local_t  *local          = NULL;
e7a346
-        xlator_t     *this           = NULL;
e7a346
-        xlator_t     *hashed_subvol  = NULL;
e7a346
-        int ret                      = -1;
e7a346
-        dict_t       *xattrs         = NULL;
e7a346
-        char          gfid_local[GF_UUID_BUF_SIZE] = {0,};
e7a346
-        int32_t       zero[1]        = {0};
e7a346
-        call_frame_t *xattr_frame    = NULL;
e7a346
-        dht_local_t  *copy_local     = NULL;
e7a346
-        dht_conf_t   *conf           = 0;
e7a346
-
e7a346
-        local = frame->local;
e7a346
-        this = frame->this;
e7a346
-        conf = this->private;
e7a346
-        gf_uuid_unparse(local->gfid, gfid_local);
e7a346
-
e7a346
-        if (!dict_get (local->xattr, conf->mds_xattr_key)) {
e7a346
-                /* It means no internal MDS xattr has been set yet
e7a346
-                */
e7a346
-                /* Calculate hashed subvol based on inode and
e7a346
-                   parent inode
e7a346
-                */
e7a346
-                hashed_subvol = dht_inode_get_hashed_subvol (local->inode, this,
e7a346
-                                                             loc);
e7a346
-                if (!hashed_subvol) {
e7a346
-                        gf_msg (this->name, GF_LOG_DEBUG, 0,
e7a346
-                                DHT_MSG_HASHED_SUBVOL_GET_FAILED,
e7a346
-                                "Failed to get hashed subvol for path %s"
e7a346
-                                "gfid is %s ",
e7a346
-                                local->loc.path, gfid_local);
e7a346
-                        ret = -1;
e7a346
-                        goto out;
e7a346
-                } else {
e7a346
-                        /* Set internal mds xattr on disk   */
e7a346
-                        xattrs = dict_new ();
e7a346
-                        if (!xattrs) {
e7a346
-                                gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
e7a346
-                                        DHT_MSG_NO_MEMORY, "dict_new failed");
e7a346
-                                ret = -1;
e7a346
-                                goto out;
e7a346
-                        }
e7a346
-                        /* Add internal MDS xattr on disk for hashed subvol
e7a346
-                        */
e7a346
-                        ret = dht_dict_set_array (xattrs, conf->mds_xattr_key, zero, 1);
e7a346
-                        if (ret) {
e7a346
-                                gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
e7a346
-                                        DHT_MSG_DICT_SET_FAILED,
e7a346
-                                        "Failed to set dictionary"
e7a346
-                                        "  value:key = %s for "
e7a346
-                                        "path %s", conf->mds_xattr_key,
e7a346
-                                        local->loc.path);
e7a346
-                                ret = -1;
e7a346
-                                goto out;
e7a346
-                        }
e7a346
-
e7a346
-                        xattr_frame = create_frame (this, this->ctx->pool);
e7a346
-                        if (!xattr_frame) {
e7a346
-                                ret = -1;
e7a346
-                                goto out;
e7a346
-                        }
e7a346
-                        copy_local = dht_local_init (xattr_frame, &(local->loc),
e7a346
-                                                     NULL, 0);
e7a346
-                        if (!copy_local) {
e7a346
-                                ret = -1;
e7a346
-                                DHT_STACK_DESTROY (xattr_frame);
e7a346
-                                goto out;
e7a346
-                        }
e7a346
-
e7a346
-                        copy_local->stbuf = local->stbuf;
e7a346
-                        copy_local->inode = inode_ref (local->inode);
e7a346
-                        gf_uuid_copy (copy_local->loc.gfid, local->gfid);
e7a346
-
e7a346
-                        STACK_WIND_COOKIE (xattr_frame,
e7a346
-                                           dht_selfheal_dir_check_set_mdsxattr_cbk,
e7a346
-                                           (void *)hashed_subvol, hashed_subvol,
e7a346
-                                           hashed_subvol->fops->setxattr,
e7a346
-                                           loc, xattrs, 0, NULL);
e7a346
-                        ret = 0;
e7a346
-                }
e7a346
-        } else {
e7a346
-                ret = 0;
e7a346
-                gf_msg_debug (this->name, 0,
e7a346
-                              "internal xattr %s is present on subvol"
e7a346
-                              "on path %s gfid is %s " , conf->mds_xattr_key,
e7a346
-                               local->loc.path, gfid_local);
e7a346
-        }
e7a346
-
e7a346
-out:
e7a346
-        if (xattrs)
e7a346
-                dict_unref (xattrs);
e7a346
-        return ret;
e7a346
-}
e7a346
-
e7a346
 
e7a346
 int
e7a346
 dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf,
e7a346
@@ -1313,32 +1178,6 @@ dht_selfheal_dir_setattr (call_frame_t *frame, loc_t *loc, struct iatt *stbuf,
e7a346
                         missing_attr++;
e7a346
         }
e7a346
 
e7a346
-        if (!__is_root_gfid (local->stbuf.ia_gfid)) {
e7a346
-                if (local->need_xattr_heal) {
e7a346
-                        local->need_xattr_heal = 0;
e7a346
-                        ret =  dht_dir_xattr_heal (this, local);
e7a346
-                        if (ret)
e7a346
-                                gf_msg (this->name, GF_LOG_ERROR,
e7a346
-                                        ret,
e7a346
-                                        DHT_MSG_DIR_XATTR_HEAL_FAILED,
e7a346
-                                        "xattr heal failed for "
e7a346
-                                        "directory  %s gfid %s ",
e7a346
-                                        local->loc.path,
e7a346
-                                        local->gfid);
e7a346
-                } else {
e7a346
-                        ret = dht_selfheal_dir_check_set_mdsxattr (frame, loc);
e7a346
-                        if (ret)
e7a346
-                                gf_msg (this->name, GF_LOG_INFO, ret,
e7a346
-                                        DHT_MSG_DIR_XATTR_HEAL_FAILED,
e7a346
-                                        "set mds internal xattr failed for "
e7a346
-                                        "directory  %s gfid %s ", local->loc.path,
e7a346
-                                        local->gfid);
e7a346
-                }
e7a346
-        }
e7a346
-
e7a346
-        if (!gf_uuid_is_null (local->gfid))
e7a346
-                gf_uuid_copy (loc->gfid, local->gfid);
e7a346
-
e7a346
         if (missing_attr == 0) {
e7a346
                 if (!local->heal_layout) {
e7a346
                         gf_msg_trace (this->name, 0,
e7a346
@@ -1789,6 +1628,33 @@ dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc,
e7a346
         }
e7a346
 
e7a346
         if (missing_dirs == 0) {
e7a346
+                if (!__is_root_gfid (local->stbuf.ia_gfid)) {
e7a346
+                        if (local->need_xattr_heal) {
e7a346
+                                local->need_xattr_heal = 0;
e7a346
+                                ret =  dht_dir_xattr_heal (this, local);
e7a346
+                                if (ret)
e7a346
+                                        gf_msg (this->name, GF_LOG_ERROR,
e7a346
+                                                ret,
e7a346
+                                                DHT_MSG_DIR_XATTR_HEAL_FAILED,
e7a346
+                                                "xattr heal failed for "
e7a346
+                                                "directory  %s gfid %s ",
e7a346
+                                                local->loc.path,
e7a346
+                                                local->gfid);
e7a346
+                        } else {
e7a346
+                                if (!gf_uuid_is_null (local->gfid))
e7a346
+                                        gf_uuid_copy (loc->gfid, local->gfid);
e7a346
+
e7a346
+                                ret = dht_common_mark_mdsxattr (frame, NULL, 0);
e7a346
+                                if (!ret)
e7a346
+                                        return 0;
e7a346
+
e7a346
+                                gf_msg (this->name, GF_LOG_INFO, 0,
e7a346
+                                        DHT_MSG_DIR_XATTR_HEAL_FAILED,
e7a346
+                                        "Failed to set mds xattr "
e7a346
+                                        "for directory  %s gfid %s ",
e7a346
+                                        local->loc.path, local->gfid);
e7a346
+                        }
e7a346
+                }
e7a346
                 dht_selfheal_dir_setattr (frame, loc, &local->stbuf,
e7a346
                                           0xffffffff, layout);
e7a346
                 return 0;
e7a346
-- 
e7a346
1.8.3.1
e7a346