7c2869
From 590cf967946fd5195876adb1ab449fd2242b03ed Mon Sep 17 00:00:00 2001
7c2869
From: Krutika Dhananjay <kdhananj@redhat.com>
7c2869
Date: Wed, 6 Dec 2017 16:55:33 +0530
7c2869
Subject: [PATCH 666/675] features/shard: Upon FSYNC from upper layers, wind
7c2869
 fsync on all changed shards
7c2869
7c2869
> Upstream: https://review.gluster.org/19566
7c2869
> BUG: 1468483
7c2869
> Change-Id: Ib74354f57a18569762ad45a51f182822a2537421
7c2869
7c2869
Change-Id: Ib74354f57a18569762ad45a51f182822a2537421
7c2869
BUG: 1583462
7c2869
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
7c2869
Reviewed-on: https://code.engineering.redhat.com/gerrit/140382
7c2869
Tested-by: RHGS Build Bot <nigelb@redhat.com>
7c2869
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
7c2869
---
7c2869
 tests/bugs/shard/bug-1468483.t               |  58 +++
7c2869
 tests/bugs/shard/shard-inode-refcount-test.t |   2 +-
7c2869
 xlators/features/shard/src/shard-messages.h  |   9 +-
7c2869
 xlators/features/shard/src/shard.c           | 534 +++++++++++++++++++++++++--
7c2869
 xlators/features/shard/src/shard.h           |   6 +
7c2869
 5 files changed, 570 insertions(+), 39 deletions(-)
7c2869
 create mode 100644 tests/bugs/shard/bug-1468483.t
7c2869
7c2869
diff --git a/tests/bugs/shard/bug-1468483.t b/tests/bugs/shard/bug-1468483.t
7c2869
new file mode 100644
7c2869
index 0000000..e462b8d
7c2869
--- /dev/null
7c2869
+++ b/tests/bugs/shard/bug-1468483.t
7c2869
@@ -0,0 +1,58 @@
7c2869
+#!/bin/bash
7c2869
+
7c2869
+. $(dirname $0)/../../include.rc
7c2869
+. $(dirname $0)/../../volume.rc
7c2869
+. $(dirname $0)/../../common-utils.rc
7c2869
+
7c2869
+cleanup
7c2869
+
7c2869
+TEST glusterd
7c2869
+TEST pidof glusterd
7c2869
+TEST $CLI volume create $V0 $H0:$B0/${V0}0
7c2869
+TEST $CLI volume set $V0 performance.write-behind off
7c2869
+TEST $CLI volume set $V0 features.shard on
7c2869
+TEST $CLI volume set $V0 features.shard-block-size 16MB
7c2869
+TEST $CLI volume start $V0
7c2869
+TEST $CLI volume profile $V0 start
7c2869
+
7c2869
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
7c2869
+TEST dd if=/dev/zero conv=fsync of=$M0/foo bs=1M count=100
7c2869
+
7c2869
+#This should ensure /.shard is created on the bricks.
7c2869
+TEST stat $B0/${V0}0/.shard
7c2869
+
7c2869
+gfid_foo=$(get_gfid_string $M0/foo)
7c2869
+
7c2869
+TEST stat $B0/${V0}0/.shard/$gfid_foo.1
7c2869
+TEST stat $B0/${V0}0/.shard/$gfid_foo.2
7c2869
+TEST stat $B0/${V0}0/.shard/$gfid_foo.3
7c2869
+TEST stat $B0/${V0}0/.shard/$gfid_foo.4
7c2869
+TEST stat $B0/${V0}0/.shard/$gfid_foo.5
7c2869
+TEST stat $B0/${V0}0/.shard/$gfid_foo.6
7c2869
+
7c2869
+# For a file with 7 shards, there should be 7 fsyncs on the brick. Without this
7c2869
+# fix, I was seeing only 1 fsync (on the base shard alone).
7c2869
+
7c2869
+EXPECT "7" echo `$CLI volume profile $V0 info incremental | grep -w FSYNC | awk '{print $8}'`
7c2869
+
7c2869
+useradd -M test_user 2>/dev/null
7c2869
+
7c2869
+TEST touch $M0/bar
7c2869
+
7c2869
+# Change ownership to non-root on bar.
7c2869
+TEST chown test_user:test_user $M0/bar
7c2869
+
7c2869
+TEST $CLI volume profile $V0 stop
7c2869
+TEST $CLI volume profile $V0 start
7c2869
+
7c2869
+# Write 100M of data on bar as non-root.
7c2869
+TEST run_cmd_as_user test_user "dd if=/dev/zero conv=fsync of=$M0/bar bs=1M count=100"
7c2869
+
7c2869
+EXPECT "7" echo `$CLI volume profile $V0 info incremental | grep -w FSYNC | awk '{print $8}'`
7c2869
+
7c2869
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
7c2869
+userdel test_user
7c2869
+TEST $CLI volume stop $V0
7c2869
+TEST $CLI volume delete $V0
7c2869
+
7c2869
+cleanup
7c2869
diff --git a/tests/bugs/shard/shard-inode-refcount-test.t b/tests/bugs/shard/shard-inode-refcount-test.t
7c2869
index 6358097..03e0cc9 100644
7c2869
--- a/tests/bugs/shard/shard-inode-refcount-test.t
7c2869
+++ b/tests/bugs/shard/shard-inode-refcount-test.t
7c2869
@@ -14,7 +14,7 @@ TEST $CLI volume start $V0
7c2869
 
7c2869
 TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0
7c2869
 
7c2869
-TEST dd if=/dev/zero of=$M0/one-plus-five-shards bs=1M count=23
7c2869
+TEST dd if=/dev/zero conv=fsync of=$M0/one-plus-five-shards bs=1M count=23
7c2869
 
7c2869
 ACTIVE_INODES_BEFORE=$(get_mount_active_size_value $V0)
7c2869
 TEST rm -f $M0/one-plus-five-shards
7c2869
diff --git a/xlators/features/shard/src/shard-messages.h b/xlators/features/shard/src/shard-messages.h
7c2869
index 588cb68..8e61630 100644
7c2869
--- a/xlators/features/shard/src/shard-messages.h
7c2869
+++ b/xlators/features/shard/src/shard-messages.h
7c2869
@@ -40,7 +40,7 @@
7c2869
  */
7c2869
 
7c2869
 #define GLFS_COMP_BASE_SHARD      GLFS_MSGID_COMP_SHARD
7c2869
-#define GLFS_NUM_MESSAGES         18
7c2869
+#define GLFS_NUM_MESSAGES         19
7c2869
 #define GLFS_MSGID_END          (GLFS_COMP_BASE_SHARD + GLFS_NUM_MESSAGES + 1)
7c2869
 
7c2869
 #define glfs_msg_start_x GLFS_COMP_BASE_SHARD, "Invalid: Start of messages"
7c2869
@@ -180,5 +180,12 @@
7c2869
 */
7c2869
 #define SHARD_MSG_INVALID_FOP                        (GLFS_COMP_BASE_SHARD + 18)
7c2869
 
7c2869
+/*!
7c2869
+ * @messageid 133019
7c2869
+ * @diagnosis
7c2869
+ * @recommendedaction
7c2869
+*/
7c2869
+#define SHARD_MSG_MEMALLOC_FAILED                    (GLFS_COMP_BASE_SHARD + 19)
7c2869
+
7c2869
 #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
7c2869
 #endif /* !_SHARD_MESSAGES_H_ */
7c2869
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
7c2869
index c57a426..68d1a3a 100644
7c2869
--- a/xlators/features/shard/src/shard.c
7c2869
+++ b/xlators/features/shard/src/shard.c
7c2869
@@ -76,6 +76,7 @@ __shard_inode_ctx_get (inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx)
7c2869
                 return ret;
7c2869
 
7c2869
         INIT_LIST_HEAD (&ctx_p->ilist);
7c2869
+        INIT_LIST_HEAD (&ctx_p->to_fsync_list);
7c2869
 
7c2869
         ret = __inode_ctx_set (inode, this, (uint64_t *)&ctx_p);
7c2869
         if (ret < 0) {
7c2869
@@ -205,6 +206,65 @@ shard_inode_ctx_set_refreshed_flag (inode_t *inode, xlator_t *this)
7c2869
         return ret;
7c2869
 }
7c2869
 
7c2869
+int
7c2869
+__shard_inode_ctx_add_to_fsync_list (inode_t *base_inode, xlator_t *this,
7c2869
+                                     inode_t *shard_inode)
7c2869
+{
7c2869
+        int                 ret           = -1;
7c2869
+        shard_inode_ctx_t  *base_ictx     = NULL;
7c2869
+        shard_inode_ctx_t  *shard_ictx    = NULL;
7c2869
+
7c2869
+        ret = __shard_inode_ctx_get (base_inode, this, &base_ictx);
7c2869
+        if (ret)
7c2869
+                return ret;
7c2869
+
7c2869
+        ret = __shard_inode_ctx_get (shard_inode, this, &shard_ictx);
7c2869
+        if (ret)
7c2869
+                return ret;
7c2869
+
7c2869
+        if (shard_ictx->fsync_needed) {
7c2869
+                shard_ictx->fsync_needed++;
7c2869
+                return 1;
7c2869
+        }
7c2869
+
7c2869
+        list_add_tail (&shard_ictx->to_fsync_list, &base_ictx->to_fsync_list);
7c2869
+        shard_ictx->inode = shard_inode;
7c2869
+        shard_ictx->fsync_needed++;
7c2869
+        base_ictx->fsync_count++;
7c2869
+        shard_ictx->base_inode = base_inode;
7c2869
+
7c2869
+        return 0;
7c2869
+}
7c2869
+
7c2869
+int
7c2869
+shard_inode_ctx_add_to_fsync_list (inode_t *base_inode, xlator_t *this,
7c2869
+                                   inode_t *shard_inode)
7c2869
+{
7c2869
+        int ret = -1;
7c2869
+
7c2869
+        /* This ref acts as a refkeepr on the base inode. We
7c2869
+         * need to keep this inode alive as it holds the head
7c2869
+         * of the to_fsync_list.
7c2869
+         */
7c2869
+        inode_ref (base_inode);
7c2869
+
7c2869
+        LOCK (&base_inode->lock);
7c2869
+        LOCK (&shard_inode->lock);
7c2869
+        {
7c2869
+                ret = __shard_inode_ctx_add_to_fsync_list (base_inode, this,
7c2869
+                                                           shard_inode);
7c2869
+        }
7c2869
+        UNLOCK (&shard_inode->lock);
7c2869
+        UNLOCK (&base_inode->lock);
7c2869
+
7c2869
+        /* Unref the base inode corresponding to the ref above, if the shard is
7c2869
+         * found to be already part of the fsync list.
7c2869
+         */
7c2869
+        if (ret != 0)
7c2869
+                inode_unref (base_inode);
7c2869
+        return ret;
7c2869
+}
7c2869
+
7c2869
 gf_boolean_t
7c2869
 __shard_inode_ctx_needs_lookup (inode_t *inode, xlator_t *this)
7c2869
 {
7c2869
@@ -301,6 +361,40 @@ shard_inode_ctx_get_block_size (inode_t *inode, xlator_t *this,
7c2869
 }
7c2869
 
7c2869
 int
7c2869
+__shard_inode_ctx_get_fsync_count (inode_t *inode, xlator_t *this,
7c2869
+                                   int *fsync_count)
7c2869
+{
7c2869
+        int                 ret      = -1;
7c2869
+        uint64_t            ctx_uint = 0;
7c2869
+        shard_inode_ctx_t  *ctx      = NULL;
7c2869
+
7c2869
+        ret = __inode_ctx_get (inode, this, &ctx_uint);
7c2869
+        if (ret < 0)
7c2869
+                return ret;
7c2869
+
7c2869
+        ctx = (shard_inode_ctx_t *) ctx_uint;
7c2869
+
7c2869
+        *fsync_count = ctx->fsync_needed;
7c2869
+
7c2869
+        return 0;
7c2869
+}
7c2869
+
7c2869
+int
7c2869
+shard_inode_ctx_get_fsync_count (inode_t *inode, xlator_t *this,
7c2869
+                                int *fsync_count)
7c2869
+{
7c2869
+        int ret = -1;
7c2869
+
7c2869
+        LOCK (&inode->lock);
7c2869
+        {
7c2869
+                ret = __shard_inode_ctx_get_fsync_count (inode, this,
7c2869
+                                                         fsync_count);
7c2869
+        }
7c2869
+        UNLOCK (&inode->lock);
7c2869
+
7c2869
+        return ret;
7c2869
+}
7c2869
+int
7c2869
 __shard_inode_ctx_get_all (inode_t *inode, xlator_t *this,
7c2869
                            shard_inode_ctx_t *ctx_out)
7c2869
 {
7c2869
@@ -482,15 +576,19 @@ out:
7c2869
         return ret;
7c2869
 }
7c2869
 
7c2869
-void
7c2869
+inode_t *
7c2869
 __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
7c2869
                                   inode_t *base_inode, int block_num)
7c2869
 {
7c2869
-        char                block_bname[256] = {0,};
7c2869
-        inode_t            *lru_inode        = NULL;
7c2869
-        shard_priv_t       *priv             = NULL;
7c2869
-        shard_inode_ctx_t  *ctx              = NULL;
7c2869
-        shard_inode_ctx_t  *lru_inode_ctx    = NULL;
7c2869
+        char                block_bname[256]     = {0,};
7c2869
+        inode_t            *lru_inode            = NULL;
7c2869
+        shard_priv_t       *priv                 = NULL;
7c2869
+        shard_inode_ctx_t  *ctx                  = NULL;
7c2869
+        shard_inode_ctx_t  *lru_inode_ctx        = NULL;
7c2869
+        shard_inode_ctx_t  *lru_base_inode_ctx   = NULL;
7c2869
+        inode_t            *fsync_inode          = NULL;
7c2869
+        inode_t            *lru_base_inode       = NULL;
7c2869
+        gf_boolean_t        do_fsync             = _gf_false;
7c2869
 
7c2869
         priv = this->private;
7c2869
 
7c2869
@@ -510,6 +608,7 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
7c2869
                         ctx->block_num = block_num;
7c2869
                         list_add_tail (&ctx->ilist, &priv->ilist_head);
7c2869
                         priv->inode_count++;
7c2869
+                        ctx->base_inode = base_inode;
7c2869
                 } else {
7c2869
                 /*If on the other hand there is no available slot for this inode
7c2869
                  * in the list, delete the lru inode from the head of the list,
7c2869
@@ -519,30 +618,56 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
7c2869
                                                           shard_inode_ctx_t,
7c2869
                                                           ilist);
7c2869
                         GF_ASSERT (lru_inode_ctx->block_num > 0);
7c2869
+                        lru_base_inode = lru_inode_ctx->base_inode;
7c2869
                         list_del_init (&lru_inode_ctx->ilist);
7c2869
                         lru_inode = inode_find (linked_inode->table,
7c2869
                                                 lru_inode_ctx->stat.ia_gfid);
7c2869
-                        shard_make_block_bname (lru_inode_ctx->block_num,
7c2869
-                                                lru_inode_ctx->base_gfid,
7c2869
-                                                block_bname,
7c2869
-                                                sizeof (block_bname));
7c2869
-                        inode_unlink (lru_inode, priv->dot_shard_inode,
7c2869
-                                      block_bname);
7c2869
-                        /* The following unref corresponds to the ref held by
7c2869
-                         * inode_find() above.
7c2869
+                        /* If the lru inode was part of the pending-fsync list,
7c2869
+                         * the base inode needs to be unref'd, the lru inode
7c2869
+                         * deleted from fsync list and fsync'd in a new frame,
7c2869
+                         * and then unlinked in memory and forgotten.
7c2869
                          */
7c2869
-                        inode_unref (lru_inode);
7c2869
+                        LOCK (&lru_base_inode->lock);
7c2869
+                        LOCK (&lru_inode->lock);
7c2869
+                        {
7c2869
+                                if (!list_empty(&lru_inode_ctx->to_fsync_list)) {
7c2869
+                                        list_del_init (&lru_inode_ctx->to_fsync_list);
7c2869
+                                        lru_inode_ctx->fsync_needed = 0;
7c2869
+                                        do_fsync = _gf_true;
7c2869
+                                        __shard_inode_ctx_get (lru_base_inode, this, &lru_base_inode_ctx);
7c2869
+                                        lru_base_inode_ctx->fsync_count--;
7c2869
+                                }
7c2869
+                        }
7c2869
+                        UNLOCK (&lru_inode->lock);
7c2869
+                        UNLOCK (&lru_base_inode->lock);
7c2869
+
7c2869
+                        if (!do_fsync) {
7c2869
+                                shard_make_block_bname (lru_inode_ctx->block_num,
7c2869
+                                                        lru_inode_ctx->base_gfid,
7c2869
+                                                        block_bname,
7c2869
+                                                        sizeof (block_bname));
7c2869
                         /* The following unref corresponds to the ref held at
7c2869
-                         * the time the shard was created or looked up
7c2869
+                         * the time the shard was added to the lru list.
7c2869
+                         */
7c2869
+                                inode_unref (lru_inode);
7c2869
+                                inode_unlink (lru_inode, priv->dot_shard_inode,
7c2869
+                                              block_bname);
7c2869
+                                inode_forget (lru_inode, 0);
7c2869
+                        } else {
7c2869
+                                fsync_inode = lru_inode;
7c2869
+                                inode_unref (lru_base_inode);
7c2869
+                        }
7c2869
+                        /* The following unref corresponds to the ref
7c2869
+                         * held by inode_find() above.
7c2869
                          */
7c2869
                         inode_unref (lru_inode);
7c2869
-                        inode_forget (lru_inode, 0);
7c2869
                         /* For as long as an inode is in lru list, we try to
7c2869
                          * keep it alive by holding a ref on it.
7c2869
                          */
7c2869
                         inode_ref (linked_inode);
7c2869
                         gf_uuid_copy (ctx->base_gfid, base_inode->gfid);
7c2869
                         ctx->block_num = block_num;
7c2869
+                        ctx->base_inode = base_inode;
7c2869
                         list_add_tail (&ctx->ilist, &priv->ilist_head);
7c2869
                 }
7c2869
         } else {
7c2869
@@ -551,6 +676,7 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this,
7c2869
          */
7c2869
                 list_move_tail (&ctx->ilist, &priv->ilist_head);
7c2869
         }
7c2869
+        return fsync_inode;
7c2869
 }
7c2869
 
7c2869
 int
7c2869
@@ -617,6 +743,85 @@ shard_common_inode_write_success_unwind (glusterfs_fop_t fop,
7c2869
 }
7c2869
 
7c2869
 int
7c2869
+shard_evicted_inode_fsync_cbk (call_frame_t *frame, void *cookie,
7c2869
+                               xlator_t *this, int32_t op_ret, int32_t op_errno,
7c2869
+                               struct iatt *prebuf, struct iatt *postbuf,
7c2869
+                               dict_t *xdata)
7c2869
+{
7c2869
+        char                  block_bname[256] = {0,};
7c2869
+        fd_t                 *anon_fd          = cookie;
7c2869
+        inode_t              *shard_inode      = NULL;
7c2869
+        shard_inode_ctx_t    *ctx              = NULL;
7c2869
+        shard_priv_t         *priv             = NULL;
7c2869
+
7c2869
+        priv = this->private;
7c2869
+        shard_inode = anon_fd->inode;
7c2869
+
7c2869
+        if (op_ret < 0) {
7c2869
+                gf_msg (this->name, GF_LOG_WARNING, op_errno,
7c2869
+                        SHARD_MSG_MEMALLOC_FAILED, "fsync failed on shard");
7c2869
+                goto out;
7c2869
+        }
7c2869
+
7c2869
+        LOCK (&priv->lock);
7c2869
+        LOCK(&shard_inode->lock);
7c2869
+        {
7c2869
+                __shard_inode_ctx_get (shard_inode, this, &ctx;;
7c2869
+                if ((list_empty(&ctx->to_fsync_list)) &&
7c2869
+                    (list_empty(&ctx->ilist))) {
7c2869
+                        shard_make_block_bname (ctx->block_num,
7c2869
+                                                shard_inode->gfid, block_bname,
7c2869
+                                                sizeof (block_bname));
7c2869
+                        inode_unlink (shard_inode, priv->dot_shard_inode,
7c2869
+                                      block_bname);
7c2869
+                        /* The following unref corresponds to the ref held by
7c2869
+                         * inode_link() at the time the shard was created or
7c2869
+                         * looked up
7c2869
+                         */
7c2869
+                        inode_unref (shard_inode);
7c2869
+                        inode_forget (shard_inode, 0);
7c2869
+                }
7c2869
+        }
7c2869
+        UNLOCK(&shard_inode->lock);
7c2869
+        UNLOCK(&priv->lock);
7c2869
+
7c2869
+out:
7c2869
+        if (anon_fd)
7c2869
+                fd_unref (anon_fd);
7c2869
+        STACK_DESTROY (frame->root);
7c2869
+        return 0;
7c2869
+}
7c2869
+
7c2869
+int
7c2869
+shard_initiate_evicted_inode_fsync (xlator_t *this, inode_t *inode)
7c2869
+{
7c2869
+        fd_t             *anon_fd     = NULL;
7c2869
+        call_frame_t     *fsync_frame = NULL;
7c2869
+
7c2869
+        fsync_frame = create_frame (this, this->ctx->pool);
7c2869
+        if (!fsync_frame) {
7c2869
+                gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
7c2869
+                        SHARD_MSG_MEMALLOC_FAILED, "Failed to create new frame "
7c2869
+                        "to fsync shard");
7c2869
+                return -1;
7c2869
+        }
7c2869
+
7c2869
+        anon_fd = fd_anonymous (inode);
7c2869
+        if (!anon_fd) {
7c2869
+                gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
7c2869
+                        SHARD_MSG_MEMALLOC_FAILED, "Failed to create anon fd to"
7c2869
+                        " fsync shard");
7c2869
+                STACK_DESTROY (fsync_frame->root);
7c2869
+                return -1;
7c2869
+        }
7c2869
+
7c2869
+        STACK_WIND_COOKIE (fsync_frame, shard_evicted_inode_fsync_cbk, anon_fd,
7c2869
+                           FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync,
7c2869
+                           anon_fd, 1, NULL);
7c2869
+        return 0;
7c2869
+}
7c2869
+
7c2869
+int
7c2869
 shard_common_resolve_shards (call_frame_t *frame, xlator_t *this,
7c2869
                              shard_post_resolve_fop_handler_t post_res_handler)
7c2869
 {
7c2869
@@ -625,6 +830,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this,
7c2869
         char                  path[PATH_MAX] = {0,};
7c2869
         inode_t              *inode          = NULL;
7c2869
         inode_t              *res_inode      = NULL;
7c2869
+        inode_t              *fsync_inode    = NULL;
7c2869
         shard_priv_t         *priv           = NULL;
7c2869
         shard_local_t        *local          = NULL;
7c2869
 
7c2869
@@ -661,20 +867,22 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this,
7c2869
                          */
7c2869
                         LOCK(&priv->lock);
7c2869
                         {
7c2869
-                                __shard_update_shards_inode_list (inode, this,
7c2869
+                                fsync_inode = __shard_update_shards_inode_list (inode,
7c2869
+                                                                  this,
7c2869
                                                                   res_inode,
7c2869
                                                                 shard_idx_iter);
7c2869
                         }
7c2869
                         UNLOCK(&priv->lock);
7c2869
                         shard_idx_iter++;
7c2869
-
7c2869
+                        if (fsync_inode)
7c2869
+                                shard_initiate_evicted_inode_fsync (this,
7c2869
+                                                                    fsync_inode);
7c2869
                          continue;
7c2869
                 } else {
7c2869
                         local->call_count++;
7c2869
                         shard_idx_iter++;
7c2869
                 }
7c2869
         }
7c2869
-
7c2869
 out:
7c2869
         post_res_handler (frame, this);
7c2869
         return 0;
7c2869
@@ -1657,6 +1865,7 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode,
7c2869
         char            block_bname[256] = {0,};
7c2869
         inode_t        *linked_inode     = NULL;
7c2869
         xlator_t       *this             = NULL;
7c2869
+        inode_t        *fsync_inode      = NULL;
7c2869
         shard_priv_t   *priv             = NULL;
7c2869
 
7c2869
         this = THIS;
7c2869
@@ -1674,10 +1883,14 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode,
7c2869
 
7c2869
         LOCK(&priv->lock);
7c2869
         {
7c2869
-                __shard_update_shards_inode_list (linked_inode, this,
7c2869
-                                                  local->loc.inode, block_num);
7c2869
+                fsync_inode = __shard_update_shards_inode_list (linked_inode,
7c2869
+                                                                this,
7c2869
+                                                                local->loc.inode,
7c2869
+                                                                block_num);
7c2869
         }
7c2869
         UNLOCK(&priv->lock);
7c2869
+        if (fsync_inode)
7c2869
+                shard_initiate_evicted_inode_fsync (this, fsync_inode);
7c2869
 }
7c2869
 
7c2869
 int
7c2869
@@ -2120,6 +2333,7 @@ shard_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
7c2869
         local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new ();
7c2869
         if (!local->xattr_req)
7c2869
                 goto err;
7c2869
+        local->resolver_base_inode = loc->inode;
7c2869
 
7c2869
         shard_lookup_base_file (frame, this, &local->loc,
7c2869
                                 shard_post_lookup_truncate_handler);
7c2869
@@ -2172,6 +2386,7 @@ shard_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
7c2869
 
7c2869
         local->loc.inode = inode_ref (fd->inode);
7c2869
         gf_uuid_copy (local->loc.gfid, fd->inode->gfid);
7c2869
+        local->resolver_base_inode = fd->inode;
7c2869
 
7c2869
         shard_lookup_base_file (frame, this, &local->loc,
7c2869
                                 shard_post_lookup_truncate_handler);
7c2869
@@ -2509,32 +2724,48 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
7c2869
 {
7c2869
         char                  block_bname[256]  = {0,};
7c2869
         inode_t              *inode             = NULL;
7c2869
+        inode_t              *base_inode        = NULL;
7c2869
         xlator_t             *this              = NULL;
7c2869
         shard_priv_t         *priv              = NULL;
7c2869
         shard_inode_ctx_t    *ctx               = NULL;
7c2869
+        shard_inode_ctx_t    *base_ictx         = NULL;
7c2869
+        gf_boolean_t          unlink_unref_forget = _gf_false;
7c2869
 
7c2869
         this = THIS;
7c2869
         priv = this->private;
7c2869
 
7c2869
         inode = local->inode_list[shard_block_num - local->first_block];
7c2869
+        base_inode = local->resolver_base_inode;
7c2869
 
7c2869
         shard_make_block_bname (shard_block_num, (local->loc.inode)->gfid,
7c2869
                                 block_bname, sizeof (block_bname));
7c2869
 
7c2869
         LOCK(&priv->lock);
7c2869
+        LOCK(&base_inode->lock);
7c2869
+        LOCK(&inode->lock);
7c2869
         {
7c2869
-                shard_inode_ctx_get (inode, this, &ctx;;
7c2869
+                __shard_inode_ctx_get (inode, this, &ctx;;
7c2869
                 if (!list_empty (&ctx->ilist)) {
7c2869
                         list_del_init (&ctx->ilist);
7c2869
                         priv->inode_count--;
7c2869
                         GF_ASSERT (priv->inode_count >= 0);
7c2869
-                        inode_unlink (inode, priv->dot_shard_inode, block_bname);
7c2869
-                        inode_unref (inode);
7c2869
-                        inode_forget (inode, 0);
7c2869
+                        unlink_unref_forget = _gf_true;
7c2869
+                }
7c2869
+                if (ctx->fsync_needed) {
7c2869
+                        inode_unref (base_inode);
7c2869
+                        list_del_init (&ctx->to_fsync_list);
7c2869
+                        __shard_inode_ctx_get (base_inode, this, &base_ictx);
7c2869
+                        base_ictx->fsync_count--;
7c2869
                 }
7c2869
         }
7c2869
+        UNLOCK(&inode->lock);
7c2869
+        UNLOCK(&base_inode->lock);
7c2869
+        if (unlink_unref_forget) {
7c2869
+                inode_unlink (inode, priv->dot_shard_inode, block_bname);
7c2869
+                inode_unref (inode);
7c2869
+                inode_forget (inode, 0);
7c2869
+        }
7c2869
         UNLOCK(&priv->lock);
7c2869
-
7c2869
 }
7c2869
 
7c2869
 int
7c2869
@@ -2755,6 +2986,7 @@ shard_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
7c2869
         local->xflag = xflag;
7c2869
         local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new ();
7c2869
         local->block_size = block_size;
7c2869
+        local->resolver_base_inode = loc->inode;
7c2869
         local->fop = GF_FOP_UNLINK;
7c2869
         if (!this->itable)
7c2869
                 this->itable = (local->loc.inode)->table;
7c2869
@@ -2991,6 +3223,7 @@ shard_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
7c2869
         frame->local = local;
7c2869
         loc_copy (&local->loc, oldloc);
7c2869
         loc_copy (&local->loc2, newloc);
7c2869
+        local->resolver_base_inode = newloc->inode;
7c2869
         local->fop = GF_FOP_RENAME;
7c2869
         local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new();
7c2869
         if (!local->xattr_req)
7c2869
@@ -3757,6 +3990,10 @@ shard_common_inode_write_do_cbk (call_frame_t *frame, void *cookie,
7c2869
                         local->delta_size += (post->ia_size - pre->ia_size);
7c2869
                         shard_inode_ctx_set (local->fd->inode, this, post, 0,
7c2869
                                              SHARD_MASK_TIMES);
7c2869
+                        if (local->fd->inode != anon_fd->inode)
7c2869
+                                shard_inode_ctx_add_to_fsync_list (local->fd->inode,
7c2869
+                                                                   this,
7c2869
+                                                                   anon_fd->inode);
7c2869
                 }
7c2869
         }
7c2869
         UNLOCK (&frame->lock);
7c2869
@@ -4207,18 +4444,199 @@ shard_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
7c2869
 }
7c2869
 
7c2869
 int
7c2869
-shard_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
7c2869
-                 int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
7c2869
-                 struct iatt *postbuf, dict_t *xdata)
7c2869
+__shard_get_timestamps_from_inode_ctx (shard_local_t *local, inode_t *inode,
7c2869
+                                       xlator_t *this)
7c2869
 {
7c2869
-        if (op_ret < 0)
7c2869
+        int                   ret      = -1;
7c2869
+        uint64_t              ctx_uint = 0;
7c2869
+        shard_inode_ctx_t    *ctx      = NULL;
7c2869
+
7c2869
+        ret = __inode_ctx_get (inode, this, &ctx_uint);
7c2869
+        if (ret < 0)
7c2869
+                return ret;
7c2869
+
7c2869
+        ctx = (shard_inode_ctx_t *) ctx_uint;
7c2869
+
7c2869
+        local->postbuf.ia_ctime = ctx->stat.ia_ctime;
7c2869
+        local->postbuf.ia_ctime_nsec = ctx->stat.ia_ctime_nsec;
7c2869
+        local->postbuf.ia_atime = ctx->stat.ia_atime;
7c2869
+        local->postbuf.ia_atime_nsec = ctx->stat.ia_atime_nsec;
7c2869
+        local->postbuf.ia_mtime = ctx->stat.ia_mtime;
7c2869
+        local->postbuf.ia_mtime_nsec = ctx->stat.ia_mtime_nsec;
7c2869
+
7c2869
+        return 0;
7c2869
+}
7c2869
+
7c2869
+int
7c2869
+shard_get_timestamps_from_inode_ctx (shard_local_t *local, inode_t *inode,
7c2869
+                                     xlator_t *this)
7c2869
+{
7c2869
+        int ret = 0;
7c2869
+
7c2869
+        LOCK (&inode->lock);
7c2869
+        {
7c2869
+                ret = __shard_get_timestamps_from_inode_ctx (local, inode,
7c2869
+                                                             this);
7c2869
+        }
7c2869
+        UNLOCK (&inode->lock);
7c2869
+
7c2869
+        return ret;
7c2869
+}
7c2869
+
7c2869
+int
7c2869
+shard_fsync_shards_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
7c2869
+                       int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
7c2869
+                       struct iatt *postbuf, dict_t *xdata)
7c2869
+{
7c2869
+        int                    call_count  = 0;
7c2869
+        uint64_t               fsync_count = 0;
7c2869
+        fd_t                  *anon_fd     = cookie;
7c2869
+        shard_local_t         *local       = NULL;
7c2869
+        shard_inode_ctx_t     *ctx         = NULL;
7c2869
+        shard_inode_ctx_t     *base_ictx   = NULL;
7c2869
+        inode_t               *base_inode  = NULL;
7c2869
+
7c2869
+        local = frame->local;
7c2869
+        base_inode = local->fd->inode;
7c2869
+
7c2869
+        if (local->op_ret < 0)
7c2869
                 goto out;
7c2869
 
7c2869
-        /* To-Do: Wind fsync on all shards of the file */
7c2869
-        postbuf->ia_ctime = 0;
7c2869
+        LOCK (&frame->lock);
7c2869
+        {
7c2869
+                if (op_ret < 0) {
7c2869
+                        local->op_ret = op_ret;
7c2869
+                        local->op_errno = op_errno;
7c2869
+                        UNLOCK (&frame->lock);
7c2869
+                        goto out;
7c2869
+                }
7c2869
+                shard_inode_ctx_set (local->fd->inode, this, postbuf, 0,
7c2869
+                                     SHARD_MASK_TIMES);
7c2869
+        }
7c2869
+        UNLOCK (&frame->lock);
7c2869
+        fd_ctx_get (anon_fd, this, &fsync_count);
7c2869
 out:
7c2869
-        SHARD_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf,
7c2869
-                            xdata);
7c2869
+        if (base_inode != anon_fd->inode) {
7c2869
+                LOCK (&base_inode->lock);
7c2869
+                LOCK (&anon_fd->inode->lock);
7c2869
+                {
7c2869
+                        __shard_inode_ctx_get (anon_fd->inode, this, &ctx;;
7c2869
+                        __shard_inode_ctx_get (base_inode, this, &base_ictx);
7c2869
+                        if (op_ret == 0)
7c2869
+                                ctx->fsync_needed -= fsync_count;
7c2869
+                        GF_ASSERT (ctx->fsync_needed >= 0);
7c2869
+                        list_del_init (&ctx->to_fsync_list);
7c2869
+                        if (ctx->fsync_needed != 0) {
7c2869
+                                list_add_tail (&ctx->to_fsync_list,
7c2869
+                                               &base_ictx->to_fsync_list);
7c2869
+                                base_ictx->fsync_count++;
7c2869
+                        }
7c2869
+                }
7c2869
+                UNLOCK (&anon_fd->inode->lock);
7c2869
+                UNLOCK (&base_inode->lock);
7c2869
+        }
7c2869
+        if (anon_fd)
7c2869
+                fd_unref (anon_fd);
7c2869
+
7c2869
+        call_count = shard_call_count_return (frame);
7c2869
+        if (call_count != 0)
7c2869
+                return 0;
7c2869
+
7c2869
+        if (local->op_ret < 0) {
7c2869
+                SHARD_STACK_UNWIND (fsync, frame, local->op_ret,
7c2869
+                                    local->op_errno, NULL, NULL, NULL);
7c2869
+        } else {
7c2869
+                shard_get_timestamps_from_inode_ctx (local, base_inode, this);
7c2869
+                SHARD_STACK_UNWIND (fsync, frame, local->op_ret,
7c2869
+                                    local->op_errno, &local->prebuf,
7c2869
+                                    &local->postbuf, local->xattr_rsp);
7c2869
+        }
7c2869
+        return 0;
7c2869
+}
7c2869
+
7c2869
+int
7c2869
+shard_post_lookup_fsync_handler (call_frame_t *frame, xlator_t *this)
7c2869
+{
7c2869
+        int                 ret          = 0;
7c2869
+        int                 call_count   = 0;
7c2869
+        int                 fsync_count  = 0;
7c2869
+        fd_t               *anon_fd      = NULL;
7c2869
+        inode_t            *base_inode   = NULL;
7c2869
+        shard_local_t      *local        = NULL;
7c2869
+        shard_inode_ctx_t  *ctx          = NULL;
7c2869
+        shard_inode_ctx_t  *iter         = NULL;
7c2869
+        struct list_head    copy         = {0,};
7c2869
+        shard_inode_ctx_t  *tmp          = NULL;
7c2869
+
7c2869
+        local = frame->local;
7c2869
+        base_inode = local->fd->inode;
7c2869
+        local->postbuf = local->prebuf;
7c2869
+        INIT_LIST_HEAD (©);
7c2869
+
7c2869
+        if (local->op_ret < 0) {
7c2869
+                SHARD_STACK_UNWIND (fsync, frame, local->op_ret,
7c2869
+                                    local->op_errno, NULL, NULL, NULL);
7c2869
+                return 0;
7c2869
+        }
7c2869
+
7c2869
+        LOCK (&base_inode->lock);
7c2869
+        {
7c2869
+                __shard_inode_ctx_get (base_inode, this, &ctx;;
7c2869
+                list_splice_init (&ctx->to_fsync_list, ©);
7c2869
+                call_count = ctx->fsync_count;
7c2869
+                ctx->fsync_count = 0;
7c2869
+        }
7c2869
+        UNLOCK (&base_inode->lock);
7c2869
+
7c2869
+        local->call_count = ++call_count;
7c2869
+
7c2869
+        /* Send fsync() on the base shard first */
7c2869
+        anon_fd = fd_ref (local->fd);
7c2869
+        STACK_WIND_COOKIE (frame, shard_fsync_shards_cbk, anon_fd,
7c2869
+                           FIRST_CHILD(this),
7c2869
+                           FIRST_CHILD(this)->fops->fsync, anon_fd,
7c2869
+                           local->datasync, local->xattr_req);
7c2869
+        call_count--;
7c2869
+        anon_fd = NULL;
7c2869
+
7c2869
+        list_for_each_entry_safe (iter, tmp, &copy, to_fsync_list) {
7c2869
+                fsync_count = 0;
7c2869
+                shard_inode_ctx_get_fsync_count (iter->inode, this,
7c2869
+                                                 &fsync_count);
7c2869
+                GF_ASSERT (fsync_count > 0);
7c2869
+                anon_fd = fd_anonymous (iter->inode);
7c2869
+                if (!anon_fd) {
7c2869
+                        local->op_ret = -1;
7c2869
+                        local->op_errno = ENOMEM;
7c2869
+                        gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
7c2869
+                                SHARD_MSG_MEMALLOC_FAILED, "Failed to create "
7c2869
+                                "anon fd to fsync shard");
7c2869
+                        shard_fsync_shards_cbk (frame, (void *) (long) anon_fd,
7c2869
+                                                this, -1, ENOMEM, NULL, NULL,
7c2869
+                                                NULL);
7c2869
+                        continue;
7c2869
+                }
7c2869
+
7c2869
+                ret = fd_ctx_set (anon_fd, this, fsync_count);
7c2869
+                if (ret) {
7c2869
+                        gf_msg (this->name, GF_LOG_ERROR, 0,
7c2869
+                                SHARD_MSG_FD_CTX_SET_FAILED, "Failed to set fd "
7c2869
+                                "ctx for shard inode gfid=%s",
7c2869
+                                uuid_utoa (iter->inode->gfid));
7c2869
+                        local->op_ret = -1;
7c2869
+                        local->op_errno = ENOMEM;
7c2869
+                        shard_fsync_shards_cbk (frame, (void *) (long) anon_fd,
7c2869
+                                                this, -1, ENOMEM, NULL, NULL,
7c2869
+                                                NULL);
7c2869
+                        continue;
7c2869
+                }
7c2869
+                STACK_WIND_COOKIE (frame, shard_fsync_shards_cbk, anon_fd,
7c2869
+                                   FIRST_CHILD(this),
7c2869
+                                   FIRST_CHILD(this)->fops->fsync, anon_fd,
7c2869
+                                   local->datasync, local->xattr_req);
7c2869
+                call_count--;
7c2869
+        }
7c2869
+
7c2869
         return 0;
7c2869
 }
7c2869
 
7c2869
@@ -4226,8 +4644,50 @@ int
7c2869
 shard_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
7c2869
              dict_t *xdata)
7c2869
 {
7c2869
-        STACK_WIND (frame, shard_fsync_cbk, FIRST_CHILD(this),
7c2869
-                    FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata);
7c2869
+        int              ret        = 0;
7c2869
+        uint64_t         block_size = 0;
7c2869
+        shard_local_t   *local      = NULL;
7c2869
+
7c2869
+        ret = shard_inode_ctx_get_block_size (fd->inode, this, &block_size);
7c2869
+        if (ret) {
7c2869
+                gf_msg (this->name, GF_LOG_ERROR, 0,
7c2869
+                        SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block "
7c2869
+                        "size for %s from its inode ctx",
7c2869
+                        uuid_utoa (fd->inode->gfid));
7c2869
+                goto err;
7c2869
+        }
7c2869
+
7c2869
+        if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) {
7c2869
+                STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD(this),
7c2869
+                            FIRST_CHILD(this)->fops->fsync, fd, datasync,
7c2869
+                            xdata);
7c2869
+                return 0;
7c2869
+        }
7c2869
+
7c2869
+        if (!this->itable)
7c2869
+                this->itable = fd->inode->table;
7c2869
+
7c2869
+        local = mem_get0 (this->local_pool);
7c2869
+        if (!local)
7c2869
+                goto err;
7c2869
+
7c2869
+        frame->local = local;
7c2869
+
7c2869
+        local->fd = fd_ref (fd);
7c2869
+        local->fop = GF_FOP_FSYNC;
7c2869
+        local->datasync = datasync;
7c2869
+        local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new ();
7c2869
+        if (!local->xattr_req)
7c2869
+                goto err;
7c2869
+
7c2869
+        local->loc.inode = inode_ref (fd->inode);
7c2869
+        gf_uuid_copy (local->loc.gfid, fd->inode->gfid);
7c2869
+
7c2869
+        shard_lookup_base_file (frame, this, &local->loc,
7c2869
+                                shard_post_lookup_fsync_handler);
7c2869
+        return 0;
7c2869
+err:
7c2869
+        SHARD_STACK_UNWIND (fsync, frame, -1, ENOMEM, NULL, NULL, NULL);
7c2869
         return 0;
7c2869
 }
7c2869
 
7c2869
diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h
7c2869
index 7319598..75d39a1 100644
7c2869
--- a/xlators/features/shard/src/shard.h
7c2869
+++ b/xlators/features/shard/src/shard.h
7c2869
@@ -215,6 +215,7 @@ typedef struct shard_local {
7c2869
         uint32_t gid;
7c2869
         uint64_t block_size;
7c2869
         uint64_t dst_block_size;
7c2869
+        int32_t datasync;
7c2869
         off_t offset;
7c2869
         size_t total_size;
7c2869
         size_t written_size;
7c2869
@@ -270,6 +271,11 @@ typedef struct shard_inode_ctx {
7c2869
         uuid_t base_gfid;
7c2869
         int block_num;
7c2869
         gf_boolean_t refreshed;
7c2869
+        struct list_head to_fsync_list;
7c2869
+        int fsync_needed;
7c2869
+        inode_t *inode;
7c2869
+        int fsync_count;
7c2869
+        inode_t *base_inode;
7c2869
 } shard_inode_ctx_t;
7c2869
 
7c2869
 #endif /* __SHARD_H__ */
7c2869
-- 
7c2869
1.8.3.1
7c2869