887953
From a7ade5267ebaf4bf318ee2aebe48000cee583e3b Mon Sep 17 00:00:00 2001
887953
From: Krutika Dhananjay <kdhananj@redhat.com>
887953
Date: Fri, 28 Dec 2018 18:53:15 +0530
887953
Subject: [PATCH 505/506] features/shard: Fix launch of multiple synctasks for
887953
 background deletion
887953
887953
> Upstream: https://review.gluster.org/21957
887953
> BUG: 1662368
887953
> Change-Id: Ib33773d27fb4be463c7a8a5a6a4b63689705324e
887953
887953
PROBLEM:
887953
887953
When multiple sharded files are deleted in quick succession, multiple
887953
issues were observed:
887953
1. misleading logs corresponding to a sharded file where while one log
887953
   message said the shards corresponding to the file were deleted
887953
   successfully, this was followed by multiple logs suggesting the very
887953
   same operation failed. This was because of multiple synctasks
887953
   attempting to clean up shards of the same file and only one of them
887953
   succeeding (the one that gets ENTRYLK successfully), and the rest of
887953
   them logging failure.
887953
887953
2. multiple synctasks to do background deletion would be launched, one
887953
   for each deleted file but all of them could readdir entries from
887953
   .remove_me at the same time could potentially contend for ENTRYLK on
887953
   .shard for each of the entry names. This is undesirable and wasteful.
887953
887953
FIX:
887953
Background deletion will now follow a state machine. In the event that
887953
there are multiple attempts to launch synctask for background deletion,
887953
one for each file deleted, only the first task is launched. And if while
887953
this task is doing the cleanup, more attempts are made to delete other
887953
files, the state of the synctask is adjusted so that it restarts the
887953
crawl even after reaching end-of-directory to pick up any files it may
887953
have missed in the previous iteration.
887953
887953
This patch also fixes uninitialized lk-owner during syncop_entrylk()
887953
which was leading to multiple background deletion synctasks entering
887953
the critical section at the same time and leading to illegal memory access
887953
of base inode in the second syntcask after it was destroyed post shard deletion
887953
by the first synctask.
887953
887953
Change-Id: Ib33773d27fb4be463c7a8a5a6a4b63689705324e
887953
BUG: 1662059
887953
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
887953
Reviewed-on: https://code.engineering.redhat.com/gerrit/160437
887953
Tested-by: RHGS Build Bot <nigelb@redhat.com>
887953
Reviewed-by: Xavi Hernandez <xhernandez@redhat.com>
887953
---
887953
 xlators/features/shard/src/shard.c | 199 +++++++++++++++++++----------
887953
 xlators/features/shard/src/shard.h |  12 +-
887953
 2 files changed, 136 insertions(+), 75 deletions(-)
887953
887953
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
887953
index 5b72399f5..8aed1a386 100644
887953
--- a/xlators/features/shard/src/shard.c
887953
+++ b/xlators/features/shard/src/shard.c
887953
@@ -1465,16 +1465,45 @@ int
887953
 shard_start_background_deletion (xlator_t *this)
887953
 {
887953
         int              ret           = 0;
887953
+        gf_boolean_t     i_cleanup     = _gf_true;
887953
+        shard_priv_t    *priv          = NULL;
887953
         call_frame_t    *cleanup_frame = NULL;
887953
 
887953
+        priv = this->private;
887953
+
887953
+        LOCK(&priv->lock);
887953
+        {
887953
+                switch (priv->bg_del_state) {
887953
+                case SHARD_BG_DELETION_NONE:
887953
+                        i_cleanup = _gf_true;
887953
+                        priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING;
887953
+                        break;
887953
+                case SHARD_BG_DELETION_LAUNCHING:
887953
+                        i_cleanup = _gf_false;
887953
+                        break;
887953
+                case SHARD_BG_DELETION_IN_PROGRESS:
887953
+                        priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING;
887953
+                        i_cleanup = _gf_false;
887953
+                        break;
887953
+                default:
887953
+                        break;
887953
+                }
887953
+        }
887953
+        UNLOCK(&priv->lock);
887953
+        if (!i_cleanup)
887953
+                return 0;
887953
+
887953
         cleanup_frame = create_frame (this, this->ctx->pool);
887953
         if (!cleanup_frame) {
887953
                 gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
887953
                         SHARD_MSG_MEMALLOC_FAILED, "Failed to create "
887953
                         "new frame to delete shards");
887953
-                return -ENOMEM;
887953
+                ret = -ENOMEM;
887953
+                goto err;
887953
         }
887953
 
887953
+        set_lk_owner_from_ptr(&cleanup_frame->root->lk_owner, cleanup_frame->root);
887953
+
887953
         ret = synctask_new (this->ctx->env, shard_delete_shards,
887953
                             shard_delete_shards_cbk, cleanup_frame,
887953
                             cleanup_frame);
887953
@@ -1484,7 +1513,16 @@ shard_start_background_deletion (xlator_t *this)
887953
                         "failed to create task to do background "
887953
                         "cleanup of shards");
887953
                 STACK_DESTROY (cleanup_frame->root);
887953
+                goto err;
887953
         }
887953
+        return 0;
887953
+
887953
+err:
887953
+        LOCK(&priv->lock);
887953
+        {
887953
+                priv->bg_del_state = SHARD_BG_DELETION_NONE;
887953
+        }
887953
+        UNLOCK(&priv->lock);
887953
         return ret;
887953
 }
887953
 
887953
@@ -1493,7 +1531,7 @@ shard_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
887953
                   int32_t op_ret, int32_t op_errno, inode_t *inode,
887953
                   struct iatt *buf, dict_t *xdata, struct iatt *postparent)
887953
 {
887953
-        int             ret             = 0;
887953
+        int             ret             = -1;
887953
         shard_priv_t   *priv            = NULL;
887953
         gf_boolean_t    i_start_cleanup = _gf_false;
887953
 
887953
@@ -1526,22 +1564,23 @@ shard_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
887953
 
887953
         LOCK (&priv->lock);
887953
         {
887953
-                if (priv->first_lookup == SHARD_FIRST_LOOKUP_PENDING) {
887953
-                        priv->first_lookup = SHARD_FIRST_LOOKUP_IN_PROGRESS;
887953
+                if (priv->first_lookup_done == _gf_false) {
887953
+                        priv->first_lookup_done = _gf_true;
887953
                         i_start_cleanup = _gf_true;
887953
                 }
887953
         }
887953
         UNLOCK (&priv->lock);
887953
 
887953
-        if (i_start_cleanup) {
887953
-                ret = shard_start_background_deletion (this);
887953
-                if (ret) {
887953
-                        LOCK (&priv->lock);
887953
-                        {
887953
-                                priv->first_lookup = SHARD_FIRST_LOOKUP_PENDING;
887953
-                        }
887953
-                        UNLOCK (&priv->lock);
887953
+        if (!i_start_cleanup)
887953
+                goto unwind;
887953
+
887953
+        ret = shard_start_background_deletion(this);
887953
+        if (ret < 0) {
887953
+                LOCK(&priv->lock);
887953
+                {
887953
+                        priv->first_lookup_done = _gf_false;
887953
                 }
887953
+                UNLOCK(&priv->lock);
887953
         }
887953
 unwind:
887953
         SHARD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf,
887953
@@ -2940,9 +2979,10 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
887953
                 if (ctx->fsync_needed) {
887953
                         unref_base_inode++;
887953
                         list_del_init (&ctx->to_fsync_list);
887953
-                        if (base_inode)
887953
+                        if (base_inode) {
887953
                                 __shard_inode_ctx_get (base_inode, this, &base_ictx);
887953
-                        base_ictx->fsync_count--;
887953
+                                base_ictx->fsync_count--;
887953
+                        }
887953
                 }
887953
         }
887953
         UNLOCK(&inode->lock);
887953
@@ -3334,10 +3374,15 @@ shard_delete_shards_of_entry (call_frame_t *cleanup_frame, xlator_t *this,
887953
         loc.inode = inode_ref (priv->dot_shard_rm_inode);
887953
 
887953
         ret = syncop_entrylk (FIRST_CHILD(this), this->name, &loc,
887953
-                              entry->d_name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL,
887953
-                              NULL);
887953
-        if (ret)
887953
+                              entry->d_name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK,
887953
+                              NULL, NULL);
887953
+        if (ret < 0) {
887953
+                if (ret == -EAGAIN) {
887953
+                        ret = 0;
887953
+                }
887953
                 goto out;
887953
+        }
887953
+
887953
         {
887953
                 ret = __shard_delete_shards_of_entry (cleanup_frame, this,
887953
                                                       entry, inode);
887953
@@ -3352,20 +3397,6 @@ out:
887953
 int
887953
 shard_delete_shards_cbk (int ret, call_frame_t *frame, void *data)
887953
 {
887953
-        xlator_t     *this          = NULL;
887953
-        shard_priv_t *priv          = NULL;
887953
-
887953
-        this = frame->this;
887953
-        priv = this->private;
887953
-
887953
-        if (ret < 0) {
887953
-                gf_msg (this->name, GF_LOG_WARNING, -ret,
887953
-                        SHARD_MSG_SHARDS_DELETION_FAILED,
887953
-                        "Background deletion of shards failed");
887953
-                priv->first_lookup = SHARD_FIRST_LOOKUP_PENDING;
887953
-        } else {
887953
-                priv->first_lookup = SHARD_FIRST_LOOKUP_DONE;
887953
-        }
887953
         SHARD_STACK_DESTROY (frame);
887953
         return 0;
887953
 }
887953
@@ -3482,6 +3513,7 @@ shard_delete_shards (void *opaque)
887953
         gf_dirent_t      entries;
887953
         gf_dirent_t     *entry                      = NULL;
887953
         call_frame_t    *cleanup_frame              = NULL;
887953
+        gf_boolean_t done = _gf_false;
887953
 
887953
         this = THIS;
887953
         priv = this->private;
887953
@@ -3534,52 +3566,81 @@ shard_delete_shards (void *opaque)
887953
                 goto err;
887953
         }
887953
 
887953
-        while ((ret = syncop_readdirp (FIRST_CHILD(this), local->fd, 131072,
887953
-                                      offset, &entries, local->xattr_req,
887953
-                                      NULL))) {
887953
-                if (ret > 0)
887953
-                        ret = 0;
887953
-                list_for_each_entry (entry, &entries.list, list) {
887953
-                        offset = entry->d_off;
887953
-
887953
-                        if (!strcmp (entry->d_name, ".") ||
887953
-                            !strcmp (entry->d_name, ".."))
887953
-                                continue;
887953
+        for (;;) {
887953
+                offset = 0;
887953
+                LOCK(&priv->lock);
887953
+                {
887953
+                        if (priv->bg_del_state == SHARD_BG_DELETION_LAUNCHING) {
887953
+                                priv->bg_del_state = SHARD_BG_DELETION_IN_PROGRESS;
887953
+                        } else if (priv->bg_del_state == SHARD_BG_DELETION_IN_PROGRESS) {
887953
+                                priv->bg_del_state = SHARD_BG_DELETION_NONE;
887953
+                                done = _gf_true;
887953
+                        }
887953
+                }
887953
+                UNLOCK(&priv->lock);
887953
+                if (done)
887953
+                        break;
887953
+                while ((ret = syncop_readdirp (FIRST_CHILD(this), local->fd,
887953
+                                               131072, offset, &entries,
887953
+                                               local->xattr_req, NULL))) {
887953
+                        if (ret > 0)
887953
+                                ret = 0;
887953
+                        list_for_each_entry (entry, &entries.list, list) {
887953
+                                offset = entry->d_off;
887953
+
887953
+                                if (!strcmp (entry->d_name, ".") ||
887953
+                                    !strcmp (entry->d_name, ".."))
887953
+                                        continue;
887953
 
887953
-                        if (!entry->inode) {
887953
-                                ret = shard_lookup_marker_entry (this, local,
887953
-                                                                 entry);
887953
-                                if (ret < 0)
887953
+                                if (!entry->inode) {
887953
+                                        ret = shard_lookup_marker_entry (this,
887953
+                                                                         local,
887953
+                                                                         entry);
887953
+                                        if (ret < 0)
887953
+                                                continue;
887953
+                                }
887953
+                                link_inode = inode_link (entry->inode,
887953
+                                                         local->fd->inode,
887953
+                                                         entry->d_name,
887953
+                                                         &entry->d_stat);
887953
+
887953
+                                gf_msg_debug (this->name, 0, "Initiating "
887953
+                                              "deletion of shards of gfid %s",
887953
+                                              entry->d_name);
887953
+                                ret = shard_delete_shards_of_entry (cleanup_frame,
887953
+                                                                    this,
887953
+                                                                    entry,
887953
+                                                                    link_inode);
887953
+                                inode_unlink (link_inode, local->fd->inode,
887953
+                                              entry->d_name);
887953
+                                inode_unref (link_inode);
887953
+                                if (ret) {
887953
+                                        gf_msg (this->name, GF_LOG_ERROR, -ret,
887953
+                                                SHARD_MSG_SHARDS_DELETION_FAILED,
887953
+                                                "Failed to clean up shards of "
887953
+                                                "gfid %s", entry->d_name);
887953
                                         continue;
887953
-                        }
887953
-                        link_inode = inode_link (entry->inode, local->fd->inode,
887953
-                                                 entry->d_name, &entry->d_stat);
887953
-
887953
-                        gf_msg_debug (this->name, 0, "Initiating deletion of "
887953
-                                      "shards of gfid %s", entry->d_name);
887953
-                        ret = shard_delete_shards_of_entry (cleanup_frame, this,
887953
-                                                            entry, link_inode);
887953
-                        inode_unlink (link_inode, local->fd->inode,
887953
-                                      entry->d_name);
887953
-                        inode_unref (link_inode);
887953
-                        if (ret) {
887953
-                                gf_msg (this->name, GF_LOG_ERROR, -ret,
887953
-                                        SHARD_MSG_SHARDS_DELETION_FAILED,
887953
-                                        "Failed to clean up shards of gfid %s",
887953
+                                }
887953
+                                gf_msg (this->name, GF_LOG_INFO, 0,
887953
+                                        SHARD_MSG_SHARDS_DELETION_COMPLETED,
887953
+                                        "Deleted shards of gfid=%s from backend",
887953
                                         entry->d_name);
887953
-                                continue;
887953
                         }
887953
-                        gf_msg (this->name, GF_LOG_INFO, 0,
887953
-                                SHARD_MSG_SHARDS_DELETION_COMPLETED, "Deleted "
887953
-                                "shards of gfid=%s from backend",
887953
-                                entry->d_name);
887953
+                        gf_dirent_free (&entries);
887953
+                        if (ret)
887953
+                                break;
887953
                 }
887953
-                gf_dirent_free (&entries);
887953
-                if (ret)
887953
-                        break;
887953
         }
887953
         ret = 0;
887953
+        loc_wipe(&loc;;
887953
+        return ret;
887953
+
887953
 err:
887953
+        LOCK(&priv->lock);
887953
+        {
887953
+                priv->bg_del_state = SHARD_BG_DELETION_NONE;
887953
+        }
887953
+        UNLOCK(&priv->lock);
887953
         loc_wipe (&loc;;
887953
         return ret;
887953
 }
887953
diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h
887953
index ac3813c8c..37934f3a2 100644
887953
--- a/xlators/features/shard/src/shard.h
887953
+++ b/xlators/features/shard/src/shard.h
887953
@@ -196,11 +196,10 @@ shard_unlock_entrylk (call_frame_t *frame, xlator_t *this);
887953
         } while (0)
887953
 
887953
 typedef enum {
887953
-        SHARD_FIRST_LOOKUP_PENDING = 0,
887953
-        SHARD_FIRST_LOOKUP_IN_PROGRESS,
887953
-        SHARD_FIRST_LOOKUP_DONE,
887953
-} shard_first_lookup_state_t;
887953
-
887953
+        SHARD_BG_DELETION_NONE = 0,
887953
+        SHARD_BG_DELETION_LAUNCHING,
887953
+        SHARD_BG_DELETION_IN_PROGRESS,
887953
+} shard_bg_deletion_state_t;
887953
 /* rm = "remove me" */
887953
 
887953
 typedef struct shard_priv {
887953
@@ -213,7 +212,8 @@ typedef struct shard_priv {
887953
         int inode_count;
887953
         struct list_head ilist_head;
887953
         uint32_t deletion_rate;
887953
-        shard_first_lookup_state_t first_lookup;
887953
+        shard_bg_deletion_state_t bg_del_state;
887953
+        gf_boolean_t first_lookup_done;
887953
         uint64_t lru_limit;
887953
 } shard_priv_t;
887953
 
887953
-- 
887953
2.20.1
887953