e7a346
From a7ade5267ebaf4bf318ee2aebe48000cee583e3b Mon Sep 17 00:00:00 2001
e7a346
From: Krutika Dhananjay <kdhananj@redhat.com>
e7a346
Date: Fri, 28 Dec 2018 18:53:15 +0530
e7a346
Subject: [PATCH 505/506] features/shard: Fix launch of multiple synctasks for
e7a346
 background deletion
e7a346
e7a346
> Upstream: https://review.gluster.org/21957
e7a346
> BUG: 1662368
e7a346
> Change-Id: Ib33773d27fb4be463c7a8a5a6a4b63689705324e
e7a346
e7a346
PROBLEM:
e7a346
e7a346
When multiple sharded files are deleted in quick succession, multiple
e7a346
issues were observed:
e7a346
1. misleading logs corresponding to a sharded file where while one log
e7a346
   message said the shards corresponding to the file were deleted
e7a346
   successfully, this was followed by multiple logs suggesting the very
e7a346
   same operation failed. This was because of multiple synctasks
e7a346
   attempting to clean up shards of the same file and only one of them
e7a346
   succeeding (the one that gets ENTRYLK successfully), and the rest of
e7a346
   them logging failure.
e7a346
e7a346
2. multiple synctasks to do background deletion would be launched, one
e7a346
   for each deleted file but all of them could readdir entries from
e7a346
   .remove_me at the same time could potentially contend for ENTRYLK on
e7a346
   .shard for each of the entry names. This is undesirable and wasteful.
e7a346
e7a346
FIX:
e7a346
Background deletion will now follow a state machine. In the event that
e7a346
there are multiple attempts to launch synctask for background deletion,
e7a346
one for each file deleted, only the first task is launched. And if while
e7a346
this task is doing the cleanup, more attempts are made to delete other
e7a346
files, the state of the synctask is adjusted so that it restarts the
e7a346
crawl even after reaching end-of-directory to pick up any files it may
e7a346
have missed in the previous iteration.
e7a346
e7a346
This patch also fixes uninitialized lk-owner during syncop_entrylk()
e7a346
which was leading to multiple background deletion synctasks entering
e7a346
the critical section at the same time and leading to illegal memory access
e7a346
of base inode in the second syntcask after it was destroyed post shard deletion
e7a346
by the first synctask.
e7a346
e7a346
Change-Id: Ib33773d27fb4be463c7a8a5a6a4b63689705324e
e7a346
BUG: 1662059
e7a346
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
e7a346
Reviewed-on: https://code.engineering.redhat.com/gerrit/160437
e7a346
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e7a346
Reviewed-by: Xavi Hernandez <xhernandez@redhat.com>
e7a346
---
e7a346
 xlators/features/shard/src/shard.c | 199 +++++++++++++++++++----------
e7a346
 xlators/features/shard/src/shard.h |  12 +-
e7a346
 2 files changed, 136 insertions(+), 75 deletions(-)
e7a346
e7a346
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
e7a346
index 5b72399f5..8aed1a386 100644
e7a346
--- a/xlators/features/shard/src/shard.c
e7a346
+++ b/xlators/features/shard/src/shard.c
e7a346
@@ -1465,16 +1465,45 @@ int
e7a346
 shard_start_background_deletion (xlator_t *this)
e7a346
 {
e7a346
         int              ret           = 0;
e7a346
+        gf_boolean_t     i_cleanup     = _gf_true;
e7a346
+        shard_priv_t    *priv          = NULL;
e7a346
         call_frame_t    *cleanup_frame = NULL;
e7a346
 
e7a346
+        priv = this->private;
e7a346
+
e7a346
+        LOCK(&priv->lock);
e7a346
+        {
e7a346
+                switch (priv->bg_del_state) {
e7a346
+                case SHARD_BG_DELETION_NONE:
e7a346
+                        i_cleanup = _gf_true;
e7a346
+                        priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING;
e7a346
+                        break;
e7a346
+                case SHARD_BG_DELETION_LAUNCHING:
e7a346
+                        i_cleanup = _gf_false;
e7a346
+                        break;
e7a346
+                case SHARD_BG_DELETION_IN_PROGRESS:
e7a346
+                        priv->bg_del_state = SHARD_BG_DELETION_LAUNCHING;
e7a346
+                        i_cleanup = _gf_false;
e7a346
+                        break;
e7a346
+                default:
e7a346
+                        break;
e7a346
+                }
e7a346
+        }
e7a346
+        UNLOCK(&priv->lock);
e7a346
+        if (!i_cleanup)
e7a346
+                return 0;
e7a346
+
e7a346
         cleanup_frame = create_frame (this, this->ctx->pool);
e7a346
         if (!cleanup_frame) {
e7a346
                 gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
e7a346
                         SHARD_MSG_MEMALLOC_FAILED, "Failed to create "
e7a346
                         "new frame to delete shards");
e7a346
-                return -ENOMEM;
e7a346
+                ret = -ENOMEM;
e7a346
+                goto err;
e7a346
         }
e7a346
 
e7a346
+        set_lk_owner_from_ptr(&cleanup_frame->root->lk_owner, cleanup_frame->root);
e7a346
+
e7a346
         ret = synctask_new (this->ctx->env, shard_delete_shards,
e7a346
                             shard_delete_shards_cbk, cleanup_frame,
e7a346
                             cleanup_frame);
e7a346
@@ -1484,7 +1513,16 @@ shard_start_background_deletion (xlator_t *this)
e7a346
                         "failed to create task to do background "
e7a346
                         "cleanup of shards");
e7a346
                 STACK_DESTROY (cleanup_frame->root);
e7a346
+                goto err;
e7a346
         }
e7a346
+        return 0;
e7a346
+
e7a346
+err:
e7a346
+        LOCK(&priv->lock);
e7a346
+        {
e7a346
+                priv->bg_del_state = SHARD_BG_DELETION_NONE;
e7a346
+        }
e7a346
+        UNLOCK(&priv->lock);
e7a346
         return ret;
e7a346
 }
e7a346
 
e7a346
@@ -1493,7 +1531,7 @@ shard_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
                   int32_t op_ret, int32_t op_errno, inode_t *inode,
e7a346
                   struct iatt *buf, dict_t *xdata, struct iatt *postparent)
e7a346
 {
e7a346
-        int             ret             = 0;
e7a346
+        int             ret             = -1;
e7a346
         shard_priv_t   *priv            = NULL;
e7a346
         gf_boolean_t    i_start_cleanup = _gf_false;
e7a346
 
e7a346
@@ -1526,22 +1564,23 @@ shard_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
e7a346
 
e7a346
         LOCK (&priv->lock);
e7a346
         {
e7a346
-                if (priv->first_lookup == SHARD_FIRST_LOOKUP_PENDING) {
e7a346
-                        priv->first_lookup = SHARD_FIRST_LOOKUP_IN_PROGRESS;
e7a346
+                if (priv->first_lookup_done == _gf_false) {
e7a346
+                        priv->first_lookup_done = _gf_true;
e7a346
                         i_start_cleanup = _gf_true;
e7a346
                 }
e7a346
         }
e7a346
         UNLOCK (&priv->lock);
e7a346
 
e7a346
-        if (i_start_cleanup) {
e7a346
-                ret = shard_start_background_deletion (this);
e7a346
-                if (ret) {
e7a346
-                        LOCK (&priv->lock);
e7a346
-                        {
e7a346
-                                priv->first_lookup = SHARD_FIRST_LOOKUP_PENDING;
e7a346
-                        }
e7a346
-                        UNLOCK (&priv->lock);
e7a346
+        if (!i_start_cleanup)
e7a346
+                goto unwind;
e7a346
+
e7a346
+        ret = shard_start_background_deletion(this);
e7a346
+        if (ret < 0) {
e7a346
+                LOCK(&priv->lock);
e7a346
+                {
e7a346
+                        priv->first_lookup_done = _gf_false;
e7a346
                 }
e7a346
+                UNLOCK(&priv->lock);
e7a346
         }
e7a346
 unwind:
e7a346
         SHARD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf,
e7a346
@@ -2940,9 +2979,10 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num)
e7a346
                 if (ctx->fsync_needed) {
e7a346
                         unref_base_inode++;
e7a346
                         list_del_init (&ctx->to_fsync_list);
e7a346
-                        if (base_inode)
e7a346
+                        if (base_inode) {
e7a346
                                 __shard_inode_ctx_get (base_inode, this, &base_ictx);
e7a346
-                        base_ictx->fsync_count--;
e7a346
+                                base_ictx->fsync_count--;
e7a346
+                        }
e7a346
                 }
e7a346
         }
e7a346
         UNLOCK(&inode->lock);
e7a346
@@ -3334,10 +3374,15 @@ shard_delete_shards_of_entry (call_frame_t *cleanup_frame, xlator_t *this,
e7a346
         loc.inode = inode_ref (priv->dot_shard_rm_inode);
e7a346
 
e7a346
         ret = syncop_entrylk (FIRST_CHILD(this), this->name, &loc,
e7a346
-                              entry->d_name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL,
e7a346
-                              NULL);
e7a346
-        if (ret)
e7a346
+                              entry->d_name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK,
e7a346
+                              NULL, NULL);
e7a346
+        if (ret < 0) {
e7a346
+                if (ret == -EAGAIN) {
e7a346
+                        ret = 0;
e7a346
+                }
e7a346
                 goto out;
e7a346
+        }
e7a346
+
e7a346
         {
e7a346
                 ret = __shard_delete_shards_of_entry (cleanup_frame, this,
e7a346
                                                       entry, inode);
e7a346
@@ -3352,20 +3397,6 @@ out:
e7a346
 int
e7a346
 shard_delete_shards_cbk (int ret, call_frame_t *frame, void *data)
e7a346
 {
e7a346
-        xlator_t     *this          = NULL;
e7a346
-        shard_priv_t *priv          = NULL;
e7a346
-
e7a346
-        this = frame->this;
e7a346
-        priv = this->private;
e7a346
-
e7a346
-        if (ret < 0) {
e7a346
-                gf_msg (this->name, GF_LOG_WARNING, -ret,
e7a346
-                        SHARD_MSG_SHARDS_DELETION_FAILED,
e7a346
-                        "Background deletion of shards failed");
e7a346
-                priv->first_lookup = SHARD_FIRST_LOOKUP_PENDING;
e7a346
-        } else {
e7a346
-                priv->first_lookup = SHARD_FIRST_LOOKUP_DONE;
e7a346
-        }
e7a346
         SHARD_STACK_DESTROY (frame);
e7a346
         return 0;
e7a346
 }
e7a346
@@ -3482,6 +3513,7 @@ shard_delete_shards (void *opaque)
e7a346
         gf_dirent_t      entries;
e7a346
         gf_dirent_t     *entry                      = NULL;
e7a346
         call_frame_t    *cleanup_frame              = NULL;
e7a346
+        gf_boolean_t done = _gf_false;
e7a346
 
e7a346
         this = THIS;
e7a346
         priv = this->private;
e7a346
@@ -3534,52 +3566,81 @@ shard_delete_shards (void *opaque)
e7a346
                 goto err;
e7a346
         }
e7a346
 
e7a346
-        while ((ret = syncop_readdirp (FIRST_CHILD(this), local->fd, 131072,
e7a346
-                                      offset, &entries, local->xattr_req,
e7a346
-                                      NULL))) {
e7a346
-                if (ret > 0)
e7a346
-                        ret = 0;
e7a346
-                list_for_each_entry (entry, &entries.list, list) {
e7a346
-                        offset = entry->d_off;
e7a346
-
e7a346
-                        if (!strcmp (entry->d_name, ".") ||
e7a346
-                            !strcmp (entry->d_name, ".."))
e7a346
-                                continue;
e7a346
+        for (;;) {
e7a346
+                offset = 0;
e7a346
+                LOCK(&priv->lock);
e7a346
+                {
e7a346
+                        if (priv->bg_del_state == SHARD_BG_DELETION_LAUNCHING) {
e7a346
+                                priv->bg_del_state = SHARD_BG_DELETION_IN_PROGRESS;
e7a346
+                        } else if (priv->bg_del_state == SHARD_BG_DELETION_IN_PROGRESS) {
e7a346
+                                priv->bg_del_state = SHARD_BG_DELETION_NONE;
e7a346
+                                done = _gf_true;
e7a346
+                        }
e7a346
+                }
e7a346
+                UNLOCK(&priv->lock);
e7a346
+                if (done)
e7a346
+                        break;
e7a346
+                while ((ret = syncop_readdirp (FIRST_CHILD(this), local->fd,
e7a346
+                                               131072, offset, &entries,
e7a346
+                                               local->xattr_req, NULL))) {
e7a346
+                        if (ret > 0)
e7a346
+                                ret = 0;
e7a346
+                        list_for_each_entry (entry, &entries.list, list) {
e7a346
+                                offset = entry->d_off;
e7a346
+
e7a346
+                                if (!strcmp (entry->d_name, ".") ||
e7a346
+                                    !strcmp (entry->d_name, ".."))
e7a346
+                                        continue;
e7a346
 
e7a346
-                        if (!entry->inode) {
e7a346
-                                ret = shard_lookup_marker_entry (this, local,
e7a346
-                                                                 entry);
e7a346
-                                if (ret < 0)
e7a346
+                                if (!entry->inode) {
e7a346
+                                        ret = shard_lookup_marker_entry (this,
e7a346
+                                                                         local,
e7a346
+                                                                         entry);
e7a346
+                                        if (ret < 0)
e7a346
+                                                continue;
e7a346
+                                }
e7a346
+                                link_inode = inode_link (entry->inode,
e7a346
+                                                         local->fd->inode,
e7a346
+                                                         entry->d_name,
e7a346
+                                                         &entry->d_stat);
e7a346
+
e7a346
+                                gf_msg_debug (this->name, 0, "Initiating "
e7a346
+                                              "deletion of shards of gfid %s",
e7a346
+                                              entry->d_name);
e7a346
+                                ret = shard_delete_shards_of_entry (cleanup_frame,
e7a346
+                                                                    this,
e7a346
+                                                                    entry,
e7a346
+                                                                    link_inode);
e7a346
+                                inode_unlink (link_inode, local->fd->inode,
e7a346
+                                              entry->d_name);
e7a346
+                                inode_unref (link_inode);
e7a346
+                                if (ret) {
e7a346
+                                        gf_msg (this->name, GF_LOG_ERROR, -ret,
e7a346
+                                                SHARD_MSG_SHARDS_DELETION_FAILED,
e7a346
+                                                "Failed to clean up shards of "
e7a346
+                                                "gfid %s", entry->d_name);
e7a346
                                         continue;
e7a346
-                        }
e7a346
-                        link_inode = inode_link (entry->inode, local->fd->inode,
e7a346
-                                                 entry->d_name, &entry->d_stat);
e7a346
-
e7a346
-                        gf_msg_debug (this->name, 0, "Initiating deletion of "
e7a346
-                                      "shards of gfid %s", entry->d_name);
e7a346
-                        ret = shard_delete_shards_of_entry (cleanup_frame, this,
e7a346
-                                                            entry, link_inode);
e7a346
-                        inode_unlink (link_inode, local->fd->inode,
e7a346
-                                      entry->d_name);
e7a346
-                        inode_unref (link_inode);
e7a346
-                        if (ret) {
e7a346
-                                gf_msg (this->name, GF_LOG_ERROR, -ret,
e7a346
-                                        SHARD_MSG_SHARDS_DELETION_FAILED,
e7a346
-                                        "Failed to clean up shards of gfid %s",
e7a346
+                                }
e7a346
+                                gf_msg (this->name, GF_LOG_INFO, 0,
e7a346
+                                        SHARD_MSG_SHARDS_DELETION_COMPLETED,
e7a346
+                                        "Deleted shards of gfid=%s from backend",
e7a346
                                         entry->d_name);
e7a346
-                                continue;
e7a346
                         }
e7a346
-                        gf_msg (this->name, GF_LOG_INFO, 0,
e7a346
-                                SHARD_MSG_SHARDS_DELETION_COMPLETED, "Deleted "
e7a346
-                                "shards of gfid=%s from backend",
e7a346
-                                entry->d_name);
e7a346
+                        gf_dirent_free (&entries);
e7a346
+                        if (ret)
e7a346
+                                break;
e7a346
                 }
e7a346
-                gf_dirent_free (&entries);
e7a346
-                if (ret)
e7a346
-                        break;
e7a346
         }
e7a346
         ret = 0;
e7a346
+        loc_wipe(&loc;;
e7a346
+        return ret;
e7a346
+
e7a346
 err:
e7a346
+        LOCK(&priv->lock);
e7a346
+        {
e7a346
+                priv->bg_del_state = SHARD_BG_DELETION_NONE;
e7a346
+        }
e7a346
+        UNLOCK(&priv->lock);
e7a346
         loc_wipe (&loc;;
e7a346
         return ret;
e7a346
 }
e7a346
diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h
e7a346
index ac3813c8c..37934f3a2 100644
e7a346
--- a/xlators/features/shard/src/shard.h
e7a346
+++ b/xlators/features/shard/src/shard.h
e7a346
@@ -196,11 +196,10 @@ shard_unlock_entrylk (call_frame_t *frame, xlator_t *this);
e7a346
         } while (0)
e7a346
 
e7a346
 typedef enum {
e7a346
-        SHARD_FIRST_LOOKUP_PENDING = 0,
e7a346
-        SHARD_FIRST_LOOKUP_IN_PROGRESS,
e7a346
-        SHARD_FIRST_LOOKUP_DONE,
e7a346
-} shard_first_lookup_state_t;
e7a346
-
e7a346
+        SHARD_BG_DELETION_NONE = 0,
e7a346
+        SHARD_BG_DELETION_LAUNCHING,
e7a346
+        SHARD_BG_DELETION_IN_PROGRESS,
e7a346
+} shard_bg_deletion_state_t;
e7a346
 /* rm = "remove me" */
e7a346
 
e7a346
 typedef struct shard_priv {
e7a346
@@ -213,7 +212,8 @@ typedef struct shard_priv {
e7a346
         int inode_count;
e7a346
         struct list_head ilist_head;
e7a346
         uint32_t deletion_rate;
e7a346
-        shard_first_lookup_state_t first_lookup;
e7a346
+        shard_bg_deletion_state_t bg_del_state;
e7a346
+        gf_boolean_t first_lookup_done;
e7a346
         uint64_t lru_limit;
e7a346
 } shard_priv_t;
e7a346
 
e7a346
-- 
e7a346
2.20.1
e7a346