233933
From 4f0aa008ed393d7ce222c4ea4bd0fa6ed52b48f6 Mon Sep 17 00:00:00 2001
233933
From: Krutika Dhananjay <kdhananj@redhat.com>
233933
Date: Fri, 5 Apr 2019 12:29:23 +0530
233933
Subject: [PATCH 177/178] features/shard: Fix extra unref when inode object is
233933
 lru'd out and added back
233933
233933
Long tale of double unref! But do read...
233933
233933
In cases where a shard base inode is evicted from lru list while still
233933
being part of fsync list but added back soon before its unlink, there
233933
could be an extra inode_unref() leading to premature inode destruction
233933
leading to crash.
233933
233933
One such specific case is the following -
233933
233933
Consider features.shard-deletion-rate = features.shard-lru-limit = 2.
233933
This is an oversimplified example but explains the problem clearly.
233933
233933
First, a file is FALLOCATE'd to a size so that number of shards under
233933
/.shard = 3 > lru-limit.
233933
Shards 1, 2 and 3 need to be resolved. 1 and 2 are resolved first.
233933
Resultant lru list:
233933
                               1 -----> 2
233933
refs on base inode -          (1)  +   (1) = 2
233933
3 needs to be resolved. So 1 is lru'd out. Resultant lru list -
233933
		               2 -----> 3
233933
refs on base inode -          (1)  +   (1) = 2
233933
233933
Note that 1 is inode_unlink()d but not destroyed because there are
233933
non-zero refs on it since it is still participating in this ongoing
233933
FALLOCATE operation.
233933
233933
FALLOCATE is sent on all participant shards. In the cbk, all of them are
233933
added to fync_list.
233933
Resulting fsync list -
233933
                               1 -----> 2 -----> 3 (order doesn't matter)
233933
refs on base inode -          (1)  +   (1)  +   (1) = 3
233933
Total refs = 3 + 2 = 5
233933
233933
Now an attempt is made to unlink this file. Background deletion is triggered.
233933
The first $shard-deletion-rate shards need to be unlinked in the first batch.
233933
So shards 1 and 2 need to be resolved. inode_resolve fails on 1 but succeeds
233933
on 2 and so it's moved to tail of list.
233933
lru list now -
233933
                              3 -----> 2
233933
No change in refs.
233933
233933
shard 1 is looked up. In lookup_cbk, it's linked and added back to lru list
233933
at the cost of evicting shard 3.
233933
lru list now -
233933
                              2 -----> 1
233933
refs on base inode:          (1)  +   (1) = 2
233933
fsync list now -
233933
                              1 -----> 2 (again order doesn't matter)
233933
refs on base inode -         (1)  +   (1) = 2
233933
Total refs = 2 + 2 = 4
233933
After eviction, it is found 3 needs fsync. So fsync is wound, yet to be ack'd.
233933
So it is still inode_link()d.
233933
233933
Now deletion of shards 1 and 2 completes. lru list is empty. Base inode unref'd and
233933
destroyed.
233933
In the next batched deletion, 3 needs to be deleted. It is inode_resolve()able.
233933
It is added back to lru list but base inode passed to __shard_update_shards_inode_list()
233933
is NULL since the inode is destroyed. But its ctx->inode still contains base inode ptr
233933
from first addition to lru list for no additional ref on it.
233933
lru list now -
233933
                              3
233933
refs on base inode -         (0)
233933
Total refs on base inode = 0
233933
Unlink is sent on 3. It completes. Now since the ctx contains ptr to base_inode and the
233933
shard is part of lru list, base shard is unref'd leading to a crash.
233933
233933
FIX:
233933
When shard is readded back to lru list, copy the base inode pointer as is into its inode ctx,
233933
even if it is NULL. This is needed to prevent double unrefs at the time of deleting it.
233933
233933
Upstream patch:
233933
> BUG: 1696136
233933
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/22517
233933
> Change-Id: I99a44039da2e10a1aad183e84f644d63ca552462
233933
> Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
233933
233933
Change-Id: I99a44039da2e10a1aad183e84f644d63ca552462
233933
Updates: bz#1694595
233933
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
233933
Reviewed-on: https://code.engineering.redhat.com/gerrit/172803
233933
Tested-by: RHGS Build Bot <nigelb@redhat.com>
233933
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
233933
---
233933
 .../bug-1696136-lru-limit-equals-deletion-rate.t   | 34 ++++++++++++++++++++++
233933
 xlators/features/shard/src/shard.c                 |  6 ++--
233933
 2 files changed, 36 insertions(+), 4 deletions(-)
233933
 create mode 100644 tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
233933
233933
diff --git a/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t b/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
233933
new file mode 100644
233933
index 0000000..3e4a65a
233933
--- /dev/null
233933
+++ b/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
233933
@@ -0,0 +1,34 @@
233933
+#!/bin/bash
233933
+
233933
+. $(dirname $0)/../../include.rc
233933
+. $(dirname $0)/../../volume.rc
233933
+. $(dirname $0)/../../fallocate.rc
233933
+
233933
+cleanup
233933
+
233933
+TEST glusterd
233933
+TEST pidof glusterd
233933
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
233933
+TEST $CLI volume set $V0 features.shard on
233933
+TEST $CLI volume set $V0 features.shard-block-size 4MB
233933
+TEST $CLI volume set $V0 features.shard-lru-limit 120
233933
+TEST $CLI volume set $V0 features.shard-deletion-rate 120
233933
+TEST $CLI volume set $V0 performance.write-behind off
233933
+TEST $CLI volume start $V0
233933
+
233933
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
233933
+
233933
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
233933
+
233933
+# Create a file
233933
+TEST touch $M0/file1
233933
+
233933
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
233933
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
233933
+
233933
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
233933
+TEST $CLI volume stop $V0
233933
+TEST $CLI volume delete $V0
233933
+rm -f $(dirname $0)/bug-1696136
233933
+
233933
+cleanup
233933
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
233933
index 3c4bcdc..c1799ad 100644
233933
--- a/xlators/features/shard/src/shard.c
233933
+++ b/xlators/features/shard/src/shard.c
233933
@@ -689,8 +689,7 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
233933
             ctx->block_num = block_num;
233933
             list_add_tail(&ctx->ilist, &priv->ilist_head);
233933
             priv->inode_count++;
233933
-            if (base_inode)
233933
-                ctx->base_inode = inode_ref(base_inode);
233933
+            ctx->base_inode = inode_ref(base_inode);
233933
         } else {
233933
             /*If on the other hand there is no available slot for this inode
233933
              * in the list, delete the lru inode from the head of the list,
233933
@@ -765,8 +764,7 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
233933
             else
233933
                 gf_uuid_copy(ctx->base_gfid, gfid);
233933
             ctx->block_num = block_num;
233933
-            if (base_inode)
233933
-                ctx->base_inode = inode_ref(base_inode);
233933
+            ctx->base_inode = inode_ref(base_inode);
233933
             list_add_tail(&ctx->ilist, &priv->ilist_head);
233933
         }
233933
     } else {
233933
-- 
233933
1.8.3.1
233933