50dc83
From 4f0aa008ed393d7ce222c4ea4bd0fa6ed52b48f6 Mon Sep 17 00:00:00 2001
50dc83
From: Krutika Dhananjay <kdhananj@redhat.com>
50dc83
Date: Fri, 5 Apr 2019 12:29:23 +0530
50dc83
Subject: [PATCH 177/178] features/shard: Fix extra unref when inode object is
50dc83
 lru'd out and added back
50dc83
50dc83
Long tale of double unref! But do read...
50dc83
50dc83
In cases where a shard base inode is evicted from lru list while still
50dc83
being part of fsync list but added back soon before its unlink, there
50dc83
could be an extra inode_unref() leading to premature inode destruction
50dc83
leading to crash.
50dc83
50dc83
One such specific case is the following -
50dc83
50dc83
Consider features.shard-deletion-rate = features.shard-lru-limit = 2.
50dc83
This is an oversimplified example but explains the problem clearly.
50dc83
50dc83
First, a file is FALLOCATE'd to a size so that number of shards under
50dc83
/.shard = 3 > lru-limit.
50dc83
Shards 1, 2 and 3 need to be resolved. 1 and 2 are resolved first.
50dc83
Resultant lru list:
50dc83
                               1 -----> 2
50dc83
refs on base inode -          (1)  +   (1) = 2
50dc83
3 needs to be resolved. So 1 is lru'd out. Resultant lru list -
50dc83
		               2 -----> 3
50dc83
refs on base inode -          (1)  +   (1) = 2
50dc83
50dc83
Note that 1 is inode_unlink()d but not destroyed because there are
50dc83
non-zero refs on it since it is still participating in this ongoing
50dc83
FALLOCATE operation.
50dc83
50dc83
FALLOCATE is sent on all participant shards. In the cbk, all of them are
50dc83
added to fync_list.
50dc83
Resulting fsync list -
50dc83
                               1 -----> 2 -----> 3 (order doesn't matter)
50dc83
refs on base inode -          (1)  +   (1)  +   (1) = 3
50dc83
Total refs = 3 + 2 = 5
50dc83
50dc83
Now an attempt is made to unlink this file. Background deletion is triggered.
50dc83
The first $shard-deletion-rate shards need to be unlinked in the first batch.
50dc83
So shards 1 and 2 need to be resolved. inode_resolve fails on 1 but succeeds
50dc83
on 2 and so it's moved to tail of list.
50dc83
lru list now -
50dc83
                              3 -----> 2
50dc83
No change in refs.
50dc83
50dc83
shard 1 is looked up. In lookup_cbk, it's linked and added back to lru list
50dc83
at the cost of evicting shard 3.
50dc83
lru list now -
50dc83
                              2 -----> 1
50dc83
refs on base inode:          (1)  +   (1) = 2
50dc83
fsync list now -
50dc83
                              1 -----> 2 (again order doesn't matter)
50dc83
refs on base inode -         (1)  +   (1) = 2
50dc83
Total refs = 2 + 2 = 4
50dc83
After eviction, it is found 3 needs fsync. So fsync is wound, yet to be ack'd.
50dc83
So it is still inode_link()d.
50dc83
50dc83
Now deletion of shards 1 and 2 completes. lru list is empty. Base inode unref'd and
50dc83
destroyed.
50dc83
In the next batched deletion, 3 needs to be deleted. It is inode_resolve()able.
50dc83
It is added back to lru list but base inode passed to __shard_update_shards_inode_list()
50dc83
is NULL since the inode is destroyed. But its ctx->inode still contains base inode ptr
50dc83
from first addition to lru list for no additional ref on it.
50dc83
lru list now -
50dc83
                              3
50dc83
refs on base inode -         (0)
50dc83
Total refs on base inode = 0
50dc83
Unlink is sent on 3. It completes. Now since the ctx contains ptr to base_inode and the
50dc83
shard is part of lru list, base shard is unref'd leading to a crash.
50dc83
50dc83
FIX:
50dc83
When shard is readded back to lru list, copy the base inode pointer as is into its inode ctx,
50dc83
even if it is NULL. This is needed to prevent double unrefs at the time of deleting it.
50dc83
50dc83
Upstream patch:
50dc83
> BUG: 1696136
50dc83
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/22517
50dc83
> Change-Id: I99a44039da2e10a1aad183e84f644d63ca552462
50dc83
> Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
50dc83
50dc83
Change-Id: I99a44039da2e10a1aad183e84f644d63ca552462
50dc83
Updates: bz#1694595
50dc83
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
50dc83
Reviewed-on: https://code.engineering.redhat.com/gerrit/172803
50dc83
Tested-by: RHGS Build Bot <nigelb@redhat.com>
50dc83
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
50dc83
---
50dc83
 .../bug-1696136-lru-limit-equals-deletion-rate.t   | 34 ++++++++++++++++++++++
50dc83
 xlators/features/shard/src/shard.c                 |  6 ++--
50dc83
 2 files changed, 36 insertions(+), 4 deletions(-)
50dc83
 create mode 100644 tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
50dc83
50dc83
diff --git a/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t b/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
50dc83
new file mode 100644
50dc83
index 0000000..3e4a65a
50dc83
--- /dev/null
50dc83
+++ b/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
50dc83
@@ -0,0 +1,34 @@
50dc83
+#!/bin/bash
50dc83
+
50dc83
+. $(dirname $0)/../../include.rc
50dc83
+. $(dirname $0)/../../volume.rc
50dc83
+. $(dirname $0)/../../fallocate.rc
50dc83
+
50dc83
+cleanup
50dc83
+
50dc83
+TEST glusterd
50dc83
+TEST pidof glusterd
50dc83
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
50dc83
+TEST $CLI volume set $V0 features.shard on
50dc83
+TEST $CLI volume set $V0 features.shard-block-size 4MB
50dc83
+TEST $CLI volume set $V0 features.shard-lru-limit 120
50dc83
+TEST $CLI volume set $V0 features.shard-deletion-rate 120
50dc83
+TEST $CLI volume set $V0 performance.write-behind off
50dc83
+TEST $CLI volume start $V0
50dc83
+
50dc83
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
50dc83
+
50dc83
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
50dc83
+
50dc83
+# Create a file
50dc83
+TEST touch $M0/file1
50dc83
+
50dc83
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
50dc83
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
50dc83
+
50dc83
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
50dc83
+TEST $CLI volume stop $V0
50dc83
+TEST $CLI volume delete $V0
50dc83
+rm -f $(dirname $0)/bug-1696136
50dc83
+
50dc83
+cleanup
50dc83
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
50dc83
index 3c4bcdc..c1799ad 100644
50dc83
--- a/xlators/features/shard/src/shard.c
50dc83
+++ b/xlators/features/shard/src/shard.c
50dc83
@@ -689,8 +689,7 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
50dc83
             ctx->block_num = block_num;
50dc83
             list_add_tail(&ctx->ilist, &priv->ilist_head);
50dc83
             priv->inode_count++;
50dc83
-            if (base_inode)
50dc83
-                ctx->base_inode = inode_ref(base_inode);
50dc83
+            ctx->base_inode = inode_ref(base_inode);
50dc83
         } else {
50dc83
             /*If on the other hand there is no available slot for this inode
50dc83
              * in the list, delete the lru inode from the head of the list,
50dc83
@@ -765,8 +764,7 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
50dc83
             else
50dc83
                 gf_uuid_copy(ctx->base_gfid, gfid);
50dc83
             ctx->block_num = block_num;
50dc83
-            if (base_inode)
50dc83
-                ctx->base_inode = inode_ref(base_inode);
50dc83
+            ctx->base_inode = inode_ref(base_inode);
50dc83
             list_add_tail(&ctx->ilist, &priv->ilist_head);
50dc83
         }
50dc83
     } else {
50dc83
-- 
50dc83
1.8.3.1
50dc83