74b1de
From 4f0aa008ed393d7ce222c4ea4bd0fa6ed52b48f6 Mon Sep 17 00:00:00 2001
74b1de
From: Krutika Dhananjay <kdhananj@redhat.com>
74b1de
Date: Fri, 5 Apr 2019 12:29:23 +0530
74b1de
Subject: [PATCH 177/178] features/shard: Fix extra unref when inode object is
74b1de
 lru'd out and added back
74b1de
74b1de
Long tale of double unref! But do read...
74b1de
74b1de
In cases where a shard base inode is evicted from lru list while still
74b1de
being part of fsync list but added back soon before its unlink, there
74b1de
could be an extra inode_unref() leading to premature inode destruction
74b1de
leading to crash.
74b1de
74b1de
One such specific case is the following -
74b1de
74b1de
Consider features.shard-deletion-rate = features.shard-lru-limit = 2.
74b1de
This is an oversimplified example but explains the problem clearly.
74b1de
74b1de
First, a file is FALLOCATE'd to a size so that number of shards under
74b1de
/.shard = 3 > lru-limit.
74b1de
Shards 1, 2 and 3 need to be resolved. 1 and 2 are resolved first.
74b1de
Resultant lru list:
74b1de
                               1 -----> 2
74b1de
refs on base inode -          (1)  +   (1) = 2
74b1de
3 needs to be resolved. So 1 is lru'd out. Resultant lru list -
74b1de
		               2 -----> 3
74b1de
refs on base inode -          (1)  +   (1) = 2
74b1de
74b1de
Note that 1 is inode_unlink()d but not destroyed because there are
74b1de
non-zero refs on it since it is still participating in this ongoing
74b1de
FALLOCATE operation.
74b1de
74b1de
FALLOCATE is sent on all participant shards. In the cbk, all of them are
74b1de
added to fync_list.
74b1de
Resulting fsync list -
74b1de
                               1 -----> 2 -----> 3 (order doesn't matter)
74b1de
refs on base inode -          (1)  +   (1)  +   (1) = 3
74b1de
Total refs = 3 + 2 = 5
74b1de
74b1de
Now an attempt is made to unlink this file. Background deletion is triggered.
74b1de
The first $shard-deletion-rate shards need to be unlinked in the first batch.
74b1de
So shards 1 and 2 need to be resolved. inode_resolve fails on 1 but succeeds
74b1de
on 2 and so it's moved to tail of list.
74b1de
lru list now -
74b1de
                              3 -----> 2
74b1de
No change in refs.
74b1de
74b1de
shard 1 is looked up. In lookup_cbk, it's linked and added back to lru list
74b1de
at the cost of evicting shard 3.
74b1de
lru list now -
74b1de
                              2 -----> 1
74b1de
refs on base inode:          (1)  +   (1) = 2
74b1de
fsync list now -
74b1de
                              1 -----> 2 (again order doesn't matter)
74b1de
refs on base inode -         (1)  +   (1) = 2
74b1de
Total refs = 2 + 2 = 4
74b1de
After eviction, it is found 3 needs fsync. So fsync is wound, yet to be ack'd.
74b1de
So it is still inode_link()d.
74b1de
74b1de
Now deletion of shards 1 and 2 completes. lru list is empty. Base inode unref'd and
74b1de
destroyed.
74b1de
In the next batched deletion, 3 needs to be deleted. It is inode_resolve()able.
74b1de
It is added back to lru list but base inode passed to __shard_update_shards_inode_list()
74b1de
is NULL since the inode is destroyed. But its ctx->inode still contains base inode ptr
74b1de
from first addition to lru list for no additional ref on it.
74b1de
lru list now -
74b1de
                              3
74b1de
refs on base inode -         (0)
74b1de
Total refs on base inode = 0
74b1de
Unlink is sent on 3. It completes. Now since the ctx contains ptr to base_inode and the
74b1de
shard is part of lru list, base shard is unref'd leading to a crash.
74b1de
74b1de
FIX:
74b1de
When shard is readded back to lru list, copy the base inode pointer as is into its inode ctx,
74b1de
even if it is NULL. This is needed to prevent double unrefs at the time of deleting it.
74b1de
74b1de
Upstream patch:
74b1de
> BUG: 1696136
74b1de
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/22517
74b1de
> Change-Id: I99a44039da2e10a1aad183e84f644d63ca552462
74b1de
> Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
74b1de
74b1de
Change-Id: I99a44039da2e10a1aad183e84f644d63ca552462
74b1de
Updates: bz#1694595
74b1de
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
74b1de
Reviewed-on: https://code.engineering.redhat.com/gerrit/172803
74b1de
Tested-by: RHGS Build Bot <nigelb@redhat.com>
74b1de
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
74b1de
---
74b1de
 .../bug-1696136-lru-limit-equals-deletion-rate.t   | 34 ++++++++++++++++++++++
74b1de
 xlators/features/shard/src/shard.c                 |  6 ++--
74b1de
 2 files changed, 36 insertions(+), 4 deletions(-)
74b1de
 create mode 100644 tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
74b1de
74b1de
diff --git a/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t b/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
74b1de
new file mode 100644
74b1de
index 0000000..3e4a65a
74b1de
--- /dev/null
74b1de
+++ b/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
74b1de
@@ -0,0 +1,34 @@
74b1de
+#!/bin/bash
74b1de
+
74b1de
+. $(dirname $0)/../../include.rc
74b1de
+. $(dirname $0)/../../volume.rc
74b1de
+. $(dirname $0)/../../fallocate.rc
74b1de
+
74b1de
+cleanup
74b1de
+
74b1de
+TEST glusterd
74b1de
+TEST pidof glusterd
74b1de
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
74b1de
+TEST $CLI volume set $V0 features.shard on
74b1de
+TEST $CLI volume set $V0 features.shard-block-size 4MB
74b1de
+TEST $CLI volume set $V0 features.shard-lru-limit 120
74b1de
+TEST $CLI volume set $V0 features.shard-deletion-rate 120
74b1de
+TEST $CLI volume set $V0 performance.write-behind off
74b1de
+TEST $CLI volume start $V0
74b1de
+
74b1de
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
74b1de
+
74b1de
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
74b1de
+
74b1de
+# Create a file
74b1de
+TEST touch $M0/file1
74b1de
+
74b1de
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
74b1de
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
74b1de
+
74b1de
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
74b1de
+TEST $CLI volume stop $V0
74b1de
+TEST $CLI volume delete $V0
74b1de
+rm -f $(dirname $0)/bug-1696136
74b1de
+
74b1de
+cleanup
74b1de
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
74b1de
index 3c4bcdc..c1799ad 100644
74b1de
--- a/xlators/features/shard/src/shard.c
74b1de
+++ b/xlators/features/shard/src/shard.c
74b1de
@@ -689,8 +689,7 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
74b1de
             ctx->block_num = block_num;
74b1de
             list_add_tail(&ctx->ilist, &priv->ilist_head);
74b1de
             priv->inode_count++;
74b1de
-            if (base_inode)
74b1de
-                ctx->base_inode = inode_ref(base_inode);
74b1de
+            ctx->base_inode = inode_ref(base_inode);
74b1de
         } else {
74b1de
             /*If on the other hand there is no available slot for this inode
74b1de
              * in the list, delete the lru inode from the head of the list,
74b1de
@@ -765,8 +764,7 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
74b1de
             else
74b1de
                 gf_uuid_copy(ctx->base_gfid, gfid);
74b1de
             ctx->block_num = block_num;
74b1de
-            if (base_inode)
74b1de
-                ctx->base_inode = inode_ref(base_inode);
74b1de
+            ctx->base_inode = inode_ref(base_inode);
74b1de
             list_add_tail(&ctx->ilist, &priv->ilist_head);
74b1de
         }
74b1de
     } else {
74b1de
-- 
74b1de
1.8.3.1
74b1de