256ebe
From 4f0aa008ed393d7ce222c4ea4bd0fa6ed52b48f6 Mon Sep 17 00:00:00 2001
256ebe
From: Krutika Dhananjay <kdhananj@redhat.com>
256ebe
Date: Fri, 5 Apr 2019 12:29:23 +0530
256ebe
Subject: [PATCH 177/178] features/shard: Fix extra unref when inode object is
256ebe
 lru'd out and added back
256ebe
256ebe
Long tale of double unref! But do read...
256ebe
256ebe
In cases where a shard base inode is evicted from lru list while still
256ebe
being part of fsync list but added back soon before its unlink, there
256ebe
could be an extra inode_unref() leading to premature inode destruction
256ebe
leading to crash.
256ebe
256ebe
One such specific case is the following -
256ebe
256ebe
Consider features.shard-deletion-rate = features.shard-lru-limit = 2.
256ebe
This is an oversimplified example but explains the problem clearly.
256ebe
256ebe
First, a file is FALLOCATE'd to a size so that number of shards under
256ebe
/.shard = 3 > lru-limit.
256ebe
Shards 1, 2 and 3 need to be resolved. 1 and 2 are resolved first.
256ebe
Resultant lru list:
256ebe
                               1 -----> 2
256ebe
refs on base inode -          (1)  +   (1) = 2
256ebe
3 needs to be resolved. So 1 is lru'd out. Resultant lru list -
256ebe
		               2 -----> 3
256ebe
refs on base inode -          (1)  +   (1) = 2
256ebe
256ebe
Note that 1 is inode_unlink()d but not destroyed because there are
256ebe
non-zero refs on it since it is still participating in this ongoing
256ebe
FALLOCATE operation.
256ebe
256ebe
FALLOCATE is sent on all participant shards. In the cbk, all of them are
256ebe
added to fync_list.
256ebe
Resulting fsync list -
256ebe
                               1 -----> 2 -----> 3 (order doesn't matter)
256ebe
refs on base inode -          (1)  +   (1)  +   (1) = 3
256ebe
Total refs = 3 + 2 = 5
256ebe
256ebe
Now an attempt is made to unlink this file. Background deletion is triggered.
256ebe
The first $shard-deletion-rate shards need to be unlinked in the first batch.
256ebe
So shards 1 and 2 need to be resolved. inode_resolve fails on 1 but succeeds
256ebe
on 2 and so it's moved to tail of list.
256ebe
lru list now -
256ebe
                              3 -----> 2
256ebe
No change in refs.
256ebe
256ebe
shard 1 is looked up. In lookup_cbk, it's linked and added back to lru list
256ebe
at the cost of evicting shard 3.
256ebe
lru list now -
256ebe
                              2 -----> 1
256ebe
refs on base inode:          (1)  +   (1) = 2
256ebe
fsync list now -
256ebe
                              1 -----> 2 (again order doesn't matter)
256ebe
refs on base inode -         (1)  +   (1) = 2
256ebe
Total refs = 2 + 2 = 4
256ebe
After eviction, it is found 3 needs fsync. So fsync is wound, yet to be ack'd.
256ebe
So it is still inode_link()d.
256ebe
256ebe
Now deletion of shards 1 and 2 completes. lru list is empty. Base inode unref'd and
256ebe
destroyed.
256ebe
In the next batched deletion, 3 needs to be deleted. It is inode_resolve()able.
256ebe
It is added back to lru list but base inode passed to __shard_update_shards_inode_list()
256ebe
is NULL since the inode is destroyed. But its ctx->inode still contains base inode ptr
256ebe
from first addition to lru list for no additional ref on it.
256ebe
lru list now -
256ebe
                              3
256ebe
refs on base inode -         (0)
256ebe
Total refs on base inode = 0
256ebe
Unlink is sent on 3. It completes. Now since the ctx contains ptr to base_inode and the
256ebe
shard is part of lru list, base shard is unref'd leading to a crash.
256ebe
256ebe
FIX:
256ebe
When shard is readded back to lru list, copy the base inode pointer as is into its inode ctx,
256ebe
even if it is NULL. This is needed to prevent double unrefs at the time of deleting it.
256ebe
256ebe
Upstream patch:
256ebe
> BUG: 1696136
256ebe
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/22517
256ebe
> Change-Id: I99a44039da2e10a1aad183e84f644d63ca552462
256ebe
> Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
256ebe
256ebe
Change-Id: I99a44039da2e10a1aad183e84f644d63ca552462
256ebe
Updates: bz#1694595
256ebe
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
256ebe
Reviewed-on: https://code.engineering.redhat.com/gerrit/172803
256ebe
Tested-by: RHGS Build Bot <nigelb@redhat.com>
256ebe
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
256ebe
---
256ebe
 .../bug-1696136-lru-limit-equals-deletion-rate.t   | 34 ++++++++++++++++++++++
256ebe
 xlators/features/shard/src/shard.c                 |  6 ++--
256ebe
 2 files changed, 36 insertions(+), 4 deletions(-)
256ebe
 create mode 100644 tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
256ebe
256ebe
diff --git a/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t b/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
256ebe
new file mode 100644
256ebe
index 0000000..3e4a65a
256ebe
--- /dev/null
256ebe
+++ b/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
256ebe
@@ -0,0 +1,34 @@
256ebe
+#!/bin/bash
256ebe
+
256ebe
+. $(dirname $0)/../../include.rc
256ebe
+. $(dirname $0)/../../volume.rc
256ebe
+. $(dirname $0)/../../fallocate.rc
256ebe
+
256ebe
+cleanup
256ebe
+
256ebe
+TEST glusterd
256ebe
+TEST pidof glusterd
256ebe
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
256ebe
+TEST $CLI volume set $V0 features.shard on
256ebe
+TEST $CLI volume set $V0 features.shard-block-size 4MB
256ebe
+TEST $CLI volume set $V0 features.shard-lru-limit 120
256ebe
+TEST $CLI volume set $V0 features.shard-deletion-rate 120
256ebe
+TEST $CLI volume set $V0 performance.write-behind off
256ebe
+TEST $CLI volume start $V0
256ebe
+
256ebe
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
256ebe
+
256ebe
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
256ebe
+
256ebe
+# Create a file
256ebe
+TEST touch $M0/file1
256ebe
+
256ebe
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
256ebe
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
256ebe
+
256ebe
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
256ebe
+TEST $CLI volume stop $V0
256ebe
+TEST $CLI volume delete $V0
256ebe
+rm -f $(dirname $0)/bug-1696136
256ebe
+
256ebe
+cleanup
256ebe
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
256ebe
index 3c4bcdc..c1799ad 100644
256ebe
--- a/xlators/features/shard/src/shard.c
256ebe
+++ b/xlators/features/shard/src/shard.c
256ebe
@@ -689,8 +689,7 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
256ebe
             ctx->block_num = block_num;
256ebe
             list_add_tail(&ctx->ilist, &priv->ilist_head);
256ebe
             priv->inode_count++;
256ebe
-            if (base_inode)
256ebe
-                ctx->base_inode = inode_ref(base_inode);
256ebe
+            ctx->base_inode = inode_ref(base_inode);
256ebe
         } else {
256ebe
             /*If on the other hand there is no available slot for this inode
256ebe
              * in the list, delete the lru inode from the head of the list,
256ebe
@@ -765,8 +764,7 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
256ebe
             else
256ebe
                 gf_uuid_copy(ctx->base_gfid, gfid);
256ebe
             ctx->block_num = block_num;
256ebe
-            if (base_inode)
256ebe
-                ctx->base_inode = inode_ref(base_inode);
256ebe
+            ctx->base_inode = inode_ref(base_inode);
256ebe
             list_add_tail(&ctx->ilist, &priv->ilist_head);
256ebe
         }
256ebe
     } else {
256ebe
-- 
256ebe
1.8.3.1
256ebe