e3c68b
From 4f0aa008ed393d7ce222c4ea4bd0fa6ed52b48f6 Mon Sep 17 00:00:00 2001
e3c68b
From: Krutika Dhananjay <kdhananj@redhat.com>
e3c68b
Date: Fri, 5 Apr 2019 12:29:23 +0530
e3c68b
Subject: [PATCH 177/178] features/shard: Fix extra unref when inode object is
e3c68b
 lru'd out and added back
e3c68b
e3c68b
Long tale of double unref! But do read...
e3c68b
e3c68b
In cases where a shard base inode is evicted from lru list while still
e3c68b
being part of fsync list but added back soon before its unlink, there
e3c68b
could be an extra inode_unref() leading to premature inode destruction
e3c68b
leading to crash.
e3c68b
e3c68b
One such specific case is the following -
e3c68b
e3c68b
Consider features.shard-deletion-rate = features.shard-lru-limit = 2.
e3c68b
This is an oversimplified example but explains the problem clearly.
e3c68b
e3c68b
First, a file is FALLOCATE'd to a size so that number of shards under
e3c68b
/.shard = 3 > lru-limit.
e3c68b
Shards 1, 2 and 3 need to be resolved. 1 and 2 are resolved first.
e3c68b
Resultant lru list:
e3c68b
                               1 -----> 2
e3c68b
refs on base inode -          (1)  +   (1) = 2
e3c68b
3 needs to be resolved. So 1 is lru'd out. Resultant lru list -
e3c68b
		               2 -----> 3
e3c68b
refs on base inode -          (1)  +   (1) = 2
e3c68b
e3c68b
Note that 1 is inode_unlink()d but not destroyed because there are
e3c68b
non-zero refs on it since it is still participating in this ongoing
e3c68b
FALLOCATE operation.
e3c68b
e3c68b
FALLOCATE is sent on all participant shards. In the cbk, all of them are
e3c68b
added to fync_list.
e3c68b
Resulting fsync list -
e3c68b
                               1 -----> 2 -----> 3 (order doesn't matter)
e3c68b
refs on base inode -          (1)  +   (1)  +   (1) = 3
e3c68b
Total refs = 3 + 2 = 5
e3c68b
e3c68b
Now an attempt is made to unlink this file. Background deletion is triggered.
e3c68b
The first $shard-deletion-rate shards need to be unlinked in the first batch.
e3c68b
So shards 1 and 2 need to be resolved. inode_resolve fails on 1 but succeeds
e3c68b
on 2 and so it's moved to tail of list.
e3c68b
lru list now -
e3c68b
                              3 -----> 2
e3c68b
No change in refs.
e3c68b
e3c68b
shard 1 is looked up. In lookup_cbk, it's linked and added back to lru list
e3c68b
at the cost of evicting shard 3.
e3c68b
lru list now -
e3c68b
                              2 -----> 1
e3c68b
refs on base inode:          (1)  +   (1) = 2
e3c68b
fsync list now -
e3c68b
                              1 -----> 2 (again order doesn't matter)
e3c68b
refs on base inode -         (1)  +   (1) = 2
e3c68b
Total refs = 2 + 2 = 4
e3c68b
After eviction, it is found 3 needs fsync. So fsync is wound, yet to be ack'd.
e3c68b
So it is still inode_link()d.
e3c68b
e3c68b
Now deletion of shards 1 and 2 completes. lru list is empty. Base inode unref'd and
e3c68b
destroyed.
e3c68b
In the next batched deletion, 3 needs to be deleted. It is inode_resolve()able.
e3c68b
It is added back to lru list but base inode passed to __shard_update_shards_inode_list()
e3c68b
is NULL since the inode is destroyed. But its ctx->inode still contains base inode ptr
e3c68b
from first addition to lru list for no additional ref on it.
e3c68b
lru list now -
e3c68b
                              3
e3c68b
refs on base inode -         (0)
e3c68b
Total refs on base inode = 0
e3c68b
Unlink is sent on 3. It completes. Now since the ctx contains ptr to base_inode and the
e3c68b
shard is part of lru list, base shard is unref'd leading to a crash.
e3c68b
e3c68b
FIX:
e3c68b
When shard is readded back to lru list, copy the base inode pointer as is into its inode ctx,
e3c68b
even if it is NULL. This is needed to prevent double unrefs at the time of deleting it.
e3c68b
e3c68b
Upstream patch:
e3c68b
> BUG: 1696136
e3c68b
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/22517
e3c68b
> Change-Id: I99a44039da2e10a1aad183e84f644d63ca552462
e3c68b
> Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
e3c68b
e3c68b
Change-Id: I99a44039da2e10a1aad183e84f644d63ca552462
e3c68b
Updates: bz#1694595
e3c68b
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
e3c68b
Reviewed-on: https://code.engineering.redhat.com/gerrit/172803
e3c68b
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e3c68b
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
e3c68b
---
e3c68b
 .../bug-1696136-lru-limit-equals-deletion-rate.t   | 34 ++++++++++++++++++++++
e3c68b
 xlators/features/shard/src/shard.c                 |  6 ++--
e3c68b
 2 files changed, 36 insertions(+), 4 deletions(-)
e3c68b
 create mode 100644 tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
e3c68b
e3c68b
diff --git a/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t b/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
e3c68b
new file mode 100644
e3c68b
index 0000000..3e4a65a
e3c68b
--- /dev/null
e3c68b
+++ b/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
e3c68b
@@ -0,0 +1,34 @@
e3c68b
+#!/bin/bash
e3c68b
+
e3c68b
+. $(dirname $0)/../../include.rc
e3c68b
+. $(dirname $0)/../../volume.rc
e3c68b
+. $(dirname $0)/../../fallocate.rc
e3c68b
+
e3c68b
+cleanup
e3c68b
+
e3c68b
+TEST glusterd
e3c68b
+TEST pidof glusterd
e3c68b
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
e3c68b
+TEST $CLI volume set $V0 features.shard on
e3c68b
+TEST $CLI volume set $V0 features.shard-block-size 4MB
e3c68b
+TEST $CLI volume set $V0 features.shard-lru-limit 120
e3c68b
+TEST $CLI volume set $V0 features.shard-deletion-rate 120
e3c68b
+TEST $CLI volume set $V0 performance.write-behind off
e3c68b
+TEST $CLI volume start $V0
e3c68b
+
e3c68b
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
e3c68b
+
e3c68b
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
e3c68b
+
e3c68b
+# Create a file
e3c68b
+TEST touch $M0/file1
e3c68b
+
e3c68b
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
e3c68b
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
e3c68b
+
e3c68b
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
e3c68b
+TEST $CLI volume stop $V0
e3c68b
+TEST $CLI volume delete $V0
e3c68b
+rm -f $(dirname $0)/bug-1696136
e3c68b
+
e3c68b
+cleanup
e3c68b
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
e3c68b
index 3c4bcdc..c1799ad 100644
e3c68b
--- a/xlators/features/shard/src/shard.c
e3c68b
+++ b/xlators/features/shard/src/shard.c
e3c68b
@@ -689,8 +689,7 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
e3c68b
             ctx->block_num = block_num;
e3c68b
             list_add_tail(&ctx->ilist, &priv->ilist_head);
e3c68b
             priv->inode_count++;
e3c68b
-            if (base_inode)
e3c68b
-                ctx->base_inode = inode_ref(base_inode);
e3c68b
+            ctx->base_inode = inode_ref(base_inode);
e3c68b
         } else {
e3c68b
             /*If on the other hand there is no available slot for this inode
e3c68b
              * in the list, delete the lru inode from the head of the list,
e3c68b
@@ -765,8 +764,7 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
e3c68b
             else
e3c68b
                 gf_uuid_copy(ctx->base_gfid, gfid);
e3c68b
             ctx->block_num = block_num;
e3c68b
-            if (base_inode)
e3c68b
-                ctx->base_inode = inode_ref(base_inode);
e3c68b
+            ctx->base_inode = inode_ref(base_inode);
e3c68b
             list_add_tail(&ctx->ilist, &priv->ilist_head);
e3c68b
         }
e3c68b
     } else {
e3c68b
-- 
e3c68b
1.8.3.1
e3c68b