|
|
e3c68b |
From 4f0aa008ed393d7ce222c4ea4bd0fa6ed52b48f6 Mon Sep 17 00:00:00 2001
|
|
|
e3c68b |
From: Krutika Dhananjay <kdhananj@redhat.com>
|
|
|
e3c68b |
Date: Fri, 5 Apr 2019 12:29:23 +0530
|
|
|
e3c68b |
Subject: [PATCH 177/178] features/shard: Fix extra unref when inode object is
|
|
|
e3c68b |
lru'd out and added back
|
|
|
e3c68b |
|
|
|
e3c68b |
Long tale of double unref! But do read...
|
|
|
e3c68b |
|
|
|
e3c68b |
In cases where a shard base inode is evicted from lru list while still
|
|
|
e3c68b |
being part of fsync list but added back soon before its unlink, there
|
|
|
e3c68b |
could be an extra inode_unref() leading to premature inode destruction
|
|
|
e3c68b |
leading to crash.
|
|
|
e3c68b |
|
|
|
e3c68b |
One such specific case is the following -
|
|
|
e3c68b |
|
|
|
e3c68b |
Consider features.shard-deletion-rate = features.shard-lru-limit = 2.
|
|
|
e3c68b |
This is an oversimplified example but explains the problem clearly.
|
|
|
e3c68b |
|
|
|
e3c68b |
First, a file is FALLOCATE'd to a size so that number of shards under
|
|
|
e3c68b |
/.shard = 3 > lru-limit.
|
|
|
e3c68b |
Shards 1, 2 and 3 need to be resolved. 1 and 2 are resolved first.
|
|
|
e3c68b |
Resultant lru list:
|
|
|
e3c68b |
1 -----> 2
|
|
|
e3c68b |
refs on base inode - (1) + (1) = 2
|
|
|
e3c68b |
3 needs to be resolved. So 1 is lru'd out. Resultant lru list -
|
|
|
e3c68b |
2 -----> 3
|
|
|
e3c68b |
refs on base inode - (1) + (1) = 2
|
|
|
e3c68b |
|
|
|
e3c68b |
Note that 1 is inode_unlink()d but not destroyed because there are
|
|
|
e3c68b |
non-zero refs on it since it is still participating in this ongoing
|
|
|
e3c68b |
FALLOCATE operation.
|
|
|
e3c68b |
|
|
|
e3c68b |
FALLOCATE is sent on all participant shards. In the cbk, all of them are
|
|
|
e3c68b |
added to fync_list.
|
|
|
e3c68b |
Resulting fsync list -
|
|
|
e3c68b |
1 -----> 2 -----> 3 (order doesn't matter)
|
|
|
e3c68b |
refs on base inode - (1) + (1) + (1) = 3
|
|
|
e3c68b |
Total refs = 3 + 2 = 5
|
|
|
e3c68b |
|
|
|
e3c68b |
Now an attempt is made to unlink this file. Background deletion is triggered.
|
|
|
e3c68b |
The first $shard-deletion-rate shards need to be unlinked in the first batch.
|
|
|
e3c68b |
So shards 1 and 2 need to be resolved. inode_resolve fails on 1 but succeeds
|
|
|
e3c68b |
on 2 and so it's moved to tail of list.
|
|
|
e3c68b |
lru list now -
|
|
|
e3c68b |
3 -----> 2
|
|
|
e3c68b |
No change in refs.
|
|
|
e3c68b |
|
|
|
e3c68b |
shard 1 is looked up. In lookup_cbk, it's linked and added back to lru list
|
|
|
e3c68b |
at the cost of evicting shard 3.
|
|
|
e3c68b |
lru list now -
|
|
|
e3c68b |
2 -----> 1
|
|
|
e3c68b |
refs on base inode: (1) + (1) = 2
|
|
|
e3c68b |
fsync list now -
|
|
|
e3c68b |
1 -----> 2 (again order doesn't matter)
|
|
|
e3c68b |
refs on base inode - (1) + (1) = 2
|
|
|
e3c68b |
Total refs = 2 + 2 = 4
|
|
|
e3c68b |
After eviction, it is found 3 needs fsync. So fsync is wound, yet to be ack'd.
|
|
|
e3c68b |
So it is still inode_link()d.
|
|
|
e3c68b |
|
|
|
e3c68b |
Now deletion of shards 1 and 2 completes. lru list is empty. Base inode unref'd and
|
|
|
e3c68b |
destroyed.
|
|
|
e3c68b |
In the next batched deletion, 3 needs to be deleted. It is inode_resolve()able.
|
|
|
e3c68b |
It is added back to lru list but base inode passed to __shard_update_shards_inode_list()
|
|
|
e3c68b |
is NULL since the inode is destroyed. But its ctx->inode still contains base inode ptr
|
|
|
e3c68b |
from first addition to lru list for no additional ref on it.
|
|
|
e3c68b |
lru list now -
|
|
|
e3c68b |
3
|
|
|
e3c68b |
refs on base inode - (0)
|
|
|
e3c68b |
Total refs on base inode = 0
|
|
|
e3c68b |
Unlink is sent on 3. It completes. Now since the ctx contains ptr to base_inode and the
|
|
|
e3c68b |
shard is part of lru list, base shard is unref'd leading to a crash.
|
|
|
e3c68b |
|
|
|
e3c68b |
FIX:
|
|
|
e3c68b |
When shard is readded back to lru list, copy the base inode pointer as is into its inode ctx,
|
|
|
e3c68b |
even if it is NULL. This is needed to prevent double unrefs at the time of deleting it.
|
|
|
e3c68b |
|
|
|
e3c68b |
Upstream patch:
|
|
|
e3c68b |
> BUG: 1696136
|
|
|
e3c68b |
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/22517
|
|
|
e3c68b |
> Change-Id: I99a44039da2e10a1aad183e84f644d63ca552462
|
|
|
e3c68b |
> Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
|
|
|
e3c68b |
|
|
|
e3c68b |
Change-Id: I99a44039da2e10a1aad183e84f644d63ca552462
|
|
|
e3c68b |
Updates: bz#1694595
|
|
|
e3c68b |
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
|
|
|
e3c68b |
Reviewed-on: https://code.engineering.redhat.com/gerrit/172803
|
|
|
e3c68b |
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
|
e3c68b |
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
|
|
e3c68b |
---
|
|
|
e3c68b |
.../bug-1696136-lru-limit-equals-deletion-rate.t | 34 ++++++++++++++++++++++
|
|
|
e3c68b |
xlators/features/shard/src/shard.c | 6 ++--
|
|
|
e3c68b |
2 files changed, 36 insertions(+), 4 deletions(-)
|
|
|
e3c68b |
create mode 100644 tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
|
|
|
e3c68b |
|
|
|
e3c68b |
diff --git a/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t b/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
|
|
|
e3c68b |
new file mode 100644
|
|
|
e3c68b |
index 0000000..3e4a65a
|
|
|
e3c68b |
--- /dev/null
|
|
|
e3c68b |
+++ b/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
|
|
|
e3c68b |
@@ -0,0 +1,34 @@
|
|
|
e3c68b |
+#!/bin/bash
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+. $(dirname $0)/../../include.rc
|
|
|
e3c68b |
+. $(dirname $0)/../../volume.rc
|
|
|
e3c68b |
+. $(dirname $0)/../../fallocate.rc
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+cleanup
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+TEST glusterd
|
|
|
e3c68b |
+TEST pidof glusterd
|
|
|
e3c68b |
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
|
|
|
e3c68b |
+TEST $CLI volume set $V0 features.shard on
|
|
|
e3c68b |
+TEST $CLI volume set $V0 features.shard-block-size 4MB
|
|
|
e3c68b |
+TEST $CLI volume set $V0 features.shard-lru-limit 120
|
|
|
e3c68b |
+TEST $CLI volume set $V0 features.shard-deletion-rate 120
|
|
|
e3c68b |
+TEST $CLI volume set $V0 performance.write-behind off
|
|
|
e3c68b |
+TEST $CLI volume start $V0
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+# Create a file
|
|
|
e3c68b |
+TEST touch $M0/file1
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
|
|
|
e3c68b |
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
|
|
e3c68b |
+TEST $CLI volume stop $V0
|
|
|
e3c68b |
+TEST $CLI volume delete $V0
|
|
|
e3c68b |
+rm -f $(dirname $0)/bug-1696136
|
|
|
e3c68b |
+
|
|
|
e3c68b |
+cleanup
|
|
|
e3c68b |
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
|
|
|
e3c68b |
index 3c4bcdc..c1799ad 100644
|
|
|
e3c68b |
--- a/xlators/features/shard/src/shard.c
|
|
|
e3c68b |
+++ b/xlators/features/shard/src/shard.c
|
|
|
e3c68b |
@@ -689,8 +689,7 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
|
|
|
e3c68b |
ctx->block_num = block_num;
|
|
|
e3c68b |
list_add_tail(&ctx->ilist, &priv->ilist_head);
|
|
|
e3c68b |
priv->inode_count++;
|
|
|
e3c68b |
- if (base_inode)
|
|
|
e3c68b |
- ctx->base_inode = inode_ref(base_inode);
|
|
|
e3c68b |
+ ctx->base_inode = inode_ref(base_inode);
|
|
|
e3c68b |
} else {
|
|
|
e3c68b |
/*If on the other hand there is no available slot for this inode
|
|
|
e3c68b |
* in the list, delete the lru inode from the head of the list,
|
|
|
e3c68b |
@@ -765,8 +764,7 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
|
|
|
e3c68b |
else
|
|
|
e3c68b |
gf_uuid_copy(ctx->base_gfid, gfid);
|
|
|
e3c68b |
ctx->block_num = block_num;
|
|
|
e3c68b |
- if (base_inode)
|
|
|
e3c68b |
- ctx->base_inode = inode_ref(base_inode);
|
|
|
e3c68b |
+ ctx->base_inode = inode_ref(base_inode);
|
|
|
e3c68b |
list_add_tail(&ctx->ilist, &priv->ilist_head);
|
|
|
e3c68b |
}
|
|
|
e3c68b |
} else {
|
|
|
e3c68b |
--
|
|
|
e3c68b |
1.8.3.1
|
|
|
e3c68b |
|