|
|
12a457 |
From 0031be017dea4d908a8b4a34e809bec88e8b1acc Mon Sep 17 00:00:00 2001
|
|
|
12a457 |
From: Pranith Kumar K <pkarampu@redhat.com>
|
|
|
12a457 |
Date: Tue, 8 Mar 2016 23:05:08 +0530
|
|
|
12a457 |
Subject: [PATCH 66/80] cluster/ec: Do not ref dictionary in lookup
|
|
|
12a457 |
|
|
|
12a457 |
Problem:
|
|
|
12a457 |
1) dict_for_each loops over the elements without any locks, so the members of
|
|
|
12a457 |
the dictionary can be ref/unrefed while dict_for_each is executed by another
|
|
|
12a457 |
thread leading to crashes.
|
|
|
12a457 |
|
|
|
12a457 |
Basically with distributed ec + disctributed replicate as cold, hot tiers. tier
|
|
|
12a457 |
sends a lookup which fails on ec. (By this time dict already contains ec
|
|
|
12a457 |
xattrs) After this lookup_everywhere code path is hit in tier which triggers
|
|
|
12a457 |
lookup on each of distribute's hash lookup but fails which leads to the cold,
|
|
|
12a457 |
hot dht's lookup_everywhere in two parallel epoll threads where in ec when it
|
|
|
12a457 |
tries to set trusted.ec.version/dirty/size as keys in the dictionary, the older
|
|
|
12a457 |
values against the same key get erased. While this erasing is going on if the
|
|
|
12a457 |
thread that is doing lookup on afr's subvolume accesses these keys either in
|
|
|
12a457 |
dict_copy_with_ref or client xlator trying to serialize, that can either lead
|
|
|
12a457 |
to crash or hang based on if the spin/mutex lock is called on invalid memory.
|
|
|
12a457 |
|
|
|
12a457 |
2) EC deletes GF_CONTENT_KEY from the dictionary, this may lead to extra reads
|
|
|
12a457 |
in case of lookup-everwhere for tiered volumes.
|
|
|
12a457 |
|
|
|
12a457 |
Fix:
|
|
|
12a457 |
Do dict_copy_with_ref() for the lookup-dictionary.
|
|
|
12a457 |
This is avoiding the problem and is not actually fixing the 1st problem.
|
|
|
12a457 |
2nd problem will be fixed.
|
|
|
12a457 |
|
|
|
12a457 |
>Change-Id: I5427aa14c48cb7572977d4de9a28c5ffff2b4b95
|
|
|
12a457 |
>BUG: 1315560
|
|
|
12a457 |
>Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
|
|
|
12a457 |
>Reviewed-on: http://review.gluster.org/13680
|
|
|
12a457 |
>Smoke: Gluster Build System <jenkins@build.gluster.com>
|
|
|
12a457 |
>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
|
|
|
12a457 |
>CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
|
|
|
12a457 |
>Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
|
|
|
12a457 |
|
|
|
12a457 |
BUG: 1318428
|
|
|
12a457 |
Change-Id: I91401d978f7bb9d00b7a9c832526f79ad0a30532
|
|
|
12a457 |
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
|
|
|
12a457 |
Reviewed-on: https://code.engineering.redhat.com/gerrit/71972
|
|
|
12a457 |
---
|
|
|
12a457 |
xlators/cluster/ec/src/ec-generic.c | 11 ++++-------
|
|
|
12a457 |
1 files changed, 4 insertions(+), 7 deletions(-)
|
|
|
12a457 |
|
|
|
12a457 |
diff --git a/xlators/cluster/ec/src/ec-generic.c b/xlators/cluster/ec/src/ec-generic.c
|
|
|
12a457 |
index 3f5856e..375e49e 100644
|
|
|
12a457 |
--- a/xlators/cluster/ec/src/ec-generic.c
|
|
|
12a457 |
+++ b/xlators/cluster/ec/src/ec-generic.c
|
|
|
12a457 |
@@ -906,14 +906,11 @@ void ec_lookup(call_frame_t * frame, xlator_t * this, uintptr_t target,
|
|
|
12a457 |
}
|
|
|
12a457 |
}
|
|
|
12a457 |
if (xdata != NULL) {
|
|
|
12a457 |
- fop->xdata = dict_ref(xdata);
|
|
|
12a457 |
- if (fop->xdata == NULL) {
|
|
|
12a457 |
- gf_msg (this->name, GF_LOG_ERROR, 0,
|
|
|
12a457 |
- EC_MSG_DICT_REF_FAIL, "Failed to reference a "
|
|
|
12a457 |
- "dictionary.");
|
|
|
12a457 |
-
|
|
|
12a457 |
+ fop->xdata = dict_copy_with_ref (xdata, NULL);
|
|
|
12a457 |
+ /* Do not log failures here as a memory problem would have already
|
|
|
12a457 |
+ * been logged by the corresponding alloc functions */
|
|
|
12a457 |
+ if (fop->xdata == NULL)
|
|
|
12a457 |
goto out;
|
|
|
12a457 |
- }
|
|
|
12a457 |
}
|
|
|
12a457 |
|
|
|
12a457 |
error = 0;
|
|
|
12a457 |
--
|
|
|
12a457 |
1.7.1
|
|
|
12a457 |
|