From f2d3866e617d25ea62cda01afddc81ef0db3356e Mon Sep 17 00:00:00 2001 From: Xavi Hernandez Date: Tue, 4 May 2021 22:39:03 +0200 Subject: [PATCH 555/584] geo-rep: Improve handling of gfid mismatches In some circumstances geo-replication can detect mismatching gfids between primary and secondary. These entries are fixed in an iterative way, assuming that after a fix, a previously failing entry could succeed. Previous code was trying to fix them in a loop that can be executed up to 10 times. If some entry cannot be fixed after 10 attempts, it's discarded. These fixes are very slow, so trying to do them many times causes geo-replication to get out of sync. To minimize the number of iterations done, this patch checks if the number of entries and failures remains constant after each iteration. If they are constant, it means that nothing else can be fixed, so it makes no sense to do more iterations. This reduces the number of iterations to 2 or 3 in most of the cases, improving geo-replication performance. Backport of: > Upstream-patch: https://github.com/gluster/glusterfs/pull/2389 > Fixes: #2388 > Change-Id: I6d9a623a60045694e1a832195e1dc1fb9e88ae54 > Signed-off-by: Xavi Hernandez BUG: 1957191 Change-Id: I6d9a623a60045694e1a832195e1dc1fb9e88ae54 Signed-off-by: Xavi Hernandez Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244550 Tested-by: RHGS Build Bot Reviewed-by: Sunil Kumar Heggodu Gopala Acharya --- geo-replication/syncdaemon/master.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py index 98637e7..aef9373 100644 --- a/geo-replication/syncdaemon/master.py +++ b/geo-replication/syncdaemon/master.py @@ -1224,9 +1224,11 @@ class GMasterChangelogMixin(GMasterCommon): if gconf.get("gfid-conflict-resolution"): count = 0 + num_entries = len(entries) + num_failures = len(failures) if failures: logging.info(lf('Entry ops failed with gfid mismatch', - count=len(failures))) + count=num_failures)) while failures and count < self.MAX_OE_RETRIES: count += 1 self.handle_entry_failures(failures, entries) @@ -1237,6 +1239,20 @@ class GMasterChangelogMixin(GMasterCommon): "gfid mismatch") break + # If this iteration has not removed any entry or reduced + # the number of failures compared to the previous one, we + # don't need to keep iterating because we'll get the same + # result in all other attempts. + if ((num_entries == len(entries)) and + (num_failures == len(failures))): + logging.info(lf("No more gfid mismatches can be fixed", + entries=num_entries, + failures=num_failures)) + break + + num_entries = len(entries) + num_failures = len(failures) + self.log_failures(failures, 'gfid', gauxpfx(), 'ENTRY') self.status.dec_value("entry", len(entries)) -- 1.8.3.1