d1681e
From 81b5fbe38a022b738aa817444d7564534e0a886e Mon Sep 17 00:00:00 2001
d1681e
From: Kotresh HR <khiremat@redhat.com>
d1681e
Date: Fri, 24 Aug 2018 08:30:54 -0400
d1681e
Subject: [PATCH 360/362] geo-rep: Make automatic gfid conflict resolution
d1681e
 optional
d1681e
d1681e
Autmatic gfid conflict resolution needs to be disabled
d1681e
during failover/failback as it might lead to data loss
d1681e
in the following scenario.
d1681e
d1681e
1. Master went down without syncing directory "dir1" to slave.
d1681e
2. When slave is failed over to master, if a new file
d1681e
   is written inside "dir1", creating dir1 again if not
d1681e
   present, "dir1" ends up with different gfid on original
d1681e
   slave.
d1681e
3. When original master is up and failed back, due to
d1681e
   automatic gfid conflict resolution, "dir1" present in
d1681e
   original master is deleted losing all files and only
d1681e
   new file created on original slave is restored.
d1681e
d1681e
Hence during failover/failback, automatic gfid conflict
d1681e
resolution should be disabled. So in these cases, appropriate
d1681e
decision is taken.
d1681e
d1681e
Backport of:
d1681e
 > Patch: https://review.gluster.org/20986/
d1681e
 > BUG: 1622076
d1681e
 > Change-Id: I433616f5d3e13d4b6eb675475bd554ca34928573
d1681e
d1681e
BUG: 1622029
d1681e
Signed-off-by: Kotresh HR <khiremat@redhat.com>
d1681e
Change-Id: I433616f5d3e13d4b6eb675475bd554ca34928573
d1681e
Reviewed-on: https://code.engineering.redhat.com/gerrit/148004
d1681e
Tested-by: RHGS Build Bot <nigelb@redhat.com>
d1681e
Reviewed-by: Aravinda Vishwanathapura Krishna Murthy <avishwan@redhat.com>
d1681e
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
d1681e
---
d1681e
 geo-replication/syncdaemon/gsyncd.py |  2 ++
d1681e
 geo-replication/syncdaemon/master.py | 22 ++++++++++++----------
d1681e
 2 files changed, 14 insertions(+), 10 deletions(-)
d1681e
d1681e
diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py
d1681e
index fff193b..23d588e 100644
d1681e
--- a/geo-replication/syncdaemon/gsyncd.py
d1681e
+++ b/geo-replication/syncdaemon/gsyncd.py
d1681e
@@ -323,6 +323,8 @@ def main_i():
d1681e
     op.add_option('--changelog-archive-format', metavar='N',
d1681e
                   type=str, default="%Y%m")
d1681e
     op.add_option('--use-meta-volume', default=False, action='store_true')
d1681e
+    op.add_option('--gfid-conflict-resolution', default=True,
d1681e
+                  action='store_true')
d1681e
     op.add_option('--meta-volume-mnt', metavar='N',
d1681e
                   type=str, default="/var/run/gluster/shared_storage")
d1681e
     op.add_option(
d1681e
diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py
d1681e
index 1399378..6de2c77 100644
d1681e
--- a/geo-replication/syncdaemon/master.py
d1681e
+++ b/geo-replication/syncdaemon/master.py
d1681e
@@ -1163,16 +1163,18 @@ class GMasterChangelogMixin(GMasterCommon):
d1681e
             self.status.inc_value("entry", len(entries))
d1681e
 
d1681e
             failures = self.slave.server.entry_ops(entries)
d1681e
-            count = 0
d1681e
-            while failures and count < self.MAX_OE_RETRIES:
d1681e
-                count += 1
d1681e
-                self.handle_entry_failures(failures, entries)
d1681e
-                logging.info("Retry original entries. count = %s" % count)
d1681e
-                failures = self.slave.server.entry_ops(entries)
d1681e
-                if not failures:
d1681e
-                    logging.info("Sucessfully fixed all entry ops with gfid "
d1681e
-                                 "mismatch")
d1681e
-                    break
d1681e
+
d1681e
+            if boolify(gconf.gfid_conflict_resolution):
d1681e
+                count = 0
d1681e
+                while failures and count < self.MAX_OE_RETRIES:
d1681e
+                    count += 1
d1681e
+                    self.handle_entry_failures(failures, entries)
d1681e
+                    logging.info("Retry original entries. count = %s" % count)
d1681e
+                    failures = self.slave.server.entry_ops(entries)
d1681e
+                    if not failures:
d1681e
+                        logging.info("Sucessfully fixed all entry ops with "
d1681e
+                                     "gfid mismatch")
d1681e
+                        break
d1681e
 
d1681e
             self.log_failures(failures, 'gfid', gauxpfx(), 'ENTRY')
d1681e
 
d1681e
-- 
d1681e
1.8.3.1
d1681e