cb8e9e
From 679e4ed49d453899e6700e3fc1f4fa1e39900793 Mon Sep 17 00:00:00 2001
cb8e9e
From: Kotresh HR <khiremat@redhat.com>
cb8e9e
Date: Fri, 3 Jul 2015 16:32:56 +0530
cb8e9e
Subject: [PATCH 253/279] geo-rep: Fix history failure
cb8e9e
cb8e9e
Both ACTIVE and PASSIVE workers register to changelog
cb8e9e
at almost same time. When PASSIVE worker becomes ACTIVE,
cb8e9e
the start and end time would be current stime and register_time
cb8e9e
repectively for history API. Hence register_time would be less
cb8e9e
then stime for which history obviously fails. But it will
cb8e9e
be successful for the next restart as new register_time > stime.
cb8e9e
cb8e9e
Fix is to pass current time as the end time to history call
cb8e9e
instead of the register_time.
cb8e9e
cb8e9e
Also improvised the logging for ACTIVE/PASSIVE switching.
cb8e9e
cb8e9e
BUG: 1236546
cb8e9e
Change-Id: I5d7f38200980914bd0b2ee34da4d2e6674b3a67e
cb8e9e
Reviewed-on: http://review.gluster.org/11524
cb8e9e
Reviewed-on: http://review.gluster.org/11784
cb8e9e
Tested-by: Gluster Build System <jenkins@build.gluster.com>
cb8e9e
Tested-by: NetBSD Build System <jenkins@build.gluster.org>
cb8e9e
Reviewed-by: Aravinda VK <avishwan@redhat.com>
cb8e9e
Reviewed-by: Venky Shankar <vshankar@redhat.com>
cb8e9e
Reviewed-by: Milind Changire <mchangir@redhat.com>
cb8e9e
Signed-off-by: Kotresh HR <khiremat@redhat.com>
cb8e9e
Reviewed-on: https://code.engineering.redhat.com/gerrit/55051
cb8e9e
Reviewed-by: Saravanakumar Arumugam <sarumuga@redhat.com>
cb8e9e
Reviewed-by: Aravinda Vishwanathapura Krishna Murthy <avishwan@redhat.com>
cb8e9e
Tested-by: Aravinda Vishwanathapura Krishna Murthy <avishwan@redhat.com>
cb8e9e
---
cb8e9e
 geo-replication/syncdaemon/gconf.py  |    6 ++++++
cb8e9e
 geo-replication/syncdaemon/master.py |   18 +++++++++++++-----
cb8e9e
 2 files changed, 19 insertions(+), 5 deletions(-)
cb8e9e
cb8e9e
diff --git a/geo-replication/syncdaemon/gconf.py b/geo-replication/syncdaemon/gconf.py
cb8e9e
index 1fc7c38..39a70a6 100644
cb8e9e
--- a/geo-replication/syncdaemon/gconf.py
cb8e9e
+++ b/geo-replication/syncdaemon/gconf.py
cb8e9e
@@ -21,5 +21,11 @@ class GConf(object):
cb8e9e
     log_exit = False
cb8e9e
     permanent_handles = []
cb8e9e
     log_metadata = {}
cb8e9e
+    """One variable is sufficient to track the
cb8e9e
+       switching of worker to ACTIVE. Two variables
cb8e9e
+       are intentionally used to track worker going
cb8e9e
+       to PASSIVE as well mainly for debugging"""
cb8e9e
+    active_earlier = False
cb8e9e
+    passive_earlier = False
cb8e9e
 
cb8e9e
 gconf = GConf()
cb8e9e
diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py
cb8e9e
index 5b8abc5..1bc2450 100644
cb8e9e
--- a/geo-replication/syncdaemon/master.py
cb8e9e
+++ b/geo-replication/syncdaemon/master.py
cb8e9e
@@ -441,6 +441,7 @@ class GMasterCommon(object):
cb8e9e
             t.start()
cb8e9e
 
cb8e9e
     def mgmt_lock(self):
cb8e9e
+
cb8e9e
         """Take management volume lock """
cb8e9e
         fd = None
cb8e9e
         bname = str(self.uuid) + "_" + str(gconf.slave_id) + "_subvol_" \
cb8e9e
@@ -473,10 +474,16 @@ class GMasterCommon(object):
cb8e9e
                 os.close(fd)
cb8e9e
             if isinstance(ex, IOError) and ex.errno in (EACCES, EAGAIN):
cb8e9e
                 # cannot grab, it's taken
cb8e9e
-                logging.debug("Lock held by someother worker process")
cb8e9e
+                if not gconf.passive_earlier:
cb8e9e
+                    gconf.passive_earlier = True
cb8e9e
+                    logging.info("Didn't get lock : %s : Becoming PASSIVE"
cb8e9e
+                                 % gconf.local_path)
cb8e9e
                 return False
cb8e9e
             raise
cb8e9e
-        logging.debug("Got the lock")
cb8e9e
+
cb8e9e
+        if not gconf.active_earlier:
cb8e9e
+            gconf.active_earlier = True
cb8e9e
+            logging.info("Got lock : %s : Becoming ACTIVE" % gconf.local_path)
cb8e9e
         return True
cb8e9e
 
cb8e9e
     def should_crawl(self):
cb8e9e
@@ -1123,8 +1130,9 @@ class GMasterChangeloghistoryMixin(GMasterChangelogMixin):
cb8e9e
         self.status.set_worker_crawl_status("History Crawl")
cb8e9e
         purge_time = self.get_purge_time()
cb8e9e
 
cb8e9e
-        logging.info('starting history crawl... turns: %s, stime: %s'
cb8e9e
-                     % (self.history_turns, repr(purge_time)))
cb8e9e
+        end_time = int(time.time())
cb8e9e
+        logging.info('starting history crawl... turns: %s, stime: %s, etime: %s'
cb8e9e
+                     % (self.history_turns, repr(purge_time), repr(end_time)))
cb8e9e
 
cb8e9e
         if not purge_time or purge_time == URXTIME:
cb8e9e
             logging.info("stime not available, abandoning history crawl")
cb8e9e
@@ -1138,7 +1146,7 @@ class GMasterChangeloghistoryMixin(GMasterChangelogMixin):
cb8e9e
         ret, actual_end = self.changelog_agent.history(
cb8e9e
             changelog_path,
cb8e9e
             purge_time[0],
cb8e9e
-            self.changelog_register_time,
cb8e9e
+            end_time,
cb8e9e
             int(gconf.sync_jobs))
cb8e9e
 
cb8e9e
         # scan followed by getchanges till scan returns zero.
cb8e9e
-- 
cb8e9e
1.7.1
cb8e9e