Blob Blame History Raw
From 679e4ed49d453899e6700e3fc1f4fa1e39900793 Mon Sep 17 00:00:00 2001
From: Kotresh HR <khiremat@redhat.com>
Date: Fri, 3 Jul 2015 16:32:56 +0530
Subject: [PATCH 253/279] geo-rep: Fix history failure

Both ACTIVE and PASSIVE workers register to changelog
at almost same time. When PASSIVE worker becomes ACTIVE,
the start and end time would be current stime and register_time
repectively for history API. Hence register_time would be less
then stime for which history obviously fails. But it will
be successful for the next restart as new register_time > stime.

Fix is to pass current time as the end time to history call
instead of the register_time.

Also improvised the logging for ACTIVE/PASSIVE switching.

BUG: 1236546
Change-Id: I5d7f38200980914bd0b2ee34da4d2e6674b3a67e
Reviewed-on: http://review.gluster.org/11524
Reviewed-on: http://review.gluster.org/11784
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Tested-by: NetBSD Build System <jenkins@build.gluster.org>
Reviewed-by: Aravinda VK <avishwan@redhat.com>
Reviewed-by: Venky Shankar <vshankar@redhat.com>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Kotresh HR <khiremat@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/55051
Reviewed-by: Saravanakumar Arumugam <sarumuga@redhat.com>
Reviewed-by: Aravinda Vishwanathapura Krishna Murthy <avishwan@redhat.com>
Tested-by: Aravinda Vishwanathapura Krishna Murthy <avishwan@redhat.com>
---
 geo-replication/syncdaemon/gconf.py  |    6 ++++++
 geo-replication/syncdaemon/master.py |   18 +++++++++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/geo-replication/syncdaemon/gconf.py b/geo-replication/syncdaemon/gconf.py
index 1fc7c38..39a70a6 100644
--- a/geo-replication/syncdaemon/gconf.py
+++ b/geo-replication/syncdaemon/gconf.py
@@ -21,5 +21,11 @@ class GConf(object):
     log_exit = False
     permanent_handles = []
     log_metadata = {}
+    """One variable is sufficient to track the
+       switching of worker to ACTIVE. Two variables
+       are intentionally used to track worker going
+       to PASSIVE as well mainly for debugging"""
+    active_earlier = False
+    passive_earlier = False
 
 gconf = GConf()
diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py
index 5b8abc5..1bc2450 100644
--- a/geo-replication/syncdaemon/master.py
+++ b/geo-replication/syncdaemon/master.py
@@ -441,6 +441,7 @@ class GMasterCommon(object):
             t.start()
 
     def mgmt_lock(self):
+
         """Take management volume lock """
         fd = None
         bname = str(self.uuid) + "_" + str(gconf.slave_id) + "_subvol_" \
@@ -473,10 +474,16 @@ class GMasterCommon(object):
                 os.close(fd)
             if isinstance(ex, IOError) and ex.errno in (EACCES, EAGAIN):
                 # cannot grab, it's taken
-                logging.debug("Lock held by someother worker process")
+                if not gconf.passive_earlier:
+                    gconf.passive_earlier = True
+                    logging.info("Didn't get lock : %s : Becoming PASSIVE"
+                                 % gconf.local_path)
                 return False
             raise
-        logging.debug("Got the lock")
+
+        if not gconf.active_earlier:
+            gconf.active_earlier = True
+            logging.info("Got lock : %s : Becoming ACTIVE" % gconf.local_path)
         return True
 
     def should_crawl(self):
@@ -1123,8 +1130,9 @@ class GMasterChangeloghistoryMixin(GMasterChangelogMixin):
         self.status.set_worker_crawl_status("History Crawl")
         purge_time = self.get_purge_time()
 
-        logging.info('starting history crawl... turns: %s, stime: %s'
-                     % (self.history_turns, repr(purge_time)))
+        end_time = int(time.time())
+        logging.info('starting history crawl... turns: %s, stime: %s, etime: %s'
+                     % (self.history_turns, repr(purge_time), repr(end_time)))
 
         if not purge_time or purge_time == URXTIME:
             logging.info("stime not available, abandoning history crawl")
@@ -1138,7 +1146,7 @@ class GMasterChangeloghistoryMixin(GMasterChangelogMixin):
         ret, actual_end = self.changelog_agent.history(
             changelog_path,
             purge_time[0],
-            self.changelog_register_time,
+            end_time,
             int(gconf.sync_jobs))
 
         # scan followed by getchanges till scan returns zero.
-- 
1.7.1