|
|
cb8e9e |
From 679e4ed49d453899e6700e3fc1f4fa1e39900793 Mon Sep 17 00:00:00 2001
|
|
|
cb8e9e |
From: Kotresh HR <khiremat@redhat.com>
|
|
|
cb8e9e |
Date: Fri, 3 Jul 2015 16:32:56 +0530
|
|
|
cb8e9e |
Subject: [PATCH 253/279] geo-rep: Fix history failure
|
|
|
cb8e9e |
|
|
|
cb8e9e |
Both ACTIVE and PASSIVE workers register to changelog
|
|
|
cb8e9e |
at almost same time. When PASSIVE worker becomes ACTIVE,
|
|
|
cb8e9e |
the start and end time would be current stime and register_time
|
|
|
cb8e9e |
repectively for history API. Hence register_time would be less
|
|
|
cb8e9e |
then stime for which history obviously fails. But it will
|
|
|
cb8e9e |
be successful for the next restart as new register_time > stime.
|
|
|
cb8e9e |
|
|
|
cb8e9e |
Fix is to pass current time as the end time to history call
|
|
|
cb8e9e |
instead of the register_time.
|
|
|
cb8e9e |
|
|
|
cb8e9e |
Also improvised the logging for ACTIVE/PASSIVE switching.
|
|
|
cb8e9e |
|
|
|
cb8e9e |
BUG: 1236546
|
|
|
cb8e9e |
Change-Id: I5d7f38200980914bd0b2ee34da4d2e6674b3a67e
|
|
|
cb8e9e |
Reviewed-on: http://review.gluster.org/11524
|
|
|
cb8e9e |
Reviewed-on: http://review.gluster.org/11784
|
|
|
cb8e9e |
Tested-by: Gluster Build System <jenkins@build.gluster.com>
|
|
|
cb8e9e |
Tested-by: NetBSD Build System <jenkins@build.gluster.org>
|
|
|
cb8e9e |
Reviewed-by: Aravinda VK <avishwan@redhat.com>
|
|
|
cb8e9e |
Reviewed-by: Venky Shankar <vshankar@redhat.com>
|
|
|
cb8e9e |
Reviewed-by: Milind Changire <mchangir@redhat.com>
|
|
|
cb8e9e |
Signed-off-by: Kotresh HR <khiremat@redhat.com>
|
|
|
cb8e9e |
Reviewed-on: https://code.engineering.redhat.com/gerrit/55051
|
|
|
cb8e9e |
Reviewed-by: Saravanakumar Arumugam <sarumuga@redhat.com>
|
|
|
cb8e9e |
Reviewed-by: Aravinda Vishwanathapura Krishna Murthy <avishwan@redhat.com>
|
|
|
cb8e9e |
Tested-by: Aravinda Vishwanathapura Krishna Murthy <avishwan@redhat.com>
|
|
|
cb8e9e |
---
|
|
|
cb8e9e |
geo-replication/syncdaemon/gconf.py | 6 ++++++
|
|
|
cb8e9e |
geo-replication/syncdaemon/master.py | 18 +++++++++++++-----
|
|
|
cb8e9e |
2 files changed, 19 insertions(+), 5 deletions(-)
|
|
|
cb8e9e |
|
|
|
cb8e9e |
diff --git a/geo-replication/syncdaemon/gconf.py b/geo-replication/syncdaemon/gconf.py
|
|
|
cb8e9e |
index 1fc7c38..39a70a6 100644
|
|
|
cb8e9e |
--- a/geo-replication/syncdaemon/gconf.py
|
|
|
cb8e9e |
+++ b/geo-replication/syncdaemon/gconf.py
|
|
|
cb8e9e |
@@ -21,5 +21,11 @@ class GConf(object):
|
|
|
cb8e9e |
log_exit = False
|
|
|
cb8e9e |
permanent_handles = []
|
|
|
cb8e9e |
log_metadata = {}
|
|
|
cb8e9e |
+ """One variable is sufficient to track the
|
|
|
cb8e9e |
+ switching of worker to ACTIVE. Two variables
|
|
|
cb8e9e |
+ are intentionally used to track worker going
|
|
|
cb8e9e |
+ to PASSIVE as well mainly for debugging"""
|
|
|
cb8e9e |
+ active_earlier = False
|
|
|
cb8e9e |
+ passive_earlier = False
|
|
|
cb8e9e |
|
|
|
cb8e9e |
gconf = GConf()
|
|
|
cb8e9e |
diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py
|
|
|
cb8e9e |
index 5b8abc5..1bc2450 100644
|
|
|
cb8e9e |
--- a/geo-replication/syncdaemon/master.py
|
|
|
cb8e9e |
+++ b/geo-replication/syncdaemon/master.py
|
|
|
cb8e9e |
@@ -441,6 +441,7 @@ class GMasterCommon(object):
|
|
|
cb8e9e |
t.start()
|
|
|
cb8e9e |
|
|
|
cb8e9e |
def mgmt_lock(self):
|
|
|
cb8e9e |
+
|
|
|
cb8e9e |
"""Take management volume lock """
|
|
|
cb8e9e |
fd = None
|
|
|
cb8e9e |
bname = str(self.uuid) + "_" + str(gconf.slave_id) + "_subvol_" \
|
|
|
cb8e9e |
@@ -473,10 +474,16 @@ class GMasterCommon(object):
|
|
|
cb8e9e |
os.close(fd)
|
|
|
cb8e9e |
if isinstance(ex, IOError) and ex.errno in (EACCES, EAGAIN):
|
|
|
cb8e9e |
# cannot grab, it's taken
|
|
|
cb8e9e |
- logging.debug("Lock held by someother worker process")
|
|
|
cb8e9e |
+ if not gconf.passive_earlier:
|
|
|
cb8e9e |
+ gconf.passive_earlier = True
|
|
|
cb8e9e |
+ logging.info("Didn't get lock : %s : Becoming PASSIVE"
|
|
|
cb8e9e |
+ % gconf.local_path)
|
|
|
cb8e9e |
return False
|
|
|
cb8e9e |
raise
|
|
|
cb8e9e |
- logging.debug("Got the lock")
|
|
|
cb8e9e |
+
|
|
|
cb8e9e |
+ if not gconf.active_earlier:
|
|
|
cb8e9e |
+ gconf.active_earlier = True
|
|
|
cb8e9e |
+ logging.info("Got lock : %s : Becoming ACTIVE" % gconf.local_path)
|
|
|
cb8e9e |
return True
|
|
|
cb8e9e |
|
|
|
cb8e9e |
def should_crawl(self):
|
|
|
cb8e9e |
@@ -1123,8 +1130,9 @@ class GMasterChangeloghistoryMixin(GMasterChangelogMixin):
|
|
|
cb8e9e |
self.status.set_worker_crawl_status("History Crawl")
|
|
|
cb8e9e |
purge_time = self.get_purge_time()
|
|
|
cb8e9e |
|
|
|
cb8e9e |
- logging.info('starting history crawl... turns: %s, stime: %s'
|
|
|
cb8e9e |
- % (self.history_turns, repr(purge_time)))
|
|
|
cb8e9e |
+ end_time = int(time.time())
|
|
|
cb8e9e |
+ logging.info('starting history crawl... turns: %s, stime: %s, etime: %s'
|
|
|
cb8e9e |
+ % (self.history_turns, repr(purge_time), repr(end_time)))
|
|
|
cb8e9e |
|
|
|
cb8e9e |
if not purge_time or purge_time == URXTIME:
|
|
|
cb8e9e |
logging.info("stime not available, abandoning history crawl")
|
|
|
cb8e9e |
@@ -1138,7 +1146,7 @@ class GMasterChangeloghistoryMixin(GMasterChangelogMixin):
|
|
|
cb8e9e |
ret, actual_end = self.changelog_agent.history(
|
|
|
cb8e9e |
changelog_path,
|
|
|
cb8e9e |
purge_time[0],
|
|
|
cb8e9e |
- self.changelog_register_time,
|
|
|
cb8e9e |
+ end_time,
|
|
|
cb8e9e |
int(gconf.sync_jobs))
|
|
|
cb8e9e |
|
|
|
cb8e9e |
# scan followed by getchanges till scan returns zero.
|
|
|
cb8e9e |
--
|
|
|
cb8e9e |
1.7.1
|
|
|
cb8e9e |
|