From 8295424dcd1c28e80fa576bc5ae0febc43f58744 Mon Sep 17 00:00:00 2001
From: Aravinda VK <avishwan@redhat.com>
Date: Tue, 24 May 2016 14:13:29 +0530
Subject: [PATCH 177/178] geo-rep: Handle Worker kill gracefully if worker already died
If Agent dies for any reason, monitor tries to kill Worker also. But
if worker is also died then kill command raises error ESRCH: No such
process.
[2016-05-23 16:49:33.903965] I [monitor(monitor):326:monitor] Monitor:
Changelog Agent died, Aborting Worker(/bricks/brick0/master_brick0)
[2016-05-23 16:49:33.904535] E [syncdutils(monitor):276:log_raise_exception]
<top>: FAIL:
Traceback (most recent call last):
File "/usr/libexec/glusterfs/python/syncdaemon/syncdutils.py", line 306 in
twrap
tf(*aa)
File "/usr/libexec/glusterfs/python/syncdaemon/monitor.py", line 393, in
wmon
slave_host, master)
File "/usr/libexec/glusterfs/python/syncdaemon/monitor.py", line 327, in
monitor
os.kill(cpid, signal.SIGKILL)
OSError: [Errno 3] No such process
With this patch, monitor will gracefully handle if worker is already died.
Change-Id: I3ae5f816a3a197343b64540cf46f5453167fb660
Signed-off-by: Aravinda VK <avishwan@redhat.com>
BUG: 1339163
Reviewed-on: http://review.gluster.org/14512
Reviewed-on: http://review.gluster.org/14562
Reviewed-on: http://review.gluster.org/14563
Reviewed-on: https://code.engineering.redhat.com/gerrit/75474
Reviewed-by: Milind Changire <mchangir@redhat.com>
Reviewed-by: Kotresh Hiremath Ravishankar <khiremat@redhat.com>
---
geo-replication/syncdaemon/monitor.py | 18 +++++++++---------
1 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
index 2b570a9..050218b 100644
--- a/geo-replication/syncdaemon/monitor.py
+++ b/geo-replication/syncdaemon/monitor.py
@@ -18,11 +18,11 @@ import xml.etree.ElementTree as XET
from subprocess import PIPE
from resource import Popen, FILE, GLUSTER, SSH
from threading import Lock
-from errno import ECHILD
+from errno import ECHILD, ESRCH
import re
import random
from gconf import gconf
-from syncdutils import select, waitpid
+from syncdutils import select, waitpid, errno_wrap
from syncdutils import set_term_handler, is_host_local, GsyncdError
from syncdutils import escape, Thread, finalize, memoize
@@ -187,7 +187,7 @@ class Monitor(object):
# standard handler
set_term_handler(lambda *a: set_term_handler())
# give a chance to graceful exit
- os.kill(-os.getpid(), signal.SIGTERM)
+ errno_wrap(os.kill, [-os.getpid(), signal.SIGTERM], [ESRCH])
def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master):
"""the monitor loop
@@ -324,7 +324,7 @@ class Monitor(object):
# Agent is died Kill Worker
logging.info("Changelog Agent died, "
"Aborting Worker(%s)" % w[0])
- os.kill(cpid, signal.SIGKILL)
+ errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
nwait(cpid)
nwait(apid)
@@ -348,7 +348,7 @@ class Monitor(object):
# Agent is died Kill Worker
logging.info("Changelog Agent died, Aborting "
"Worker(%s)" % w[0])
- os.kill(cpid, signal.SIGKILL)
+ errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
nwait(cpid)
nwait(apid)
break
@@ -357,7 +357,7 @@ class Monitor(object):
else:
logging.info("worker(%s) not confirmed in %d sec, "
"aborting it" % (w[0], conn_timeout))
- os.kill(cpid, signal.SIGKILL)
+ errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
nwait(apid) # wait for agent
ret = nwait(cpid)
if ret is None:
@@ -394,9 +394,9 @@ class Monitor(object):
time.sleep(1)
self.lock.acquire()
for cpid in cpids:
- os.kill(cpid, signal.SIGKILL)
+ errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
for apid in agents:
- os.kill(apid, signal.SIGKILL)
+ errno_wrap(os.kill, [apid, signal.SIGKILL], [ESRCH])
self.lock.release()
finalize(exval=1)
t = Thread(target=wmon, args=[wx])
@@ -464,7 +464,7 @@ def monitor(*resources):
# yes, send SIGSTOP to negative of monitor pid
# to go back to pause state.
if gconf.pause_on_start:
- os.kill(-os.getpid(), signal.SIGSTOP)
+ errno_wrap(os.kill, [-os.getpid(), signal.SIGSTOP], [ESRCH])
"""oh yeah, actually Monitor is used as singleton, too"""
return Monitor().multiplex(*distribute(*resources))
--
1.7.1