From 8295424dcd1c28e80fa576bc5ae0febc43f58744 Mon Sep 17 00:00:00 2001 From: Aravinda VK Date: Tue, 24 May 2016 14:13:29 +0530 Subject: [PATCH 177/178] geo-rep: Handle Worker kill gracefully if worker already died If Agent dies for any reason, monitor tries to kill Worker also. But if worker is also died then kill command raises error ESRCH: No such process. [2016-05-23 16:49:33.903965] I [monitor(monitor):326:monitor] Monitor: Changelog Agent died, Aborting Worker(/bricks/brick0/master_brick0) [2016-05-23 16:49:33.904535] E [syncdutils(monitor):276:log_raise_exception] : FAIL: Traceback (most recent call last): File "/usr/libexec/glusterfs/python/syncdaemon/syncdutils.py", line 306 in twrap tf(*aa) File "/usr/libexec/glusterfs/python/syncdaemon/monitor.py", line 393, in wmon slave_host, master) File "/usr/libexec/glusterfs/python/syncdaemon/monitor.py", line 327, in monitor os.kill(cpid, signal.SIGKILL) OSError: [Errno 3] No such process With this patch, monitor will gracefully handle if worker is already died. Change-Id: I3ae5f816a3a197343b64540cf46f5453167fb660 Signed-off-by: Aravinda VK BUG: 1339163 Reviewed-on: http://review.gluster.org/14512 Reviewed-on: http://review.gluster.org/14562 Reviewed-on: http://review.gluster.org/14563 Reviewed-on: https://code.engineering.redhat.com/gerrit/75474 Reviewed-by: Milind Changire Reviewed-by: Kotresh Hiremath Ravishankar --- geo-replication/syncdaemon/monitor.py | 18 +++++++++--------- 1 files changed, 9 insertions(+), 9 deletions(-) diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py index 2b570a9..050218b 100644 --- a/geo-replication/syncdaemon/monitor.py +++ b/geo-replication/syncdaemon/monitor.py @@ -18,11 +18,11 @@ import xml.etree.ElementTree as XET from subprocess import PIPE from resource import Popen, FILE, GLUSTER, SSH from threading import Lock -from errno import ECHILD +from errno import ECHILD, ESRCH import re import random from gconf import gconf -from syncdutils import select, waitpid +from syncdutils import select, waitpid, errno_wrap from syncdutils import set_term_handler, is_host_local, GsyncdError from syncdutils import escape, Thread, finalize, memoize @@ -187,7 +187,7 @@ class Monitor(object): # standard handler set_term_handler(lambda *a: set_term_handler()) # give a chance to graceful exit - os.kill(-os.getpid(), signal.SIGTERM) + errno_wrap(os.kill, [-os.getpid(), signal.SIGTERM], [ESRCH]) def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master): """the monitor loop @@ -324,7 +324,7 @@ class Monitor(object): # Agent is died Kill Worker logging.info("Changelog Agent died, " "Aborting Worker(%s)" % w[0]) - os.kill(cpid, signal.SIGKILL) + errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(cpid) nwait(apid) @@ -348,7 +348,7 @@ class Monitor(object): # Agent is died Kill Worker logging.info("Changelog Agent died, Aborting " "Worker(%s)" % w[0]) - os.kill(cpid, signal.SIGKILL) + errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(cpid) nwait(apid) break @@ -357,7 +357,7 @@ class Monitor(object): else: logging.info("worker(%s) not confirmed in %d sec, " "aborting it" % (w[0], conn_timeout)) - os.kill(cpid, signal.SIGKILL) + errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(apid) # wait for agent ret = nwait(cpid) if ret is None: @@ -394,9 +394,9 @@ class Monitor(object): time.sleep(1) self.lock.acquire() for cpid in cpids: - os.kill(cpid, signal.SIGKILL) + errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) for apid in agents: - os.kill(apid, signal.SIGKILL) + errno_wrap(os.kill, [apid, signal.SIGKILL], [ESRCH]) self.lock.release() finalize(exval=1) t = Thread(target=wmon, args=[wx]) @@ -464,7 +464,7 @@ def monitor(*resources): # yes, send SIGSTOP to negative of monitor pid # to go back to pause state. if gconf.pause_on_start: - os.kill(-os.getpid(), signal.SIGSTOP) + errno_wrap(os.kill, [-os.getpid(), signal.SIGSTOP], [ESRCH]) """oh yeah, actually Monitor is used as singleton, too""" return Monitor().multiplex(*distribute(*resources)) -- 1.7.1