Blob Blame History Raw
From 8295424dcd1c28e80fa576bc5ae0febc43f58744 Mon Sep 17 00:00:00 2001
From: Aravinda VK <avishwan@redhat.com>
Date: Tue, 24 May 2016 14:13:29 +0530
Subject: [PATCH 177/178] geo-rep: Handle Worker kill gracefully if worker already died

If Agent dies for any reason, monitor tries to kill Worker also. But
if worker is also died then kill command raises error ESRCH: No such
process.

[2016-05-23 16:49:33.903965] I [monitor(monitor):326:monitor] Monitor:
    Changelog Agent died, Aborting Worker(/bricks/brick0/master_brick0)
[2016-05-23 16:49:33.904535] E [syncdutils(monitor):276:log_raise_exception]
    <top>: FAIL:
Traceback (most recent call last):
  File "/usr/libexec/glusterfs/python/syncdaemon/syncdutils.py", line 306 in
  twrap
    tf(*aa)
  File "/usr/libexec/glusterfs/python/syncdaemon/monitor.py", line 393, in
  wmon
     slave_host, master)
  File "/usr/libexec/glusterfs/python/syncdaemon/monitor.py", line 327, in
  monitor
     os.kill(cpid, signal.SIGKILL)
     OSError: [Errno 3] No such process

With this patch, monitor will gracefully handle if worker is already died.

Change-Id: I3ae5f816a3a197343b64540cf46f5453167fb660
Signed-off-by: Aravinda VK <avishwan@redhat.com>
BUG: 1339163
Reviewed-on: http://review.gluster.org/14512
Reviewed-on: http://review.gluster.org/14562
Reviewed-on: http://review.gluster.org/14563
Reviewed-on: https://code.engineering.redhat.com/gerrit/75474
Reviewed-by: Milind Changire <mchangir@redhat.com>
Reviewed-by: Kotresh Hiremath Ravishankar <khiremat@redhat.com>
---
 geo-replication/syncdaemon/monitor.py |   18 +++++++++---------
 1 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
index 2b570a9..050218b 100644
--- a/geo-replication/syncdaemon/monitor.py
+++ b/geo-replication/syncdaemon/monitor.py
@@ -18,11 +18,11 @@ import xml.etree.ElementTree as XET
 from subprocess import PIPE
 from resource import Popen, FILE, GLUSTER, SSH
 from threading import Lock
-from errno import ECHILD
+from errno import ECHILD, ESRCH
 import re
 import random
 from gconf import gconf
-from syncdutils import select, waitpid
+from syncdutils import select, waitpid, errno_wrap
 from syncdutils import set_term_handler, is_host_local, GsyncdError
 from syncdutils import escape, Thread, finalize, memoize
 
@@ -187,7 +187,7 @@ class Monitor(object):
         # standard handler
         set_term_handler(lambda *a: set_term_handler())
         # give a chance to graceful exit
-        os.kill(-os.getpid(), signal.SIGTERM)
+        errno_wrap(os.kill, [-os.getpid(), signal.SIGTERM], [ESRCH])
 
     def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master):
         """the monitor loop
@@ -324,7 +324,7 @@ class Monitor(object):
                     # Agent is died Kill Worker
                     logging.info("Changelog Agent died, "
                                  "Aborting Worker(%s)" % w[0])
-                    os.kill(cpid, signal.SIGKILL)
+                    errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                     nwait(cpid)
                     nwait(apid)
 
@@ -348,7 +348,7 @@ class Monitor(object):
                             # Agent is died Kill Worker
                             logging.info("Changelog Agent died, Aborting "
                                          "Worker(%s)" % w[0])
-                            os.kill(cpid, signal.SIGKILL)
+                            errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                             nwait(cpid)
                             nwait(apid)
                             break
@@ -357,7 +357,7 @@ class Monitor(object):
             else:
                 logging.info("worker(%s) not confirmed in %d sec, "
                              "aborting it" % (w[0], conn_timeout))
-                os.kill(cpid, signal.SIGKILL)
+                errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                 nwait(apid)  # wait for agent
                 ret = nwait(cpid)
             if ret is None:
@@ -394,9 +394,9 @@ class Monitor(object):
                 time.sleep(1)
                 self.lock.acquire()
                 for cpid in cpids:
-                    os.kill(cpid, signal.SIGKILL)
+                    errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                 for apid in agents:
-                    os.kill(apid, signal.SIGKILL)
+                    errno_wrap(os.kill, [apid, signal.SIGKILL], [ESRCH])
                 self.lock.release()
                 finalize(exval=1)
             t = Thread(target=wmon, args=[wx])
@@ -464,7 +464,7 @@ def monitor(*resources):
     # yes, send SIGSTOP to negative of monitor pid
     # to go back to pause state.
     if gconf.pause_on_start:
-        os.kill(-os.getpid(), signal.SIGSTOP)
+        errno_wrap(os.kill, [-os.getpid(), signal.SIGSTOP], [ESRCH])
 
     """oh yeah, actually Monitor is used as singleton, too"""
     return Monitor().multiplex(*distribute(*resources))
-- 
1.7.1