|
|
12a457 |
From 8295424dcd1c28e80fa576bc5ae0febc43f58744 Mon Sep 17 00:00:00 2001
|
|
|
12a457 |
From: Aravinda VK <avishwan@redhat.com>
|
|
|
12a457 |
Date: Tue, 24 May 2016 14:13:29 +0530
|
|
|
12a457 |
Subject: [PATCH 177/178] geo-rep: Handle Worker kill gracefully if worker already died
|
|
|
12a457 |
|
|
|
12a457 |
If Agent dies for any reason, monitor tries to kill Worker also. But
|
|
|
12a457 |
if worker is also died then kill command raises error ESRCH: No such
|
|
|
12a457 |
process.
|
|
|
12a457 |
|
|
|
12a457 |
[2016-05-23 16:49:33.903965] I [monitor(monitor):326:monitor] Monitor:
|
|
|
12a457 |
Changelog Agent died, Aborting Worker(/bricks/brick0/master_brick0)
|
|
|
12a457 |
[2016-05-23 16:49:33.904535] E [syncdutils(monitor):276:log_raise_exception]
|
|
|
12a457 |
<top>: FAIL:
|
|
|
12a457 |
Traceback (most recent call last):
|
|
|
12a457 |
File "/usr/libexec/glusterfs/python/syncdaemon/syncdutils.py", line 306 in
|
|
|
12a457 |
twrap
|
|
|
12a457 |
tf(*aa)
|
|
|
12a457 |
File "/usr/libexec/glusterfs/python/syncdaemon/monitor.py", line 393, in
|
|
|
12a457 |
wmon
|
|
|
12a457 |
slave_host, master)
|
|
|
12a457 |
File "/usr/libexec/glusterfs/python/syncdaemon/monitor.py", line 327, in
|
|
|
12a457 |
monitor
|
|
|
12a457 |
os.kill(cpid, signal.SIGKILL)
|
|
|
12a457 |
OSError: [Errno 3] No such process
|
|
|
12a457 |
|
|
|
12a457 |
With this patch, monitor will gracefully handle if worker is already died.
|
|
|
12a457 |
|
|
|
12a457 |
Change-Id: I3ae5f816a3a197343b64540cf46f5453167fb660
|
|
|
12a457 |
Signed-off-by: Aravinda VK <avishwan@redhat.com>
|
|
|
12a457 |
BUG: 1339163
|
|
|
12a457 |
Reviewed-on: http://review.gluster.org/14512
|
|
|
12a457 |
Reviewed-on: http://review.gluster.org/14562
|
|
|
12a457 |
Reviewed-on: http://review.gluster.org/14563
|
|
|
12a457 |
Reviewed-on: https://code.engineering.redhat.com/gerrit/75474
|
|
|
12a457 |
Reviewed-by: Milind Changire <mchangir@redhat.com>
|
|
|
12a457 |
Reviewed-by: Kotresh Hiremath Ravishankar <khiremat@redhat.com>
|
|
|
12a457 |
---
|
|
|
12a457 |
geo-replication/syncdaemon/monitor.py | 18 +++++++++---------
|
|
|
12a457 |
1 files changed, 9 insertions(+), 9 deletions(-)
|
|
|
12a457 |
|
|
|
12a457 |
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
|
|
|
12a457 |
index 2b570a9..050218b 100644
|
|
|
12a457 |
--- a/geo-replication/syncdaemon/monitor.py
|
|
|
12a457 |
+++ b/geo-replication/syncdaemon/monitor.py
|
|
|
12a457 |
@@ -18,11 +18,11 @@ import xml.etree.ElementTree as XET
|
|
|
12a457 |
from subprocess import PIPE
|
|
|
12a457 |
from resource import Popen, FILE, GLUSTER, SSH
|
|
|
12a457 |
from threading import Lock
|
|
|
12a457 |
-from errno import ECHILD
|
|
|
12a457 |
+from errno import ECHILD, ESRCH
|
|
|
12a457 |
import re
|
|
|
12a457 |
import random
|
|
|
12a457 |
from gconf import gconf
|
|
|
12a457 |
-from syncdutils import select, waitpid
|
|
|
12a457 |
+from syncdutils import select, waitpid, errno_wrap
|
|
|
12a457 |
from syncdutils import set_term_handler, is_host_local, GsyncdError
|
|
|
12a457 |
from syncdutils import escape, Thread, finalize, memoize
|
|
|
12a457 |
|
|
|
12a457 |
@@ -187,7 +187,7 @@ class Monitor(object):
|
|
|
12a457 |
# standard handler
|
|
|
12a457 |
set_term_handler(lambda *a: set_term_handler())
|
|
|
12a457 |
# give a chance to graceful exit
|
|
|
12a457 |
- os.kill(-os.getpid(), signal.SIGTERM)
|
|
|
12a457 |
+ errno_wrap(os.kill, [-os.getpid(), signal.SIGTERM], [ESRCH])
|
|
|
12a457 |
|
|
|
12a457 |
def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master):
|
|
|
12a457 |
"""the monitor loop
|
|
|
12a457 |
@@ -324,7 +324,7 @@ class Monitor(object):
|
|
|
12a457 |
# Agent is died Kill Worker
|
|
|
12a457 |
logging.info("Changelog Agent died, "
|
|
|
12a457 |
"Aborting Worker(%s)" % w[0])
|
|
|
12a457 |
- os.kill(cpid, signal.SIGKILL)
|
|
|
12a457 |
+ errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
|
|
|
12a457 |
nwait(cpid)
|
|
|
12a457 |
nwait(apid)
|
|
|
12a457 |
|
|
|
12a457 |
@@ -348,7 +348,7 @@ class Monitor(object):
|
|
|
12a457 |
# Agent is died Kill Worker
|
|
|
12a457 |
logging.info("Changelog Agent died, Aborting "
|
|
|
12a457 |
"Worker(%s)" % w[0])
|
|
|
12a457 |
- os.kill(cpid, signal.SIGKILL)
|
|
|
12a457 |
+ errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
|
|
|
12a457 |
nwait(cpid)
|
|
|
12a457 |
nwait(apid)
|
|
|
12a457 |
break
|
|
|
12a457 |
@@ -357,7 +357,7 @@ class Monitor(object):
|
|
|
12a457 |
else:
|
|
|
12a457 |
logging.info("worker(%s) not confirmed in %d sec, "
|
|
|
12a457 |
"aborting it" % (w[0], conn_timeout))
|
|
|
12a457 |
- os.kill(cpid, signal.SIGKILL)
|
|
|
12a457 |
+ errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
|
|
|
12a457 |
nwait(apid) # wait for agent
|
|
|
12a457 |
ret = nwait(cpid)
|
|
|
12a457 |
if ret is None:
|
|
|
12a457 |
@@ -394,9 +394,9 @@ class Monitor(object):
|
|
|
12a457 |
time.sleep(1)
|
|
|
12a457 |
self.lock.acquire()
|
|
|
12a457 |
for cpid in cpids:
|
|
|
12a457 |
- os.kill(cpid, signal.SIGKILL)
|
|
|
12a457 |
+ errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
|
|
|
12a457 |
for apid in agents:
|
|
|
12a457 |
- os.kill(apid, signal.SIGKILL)
|
|
|
12a457 |
+ errno_wrap(os.kill, [apid, signal.SIGKILL], [ESRCH])
|
|
|
12a457 |
self.lock.release()
|
|
|
12a457 |
finalize(exval=1)
|
|
|
12a457 |
t = Thread(target=wmon, args=[wx])
|
|
|
12a457 |
@@ -464,7 +464,7 @@ def monitor(*resources):
|
|
|
12a457 |
# yes, send SIGSTOP to negative of monitor pid
|
|
|
12a457 |
# to go back to pause state.
|
|
|
12a457 |
if gconf.pause_on_start:
|
|
|
12a457 |
- os.kill(-os.getpid(), signal.SIGSTOP)
|
|
|
12a457 |
+ errno_wrap(os.kill, [-os.getpid(), signal.SIGSTOP], [ESRCH])
|
|
|
12a457 |
|
|
|
12a457 |
"""oh yeah, actually Monitor is used as singleton, too"""
|
|
|
12a457 |
return Monitor().multiplex(*distribute(*resources))
|
|
|
12a457 |
--
|
|
|
12a457 |
1.7.1
|
|
|
12a457 |
|