60de42
From 142169da59518ee17bbf885dfbf42525fd2c1d0b Mon Sep 17 00:00:00 2001
60de42
From: =?UTF-8?q?Jan=20Pokorn=C3=BD?= <jpokorny@redhat.com>
60de42
Date: Tue, 31 Jan 2017 20:06:21 +0100
60de42
Subject: [PATCH 1/2] Low: libservices (sync): ensure no zombie is left behind
60de42
60de42
It could happen because this parent that is to wait for its
60de42
not-well-behaved child is knowlingly not blocking any signal (beside
60de42
SIGCHLD explicitly in case of using signalfd facility) and delivery of
60de42
such signal can interrupt waitpid, at least on paper, so protect
60de42
against this accordingly.
60de42
60de42
Speaking of SIGCHLD in plain self-pipe (not signalfd one) context, while
60de42
there's no clash with other synchronous actions, it may be the case with
60de42
asynchronous ones (or for that matter, arbitrary other fork-related
60de42
activities in the main program a library can have no idea about), and
60de42
this is exactly what could interrupt waitpid for real.
60de42
---
60de42
 lib/services/services_linux.c | 4 ++--
60de42
 1 file changed, 2 insertions(+), 2 deletions(-)
60de42
60de42
diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c
60de42
index cd7fd3f..ffb74ef 100644
60de42
--- a/lib/services/services_linux.c
60de42
+++ b/lib/services/services_linux.c
60de42
@@ -517,7 +517,7 @@ action_synced_wait(svc_action_t * op, sigset_t *mask)
60de42
                 if (1) {
60de42
                     /* Clear out the sigchld pipe. */
60de42
                     char ch;
60de42
-                    while (read(sfd, &ch, 1) == 1);
60de42
+                    while (read(sfd, &ch, 1) == 1) /*omit*/;
60de42
 #endif
60de42
                     wait_rc = waitpid(op->pid, &status, WNOHANG);
60de42
 
60de42
@@ -567,7 +567,7 @@ action_synced_wait(svc_action_t * op, sigset_t *mask)
60de42
          *
60de42
          * This makes it safe to skip WNOHANG here
60de42
          */
60de42
-        waitpid(op->pid, &status, 0);
60de42
+        while (waitpid(op->pid, &status, 0) == (pid_t) -1 && errno == EINTR) /*omit*/;
60de42
 
60de42
     } else if (WIFEXITED(status)) {
60de42
         op->status = PCMK_LRM_OP_DONE;
60de42
-- 
60de42
1.8.3.1
60de42
60de42
60de42
From 9acc32260562df262a768bd2259372d16d90283b Mon Sep 17 00:00:00 2001
60de42
From: =?UTF-8?q?Jan=20Pokorn=C3=BD?= <jpokorny@redhat.com>
60de42
Date: Tue, 31 Jan 2017 20:49:04 +0100
60de42
Subject: [PATCH 2/2] Low: libservices(sync): partially prevent killing foreign
60de42
 process
60de42
60de42
Do not attempt sending SIGKILL to process for which previous
60de42
waitpid(pid, &st, WNOHANG) indicated ECHILD issue.  This might
60de42
be a result of clashing with another thread or an orthogonal
60de42
signal handler getting to reap the child first.
60de42
60de42
For good measure, repeat non-hanging waitpid test right before
60de42
killing -- it won't prevent illustrated race, but will limit it
60de42
a bit more.  That's unlikely, sad path, hence without regularly
60de42
imposed performance penalty.  And when we can pay more attention
60de42
to prevent killing innocent processes (the code in question is
60de42
commonly run as root so nothing will prevent that), we should.
60de42
---
60de42
 lib/services/services_linux.c | 36 +++++++++++++++++++++---------------
60de42
 1 file changed, 21 insertions(+), 15 deletions(-)
60de42
60de42
diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c
60de42
index ffb74ef..16f25f3 100644
60de42
--- a/lib/services/services_linux.c
60de42
+++ b/lib/services/services_linux.c
60de42
@@ -521,11 +521,19 @@ action_synced_wait(svc_action_t * op, sigset_t *mask)
60de42
 #endif
60de42
                     wait_rc = waitpid(op->pid, &status, WNOHANG);
60de42
 
60de42
-                    if (wait_rc < 0){
60de42
-                        crm_perror(LOG_ERR, "waitpid() for %d failed", op->pid);
60de42
-
60de42
-                    } else if (wait_rc > 0) {
60de42
+                    if (wait_rc > 0) {
60de42
                         break;
60de42
+
60de42
+                    } else if (wait_rc < 0){
60de42
+                        if (errno == ECHILD) {
60de42
+                                /* Here, don't dare to kill and bail out... */
60de42
+                                break;
60de42
+
60de42
+                        } else {
60de42
+                                /* ...otherwise pretend process still runs. */
60de42
+                                wait_rc = 0;
60de42
+                        }
60de42
+                        crm_perror(LOG_ERR, "waitpid() for %d failed", op->pid);
60de42
                     }
60de42
                 }
60de42
             }
60de42
@@ -547,9 +555,8 @@ action_synced_wait(svc_action_t * op, sigset_t *mask)
60de42
 
60de42
     crm_trace("Child done: %d", op->pid);
60de42
     if (wait_rc <= 0) {
60de42
-        int killrc = kill(op->pid, SIGKILL);
60de42
-
60de42
         op->rc = PCMK_OCF_UNKNOWN_ERROR;
60de42
+
60de42
         if (op->timeout > 0 && timeout <= 0) {
60de42
             op->status = PCMK_LRM_OP_TIMEOUT;
60de42
             crm_warn("%s:%d - timed out after %dms", op->id, op->pid, op->timeout);
60de42
@@ -558,16 +565,15 @@ action_synced_wait(svc_action_t * op, sigset_t *mask)
60de42
             op->status = PCMK_LRM_OP_ERROR;
60de42
         }
60de42
 
60de42
-        if (killrc && errno != ESRCH) {
60de42
-            crm_err("kill(%d, KILL) failed: %d", op->pid, errno);
60de42
+        /* If only child hasn't been successfully waited for, yet.
60de42
+           This is to limit killing wrong target a bit more. */
60de42
+        if (wait_rc == 0 && waitpid(op->pid, &status, WNOHANG) == 0) {
60de42
+            if (kill(op->pid, SIGKILL)) {
60de42
+                crm_err("kill(%d, KILL) failed: %d", op->pid, errno);
60de42
+            }
60de42
+            /* Safe to skip WNOHANG here as we sent non-ignorable signal. */
60de42
+            while (waitpid(op->pid, &status, 0) == (pid_t) -1 && errno == EINTR) /*omit*/;
60de42
         }
60de42
-        /*
60de42
-         * From sigprocmask(2):
60de42
-         * It is not possible to block SIGKILL or SIGSTOP.  Attempts to do so are silently ignored.
60de42
-         *
60de42
-         * This makes it safe to skip WNOHANG here
60de42
-         */
60de42
-        while (waitpid(op->pid, &status, 0) == (pid_t) -1 && errno == EINTR) /*omit*/;
60de42
 
60de42
     } else if (WIFEXITED(status)) {
60de42
         op->status = PCMK_LRM_OP_DONE;
60de42
-- 
60de42
1.8.3.1
60de42