Blob Blame History Raw
From 142169da59518ee17bbf885dfbf42525fd2c1d0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Pokorn=C3=BD?= <jpokorny@redhat.com>
Date: Tue, 31 Jan 2017 20:06:21 +0100
Subject: [PATCH 1/2] Low: libservices (sync): ensure no zombie is left behind

It could happen because this parent that is to wait for its
not-well-behaved child is knowlingly not blocking any signal (beside
SIGCHLD explicitly in case of using signalfd facility) and delivery of
such signal can interrupt waitpid, at least on paper, so protect
against this accordingly.

Speaking of SIGCHLD in plain self-pipe (not signalfd one) context, while
there's no clash with other synchronous actions, it may be the case with
asynchronous ones (or for that matter, arbitrary other fork-related
activities in the main program a library can have no idea about), and
this is exactly what could interrupt waitpid for real.
---
 lib/services/services_linux.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c
index cd7fd3f..ffb74ef 100644
--- a/lib/services/services_linux.c
+++ b/lib/services/services_linux.c
@@ -517,7 +517,7 @@ action_synced_wait(svc_action_t * op, sigset_t *mask)
                 if (1) {
                     /* Clear out the sigchld pipe. */
                     char ch;
-                    while (read(sfd, &ch, 1) == 1);
+                    while (read(sfd, &ch, 1) == 1) /*omit*/;
 #endif
                     wait_rc = waitpid(op->pid, &status, WNOHANG);
 
@@ -567,7 +567,7 @@ action_synced_wait(svc_action_t * op, sigset_t *mask)
          *
          * This makes it safe to skip WNOHANG here
          */
-        waitpid(op->pid, &status, 0);
+        while (waitpid(op->pid, &status, 0) == (pid_t) -1 && errno == EINTR) /*omit*/;
 
     } else if (WIFEXITED(status)) {
         op->status = PCMK_LRM_OP_DONE;
-- 
1.8.3.1


From 9acc32260562df262a768bd2259372d16d90283b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Pokorn=C3=BD?= <jpokorny@redhat.com>
Date: Tue, 31 Jan 2017 20:49:04 +0100
Subject: [PATCH 2/2] Low: libservices(sync): partially prevent killing foreign
 process

Do not attempt sending SIGKILL to process for which previous
waitpid(pid, &st, WNOHANG) indicated ECHILD issue.  This might
be a result of clashing with another thread or an orthogonal
signal handler getting to reap the child first.

For good measure, repeat non-hanging waitpid test right before
killing -- it won't prevent illustrated race, but will limit it
a bit more.  That's unlikely, sad path, hence without regularly
imposed performance penalty.  And when we can pay more attention
to prevent killing innocent processes (the code in question is
commonly run as root so nothing will prevent that), we should.
---
 lib/services/services_linux.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c
index ffb74ef..16f25f3 100644
--- a/lib/services/services_linux.c
+++ b/lib/services/services_linux.c
@@ -521,11 +521,19 @@ action_synced_wait(svc_action_t * op, sigset_t *mask)
 #endif
                     wait_rc = waitpid(op->pid, &status, WNOHANG);
 
-                    if (wait_rc < 0){
-                        crm_perror(LOG_ERR, "waitpid() for %d failed", op->pid);
-
-                    } else if (wait_rc > 0) {
+                    if (wait_rc > 0) {
                         break;
+
+                    } else if (wait_rc < 0){
+                        if (errno == ECHILD) {
+                                /* Here, don't dare to kill and bail out... */
+                                break;
+
+                        } else {
+                                /* ...otherwise pretend process still runs. */
+                                wait_rc = 0;
+                        }
+                        crm_perror(LOG_ERR, "waitpid() for %d failed", op->pid);
                     }
                 }
             }
@@ -547,9 +555,8 @@ action_synced_wait(svc_action_t * op, sigset_t *mask)
 
     crm_trace("Child done: %d", op->pid);
     if (wait_rc <= 0) {
-        int killrc = kill(op->pid, SIGKILL);
-
         op->rc = PCMK_OCF_UNKNOWN_ERROR;
+
         if (op->timeout > 0 && timeout <= 0) {
             op->status = PCMK_LRM_OP_TIMEOUT;
             crm_warn("%s:%d - timed out after %dms", op->id, op->pid, op->timeout);
@@ -558,16 +565,15 @@ action_synced_wait(svc_action_t * op, sigset_t *mask)
             op->status = PCMK_LRM_OP_ERROR;
         }
 
-        if (killrc && errno != ESRCH) {
-            crm_err("kill(%d, KILL) failed: %d", op->pid, errno);
+        /* If only child hasn't been successfully waited for, yet.
+           This is to limit killing wrong target a bit more. */
+        if (wait_rc == 0 && waitpid(op->pid, &status, WNOHANG) == 0) {
+            if (kill(op->pid, SIGKILL)) {
+                crm_err("kill(%d, KILL) failed: %d", op->pid, errno);
+            }
+            /* Safe to skip WNOHANG here as we sent non-ignorable signal. */
+            while (waitpid(op->pid, &status, 0) == (pid_t) -1 && errno == EINTR) /*omit*/;
         }
-        /*
-         * From sigprocmask(2):
-         * It is not possible to block SIGKILL or SIGSTOP.  Attempts to do so are silently ignored.
-         *
-         * This makes it safe to skip WNOHANG here
-         */
-        while (waitpid(op->pid, &status, 0) == (pid_t) -1 && errno == EINTR) /*omit*/;
 
     } else if (WIFEXITED(status)) {
         op->status = PCMK_LRM_OP_DONE;
-- 
1.8.3.1