From 142169da59518ee17bbf885dfbf42525fd2c1d0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Pokorn=C3=BD?= Date: Tue, 31 Jan 2017 20:06:21 +0100 Subject: [PATCH 1/2] Low: libservices (sync): ensure no zombie is left behind It could happen because this parent that is to wait for its not-well-behaved child is knowlingly not blocking any signal (beside SIGCHLD explicitly in case of using signalfd facility) and delivery of such signal can interrupt waitpid, at least on paper, so protect against this accordingly. Speaking of SIGCHLD in plain self-pipe (not signalfd one) context, while there's no clash with other synchronous actions, it may be the case with asynchronous ones (or for that matter, arbitrary other fork-related activities in the main program a library can have no idea about), and this is exactly what could interrupt waitpid for real. --- lib/services/services_linux.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c index cd7fd3f..ffb74ef 100644 --- a/lib/services/services_linux.c +++ b/lib/services/services_linux.c @@ -517,7 +517,7 @@ action_synced_wait(svc_action_t * op, sigset_t *mask) if (1) { /* Clear out the sigchld pipe. */ char ch; - while (read(sfd, &ch, 1) == 1); + while (read(sfd, &ch, 1) == 1) /*omit*/; #endif wait_rc = waitpid(op->pid, &status, WNOHANG); @@ -567,7 +567,7 @@ action_synced_wait(svc_action_t * op, sigset_t *mask) * * This makes it safe to skip WNOHANG here */ - waitpid(op->pid, &status, 0); + while (waitpid(op->pid, &status, 0) == (pid_t) -1 && errno == EINTR) /*omit*/; } else if (WIFEXITED(status)) { op->status = PCMK_LRM_OP_DONE; -- 1.8.3.1 From 9acc32260562df262a768bd2259372d16d90283b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Pokorn=C3=BD?= Date: Tue, 31 Jan 2017 20:49:04 +0100 Subject: [PATCH 2/2] Low: libservices(sync): partially prevent killing foreign process Do not attempt sending SIGKILL to process for which previous waitpid(pid, &st, WNOHANG) indicated ECHILD issue. This might be a result of clashing with another thread or an orthogonal signal handler getting to reap the child first. For good measure, repeat non-hanging waitpid test right before killing -- it won't prevent illustrated race, but will limit it a bit more. That's unlikely, sad path, hence without regularly imposed performance penalty. And when we can pay more attention to prevent killing innocent processes (the code in question is commonly run as root so nothing will prevent that), we should. --- lib/services/services_linux.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c index ffb74ef..16f25f3 100644 --- a/lib/services/services_linux.c +++ b/lib/services/services_linux.c @@ -521,11 +521,19 @@ action_synced_wait(svc_action_t * op, sigset_t *mask) #endif wait_rc = waitpid(op->pid, &status, WNOHANG); - if (wait_rc < 0){ - crm_perror(LOG_ERR, "waitpid() for %d failed", op->pid); - - } else if (wait_rc > 0) { + if (wait_rc > 0) { break; + + } else if (wait_rc < 0){ + if (errno == ECHILD) { + /* Here, don't dare to kill and bail out... */ + break; + + } else { + /* ...otherwise pretend process still runs. */ + wait_rc = 0; + } + crm_perror(LOG_ERR, "waitpid() for %d failed", op->pid); } } } @@ -547,9 +555,8 @@ action_synced_wait(svc_action_t * op, sigset_t *mask) crm_trace("Child done: %d", op->pid); if (wait_rc <= 0) { - int killrc = kill(op->pid, SIGKILL); - op->rc = PCMK_OCF_UNKNOWN_ERROR; + if (op->timeout > 0 && timeout <= 0) { op->status = PCMK_LRM_OP_TIMEOUT; crm_warn("%s:%d - timed out after %dms", op->id, op->pid, op->timeout); @@ -558,16 +565,15 @@ action_synced_wait(svc_action_t * op, sigset_t *mask) op->status = PCMK_LRM_OP_ERROR; } - if (killrc && errno != ESRCH) { - crm_err("kill(%d, KILL) failed: %d", op->pid, errno); + /* If only child hasn't been successfully waited for, yet. + This is to limit killing wrong target a bit more. */ + if (wait_rc == 0 && waitpid(op->pid, &status, WNOHANG) == 0) { + if (kill(op->pid, SIGKILL)) { + crm_err("kill(%d, KILL) failed: %d", op->pid, errno); + } + /* Safe to skip WNOHANG here as we sent non-ignorable signal. */ + while (waitpid(op->pid, &status, 0) == (pid_t) -1 && errno == EINTR) /*omit*/; } - /* - * From sigprocmask(2): - * It is not possible to block SIGKILL or SIGSTOP. Attempts to do so are silently ignored. - * - * This makes it safe to skip WNOHANG here - */ - while (waitpid(op->pid, &status, 0) == (pid_t) -1 && errno == EINTR) /*omit*/; } else if (WIFEXITED(status)) { op->status = PCMK_LRM_OP_DONE; -- 1.8.3.1