Blob Blame History Raw
From c67c643418b1df5b5705b3a72eba1e6755830dc5 Mon Sep 17 00:00:00 2001
From: Jan Synacek <jsynacek@redhat.com>
Date: Tue, 21 Nov 2017 12:46:28 +0100
Subject: [PATCH] journald: never block when sending messages on NOTIFY_SOCKET
 socket

Otherwise we might run into deadlocks, when journald blocks on the
notify socket on PID 1, and PID 1 blocks on IPC to dbus-daemon and
dbus-daemon blocks on logging to journald. Break this cycle by making
sure that journald never ever blocks on PID 1.

Note that this change disables support for event loop watchdog support,
as these messages are sent in blocking style by sd-event. That should
not be a big loss though, as people reported frequent problems with the
watchdog hitting journald on excessively slow IO.

Fixes: #1505.
(cherry-picked from commit e22aa3d3284709234f086ebebc13a905a295b7a7)

Resolves: #1511565
---
 src/journal/journald-server.c     | 130 +++++++++++++++++++++++++++++-
 src/journal/journald-server.h     |  13 ++-
 src/journal/journald-stream.c     |  68 ++++++++++++++--
 src/journal/journald-stream.h     |   3 +
 src/journal/journald.c            |   8 --
 units/systemd-journald.service.in |   1 -
 6 files changed, 201 insertions(+), 22 deletions(-)

diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c
index daeecd5191..a810829b24 100644
--- a/src/journal/journald-server.c
+++ b/src/journal/journald-server.c
@@ -67,6 +67,8 @@
 
 #define RECHECK_AVAILABLE_SPACE_USEC (30*USEC_PER_SEC)
 
+#define NOTIFY_SNDBUF_SIZE (8*1024*1024)
+
 /* Pick a good default that is likely to fit into AF_UNIX and AF_INET SOCK_DGRAM datagrams, and even leaves some room
 + * for a bit of additional metadata. */
 #define DEFAULT_LINE_MAX (48*1024)
@@ -1556,6 +1558,126 @@ static int server_open_hostname(Server *s) {
         return 0;
 }
 
+static int dispatch_notify_event(sd_event_source *es, int fd, uint32_t revents, void *userdata) {
+        Server *s = userdata;
+        int r;
+
+        assert(s);
+        assert(s->notify_event_source == es);
+        assert(s->notify_fd == fd);
+
+        if (revents != EPOLLOUT) {
+                log_error("Invalid events on notify file descriptor.");
+                return -EINVAL;
+        }
+
+        /* The $NOTIFY_SOCKET is writable again, now send exactly one
+         * message on it. Either it's the initial READY=1 event or an
+         * stdout stream event. If there's nothing to write anymore,
+         * turn our event source off. The next time there's something
+         * to send it will be turned on again. */
+
+        if (!s->sent_notify_ready) {
+                static const char p[] =
+                        "READY=1\n"
+                        "STATUS=Processing requests...";
+                ssize_t l;
+
+                l = send(s->notify_fd, p, strlen(p), MSG_DONTWAIT);
+                if (l < 0) {
+                        if (errno == EAGAIN)
+                                return 0;
+
+                        return log_error_errno(errno, "Failed to send READY=1 notification message: %m");
+                }
+
+                s->sent_notify_ready = true;
+                log_debug("Sent READY=1 notification.");
+
+        } else if (s->stdout_streams_notify_queue)
+                /* Dispatch one stream notification event */
+                stdout_stream_send_notify(s->stdout_streams_notify_queue);
+
+        /* Leave us enabled if there's still more to to do. */
+        if (s->stdout_streams_notify_queue)
+                return 0;
+
+        /* There was nothing to do anymore, let's turn ourselves off. */
+        r = sd_event_source_set_enabled(es, SD_EVENT_OFF);
+        if (r < 0)
+                return log_error_errno(r, "Failed to turn off notify event source: %m");
+
+        return 0;
+}
+
+static int server_connect_notify(Server *s) {
+        union sockaddr_union sa = {
+                .un.sun_family = AF_UNIX,
+        };
+        const char *e;
+        int r;
+
+        assert(s);
+        assert(s->notify_fd < 0);
+        assert(!s->notify_event_source);
+
+        /*
+          So here's the problem: we'd like to send notification
+          messages to PID 1, but we cannot do that via sd_notify(),
+          since that's synchronous, and we might end up blocking on
+          it. Specifically: given that PID 1 might block on
+          dbus-daemon during IPC, and dbus-daemon is logging to us,
+          and might hence block on us, we might end up in a deadlock
+          if we block on sending PID 1 notification messages -- by
+          generating a full blocking circle. To avoid this, let's
+          create a non-blocking socket, and connect it to the
+          notification socket, and then wait for POLLOUT before we
+          send anything. This should efficiently avoid any deadlocks,
+          as we'll never block on PID 1, hence PID 1 can safely block
+          on dbus-daemon which can safely block on us again.
+
+          Don't think that this issue is real? It is, see:
+          https://github.com/systemd/systemd/issues/1505
+        */
+
+        e = getenv("NOTIFY_SOCKET");
+        if (!e)
+                return 0;
+
+        if ((e[0] != '@' && e[0] != '/') || e[1] == 0) {
+                log_error("NOTIFY_SOCKET set to an invalid value: %s", e);
+                return -EINVAL;
+        }
+
+        if (strlen(e) > sizeof(sa.un.sun_path)) {
+                log_error("NOTIFY_SOCKET path too long: %s", e);
+                return -EINVAL;
+        }
+
+        s->notify_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+        if (s->notify_fd < 0)
+                return log_error_errno(errno, "Failed to create notify socket: %m");
+
+        (void) fd_inc_sndbuf(s->notify_fd, NOTIFY_SNDBUF_SIZE);
+
+        strncpy(sa.un.sun_path, e, sizeof(sa.un.sun_path));
+        if (sa.un.sun_path[0] == '@')
+                sa.un.sun_path[0] = 0;
+
+        r = connect(s->notify_fd, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(e));
+        if (r < 0)
+                return log_error_errno(errno, "Failed to connect to notify socket: %m");
+
+        r = sd_event_add_io(s->event, &s->notify_event_source, s->notify_fd, EPOLLOUT, dispatch_notify_event, s);
+        if (r < 0)
+                return log_error_errno(r, "Failed to watch notification socket: %m");
+
+        /* This should fire pretty soon, which we'll use to send the
+         * READY=1 event. */
+
+        return 0;
+}
+
 int server_init(Server *s) {
         _cleanup_fdset_free_ FDSet *fds = NULL;
         int n, r, fd;
@@ -1563,7 +1685,7 @@ int server_init(Server *s) {
         assert(s);
 
         zero(*s);
-        s->syslog_fd = s->native_fd = s->stdout_fd = s->dev_kmsg_fd = s->audit_fd = s->hostname_fd = -1;
+        s->syslog_fd = s->native_fd = s->stdout_fd = s->dev_kmsg_fd = s->audit_fd = s->hostname_fd = s->notify_fd = -1;
         s->compress = true;
         s->seal = true;
 
@@ -1611,8 +1733,6 @@ int server_init(Server *s) {
         if (r < 0)
                 return log_error_errno(r, "Failed to create event loop: %m");
 
-        sd_event_set_watchdog(s->event, true);
-
         n = sd_listen_fds(true);
         if (n < 0)
                 return log_error_errno(n, "Failed to read listening file descriptors from environment: %m");
@@ -1718,6 +1838,8 @@ int server_init(Server *s) {
         server_cache_boot_id(s);
         server_cache_machine_id(s);
 
+        (void) server_connect_notify(s);
+
         r = system_journal_open(s, false);
         if (r < 0)
                 return r;
@@ -1770,6 +1892,7 @@ void server_done(Server *s) {
         sd_event_source_unref(s->sigterm_event_source);
         sd_event_source_unref(s->sigint_event_source);
         sd_event_source_unref(s->hostname_event_source);
+        sd_event_source_unref(s->notify_event_source);
         sd_event_unref(s->event);
 
         safe_close(s->syslog_fd);
@@ -1778,6 +1901,7 @@ void server_done(Server *s) {
         safe_close(s->dev_kmsg_fd);
         safe_close(s->audit_fd);
         safe_close(s->hostname_fd);
+        safe_close(s->notify_fd);
 
         if (s->rate_limit)
                 journal_rate_limit_free(s->rate_limit);
diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h
index b294107788..e59ff35e22 100644
--- a/src/journal/journald-server.h
+++ b/src/journal/journald-server.h
@@ -27,12 +27,15 @@
 #include <sys/types.h>
 #include <sys/socket.h>
 
+typedef struct Server Server;
+
 #include "sd-event.h"
 #include "journal-file.h"
 #include "hashmap.h"
 #include "util.h"
 #include "audit.h"
 #include "journald-rate-limit.h"
+#include "journald-stream.h"
 #include "list.h"
 
 typedef enum Storage {
@@ -52,15 +55,14 @@ typedef enum SplitMode {
         _SPLIT_INVALID = -1
 } SplitMode;
 
-typedef struct StdoutStream StdoutStream;
-
-typedef struct Server {
+struct Server {
         int syslog_fd;
         int native_fd;
         int stdout_fd;
         int dev_kmsg_fd;
         int audit_fd;
         int hostname_fd;
+        int notify_fd;
 
         sd_event *event;
 
@@ -75,6 +77,7 @@ typedef struct Server {
         sd_event_source *sigterm_event_source;
         sd_event_source *sigint_event_source;
         sd_event_source *hostname_event_source;
+        sd_event_source *notify_event_source;
 
         JournalFile *runtime_journal;
         JournalFile *system_journal;
@@ -114,6 +117,7 @@ typedef struct Server {
         usec_t oldest_file_usec;
 
         LIST_HEAD(StdoutStream, stdout_streams);
+        LIST_HEAD(StdoutStream, stdout_streams_notify_queue);
         unsigned n_stdout_streams;
 
         char *tty_path;
@@ -135,6 +139,7 @@ typedef struct Server {
 
         struct udev *udev;
 
+        bool sent_notify_ready;
         bool sync_scheduled;
 
         char machine_id_field[sizeof("_MACHINE_ID=") + 32];
@@ -145,7 +150,7 @@ typedef struct Server {
         char *cgroup_root;
 
         size_t line_max;
-} Server;
+};
 
 #define N_IOVEC_META_FIELDS 20
 #define N_IOVEC_KERNEL_FIELDS 64
diff --git a/src/journal/journald-stream.c b/src/journal/journald-stream.c
index 4d6b7ad189..9118d1a318 100644
--- a/src/journal/journald-stream.c
+++ b/src/journal/journald-stream.c
@@ -80,6 +80,7 @@ struct StdoutStream {
         bool forward_to_console:1;
 
         bool fdstore:1;
+        bool in_notify_queue:1;
 
         char *buffer;
         size_t length;
@@ -90,6 +91,7 @@ struct StdoutStream {
         char *state_file;
 
         LIST_FIELDS(StdoutStream, stdout_stream);
+        LIST_FIELDS(StdoutStream, stdout_stream_notify_queue);
 
         char id_field[sizeof("_STREAM_ID=")-1 + SD_ID128_STRING_MAX];
 };
@@ -102,6 +104,9 @@ void stdout_stream_free(StdoutStream *s) {
                 assert(s->server->n_stdout_streams > 0);
                 s->server->n_stdout_streams --;
                 LIST_REMOVE(stdout_stream, s->server->stdout_streams, s);
+
+                if (s->in_notify_queue)
+                        LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
         }
 
         if (s->event_source) {
@@ -126,7 +131,7 @@ static void stdout_stream_destroy(StdoutStream *s) {
                 return;
 
         if (s->state_file)
-                unlink(s->state_file);
+                (void) unlink(s->state_file);
 
         stdout_stream_free(s);
 }
@@ -210,11 +215,15 @@ static int stdout_stream_save(StdoutStream *s) {
         free(temp_path);
         temp_path = NULL;
 
-        /* Store the connection fd in PID 1, so that we get it passed
-         * in again on next start */
-        if (!s->fdstore) {
-                sd_pid_notify_with_fds(0, false, "FDSTORE=1", &s->fd, 1);
-                s->fdstore = true;
+        if (!s->fdstore && !s->in_notify_queue) {
+                LIST_PREPEND(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
+                s->in_notify_queue = true;
+
+                if (s->server->notify_event_source) {
+                        r = sd_event_source_set_enabled(s->server->notify_event_source, SD_EVENT_ON);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to enable notify event source: %m");
+                }
         }
 
 finish:
@@ -801,3 +810,50 @@ int server_open_stdout_socket(Server *s, FDSet *fds) {
 
         return 0;
 }
+
+void stdout_stream_send_notify(StdoutStream *s) {
+        struct iovec iovec = {
+                .iov_base = (char*) "FDSTORE=1",
+                .iov_len = strlen("FDSTORE=1"),
+        };
+        struct msghdr msghdr = {
+                .msg_iov = &iovec,
+                .msg_iovlen = 1,
+        };
+        struct cmsghdr *cmsg;
+        ssize_t l;
+
+        assert(s);
+        assert(!s->fdstore);
+        assert(s->in_notify_queue);
+        assert(s->server);
+        assert(s->server->notify_fd >= 0);
+
+        /* Store the connection fd in PID 1, so that we get it passed
+         * in again on next start */
+
+        msghdr.msg_controllen = CMSG_SPACE(sizeof(int));
+        msghdr.msg_control = alloca0(msghdr.msg_controllen);
+
+        cmsg = CMSG_FIRSTHDR(&msghdr);
+        cmsg->cmsg_level = SOL_SOCKET;
+        cmsg->cmsg_type = SCM_RIGHTS;
+        cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+
+        memcpy(CMSG_DATA(cmsg), &s->fd, sizeof(int));
+
+        l = sendmsg(s->server->notify_fd, &msghdr, MSG_DONTWAIT|MSG_NOSIGNAL);
+        if (l < 0) {
+                if (errno == EAGAIN)
+                        return;
+
+                log_error_errno(errno, "Failed to send stream file descriptor to service manager: %m");
+        } else {
+                log_debug("Successfully sent stream file descriptor to service manager.");
+                s->fdstore = 1;
+        }
+
+        LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
+        s->in_notify_queue = false;
+
+}
diff --git a/src/journal/journald-stream.h b/src/journal/journald-stream.h
index 94bf955d78..fd8c94fb65 100644
--- a/src/journal/journald-stream.h
+++ b/src/journal/journald-stream.h
@@ -21,9 +21,12 @@
   along with systemd; If not, see <http://www.gnu.org/licenses/>.
 ***/
 
+typedef struct StdoutStream StdoutStream;
+
 #include "fdset.h"
 #include "journald-server.h"
 
 int server_open_stdout_socket(Server *s, FDSet *fds);
 
 void stdout_stream_free(StdoutStream *s);
+void stdout_stream_send_notify(StdoutStream *s);
diff --git a/src/journal/journald.c b/src/journal/journald.c
index 15bbcbe3de..b7ba2b6ec5 100644
--- a/src/journal/journald.c
+++ b/src/journal/journald.c
@@ -64,10 +64,6 @@ int main(int argc, char *argv[]) {
         log_debug("systemd-journald running as pid "PID_FMT, getpid());
         server_driver_message(&server, SD_MESSAGE_JOURNAL_START, "Journal started");
 
-        sd_notify(false,
-                  "READY=1\n"
-                  "STATUS=Processing requests...");
-
         for (;;) {
                 usec_t t = USEC_INFINITY, n;
 
@@ -120,10 +116,6 @@ int main(int argc, char *argv[]) {
         server_driver_message(&server, SD_MESSAGE_JOURNAL_STOP, "Journal stopped");
 
 finish:
-        sd_notify(false,
-                  "STOPPING=1\n"
-                  "STATUS=Shutting down...");
-
         server_done(&server);
 
         return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/units/systemd-journald.service.in b/units/systemd-journald.service.in
index 0d1ea61fe8..c94c0bfba1 100644
--- a/units/systemd-journald.service.in
+++ b/units/systemd-journald.service.in
@@ -22,7 +22,6 @@ RestartSec=0
 StandardOutput=null
 FileDescriptorStoreMax=4224
 CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_SYS_PTRACE CAP_SYSLOG CAP_AUDIT_CONTROL CAP_AUDIT_READ CAP_CHOWN CAP_DAC_READ_SEARCH CAP_FOWNER CAP_SETUID CAP_SETGID CAP_MAC_OVERRIDE
-WatchdogSec=3min
 
 # Increase the default a bit in order to allow many simultaneous
 # services being run since we keep one fd open per service. Also, when