ecbff1
From c67c643418b1df5b5705b3a72eba1e6755830dc5 Mon Sep 17 00:00:00 2001
ecbff1
From: Jan Synacek <jsynacek@redhat.com>
ecbff1
Date: Tue, 21 Nov 2017 12:46:28 +0100
ecbff1
Subject: [PATCH] journald: never block when sending messages on NOTIFY_SOCKET
ecbff1
 socket
ecbff1
ecbff1
Otherwise we might run into deadlocks, when journald blocks on the
ecbff1
notify socket on PID 1, and PID 1 blocks on IPC to dbus-daemon and
ecbff1
dbus-daemon blocks on logging to journald. Break this cycle by making
ecbff1
sure that journald never ever blocks on PID 1.
ecbff1
ecbff1
Note that this change disables support for event loop watchdog support,
ecbff1
as these messages are sent in blocking style by sd-event. That should
ecbff1
not be a big loss though, as people reported frequent problems with the
ecbff1
watchdog hitting journald on excessively slow IO.
ecbff1
ecbff1
Fixes: #1505.
ecbff1
(cherry-picked from commit e22aa3d3284709234f086ebebc13a905a295b7a7)
ecbff1
ecbff1
Resolves: #1511565
ecbff1
---
de8967
 src/journal/journald-server.c     | 130 +++++++++++++++++++++++++++++-
de8967
 src/journal/journald-server.h     |  13 ++-
de8967
 src/journal/journald-stream.c     |  68 ++++++++++++++--
ecbff1
 src/journal/journald-stream.h     |   3 +
de8967
 src/journal/journald.c            |   8 --
ecbff1
 units/systemd-journald.service.in |   1 -
ecbff1
 6 files changed, 201 insertions(+), 22 deletions(-)
ecbff1
ecbff1
diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c
ecbff1
index daeecd519..a810829b2 100644
ecbff1
--- a/src/journal/journald-server.c
ecbff1
+++ b/src/journal/journald-server.c
ecbff1
@@ -67,6 +67,8 @@
ecbff1
 
ecbff1
 #define RECHECK_AVAILABLE_SPACE_USEC (30*USEC_PER_SEC)
ecbff1
 
ecbff1
+#define NOTIFY_SNDBUF_SIZE (8*1024*1024)
ecbff1
+
ecbff1
 /* Pick a good default that is likely to fit into AF_UNIX and AF_INET SOCK_DGRAM datagrams, and even leaves some room
ecbff1
 + * for a bit of additional metadata. */
ecbff1
 #define DEFAULT_LINE_MAX (48*1024)
ecbff1
@@ -1556,6 +1558,126 @@ static int server_open_hostname(Server *s) {
ecbff1
         return 0;
ecbff1
 }
ecbff1
 
ecbff1
+static int dispatch_notify_event(sd_event_source *es, int fd, uint32_t revents, void *userdata) {
ecbff1
+        Server *s = userdata;
ecbff1
+        int r;
ecbff1
+
ecbff1
+        assert(s);
ecbff1
+        assert(s->notify_event_source == es);
ecbff1
+        assert(s->notify_fd == fd);
ecbff1
+
ecbff1
+        if (revents != EPOLLOUT) {
ecbff1
+                log_error("Invalid events on notify file descriptor.");
ecbff1
+                return -EINVAL;
ecbff1
+        }
ecbff1
+
ecbff1
+        /* The $NOTIFY_SOCKET is writable again, now send exactly one
ecbff1
+         * message on it. Either it's the initial READY=1 event or an
ecbff1
+         * stdout stream event. If there's nothing to write anymore,
ecbff1
+         * turn our event source off. The next time there's something
ecbff1
+         * to send it will be turned on again. */
ecbff1
+
ecbff1
+        if (!s->sent_notify_ready) {
ecbff1
+                static const char p[] =
ecbff1
+                        "READY=1\n"
ecbff1
+                        "STATUS=Processing requests...";
ecbff1
+                ssize_t l;
ecbff1
+
ecbff1
+                l = send(s->notify_fd, p, strlen(p), MSG_DONTWAIT);
ecbff1
+                if (l < 0) {
ecbff1
+                        if (errno == EAGAIN)
ecbff1
+                                return 0;
ecbff1
+
ecbff1
+                        return log_error_errno(errno, "Failed to send READY=1 notification message: %m");
ecbff1
+                }
ecbff1
+
ecbff1
+                s->sent_notify_ready = true;
ecbff1
+                log_debug("Sent READY=1 notification.");
ecbff1
+
ecbff1
+        } else if (s->stdout_streams_notify_queue)
ecbff1
+                /* Dispatch one stream notification event */
ecbff1
+                stdout_stream_send_notify(s->stdout_streams_notify_queue);
ecbff1
+
ecbff1
+        /* Leave us enabled if there's still more to to do. */
ecbff1
+        if (s->stdout_streams_notify_queue)
ecbff1
+                return 0;
ecbff1
+
ecbff1
+        /* There was nothing to do anymore, let's turn ourselves off. */
ecbff1
+        r = sd_event_source_set_enabled(es, SD_EVENT_OFF);
ecbff1
+        if (r < 0)
ecbff1
+                return log_error_errno(r, "Failed to turn off notify event source: %m");
ecbff1
+
ecbff1
+        return 0;
ecbff1
+}
ecbff1
+
ecbff1
+static int server_connect_notify(Server *s) {
ecbff1
+        union sockaddr_union sa = {
ecbff1
+                .un.sun_family = AF_UNIX,
ecbff1
+        };
ecbff1
+        const char *e;
ecbff1
+        int r;
ecbff1
+
ecbff1
+        assert(s);
ecbff1
+        assert(s->notify_fd < 0);
ecbff1
+        assert(!s->notify_event_source);
ecbff1
+
ecbff1
+        /*
ecbff1
+          So here's the problem: we'd like to send notification
ecbff1
+          messages to PID 1, but we cannot do that via sd_notify(),
ecbff1
+          since that's synchronous, and we might end up blocking on
ecbff1
+          it. Specifically: given that PID 1 might block on
ecbff1
+          dbus-daemon during IPC, and dbus-daemon is logging to us,
ecbff1
+          and might hence block on us, we might end up in a deadlock
ecbff1
+          if we block on sending PID 1 notification messages -- by
ecbff1
+          generating a full blocking circle. To avoid this, let's
ecbff1
+          create a non-blocking socket, and connect it to the
ecbff1
+          notification socket, and then wait for POLLOUT before we
ecbff1
+          send anything. This should efficiently avoid any deadlocks,
ecbff1
+          as we'll never block on PID 1, hence PID 1 can safely block
ecbff1
+          on dbus-daemon which can safely block on us again.
ecbff1
+
ecbff1
+          Don't think that this issue is real? It is, see:
ecbff1
+          https://github.com/systemd/systemd/issues/1505
ecbff1
+        */
ecbff1
+
ecbff1
+        e = getenv("NOTIFY_SOCKET");
ecbff1
+        if (!e)
ecbff1
+                return 0;
ecbff1
+
ecbff1
+        if ((e[0] != '@' && e[0] != '/') || e[1] == 0) {
ecbff1
+                log_error("NOTIFY_SOCKET set to an invalid value: %s", e);
ecbff1
+                return -EINVAL;
ecbff1
+        }
ecbff1
+
ecbff1
+        if (strlen(e) > sizeof(sa.un.sun_path)) {
ecbff1
+                log_error("NOTIFY_SOCKET path too long: %s", e);
ecbff1
+                return -EINVAL;
ecbff1
+        }
ecbff1
+
ecbff1
+        s->notify_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
ecbff1
+        if (s->notify_fd < 0)
ecbff1
+                return log_error_errno(errno, "Failed to create notify socket: %m");
ecbff1
+
ecbff1
+        (void) fd_inc_sndbuf(s->notify_fd, NOTIFY_SNDBUF_SIZE);
ecbff1
+
ecbff1
+        strncpy(sa.un.sun_path, e, sizeof(sa.un.sun_path));
ecbff1
+        if (sa.un.sun_path[0] == '@')
ecbff1
+                sa.un.sun_path[0] = 0;
ecbff1
+
ecbff1
+        r = connect(s->notify_fd, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(e));
ecbff1
+        if (r < 0)
ecbff1
+                return log_error_errno(errno, "Failed to connect to notify socket: %m");
ecbff1
+
ecbff1
+        r = sd_event_add_io(s->event, &s->notify_event_source, s->notify_fd, EPOLLOUT, dispatch_notify_event, s);
ecbff1
+        if (r < 0)
ecbff1
+                return log_error_errno(r, "Failed to watch notification socket: %m");
ecbff1
+
ecbff1
+        /* This should fire pretty soon, which we'll use to send the
ecbff1
+         * READY=1 event. */
ecbff1
+
ecbff1
+        return 0;
ecbff1
+}
ecbff1
+
ecbff1
 int server_init(Server *s) {
ecbff1
         _cleanup_fdset_free_ FDSet *fds = NULL;
ecbff1
         int n, r, fd;
ecbff1
@@ -1563,7 +1685,7 @@ int server_init(Server *s) {
ecbff1
         assert(s);
ecbff1
 
ecbff1
         zero(*s);
ecbff1
-        s->syslog_fd = s->native_fd = s->stdout_fd = s->dev_kmsg_fd = s->audit_fd = s->hostname_fd = -1;
ecbff1
+        s->syslog_fd = s->native_fd = s->stdout_fd = s->dev_kmsg_fd = s->audit_fd = s->hostname_fd = s->notify_fd = -1;
ecbff1
         s->compress = true;
ecbff1
         s->seal = true;
ecbff1
 
ecbff1
@@ -1611,8 +1733,6 @@ int server_init(Server *s) {
ecbff1
         if (r < 0)
ecbff1
                 return log_error_errno(r, "Failed to create event loop: %m");
ecbff1
 
ecbff1
-        sd_event_set_watchdog(s->event, true);
ecbff1
-
ecbff1
         n = sd_listen_fds(true);
ecbff1
         if (n < 0)
ecbff1
                 return log_error_errno(n, "Failed to read listening file descriptors from environment: %m");
ecbff1
@@ -1718,6 +1838,8 @@ int server_init(Server *s) {
ecbff1
         server_cache_boot_id(s);
ecbff1
         server_cache_machine_id(s);
ecbff1
 
ecbff1
+        (void) server_connect_notify(s);
ecbff1
+
ecbff1
         r = system_journal_open(s, false);
ecbff1
         if (r < 0)
ecbff1
                 return r;
ecbff1
@@ -1770,6 +1892,7 @@ void server_done(Server *s) {
ecbff1
         sd_event_source_unref(s->sigterm_event_source);
ecbff1
         sd_event_source_unref(s->sigint_event_source);
ecbff1
         sd_event_source_unref(s->hostname_event_source);
ecbff1
+        sd_event_source_unref(s->notify_event_source);
ecbff1
         sd_event_unref(s->event);
ecbff1
 
ecbff1
         safe_close(s->syslog_fd);
ecbff1
@@ -1778,6 +1901,7 @@ void server_done(Server *s) {
ecbff1
         safe_close(s->dev_kmsg_fd);
ecbff1
         safe_close(s->audit_fd);
ecbff1
         safe_close(s->hostname_fd);
ecbff1
+        safe_close(s->notify_fd);
ecbff1
 
ecbff1
         if (s->rate_limit)
ecbff1
                 journal_rate_limit_free(s->rate_limit);
ecbff1
diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h
ecbff1
index b29410778..e59ff35e2 100644
ecbff1
--- a/src/journal/journald-server.h
ecbff1
+++ b/src/journal/journald-server.h
ecbff1
@@ -27,12 +27,15 @@
ecbff1
 #include <sys/types.h>
ecbff1
 #include <sys/socket.h>
ecbff1
 
ecbff1
+typedef struct Server Server;
ecbff1
+
ecbff1
 #include "sd-event.h"
ecbff1
 #include "journal-file.h"
ecbff1
 #include "hashmap.h"
ecbff1
 #include "util.h"
ecbff1
 #include "audit.h"
ecbff1
 #include "journald-rate-limit.h"
ecbff1
+#include "journald-stream.h"
ecbff1
 #include "list.h"
ecbff1
 
ecbff1
 typedef enum Storage {
ecbff1
@@ -52,15 +55,14 @@ typedef enum SplitMode {
ecbff1
         _SPLIT_INVALID = -1
ecbff1
 } SplitMode;
ecbff1
 
ecbff1
-typedef struct StdoutStream StdoutStream;
ecbff1
-
ecbff1
-typedef struct Server {
ecbff1
+struct Server {
ecbff1
         int syslog_fd;
ecbff1
         int native_fd;
ecbff1
         int stdout_fd;
ecbff1
         int dev_kmsg_fd;
ecbff1
         int audit_fd;
ecbff1
         int hostname_fd;
ecbff1
+        int notify_fd;
ecbff1
 
ecbff1
         sd_event *event;
ecbff1
 
ecbff1
@@ -75,6 +77,7 @@ typedef struct Server {
ecbff1
         sd_event_source *sigterm_event_source;
ecbff1
         sd_event_source *sigint_event_source;
ecbff1
         sd_event_source *hostname_event_source;
ecbff1
+        sd_event_source *notify_event_source;
ecbff1
 
ecbff1
         JournalFile *runtime_journal;
ecbff1
         JournalFile *system_journal;
ecbff1
@@ -114,6 +117,7 @@ typedef struct Server {
ecbff1
         usec_t oldest_file_usec;
ecbff1
 
ecbff1
         LIST_HEAD(StdoutStream, stdout_streams);
ecbff1
+        LIST_HEAD(StdoutStream, stdout_streams_notify_queue);
ecbff1
         unsigned n_stdout_streams;
ecbff1
 
ecbff1
         char *tty_path;
ecbff1
@@ -135,6 +139,7 @@ typedef struct Server {
ecbff1
 
ecbff1
         struct udev *udev;
ecbff1
 
ecbff1
+        bool sent_notify_ready;
ecbff1
         bool sync_scheduled;
ecbff1
 
ecbff1
         char machine_id_field[sizeof("_MACHINE_ID=") + 32];
ecbff1
@@ -145,7 +150,7 @@ typedef struct Server {
ecbff1
         char *cgroup_root;
ecbff1
 
ecbff1
         size_t line_max;
ecbff1
-} Server;
ecbff1
+};
ecbff1
 
ecbff1
 #define N_IOVEC_META_FIELDS 20
ecbff1
 #define N_IOVEC_KERNEL_FIELDS 64
ecbff1
diff --git a/src/journal/journald-stream.c b/src/journal/journald-stream.c
ecbff1
index 4d6b7ad18..9118d1a31 100644
ecbff1
--- a/src/journal/journald-stream.c
ecbff1
+++ b/src/journal/journald-stream.c
ecbff1
@@ -80,6 +80,7 @@ struct StdoutStream {
ecbff1
         bool forward_to_console:1;
ecbff1
 
ecbff1
         bool fdstore:1;
ecbff1
+        bool in_notify_queue:1;
ecbff1
 
ecbff1
         char *buffer;
ecbff1
         size_t length;
ecbff1
@@ -90,6 +91,7 @@ struct StdoutStream {
ecbff1
         char *state_file;
ecbff1
 
ecbff1
         LIST_FIELDS(StdoutStream, stdout_stream);
ecbff1
+        LIST_FIELDS(StdoutStream, stdout_stream_notify_queue);
ecbff1
 
ecbff1
         char id_field[sizeof("_STREAM_ID=")-1 + SD_ID128_STRING_MAX];
ecbff1
 };
ecbff1
@@ -102,6 +104,9 @@ void stdout_stream_free(StdoutStream *s) {
ecbff1
                 assert(s->server->n_stdout_streams > 0);
ecbff1
                 s->server->n_stdout_streams --;
ecbff1
                 LIST_REMOVE(stdout_stream, s->server->stdout_streams, s);
ecbff1
+
ecbff1
+                if (s->in_notify_queue)
ecbff1
+                        LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
ecbff1
         }
ecbff1
 
ecbff1
         if (s->event_source) {
ecbff1
@@ -126,7 +131,7 @@ static void stdout_stream_destroy(StdoutStream *s) {
ecbff1
                 return;
ecbff1
 
ecbff1
         if (s->state_file)
ecbff1
-                unlink(s->state_file);
ecbff1
+                (void) unlink(s->state_file);
ecbff1
 
ecbff1
         stdout_stream_free(s);
ecbff1
 }
ecbff1
@@ -210,11 +215,15 @@ static int stdout_stream_save(StdoutStream *s) {
ecbff1
         free(temp_path);
ecbff1
         temp_path = NULL;
ecbff1
 
ecbff1
-        /* Store the connection fd in PID 1, so that we get it passed
ecbff1
-         * in again on next start */
ecbff1
-        if (!s->fdstore) {
ecbff1
-                sd_pid_notify_with_fds(0, false, "FDSTORE=1", &s->fd, 1);
ecbff1
-                s->fdstore = true;
ecbff1
+        if (!s->fdstore && !s->in_notify_queue) {
ecbff1
+                LIST_PREPEND(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
ecbff1
+                s->in_notify_queue = true;
ecbff1
+
ecbff1
+                if (s->server->notify_event_source) {
ecbff1
+                        r = sd_event_source_set_enabled(s->server->notify_event_source, SD_EVENT_ON);
ecbff1
+                        if (r < 0)
ecbff1
+                                log_warning_errno(r, "Failed to enable notify event source: %m");
ecbff1
+                }
ecbff1
         }
ecbff1
 
ecbff1
 finish:
ecbff1
@@ -801,3 +810,50 @@ int server_open_stdout_socket(Server *s, FDSet *fds) {
ecbff1
 
ecbff1
         return 0;
ecbff1
 }
ecbff1
+
ecbff1
+void stdout_stream_send_notify(StdoutStream *s) {
ecbff1
+        struct iovec iovec = {
ecbff1
+                .iov_base = (char*) "FDSTORE=1",
ecbff1
+                .iov_len = strlen("FDSTORE=1"),
ecbff1
+        };
ecbff1
+        struct msghdr msghdr = {
ecbff1
+                .msg_iov = &iovec,
ecbff1
+                .msg_iovlen = 1,
ecbff1
+        };
ecbff1
+        struct cmsghdr *cmsg;
ecbff1
+        ssize_t l;
ecbff1
+
ecbff1
+        assert(s);
ecbff1
+        assert(!s->fdstore);
ecbff1
+        assert(s->in_notify_queue);
ecbff1
+        assert(s->server);
ecbff1
+        assert(s->server->notify_fd >= 0);
ecbff1
+
ecbff1
+        /* Store the connection fd in PID 1, so that we get it passed
ecbff1
+         * in again on next start */
ecbff1
+
ecbff1
+        msghdr.msg_controllen = CMSG_SPACE(sizeof(int));
ecbff1
+        msghdr.msg_control = alloca0(msghdr.msg_controllen);
ecbff1
+
ecbff1
+        cmsg = CMSG_FIRSTHDR(&msghdr);
ecbff1
+        cmsg->cmsg_level = SOL_SOCKET;
ecbff1
+        cmsg->cmsg_type = SCM_RIGHTS;
ecbff1
+        cmsg->cmsg_len = CMSG_LEN(sizeof(int));
ecbff1
+
ecbff1
+        memcpy(CMSG_DATA(cmsg), &s->fd, sizeof(int));
ecbff1
+
ecbff1
+        l = sendmsg(s->server->notify_fd, &msghdr, MSG_DONTWAIT|MSG_NOSIGNAL);
ecbff1
+        if (l < 0) {
ecbff1
+                if (errno == EAGAIN)
ecbff1
+                        return;
ecbff1
+
ecbff1
+                log_error_errno(errno, "Failed to send stream file descriptor to service manager: %m");
ecbff1
+        } else {
ecbff1
+                log_debug("Successfully sent stream file descriptor to service manager.");
ecbff1
+                s->fdstore = 1;
ecbff1
+        }
ecbff1
+
ecbff1
+        LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
ecbff1
+        s->in_notify_queue = false;
ecbff1
+
ecbff1
+}
ecbff1
diff --git a/src/journal/journald-stream.h b/src/journal/journald-stream.h
ecbff1
index 94bf955d7..fd8c94fb6 100644
ecbff1
--- a/src/journal/journald-stream.h
ecbff1
+++ b/src/journal/journald-stream.h
ecbff1
@@ -21,9 +21,12 @@
ecbff1
   along with systemd; If not, see <http://www.gnu.org/licenses/>.
ecbff1
 ***/
ecbff1
 
ecbff1
+typedef struct StdoutStream StdoutStream;
ecbff1
+
ecbff1
 #include "fdset.h"
ecbff1
 #include "journald-server.h"
ecbff1
 
ecbff1
 int server_open_stdout_socket(Server *s, FDSet *fds);
ecbff1
 
ecbff1
 void stdout_stream_free(StdoutStream *s);
ecbff1
+void stdout_stream_send_notify(StdoutStream *s);
ecbff1
diff --git a/src/journal/journald.c b/src/journal/journald.c
ecbff1
index 15bbcbe3d..b7ba2b6ec 100644
ecbff1
--- a/src/journal/journald.c
ecbff1
+++ b/src/journal/journald.c
ecbff1
@@ -64,10 +64,6 @@ int main(int argc, char *argv[]) {
ecbff1
         log_debug("systemd-journald running as pid "PID_FMT, getpid());
ecbff1
         server_driver_message(&server, SD_MESSAGE_JOURNAL_START, "Journal started");
ecbff1
 
ecbff1
-        sd_notify(false,
ecbff1
-                  "READY=1\n"
ecbff1
-                  "STATUS=Processing requests...");
ecbff1
-
ecbff1
         for (;;) {
ecbff1
                 usec_t t = USEC_INFINITY, n;
ecbff1
 
ecbff1
@@ -120,10 +116,6 @@ int main(int argc, char *argv[]) {
ecbff1
         server_driver_message(&server, SD_MESSAGE_JOURNAL_STOP, "Journal stopped");
ecbff1
 
ecbff1
 finish:
ecbff1
-        sd_notify(false,
ecbff1
-                  "STOPPING=1\n"
ecbff1
-                  "STATUS=Shutting down...");
ecbff1
-
ecbff1
         server_done(&server);
ecbff1
 
ecbff1
         return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
ecbff1
diff --git a/units/systemd-journald.service.in b/units/systemd-journald.service.in
ecbff1
index 0d1ea61fe..c94c0bfba 100644
ecbff1
--- a/units/systemd-journald.service.in
ecbff1
+++ b/units/systemd-journald.service.in
ecbff1
@@ -22,7 +22,6 @@ RestartSec=0
ecbff1
 StandardOutput=null
ecbff1
 FileDescriptorStoreMax=4224
ecbff1
 CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_SYS_PTRACE CAP_SYSLOG CAP_AUDIT_CONTROL CAP_AUDIT_READ CAP_CHOWN CAP_DAC_READ_SEARCH CAP_FOWNER CAP_SETUID CAP_SETGID CAP_MAC_OVERRIDE
ecbff1
-WatchdogSec=3min
ecbff1
 
ecbff1
 # Increase the default a bit in order to allow many simultaneous
ecbff1
 # services being run since we keep one fd open per service. Also, when