dd65c9
From c67c643418b1df5b5705b3a72eba1e6755830dc5 Mon Sep 17 00:00:00 2001
dd65c9
From: Jan Synacek <jsynacek@redhat.com>
dd65c9
Date: Tue, 21 Nov 2017 12:46:28 +0100
dd65c9
Subject: [PATCH] journald: never block when sending messages on NOTIFY_SOCKET
dd65c9
 socket
dd65c9
dd65c9
Otherwise we might run into deadlocks, when journald blocks on the
dd65c9
notify socket on PID 1, and PID 1 blocks on IPC to dbus-daemon and
dd65c9
dbus-daemon blocks on logging to journald. Break this cycle by making
dd65c9
sure that journald never ever blocks on PID 1.
dd65c9
dd65c9
Note that this change disables support for event loop watchdog support,
dd65c9
as these messages are sent in blocking style by sd-event. That should
dd65c9
not be a big loss though, as people reported frequent problems with the
dd65c9
watchdog hitting journald on excessively slow IO.
dd65c9
dd65c9
Fixes: #1505.
dd65c9
(cherry-picked from commit e22aa3d3284709234f086ebebc13a905a295b7a7)
dd65c9
dd65c9
Resolves: #1511565
dd65c9
---
dd65c9
 src/journal/journald-server.c     | 130 +++++++++++++++++++++++++++++++++++++-
dd65c9
 src/journal/journald-server.h     |  13 ++--
dd65c9
 src/journal/journald-stream.c     |  68 ++++++++++++++++++--
dd65c9
 src/journal/journald-stream.h     |   3 +
dd65c9
 src/journal/journald.c            |   8 ---
dd65c9
 units/systemd-journald.service.in |   1 -
dd65c9
 6 files changed, 201 insertions(+), 22 deletions(-)
dd65c9
dd65c9
diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c
dd65c9
index daeecd519..a810829b2 100644
dd65c9
--- a/src/journal/journald-server.c
dd65c9
+++ b/src/journal/journald-server.c
dd65c9
@@ -67,6 +67,8 @@
dd65c9
 
dd65c9
 #define RECHECK_AVAILABLE_SPACE_USEC (30*USEC_PER_SEC)
dd65c9
 
dd65c9
+#define NOTIFY_SNDBUF_SIZE (8*1024*1024)
dd65c9
+
dd65c9
 /* Pick a good default that is likely to fit into AF_UNIX and AF_INET SOCK_DGRAM datagrams, and even leaves some room
dd65c9
 + * for a bit of additional metadata. */
dd65c9
 #define DEFAULT_LINE_MAX (48*1024)
dd65c9
@@ -1556,6 +1558,126 @@ static int server_open_hostname(Server *s) {
dd65c9
         return 0;
dd65c9
 }
dd65c9
 
dd65c9
+static int dispatch_notify_event(sd_event_source *es, int fd, uint32_t revents, void *userdata) {
dd65c9
+        Server *s = userdata;
dd65c9
+        int r;
dd65c9
+
dd65c9
+        assert(s);
dd65c9
+        assert(s->notify_event_source == es);
dd65c9
+        assert(s->notify_fd == fd);
dd65c9
+
dd65c9
+        if (revents != EPOLLOUT) {
dd65c9
+                log_error("Invalid events on notify file descriptor.");
dd65c9
+                return -EINVAL;
dd65c9
+        }
dd65c9
+
dd65c9
+        /* The $NOTIFY_SOCKET is writable again, now send exactly one
dd65c9
+         * message on it. Either it's the initial READY=1 event or an
dd65c9
+         * stdout stream event. If there's nothing to write anymore,
dd65c9
+         * turn our event source off. The next time there's something
dd65c9
+         * to send it will be turned on again. */
dd65c9
+
dd65c9
+        if (!s->sent_notify_ready) {
dd65c9
+                static const char p[] =
dd65c9
+                        "READY=1\n"
dd65c9
+                        "STATUS=Processing requests...";
dd65c9
+                ssize_t l;
dd65c9
+
dd65c9
+                l = send(s->notify_fd, p, strlen(p), MSG_DONTWAIT);
dd65c9
+                if (l < 0) {
dd65c9
+                        if (errno == EAGAIN)
dd65c9
+                                return 0;
dd65c9
+
dd65c9
+                        return log_error_errno(errno, "Failed to send READY=1 notification message: %m");
dd65c9
+                }
dd65c9
+
dd65c9
+                s->sent_notify_ready = true;
dd65c9
+                log_debug("Sent READY=1 notification.");
dd65c9
+
dd65c9
+        } else if (s->stdout_streams_notify_queue)
dd65c9
+                /* Dispatch one stream notification event */
dd65c9
+                stdout_stream_send_notify(s->stdout_streams_notify_queue);
dd65c9
+
dd65c9
+        /* Leave us enabled if there's still more to to do. */
dd65c9
+        if (s->stdout_streams_notify_queue)
dd65c9
+                return 0;
dd65c9
+
dd65c9
+        /* There was nothing to do anymore, let's turn ourselves off. */
dd65c9
+        r = sd_event_source_set_enabled(es, SD_EVENT_OFF);
dd65c9
+        if (r < 0)
dd65c9
+                return log_error_errno(r, "Failed to turn off notify event source: %m");
dd65c9
+
dd65c9
+        return 0;
dd65c9
+}
dd65c9
+
dd65c9
+static int server_connect_notify(Server *s) {
dd65c9
+        union sockaddr_union sa = {
dd65c9
+                .un.sun_family = AF_UNIX,
dd65c9
+        };
dd65c9
+        const char *e;
dd65c9
+        int r;
dd65c9
+
dd65c9
+        assert(s);
dd65c9
+        assert(s->notify_fd < 0);
dd65c9
+        assert(!s->notify_event_source);
dd65c9
+
dd65c9
+        /*
dd65c9
+          So here's the problem: we'd like to send notification
dd65c9
+          messages to PID 1, but we cannot do that via sd_notify(),
dd65c9
+          since that's synchronous, and we might end up blocking on
dd65c9
+          it. Specifically: given that PID 1 might block on
dd65c9
+          dbus-daemon during IPC, and dbus-daemon is logging to us,
dd65c9
+          and might hence block on us, we might end up in a deadlock
dd65c9
+          if we block on sending PID 1 notification messages -- by
dd65c9
+          generating a full blocking circle. To avoid this, let's
dd65c9
+          create a non-blocking socket, and connect it to the
dd65c9
+          notification socket, and then wait for POLLOUT before we
dd65c9
+          send anything. This should efficiently avoid any deadlocks,
dd65c9
+          as we'll never block on PID 1, hence PID 1 can safely block
dd65c9
+          on dbus-daemon which can safely block on us again.
dd65c9
+
dd65c9
+          Don't think that this issue is real? It is, see:
dd65c9
+          https://github.com/systemd/systemd/issues/1505
dd65c9
+        */
dd65c9
+
dd65c9
+        e = getenv("NOTIFY_SOCKET");
dd65c9
+        if (!e)
dd65c9
+                return 0;
dd65c9
+
dd65c9
+        if ((e[0] != '@' && e[0] != '/') || e[1] == 0) {
dd65c9
+                log_error("NOTIFY_SOCKET set to an invalid value: %s", e);
dd65c9
+                return -EINVAL;
dd65c9
+        }
dd65c9
+
dd65c9
+        if (strlen(e) > sizeof(sa.un.sun_path)) {
dd65c9
+                log_error("NOTIFY_SOCKET path too long: %s", e);
dd65c9
+                return -EINVAL;
dd65c9
+        }
dd65c9
+
dd65c9
+        s->notify_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
dd65c9
+        if (s->notify_fd < 0)
dd65c9
+                return log_error_errno(errno, "Failed to create notify socket: %m");
dd65c9
+
dd65c9
+        (void) fd_inc_sndbuf(s->notify_fd, NOTIFY_SNDBUF_SIZE);
dd65c9
+
dd65c9
+        strncpy(sa.un.sun_path, e, sizeof(sa.un.sun_path));
dd65c9
+        if (sa.un.sun_path[0] == '@')
dd65c9
+                sa.un.sun_path[0] = 0;
dd65c9
+
dd65c9
+        r = connect(s->notify_fd, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(e));
dd65c9
+        if (r < 0)
dd65c9
+                return log_error_errno(errno, "Failed to connect to notify socket: %m");
dd65c9
+
dd65c9
+        r = sd_event_add_io(s->event, &s->notify_event_source, s->notify_fd, EPOLLOUT, dispatch_notify_event, s);
dd65c9
+        if (r < 0)
dd65c9
+                return log_error_errno(r, "Failed to watch notification socket: %m");
dd65c9
+
dd65c9
+        /* This should fire pretty soon, which we'll use to send the
dd65c9
+         * READY=1 event. */
dd65c9
+
dd65c9
+        return 0;
dd65c9
+}
dd65c9
+
dd65c9
 int server_init(Server *s) {
dd65c9
         _cleanup_fdset_free_ FDSet *fds = NULL;
dd65c9
         int n, r, fd;
dd65c9
@@ -1563,7 +1685,7 @@ int server_init(Server *s) {
dd65c9
         assert(s);
dd65c9
 
dd65c9
         zero(*s);
dd65c9
-        s->syslog_fd = s->native_fd = s->stdout_fd = s->dev_kmsg_fd = s->audit_fd = s->hostname_fd = -1;
dd65c9
+        s->syslog_fd = s->native_fd = s->stdout_fd = s->dev_kmsg_fd = s->audit_fd = s->hostname_fd = s->notify_fd = -1;
dd65c9
         s->compress = true;
dd65c9
         s->seal = true;
dd65c9
 
dd65c9
@@ -1611,8 +1733,6 @@ int server_init(Server *s) {
dd65c9
         if (r < 0)
dd65c9
                 return log_error_errno(r, "Failed to create event loop: %m");
dd65c9
 
dd65c9
-        sd_event_set_watchdog(s->event, true);
dd65c9
-
dd65c9
         n = sd_listen_fds(true);
dd65c9
         if (n < 0)
dd65c9
                 return log_error_errno(n, "Failed to read listening file descriptors from environment: %m");
dd65c9
@@ -1718,6 +1838,8 @@ int server_init(Server *s) {
dd65c9
         server_cache_boot_id(s);
dd65c9
         server_cache_machine_id(s);
dd65c9
 
dd65c9
+        (void) server_connect_notify(s);
dd65c9
+
dd65c9
         r = system_journal_open(s, false);
dd65c9
         if (r < 0)
dd65c9
                 return r;
dd65c9
@@ -1770,6 +1892,7 @@ void server_done(Server *s) {
dd65c9
         sd_event_source_unref(s->sigterm_event_source);
dd65c9
         sd_event_source_unref(s->sigint_event_source);
dd65c9
         sd_event_source_unref(s->hostname_event_source);
dd65c9
+        sd_event_source_unref(s->notify_event_source);
dd65c9
         sd_event_unref(s->event);
dd65c9
 
dd65c9
         safe_close(s->syslog_fd);
dd65c9
@@ -1778,6 +1901,7 @@ void server_done(Server *s) {
dd65c9
         safe_close(s->dev_kmsg_fd);
dd65c9
         safe_close(s->audit_fd);
dd65c9
         safe_close(s->hostname_fd);
dd65c9
+        safe_close(s->notify_fd);
dd65c9
 
dd65c9
         if (s->rate_limit)
dd65c9
                 journal_rate_limit_free(s->rate_limit);
dd65c9
diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h
dd65c9
index b29410778..e59ff35e2 100644
dd65c9
--- a/src/journal/journald-server.h
dd65c9
+++ b/src/journal/journald-server.h
dd65c9
@@ -27,12 +27,15 @@
dd65c9
 #include <sys/types.h>
dd65c9
 #include <sys/socket.h>
dd65c9
 
dd65c9
+typedef struct Server Server;
dd65c9
+
dd65c9
 #include "sd-event.h"
dd65c9
 #include "journal-file.h"
dd65c9
 #include "hashmap.h"
dd65c9
 #include "util.h"
dd65c9
 #include "audit.h"
dd65c9
 #include "journald-rate-limit.h"
dd65c9
+#include "journald-stream.h"
dd65c9
 #include "list.h"
dd65c9
 
dd65c9
 typedef enum Storage {
dd65c9
@@ -52,15 +55,14 @@ typedef enum SplitMode {
dd65c9
         _SPLIT_INVALID = -1
dd65c9
 } SplitMode;
dd65c9
 
dd65c9
-typedef struct StdoutStream StdoutStream;
dd65c9
-
dd65c9
-typedef struct Server {
dd65c9
+struct Server {
dd65c9
         int syslog_fd;
dd65c9
         int native_fd;
dd65c9
         int stdout_fd;
dd65c9
         int dev_kmsg_fd;
dd65c9
         int audit_fd;
dd65c9
         int hostname_fd;
dd65c9
+        int notify_fd;
dd65c9
 
dd65c9
         sd_event *event;
dd65c9
 
dd65c9
@@ -75,6 +77,7 @@ typedef struct Server {
dd65c9
         sd_event_source *sigterm_event_source;
dd65c9
         sd_event_source *sigint_event_source;
dd65c9
         sd_event_source *hostname_event_source;
dd65c9
+        sd_event_source *notify_event_source;
dd65c9
 
dd65c9
         JournalFile *runtime_journal;
dd65c9
         JournalFile *system_journal;
dd65c9
@@ -114,6 +117,7 @@ typedef struct Server {
dd65c9
         usec_t oldest_file_usec;
dd65c9
 
dd65c9
         LIST_HEAD(StdoutStream, stdout_streams);
dd65c9
+        LIST_HEAD(StdoutStream, stdout_streams_notify_queue);
dd65c9
         unsigned n_stdout_streams;
dd65c9
 
dd65c9
         char *tty_path;
dd65c9
@@ -135,6 +139,7 @@ typedef struct Server {
dd65c9
 
dd65c9
         struct udev *udev;
dd65c9
 
dd65c9
+        bool sent_notify_ready;
dd65c9
         bool sync_scheduled;
dd65c9
 
dd65c9
         char machine_id_field[sizeof("_MACHINE_ID=") + 32];
dd65c9
@@ -145,7 +150,7 @@ typedef struct Server {
dd65c9
         char *cgroup_root;
dd65c9
 
dd65c9
         size_t line_max;
dd65c9
-} Server;
dd65c9
+};
dd65c9
 
dd65c9
 #define N_IOVEC_META_FIELDS 20
dd65c9
 #define N_IOVEC_KERNEL_FIELDS 64
dd65c9
diff --git a/src/journal/journald-stream.c b/src/journal/journald-stream.c
dd65c9
index 4d6b7ad18..9118d1a31 100644
dd65c9
--- a/src/journal/journald-stream.c
dd65c9
+++ b/src/journal/journald-stream.c
dd65c9
@@ -80,6 +80,7 @@ struct StdoutStream {
dd65c9
         bool forward_to_console:1;
dd65c9
 
dd65c9
         bool fdstore:1;
dd65c9
+        bool in_notify_queue:1;
dd65c9
 
dd65c9
         char *buffer;
dd65c9
         size_t length;
dd65c9
@@ -90,6 +91,7 @@ struct StdoutStream {
dd65c9
         char *state_file;
dd65c9
 
dd65c9
         LIST_FIELDS(StdoutStream, stdout_stream);
dd65c9
+        LIST_FIELDS(StdoutStream, stdout_stream_notify_queue);
dd65c9
 
dd65c9
         char id_field[sizeof("_STREAM_ID=")-1 + SD_ID128_STRING_MAX];
dd65c9
 };
dd65c9
@@ -102,6 +104,9 @@ void stdout_stream_free(StdoutStream *s) {
dd65c9
                 assert(s->server->n_stdout_streams > 0);
dd65c9
                 s->server->n_stdout_streams --;
dd65c9
                 LIST_REMOVE(stdout_stream, s->server->stdout_streams, s);
dd65c9
+
dd65c9
+                if (s->in_notify_queue)
dd65c9
+                        LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
dd65c9
         }
dd65c9
 
dd65c9
         if (s->event_source) {
dd65c9
@@ -126,7 +131,7 @@ static void stdout_stream_destroy(StdoutStream *s) {
dd65c9
                 return;
dd65c9
 
dd65c9
         if (s->state_file)
dd65c9
-                unlink(s->state_file);
dd65c9
+                (void) unlink(s->state_file);
dd65c9
 
dd65c9
         stdout_stream_free(s);
dd65c9
 }
dd65c9
@@ -210,11 +215,15 @@ static int stdout_stream_save(StdoutStream *s) {
dd65c9
         free(temp_path);
dd65c9
         temp_path = NULL;
dd65c9
 
dd65c9
-        /* Store the connection fd in PID 1, so that we get it passed
dd65c9
-         * in again on next start */
dd65c9
-        if (!s->fdstore) {
dd65c9
-                sd_pid_notify_with_fds(0, false, "FDSTORE=1", &s->fd, 1);
dd65c9
-                s->fdstore = true;
dd65c9
+        if (!s->fdstore && !s->in_notify_queue) {
dd65c9
+                LIST_PREPEND(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
dd65c9
+                s->in_notify_queue = true;
dd65c9
+
dd65c9
+                if (s->server->notify_event_source) {
dd65c9
+                        r = sd_event_source_set_enabled(s->server->notify_event_source, SD_EVENT_ON);
dd65c9
+                        if (r < 0)
dd65c9
+                                log_warning_errno(r, "Failed to enable notify event source: %m");
dd65c9
+                }
dd65c9
         }
dd65c9
 
dd65c9
 finish:
dd65c9
@@ -801,3 +810,50 @@ int server_open_stdout_socket(Server *s, FDSet *fds) {
dd65c9
 
dd65c9
         return 0;
dd65c9
 }
dd65c9
+
dd65c9
+void stdout_stream_send_notify(StdoutStream *s) {
dd65c9
+        struct iovec iovec = {
dd65c9
+                .iov_base = (char*) "FDSTORE=1",
dd65c9
+                .iov_len = strlen("FDSTORE=1"),
dd65c9
+        };
dd65c9
+        struct msghdr msghdr = {
dd65c9
+                .msg_iov = &iovec,
dd65c9
+                .msg_iovlen = 1,
dd65c9
+        };
dd65c9
+        struct cmsghdr *cmsg;
dd65c9
+        ssize_t l;
dd65c9
+
dd65c9
+        assert(s);
dd65c9
+        assert(!s->fdstore);
dd65c9
+        assert(s->in_notify_queue);
dd65c9
+        assert(s->server);
dd65c9
+        assert(s->server->notify_fd >= 0);
dd65c9
+
dd65c9
+        /* Store the connection fd in PID 1, so that we get it passed
dd65c9
+         * in again on next start */
dd65c9
+
dd65c9
+        msghdr.msg_controllen = CMSG_SPACE(sizeof(int));
dd65c9
+        msghdr.msg_control = alloca0(msghdr.msg_controllen);
dd65c9
+
dd65c9
+        cmsg = CMSG_FIRSTHDR(&msghdr);
dd65c9
+        cmsg->cmsg_level = SOL_SOCKET;
dd65c9
+        cmsg->cmsg_type = SCM_RIGHTS;
dd65c9
+        cmsg->cmsg_len = CMSG_LEN(sizeof(int));
dd65c9
+
dd65c9
+        memcpy(CMSG_DATA(cmsg), &s->fd, sizeof(int));
dd65c9
+
dd65c9
+        l = sendmsg(s->server->notify_fd, &msghdr, MSG_DONTWAIT|MSG_NOSIGNAL);
dd65c9
+        if (l < 0) {
dd65c9
+                if (errno == EAGAIN)
dd65c9
+                        return;
dd65c9
+
dd65c9
+                log_error_errno(errno, "Failed to send stream file descriptor to service manager: %m");
dd65c9
+        } else {
dd65c9
+                log_debug("Successfully sent stream file descriptor to service manager.");
dd65c9
+                s->fdstore = 1;
dd65c9
+        }
dd65c9
+
dd65c9
+        LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s);
dd65c9
+        s->in_notify_queue = false;
dd65c9
+
dd65c9
+}
dd65c9
diff --git a/src/journal/journald-stream.h b/src/journal/journald-stream.h
dd65c9
index 94bf955d7..fd8c94fb6 100644
dd65c9
--- a/src/journal/journald-stream.h
dd65c9
+++ b/src/journal/journald-stream.h
dd65c9
@@ -21,9 +21,12 @@
dd65c9
   along with systemd; If not, see <http://www.gnu.org/licenses/>.
dd65c9
 ***/
dd65c9
 
dd65c9
+typedef struct StdoutStream StdoutStream;
dd65c9
+
dd65c9
 #include "fdset.h"
dd65c9
 #include "journald-server.h"
dd65c9
 
dd65c9
 int server_open_stdout_socket(Server *s, FDSet *fds);
dd65c9
 
dd65c9
 void stdout_stream_free(StdoutStream *s);
dd65c9
+void stdout_stream_send_notify(StdoutStream *s);
dd65c9
diff --git a/src/journal/journald.c b/src/journal/journald.c
dd65c9
index 15bbcbe3d..b7ba2b6ec 100644
dd65c9
--- a/src/journal/journald.c
dd65c9
+++ b/src/journal/journald.c
dd65c9
@@ -64,10 +64,6 @@ int main(int argc, char *argv[]) {
dd65c9
         log_debug("systemd-journald running as pid "PID_FMT, getpid());
dd65c9
         server_driver_message(&server, SD_MESSAGE_JOURNAL_START, "Journal started");
dd65c9
 
dd65c9
-        sd_notify(false,
dd65c9
-                  "READY=1\n"
dd65c9
-                  "STATUS=Processing requests...");
dd65c9
-
dd65c9
         for (;;) {
dd65c9
                 usec_t t = USEC_INFINITY, n;
dd65c9
 
dd65c9
@@ -120,10 +116,6 @@ int main(int argc, char *argv[]) {
dd65c9
         server_driver_message(&server, SD_MESSAGE_JOURNAL_STOP, "Journal stopped");
dd65c9
 
dd65c9
 finish:
dd65c9
-        sd_notify(false,
dd65c9
-                  "STOPPING=1\n"
dd65c9
-                  "STATUS=Shutting down...");
dd65c9
-
dd65c9
         server_done(&server);
dd65c9
 
dd65c9
         return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
dd65c9
diff --git a/units/systemd-journald.service.in b/units/systemd-journald.service.in
dd65c9
index 0d1ea61fe..c94c0bfba 100644
dd65c9
--- a/units/systemd-journald.service.in
dd65c9
+++ b/units/systemd-journald.service.in
dd65c9
@@ -22,7 +22,6 @@ RestartSec=0
dd65c9
 StandardOutput=null
dd65c9
 FileDescriptorStoreMax=4224
dd65c9
 CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_SYS_PTRACE CAP_SYSLOG CAP_AUDIT_CONTROL CAP_AUDIT_READ CAP_CHOWN CAP_DAC_READ_SEARCH CAP_FOWNER CAP_SETUID CAP_SETGID CAP_MAC_OVERRIDE
dd65c9
-WatchdogSec=3min
dd65c9
 
dd65c9
 # Increase the default a bit in order to allow many simultaneous
dd65c9
 # services being run since we keep one fd open per service. Also, when