Blob Blame History Raw
From 92b12c7dc013c95bd0d35bae99ff6df023ce0e1f Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Wed, 4 May 2016 20:43:23 +0200
Subject: [PATCH] core: use an AF_UNIX/SOCK_DGRAM socket for cgroup agent
 notification

dbus-daemon currently uses a backlog of 30 on its D-bus system bus socket. On
overloaded systems this means that only 30 connections may be queued without
dbus-daemon processing them before further connection attempts fail. Our
cgroups-agent binary so far used D-Bus for its messaging, and hitting this
limit hence may result in us losing cgroup empty messages.

This patch adds a seperate cgroup agent socket of type AF_UNIX/SOCK_DGRAM.
Since sockets of these types need no connection set up, no listen() backlog
applies. Our cgroup-agent binary will hence simply block as long as it can't
enqueue its datagram message, so that we won't lose cgroup empty messages as
likely anymore.

This also rearranges the ordering of the processing of SIGCHLD signals, service
notification messages (sd_notify()...) and the two types of cgroup
notifications (inotify for the unified hierarchy support, and agent for the
classic hierarchy support). We now always process events for these in the
following order:

  1. service notification messages  (SD_EVENT_PRIORITY_NORMAL-7)
  2. SIGCHLD signals (SD_EVENT_PRIORITY_NORMAL-6)
  3. cgroup inotify and cgroup agent (SD_EVENT_PRIORITY_NORMAL-5)

This is because when receiving SIGCHLD we invalidate PID information, which we
need to process the service notification messages which are bound to PIDs.
Hence the order between the first two items. And we want to process SIGCHLD
metadata to detect whether a service is gone, before using cgroup
notifications, to decide when a service is gone, since the former carries more
useful metadata.

Related to this:
https://bugs.freedesktop.org/show_bug.cgi?id=95264
https://github.com/systemd/systemd/issues/1961

Cherry-picked from: d8fdc62037b5b0a9fd603ad5efd6b49f956f86b5
Resolves: #1305608
---
 src/cgroups-agent/cgroups-agent.c |  48 +++++-----
 src/core/cgroup.c                 |   2 +
 src/core/dbus.c                   |  56 ++++++-----
 src/core/dbus.h                   |   2 +
 src/core/manager.c                | 149 ++++++++++++++++++++++++++++--
 src/core/manager.h                |   3 +
 6 files changed, 198 insertions(+), 62 deletions(-)

diff --git a/src/cgroups-agent/cgroups-agent.c b/src/cgroups-agent/cgroups-agent.c
index 529e843030..2fe65830e7 100644
--- a/src/cgroups-agent/cgroups-agent.c
+++ b/src/cgroups-agent/cgroups-agent.c
@@ -1,5 +1,3 @@
-/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
-
 /***
   This file is part of systemd.
 
@@ -20,14 +18,21 @@
 ***/
 
 #include <stdlib.h>
+#include <sys/socket.h>
 
-#include "sd-bus.h"
 #include "log.h"
-#include "bus-util.h"
+#include "socket-util.h"
 
 int main(int argc, char *argv[]) {
-        _cleanup_bus_close_unref_ sd_bus *bus = NULL;
-        int r;
+
+        static const union sockaddr_union sa = {
+                .un.sun_family = AF_UNIX,
+                .un.sun_path = "/run/systemd/cgroups-agent",
+        };
+
+        _cleanup_close_ int fd = -1;
+        ssize_t n;
+        size_t l;
 
         if (argc != 2) {
                 log_error("Incorrect number of arguments.");
@@ -38,27 +43,22 @@ int main(int argc, char *argv[]) {
         log_parse_environment();
         log_open();
 
-        /* We send this event to the private D-Bus socket and then the
-         * system instance will forward this to the system bus. We do
-         * this to avoid an activation loop when we start dbus when we
-         * are called when the dbus service is shut down. */
-
-        r = bus_open_system_systemd(&bus);
-        if (r < 0) {
-                /* If we couldn't connect we assume this was triggered
-                 * while systemd got restarted/transitioned from
-                 * initrd to the system, so let's ignore this */
-                log_debug_errno(r, "Failed to get D-Bus connection: %m");
+        fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0);
+        if (fd < 0) {
+                log_debug_errno(errno, "Failed to allocate socket: %m");
+                return EXIT_FAILURE;
+        }
+
+        l = strlen(argv[1]);
+
+        n = sendto(fd, argv[1], l, 0, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(sa.un.sun_path));
+        if (n < 0) {
+                log_debug_errno(errno, "Failed to send cgroups agent message: %m");
                 return EXIT_FAILURE;
         }
 
-        r = sd_bus_emit_signal(bus,
-                               "/org/freedesktop/systemd1/agent",
-                               "org.freedesktop.systemd1.Agent",
-                               "Released",
-                               "s", argv[1]);
-        if (r < 0) {
-                log_debug_errno(r, "Failed to send signal message on private connection: %m");
+        if ((size_t) n != l) {
+                log_debug("Datagram size mismatch");
                 return EXIT_FAILURE;
         }
 
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
index 10fdcc9984..b7f08fb420 100644
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -1028,6 +1028,8 @@ int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
         assert(m);
         assert(cgroup);
 
+        log_debug("Got cgroup empty notification for: %s", cgroup);
+
         u = manager_get_unit_by_cgroup(m, cgroup);
         if (u) {
                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
diff --git a/src/core/dbus.c b/src/core/dbus.c
index 85b5174868..29524d49a3 100644
--- a/src/core/dbus.c
+++ b/src/core/dbus.c
@@ -72,12 +72,37 @@ int bus_send_queued_message(Manager *m) {
         return 0;
 }
 
+int bus_forward_agent_released(Manager *m, const char *path) {
+        int r;
+
+        assert(m);
+        assert(path);
+
+        if (!m->running_as == SYSTEMD_SYSTEM)
+                return 0;
+
+        if (!m->system_bus)
+                return 0;
+
+        /* If we are running a system instance we forward the agent message on the system bus, so that the user
+         * instances get notified about this, too */
+
+        r = sd_bus_emit_signal(m->system_bus,
+                               "/org/freedesktop/systemd1/agent",
+                               "org.freedesktop.systemd1.Agent",
+                               "Released",
+                               "s", path);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to propagate agent release message: %m");
+
+        return 1;
+}
+
 static int signal_agent_released(sd_bus *bus, sd_bus_message *message, void *userdata, sd_bus_error *error) {
         Manager *m = userdata;
         const char *cgroup;
         int r;
 
-        assert(bus);
         assert(message);
         assert(m);
 
@@ -88,16 +113,6 @@ static int signal_agent_released(sd_bus *bus, sd_bus_message *message, void *use
         }
 
         manager_notify_cgroup_empty(m, cgroup);
-
-        if (m->running_as == SYSTEMD_SYSTEM && m->system_bus) {
-                /* If we are running as system manager, forward the
-                 * message to the system bus */
-
-                r = sd_bus_send(m->system_bus, message, NULL);
-                if (r < 0)
-                        log_warning_errno(r, "Failed to forward Released message: %m");
-        }
-
         return 0;
 }
 
@@ -679,25 +694,6 @@ static int bus_on_connection(sd_event_source *s, int fd, uint32_t revents, void
                 return 0;
         }
 
-        if (m->running_as == SYSTEMD_SYSTEM) {
-                /* When we run as system instance we get the Released
-                 * signal via a direct connection */
-
-                r = sd_bus_add_match(
-                                bus,
-                                NULL,
-                                "type='signal',"
-                                "interface='org.freedesktop.systemd1.Agent',"
-                                "member='Released',"
-                                "path='/org/freedesktop/systemd1/agent'",
-                                signal_agent_released, m);
-
-                if (r < 0) {
-                        log_warning_errno(r, "Failed to register Released match on new connection bus: %m");
-                        return 0;
-                }
-        }
-
         r = bus_setup_disconnected_match(m, bus);
         if (r < 0)
                 return 0;
diff --git a/src/core/dbus.h b/src/core/dbus.h
index d04f5326c6..c27d136e30 100644
--- a/src/core/dbus.h
+++ b/src/core/dbus.h
@@ -40,3 +40,5 @@ int bus_verify_manage_unit_async(Manager *m, sd_bus_message *call, sd_bus_error
 int bus_verify_manage_unit_async_for_kill(Manager *m, sd_bus_message *call, sd_bus_error *error);
 int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
 int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error);
+
+int bus_forward_agent_released(Manager *m, const char *path);
diff --git a/src/core/manager.c b/src/core/manager.c
index ee456fb790..370c8cbbed 100644
--- a/src/core/manager.c
+++ b/src/core/manager.c
@@ -83,8 +83,10 @@
 #define JOBS_IN_PROGRESS_WAIT_USEC (5*USEC_PER_SEC)
 #define JOBS_IN_PROGRESS_PERIOD_USEC (USEC_PER_SEC / 3)
 #define JOBS_IN_PROGRESS_PERIOD_DIVISOR 3
+#define CGROUPS_AGENT_RCVBUF_SIZE (8*1024*1024)
 
 static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
+static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
 static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
 static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
 static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
@@ -456,11 +458,11 @@ static int manager_setup_signals(Manager *m) {
         if (r < 0)
                 return r;
 
-        /* Process signals a bit earlier than the rest of things, but
-         * later than notify_fd processing, so that the notify
-         * processing can still figure out to which process/service a
-         * message belongs, before we reap the process. */
-        r = sd_event_source_set_priority(m->signal_event_source, -5);
+        /* Process signals a bit earlier than the rest of things, but later than notify_fd processing, so that the
+         * notify processing can still figure out to which process/service a message belongs, before we reap the
+         * process. Also, process this before handling cgroup notifications, so that we always collect child exit
+         * status information before detecting that there's no process in a cgroup. */
+        r = sd_event_source_set_priority(m->signal_event_source, -6);
         if (r < 0)
                 return r;
 
@@ -541,7 +543,7 @@ int manager_new(SystemdRunningAs running_as, bool test_run, Manager **_m) {
 
         m->idle_pipe[0] = m->idle_pipe[1] = m->idle_pipe[2] = m->idle_pipe[3] = -1;
 
-        m->pin_cgroupfs_fd = m->notify_fd = m->signal_fd = m->time_change_fd = m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->utab_inotify_fd = -1;
+        m->pin_cgroupfs_fd = m->notify_fd = m->cgroups_agent_fd = m->signal_fd = m->time_change_fd = m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->utab_inotify_fd = -1;
         m->current_job_id = 1; /* start as id #1, so that we can leave #0 around as "null-like" value */
 
         m->ask_password_inotify_fd = -1;
@@ -689,8 +691,8 @@ static int manager_setup_notify(Manager *m) {
                 if (r < 0)
                         return log_error_errno(r, "Failed to allocate notify event source: %m");
 
-                /* Process signals a bit earlier than SIGCHLD, so that we can
-                 * still identify to which service an exit message belongs */
+                /* Process notification messages a bit earlier than SIGCHLD, so that we can still identify to which
+                 * service an exit message belongs. */
                 r = sd_event_source_set_priority(m->notify_event_source, -7);
                 if (r < 0)
                         return log_error_errno(r, "Failed to set priority of notify event source: %m");
@@ -699,6 +701,77 @@ static int manager_setup_notify(Manager *m) {
         return 0;
 }
 
+static int manager_setup_cgroups_agent(Manager *m) {
+
+        static const union sockaddr_union sa = {
+                .un.sun_family = AF_UNIX,
+                .un.sun_path = "/run/systemd/cgroups-agent",
+        };
+        int r;
+
+        /* This creates a listening socket we receive cgroups agent messages on. We do not use D-Bus for delivering
+         * these messages from the cgroups agent binary to PID 1, as the cgroups agent binary is very short-living, and
+         * each instance of it needs a new D-Bus connection. Since D-Bus connections are SOCK_STREAM/AF_UNIX, on
+         * overloaded systems the backlog of the D-Bus socket becomes relevant, as not more than the configured number
+         * of D-Bus connections may be queued until the kernel will start dropping further incoming connections,
+         * possibly resulting in lost cgroups agent messages. To avoid this, we'll use a private SOCK_DGRAM/AF_UNIX
+         * socket, where no backlog is relevant as communication may take place without an actual connect() cycle, and
+         * we thus won't lose messages.
+         *
+         * Note that PID 1 will forward the agent message to system bus, so that the user systemd instance may listen
+         * to it. The system instance hence listens on this special socket, but the user instances listen on the system
+         * bus for these messages. */
+
+        if (m->test_run)
+                return 0;
+
+        if (!m->running_as == SYSTEMD_SYSTEM)
+                return 0;
+
+        if (m->cgroups_agent_fd < 0) {
+                _cleanup_close_ int fd = -1;
+
+                /* First free all secondary fields */
+                m->cgroups_agent_event_source = sd_event_source_unref(m->cgroups_agent_event_source);
+
+                fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+                if (fd < 0)
+                        return log_error_errno(errno, "Failed to allocate cgroups agent socket: %m");
+
+                fd_inc_rcvbuf(fd, CGROUPS_AGENT_RCVBUF_SIZE);
+
+                (void) unlink(sa.un.sun_path);
+
+                /* Only allow root to connect to this socket */
+                RUN_WITH_UMASK(0077)
+                        r = bind(fd, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(sa.un.sun_path));
+                if (r < 0)
+                        return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
+
+                m->cgroups_agent_fd = fd;
+                fd = -1;
+        }
+
+        if (!m->cgroups_agent_event_source) {
+                r = sd_event_add_io(m->event, &m->cgroups_agent_event_source, m->cgroups_agent_fd, EPOLLIN, manager_dispatch_cgroups_agent_fd, m);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate cgroups agent event source: %m");
+
+                /* Process cgroups notifications early, but after having processed service notification messages or
+                 * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of notification,
+                 * and we collected the metadata the notification and SIGCHLD stuff offers first. Also see handling of
+                 * cgroup inotify for the unified cgroup stuff. */
+                r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-5);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m");
+
+                (void) sd_event_source_set_description(m->cgroups_agent_event_source, "manager-cgroups-agent");
+        }
+
+        return 0;
+}
+
+
 static int manager_setup_kdbus(Manager *m) {
 #ifdef ENABLE_KDBUS
         _cleanup_free_ char *p = NULL;
@@ -912,6 +985,7 @@ Manager* manager_free(Manager *m) {
 
         sd_event_source_unref(m->signal_event_source);
         sd_event_source_unref(m->notify_event_source);
+        sd_event_source_unref(m->cgroups_agent_event_source);
         sd_event_source_unref(m->time_change_event_source);
         sd_event_source_unref(m->jobs_in_progress_event_source);
         sd_event_source_unref(m->idle_pipe_event_source);
@@ -919,6 +993,7 @@ Manager* manager_free(Manager *m) {
 
         safe_close(m->signal_fd);
         safe_close(m->notify_fd);
+        safe_close(m->cgroups_agent_fd);
         safe_close(m->time_change_fd);
         safe_close(m->kdbus_fd);
 
@@ -1167,6 +1242,10 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds) {
         if (q < 0 && r == 0)
                 r = q;
 
+        q = manager_setup_cgroups_agent(m);
+        if (q < 0 && r == 0)
+                r = q;
+
         /* We might have deserialized the kdbus control fd, but if we
          * didn't, then let's create the bus now. */
         manager_setup_kdbus(m);
@@ -1492,6 +1571,35 @@ static unsigned manager_dispatch_dbus_queue(Manager *m) {
         return n;
 }
 
+static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+        Manager *m = userdata;
+        char buf[PATH_MAX+1];
+        ssize_t n;
+
+        n = recv(fd, buf, sizeof(buf), 0);
+        if (n < 0)
+                return log_error_errno(errno, "Failed to read cgroups agent message: %m");
+        if (n == 0) {
+                log_error("Got zero-length cgroups agent message, ignoring.");
+                return 0;
+        }
+        if ((size_t) n >= sizeof(buf)) {
+                log_error("Got overly long cgroups agent message, ignoring.");
+                return 0;
+        }
+
+        if (memchr(buf, 0, n)) {
+                log_error("Got cgroups agent message with embedded NUL byte, ignoring.");
+                return 0;
+        }
+        buf[n] = 0;
+
+        manager_notify_cgroup_empty(m, buf);
+        bus_forward_agent_released(m, buf);
+
+        return 0;
+}
+
 static void manager_invoke_notify_message(Manager *m, Unit *u, pid_t pid, char *buf, size_t n, FDSet *fds) {
         _cleanup_strv_free_ char **tags = NULL;
 
@@ -2304,6 +2412,16 @@ int manager_serialize(Manager *m, FILE *f, FDSet *fds, bool switching_root) {
                 fprintf(f, "notify-socket=%s\n", m->notify_socket);
         }
 
+        if (m->cgroups_agent_fd >= 0) {
+                int copy;
+
+                copy = fdset_put_dup(fds, m->cgroups_agent_fd);
+                if (copy < 0)
+                        return copy;
+
+                fprintf(f, "cgroups-agent-fd=%i\n", copy);
+        }
+
         if (m->kdbus_fd >= 0) {
                 int copy;
 
@@ -2473,6 +2591,17 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
                         free(m->notify_socket);
                         m->notify_socket = n;
 
+                } else if (startswith(l, "cgroups-agent-fd=")) {
+                        int fd;
+
+                        if (safe_atoi(l + 17, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
+                                log_debug("Failed to parse cgroups agent fd: %s", l + 10);
+                        else {
+                                m->cgroups_agent_event_source = sd_event_source_unref(m->cgroups_agent_event_source);
+                                safe_close(m->cgroups_agent_fd);
+                                m->cgroups_agent_fd = fdset_remove(fds, fd);
+                        }
+
                 } else if (startswith(l, "kdbus-fd=")) {
                         int fd;
 
@@ -2599,6 +2728,10 @@ int manager_reload(Manager *m) {
         if (q < 0 && r >= 0)
                 r = q;
 
+        q = manager_setup_cgroups_agent(m);
+        if (q < 0 && r >= 0)
+                r = q;
+
         /* Third, fire things up! */
         q = manager_coldplug(m);
         if (q < 0 && r >= 0)
diff --git a/src/core/manager.h b/src/core/manager.h
index d3971f1684..3e855db466 100644
--- a/src/core/manager.h
+++ b/src/core/manager.h
@@ -137,6 +137,9 @@ struct Manager {
         int notify_fd;
         sd_event_source *notify_event_source;
 
+        int cgroups_agent_fd;
+        sd_event_source *cgroups_agent_event_source;
+
         int signal_fd;
         sd_event_source *signal_event_source;