diff --git a/SOURCES/0279-core-use-an-AF_UNIX-SOCK_DGRAM-socket-for-cgroup-age.patch b/SOURCES/0279-core-use-an-AF_UNIX-SOCK_DGRAM-socket-for-cgroup-age.patch new file mode 100644 index 0000000..7690756 --- /dev/null +++ b/SOURCES/0279-core-use-an-AF_UNIX-SOCK_DGRAM-socket-for-cgroup-age.patch @@ -0,0 +1,489 @@ +From 72696594d40d538e26f6522e064d79f70c8b2f9e Mon Sep 17 00:00:00 2001 +From: Lennart Poettering +Date: Wed, 4 May 2016 20:43:23 +0200 +Subject: [PATCH] core: use an AF_UNIX/SOCK_DGRAM socket for cgroup agent + notification + +dbus-daemon currently uses a backlog of 30 on its D-bus system bus socket. On +overloaded systems this means that only 30 connections may be queued without +dbus-daemon processing them before further connection attempts fail. Our +cgroups-agent binary so far used D-Bus for its messaging, and hitting this +limit hence may result in us losing cgroup empty messages. + +This patch adds a seperate cgroup agent socket of type AF_UNIX/SOCK_DGRAM. +Since sockets of these types need no connection set up, no listen() backlog +applies. Our cgroup-agent binary will hence simply block as long as it can't +enqueue its datagram message, so that we won't lose cgroup empty messages as +likely anymore. + +This also rearranges the ordering of the processing of SIGCHLD signals, service +notification messages (sd_notify()...) and the two types of cgroup +notifications (inotify for the unified hierarchy support, and agent for the +classic hierarchy support). We now always process events for these in the +following order: + + 1. service notification messages (SD_EVENT_PRIORITY_NORMAL-7) + 2. SIGCHLD signals (SD_EVENT_PRIORITY_NORMAL-6) + 3. cgroup inotify and cgroup agent (SD_EVENT_PRIORITY_NORMAL-5) + +This is because when receiving SIGCHLD we invalidate PID information, which we +need to process the service notification messages which are bound to PIDs. +Hence the order between the first two items. And we want to process SIGCHLD +metadata to detect whether a service is gone, before using cgroup +notifications, to decide when a service is gone, since the former carries more +useful metadata. + +Related to this: +https://bugs.freedesktop.org/show_bug.cgi?id=95264 +https://github.com/systemd/systemd/issues/1961 + +Cherry-picked from: d8fdc62037b5b0a9fd603ad5efd6b49f956f86b5 +Resolves: #1305608 +--- + src/cgroups-agent/cgroups-agent.c | 48 ++++++------ + src/core/cgroup.c | 2 + + src/core/dbus.c | 56 +++++++------- + src/core/dbus.h | 2 + + src/core/manager.c | 149 ++++++++++++++++++++++++++++++++++++-- + src/core/manager.h | 3 + + 6 files changed, 198 insertions(+), 62 deletions(-) + +diff --git a/src/cgroups-agent/cgroups-agent.c b/src/cgroups-agent/cgroups-agent.c +index 529e843..2fe6583 100644 +--- a/src/cgroups-agent/cgroups-agent.c ++++ b/src/cgroups-agent/cgroups-agent.c +@@ -1,5 +1,3 @@ +-/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ +- + /*** + This file is part of systemd. + +@@ -20,14 +18,21 @@ + ***/ + + #include ++#include + +-#include "sd-bus.h" + #include "log.h" +-#include "bus-util.h" ++#include "socket-util.h" + + int main(int argc, char *argv[]) { +- _cleanup_bus_close_unref_ sd_bus *bus = NULL; +- int r; ++ ++ static const union sockaddr_union sa = { ++ .un.sun_family = AF_UNIX, ++ .un.sun_path = "/run/systemd/cgroups-agent", ++ }; ++ ++ _cleanup_close_ int fd = -1; ++ ssize_t n; ++ size_t l; + + if (argc != 2) { + log_error("Incorrect number of arguments."); +@@ -38,27 +43,22 @@ int main(int argc, char *argv[]) { + log_parse_environment(); + log_open(); + +- /* We send this event to the private D-Bus socket and then the +- * system instance will forward this to the system bus. We do +- * this to avoid an activation loop when we start dbus when we +- * are called when the dbus service is shut down. */ +- +- r = bus_open_system_systemd(&bus); +- if (r < 0) { +- /* If we couldn't connect we assume this was triggered +- * while systemd got restarted/transitioned from +- * initrd to the system, so let's ignore this */ +- log_debug_errno(r, "Failed to get D-Bus connection: %m"); ++ fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0); ++ if (fd < 0) { ++ log_debug_errno(errno, "Failed to allocate socket: %m"); ++ return EXIT_FAILURE; ++ } ++ ++ l = strlen(argv[1]); ++ ++ n = sendto(fd, argv[1], l, 0, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(sa.un.sun_path)); ++ if (n < 0) { ++ log_debug_errno(errno, "Failed to send cgroups agent message: %m"); + return EXIT_FAILURE; + } + +- r = sd_bus_emit_signal(bus, +- "/org/freedesktop/systemd1/agent", +- "org.freedesktop.systemd1.Agent", +- "Released", +- "s", argv[1]); +- if (r < 0) { +- log_debug_errno(r, "Failed to send signal message on private connection: %m"); ++ if ((size_t) n != l) { ++ log_debug("Datagram size mismatch"); + return EXIT_FAILURE; + } + +diff --git a/src/core/cgroup.c b/src/core/cgroup.c +index 10fdcc9..b7f08fb 100644 +--- a/src/core/cgroup.c ++++ b/src/core/cgroup.c +@@ -1028,6 +1028,8 @@ int manager_notify_cgroup_empty(Manager *m, const char *cgroup) { + assert(m); + assert(cgroup); + ++ log_debug("Got cgroup empty notification for: %s", cgroup); ++ + u = manager_get_unit_by_cgroup(m, cgroup); + if (u) { + r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true); +diff --git a/src/core/dbus.c b/src/core/dbus.c +index 85b5174..29524d4 100644 +--- a/src/core/dbus.c ++++ b/src/core/dbus.c +@@ -72,12 +72,37 @@ int bus_send_queued_message(Manager *m) { + return 0; + } + ++int bus_forward_agent_released(Manager *m, const char *path) { ++ int r; ++ ++ assert(m); ++ assert(path); ++ ++ if (!m->running_as == SYSTEMD_SYSTEM) ++ return 0; ++ ++ if (!m->system_bus) ++ return 0; ++ ++ /* If we are running a system instance we forward the agent message on the system bus, so that the user ++ * instances get notified about this, too */ ++ ++ r = sd_bus_emit_signal(m->system_bus, ++ "/org/freedesktop/systemd1/agent", ++ "org.freedesktop.systemd1.Agent", ++ "Released", ++ "s", path); ++ if (r < 0) ++ return log_warning_errno(r, "Failed to propagate agent release message: %m"); ++ ++ return 1; ++} ++ + static int signal_agent_released(sd_bus *bus, sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *cgroup; + int r; + +- assert(bus); + assert(message); + assert(m); + +@@ -88,16 +113,6 @@ static int signal_agent_released(sd_bus *bus, sd_bus_message *message, void *use + } + + manager_notify_cgroup_empty(m, cgroup); +- +- if (m->running_as == SYSTEMD_SYSTEM && m->system_bus) { +- /* If we are running as system manager, forward the +- * message to the system bus */ +- +- r = sd_bus_send(m->system_bus, message, NULL); +- if (r < 0) +- log_warning_errno(r, "Failed to forward Released message: %m"); +- } +- + return 0; + } + +@@ -679,25 +694,6 @@ static int bus_on_connection(sd_event_source *s, int fd, uint32_t revents, void + return 0; + } + +- if (m->running_as == SYSTEMD_SYSTEM) { +- /* When we run as system instance we get the Released +- * signal via a direct connection */ +- +- r = sd_bus_add_match( +- bus, +- NULL, +- "type='signal'," +- "interface='org.freedesktop.systemd1.Agent'," +- "member='Released'," +- "path='/org/freedesktop/systemd1/agent'", +- signal_agent_released, m); +- +- if (r < 0) { +- log_warning_errno(r, "Failed to register Released match on new connection bus: %m"); +- return 0; +- } +- } +- + r = bus_setup_disconnected_match(m, bus); + if (r < 0) + return 0; +diff --git a/src/core/dbus.h b/src/core/dbus.h +index d04f532..c27d136 100644 +--- a/src/core/dbus.h ++++ b/src/core/dbus.h +@@ -40,3 +40,5 @@ int bus_verify_manage_unit_async(Manager *m, sd_bus_message *call, sd_bus_error + int bus_verify_manage_unit_async_for_kill(Manager *m, sd_bus_message *call, sd_bus_error *error); + int bus_verify_manage_unit_files_async(Manager *m, sd_bus_message *call, sd_bus_error *error); + int bus_verify_reload_daemon_async(Manager *m, sd_bus_message *call, sd_bus_error *error); ++ ++int bus_forward_agent_released(Manager *m, const char *path); +diff --git a/src/core/manager.c b/src/core/manager.c +index 45ca540..b6d76ca 100644 +--- a/src/core/manager.c ++++ b/src/core/manager.c +@@ -83,8 +83,10 @@ + #define JOBS_IN_PROGRESS_WAIT_USEC (5*USEC_PER_SEC) + #define JOBS_IN_PROGRESS_PERIOD_USEC (USEC_PER_SEC / 3) + #define JOBS_IN_PROGRESS_PERIOD_DIVISOR 3 ++#define CGROUPS_AGENT_RCVBUF_SIZE (8*1024*1024) + + static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); ++static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); + static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); + static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); + static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +@@ -456,11 +458,11 @@ static int manager_setup_signals(Manager *m) { + if (r < 0) + return r; + +- /* Process signals a bit earlier than the rest of things, but +- * later than notify_fd processing, so that the notify +- * processing can still figure out to which process/service a +- * message belongs, before we reap the process. */ +- r = sd_event_source_set_priority(m->signal_event_source, -5); ++ /* Process signals a bit earlier than the rest of things, but later than notify_fd processing, so that the ++ * notify processing can still figure out to which process/service a message belongs, before we reap the ++ * process. Also, process this before handling cgroup notifications, so that we always collect child exit ++ * status information before detecting that there's no process in a cgroup. */ ++ r = sd_event_source_set_priority(m->signal_event_source, -6); + if (r < 0) + return r; + +@@ -541,7 +543,7 @@ int manager_new(SystemdRunningAs running_as, bool test_run, Manager **_m) { + + m->idle_pipe[0] = m->idle_pipe[1] = m->idle_pipe[2] = m->idle_pipe[3] = -1; + +- m->pin_cgroupfs_fd = m->notify_fd = m->signal_fd = m->time_change_fd = m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->utab_inotify_fd = -1; ++ m->pin_cgroupfs_fd = m->notify_fd = m->cgroups_agent_fd = m->signal_fd = m->time_change_fd = m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->utab_inotify_fd = -1; + m->current_job_id = 1; /* start as id #1, so that we can leave #0 around as "null-like" value */ + + m->ask_password_inotify_fd = -1; +@@ -689,8 +691,8 @@ static int manager_setup_notify(Manager *m) { + if (r < 0) + return log_error_errno(r, "Failed to allocate notify event source: %m"); + +- /* Process signals a bit earlier than SIGCHLD, so that we can +- * still identify to which service an exit message belongs */ ++ /* Process notification messages a bit earlier than SIGCHLD, so that we can still identify to which ++ * service an exit message belongs. */ + r = sd_event_source_set_priority(m->notify_event_source, -7); + if (r < 0) + return log_error_errno(r, "Failed to set priority of notify event source: %m"); +@@ -699,6 +701,77 @@ static int manager_setup_notify(Manager *m) { + return 0; + } + ++static int manager_setup_cgroups_agent(Manager *m) { ++ ++ static const union sockaddr_union sa = { ++ .un.sun_family = AF_UNIX, ++ .un.sun_path = "/run/systemd/cgroups-agent", ++ }; ++ int r; ++ ++ /* This creates a listening socket we receive cgroups agent messages on. We do not use D-Bus for delivering ++ * these messages from the cgroups agent binary to PID 1, as the cgroups agent binary is very short-living, and ++ * each instance of it needs a new D-Bus connection. Since D-Bus connections are SOCK_STREAM/AF_UNIX, on ++ * overloaded systems the backlog of the D-Bus socket becomes relevant, as not more than the configured number ++ * of D-Bus connections may be queued until the kernel will start dropping further incoming connections, ++ * possibly resulting in lost cgroups agent messages. To avoid this, we'll use a private SOCK_DGRAM/AF_UNIX ++ * socket, where no backlog is relevant as communication may take place without an actual connect() cycle, and ++ * we thus won't lose messages. ++ * ++ * Note that PID 1 will forward the agent message to system bus, so that the user systemd instance may listen ++ * to it. The system instance hence listens on this special socket, but the user instances listen on the system ++ * bus for these messages. */ ++ ++ if (m->test_run) ++ return 0; ++ ++ if (!m->running_as == SYSTEMD_SYSTEM) ++ return 0; ++ ++ if (m->cgroups_agent_fd < 0) { ++ _cleanup_close_ int fd = -1; ++ ++ /* First free all secondary fields */ ++ m->cgroups_agent_event_source = sd_event_source_unref(m->cgroups_agent_event_source); ++ ++ fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); ++ if (fd < 0) ++ return log_error_errno(errno, "Failed to allocate cgroups agent socket: %m"); ++ ++ fd_inc_rcvbuf(fd, CGROUPS_AGENT_RCVBUF_SIZE); ++ ++ (void) unlink(sa.un.sun_path); ++ ++ /* Only allow root to connect to this socket */ ++ RUN_WITH_UMASK(0077) ++ r = bind(fd, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(sa.un.sun_path)); ++ if (r < 0) ++ return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path); ++ ++ m->cgroups_agent_fd = fd; ++ fd = -1; ++ } ++ ++ if (!m->cgroups_agent_event_source) { ++ r = sd_event_add_io(m->event, &m->cgroups_agent_event_source, m->cgroups_agent_fd, EPOLLIN, manager_dispatch_cgroups_agent_fd, m); ++ if (r < 0) ++ return log_error_errno(r, "Failed to allocate cgroups agent event source: %m"); ++ ++ /* Process cgroups notifications early, but after having processed service notification messages or ++ * SIGCHLD signals, so that a cgroup running empty is always just the last safety net of notification, ++ * and we collected the metadata the notification and SIGCHLD stuff offers first. Also see handling of ++ * cgroup inotify for the unified cgroup stuff. */ ++ r = sd_event_source_set_priority(m->cgroups_agent_event_source, SD_EVENT_PRIORITY_NORMAL-5); ++ if (r < 0) ++ return log_error_errno(r, "Failed to set priority of cgroups agent event source: %m"); ++ ++ (void) sd_event_source_set_description(m->cgroups_agent_event_source, "manager-cgroups-agent"); ++ } ++ ++ return 0; ++} ++ ++ + static int manager_setup_kdbus(Manager *m) { + #ifdef ENABLE_KDBUS + _cleanup_free_ char *p = NULL; +@@ -926,6 +999,7 @@ Manager* manager_free(Manager *m) { + + sd_event_source_unref(m->signal_event_source); + sd_event_source_unref(m->notify_event_source); ++ sd_event_source_unref(m->cgroups_agent_event_source); + sd_event_source_unref(m->time_change_event_source); + sd_event_source_unref(m->jobs_in_progress_event_source); + sd_event_source_unref(m->idle_pipe_event_source); +@@ -933,6 +1007,7 @@ Manager* manager_free(Manager *m) { + + safe_close(m->signal_fd); + safe_close(m->notify_fd); ++ safe_close(m->cgroups_agent_fd); + safe_close(m->time_change_fd); + safe_close(m->kdbus_fd); + +@@ -1181,6 +1256,10 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds) { + if (q < 0 && r == 0) + r = q; + ++ q = manager_setup_cgroups_agent(m); ++ if (q < 0 && r == 0) ++ r = q; ++ + /* We might have deserialized the kdbus control fd, but if we + * didn't, then let's create the bus now. */ + manager_setup_kdbus(m); +@@ -1502,6 +1581,35 @@ static unsigned manager_dispatch_dbus_queue(Manager *m) { + return n; + } + ++static int manager_dispatch_cgroups_agent_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { ++ Manager *m = userdata; ++ char buf[PATH_MAX+1]; ++ ssize_t n; ++ ++ n = recv(fd, buf, sizeof(buf), 0); ++ if (n < 0) ++ return log_error_errno(errno, "Failed to read cgroups agent message: %m"); ++ if (n == 0) { ++ log_error("Got zero-length cgroups agent message, ignoring."); ++ return 0; ++ } ++ if ((size_t) n >= sizeof(buf)) { ++ log_error("Got overly long cgroups agent message, ignoring."); ++ return 0; ++ } ++ ++ if (memchr(buf, 0, n)) { ++ log_error("Got cgroups agent message with embedded NUL byte, ignoring."); ++ return 0; ++ } ++ buf[n] = 0; ++ ++ manager_notify_cgroup_empty(m, buf); ++ bus_forward_agent_released(m, buf); ++ ++ return 0; ++} ++ + static void manager_invoke_notify_message(Manager *m, Unit *u, pid_t pid, char *buf, size_t n, FDSet *fds) { + _cleanup_strv_free_ char **tags = NULL; + +@@ -2314,6 +2422,16 @@ int manager_serialize(Manager *m, FILE *f, FDSet *fds, bool switching_root) { + fprintf(f, "notify-socket=%s\n", m->notify_socket); + } + ++ if (m->cgroups_agent_fd >= 0) { ++ int copy; ++ ++ copy = fdset_put_dup(fds, m->cgroups_agent_fd); ++ if (copy < 0) ++ return copy; ++ ++ fprintf(f, "cgroups-agent-fd=%i\n", copy); ++ } ++ + if (m->kdbus_fd >= 0) { + int copy; + +@@ -2483,6 +2601,17 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) { + free(m->notify_socket); + m->notify_socket = n; + ++ } else if (startswith(l, "cgroups-agent-fd=")) { ++ int fd; ++ ++ if (safe_atoi(l + 17, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) ++ log_debug("Failed to parse cgroups agent fd: %s", l + 10); ++ else { ++ m->cgroups_agent_event_source = sd_event_source_unref(m->cgroups_agent_event_source); ++ safe_close(m->cgroups_agent_fd); ++ m->cgroups_agent_fd = fdset_remove(fds, fd); ++ } ++ + } else if (startswith(l, "kdbus-fd=")) { + int fd; + +@@ -2609,6 +2738,10 @@ int manager_reload(Manager *m) { + if (q < 0 && r >= 0) + r = q; + ++ q = manager_setup_cgroups_agent(m); ++ if (q < 0 && r >= 0) ++ r = q; ++ + /* Third, fire things up! */ + q = manager_coldplug(m); + if (q < 0 && r >= 0) +diff --git a/src/core/manager.h b/src/core/manager.h +index d3971f1..3e855db 100644 +--- a/src/core/manager.h ++++ b/src/core/manager.h +@@ -137,6 +137,9 @@ struct Manager { + int notify_fd; + sd_event_source *notify_event_source; + ++ int cgroups_agent_fd; ++ sd_event_source *cgroups_agent_event_source; ++ + int signal_fd; + sd_event_source *signal_event_source; + diff --git a/SOURCES/0280-logind-process-session-inhibitor-fds-at-higher-prior.patch b/SOURCES/0280-logind-process-session-inhibitor-fds-at-higher-prior.patch new file mode 100644 index 0000000..a396534 --- /dev/null +++ b/SOURCES/0280-logind-process-session-inhibitor-fds-at-higher-prior.patch @@ -0,0 +1,59 @@ +From 5eeac990b887d7cbe8a4b234918f19f64a57079a Mon Sep 17 00:00:00 2001 +From: Lennart Poettering +Date: Wed, 4 May 2016 19:01:56 +0200 +Subject: [PATCH] logind: process session/inhibitor fds at higher priority + +Let's make sure we process session and inhibitor pipe fds (that signal +sessions/inhibtors going away) at a higher priority +than new bus calls that might create new sessions or inhibitors. This helps +ensuring that the number of open sessions stays minimal. + +Cherry-picked from: e11544a8305ab9dea097c74bb16e296150c9cc10 +Resolves: #1305608 +--- + src/login/logind-inhibit.c | 2 +- + src/login/logind-session.c | 4 +++- + src/login/logind.c | 2 +- + 3 files changed, 5 insertions(+), 3 deletions(-) + +diff --git a/src/login/logind-inhibit.c b/src/login/logind-inhibit.c +index 84fee0e..bf96898 100644 +--- a/src/login/logind-inhibit.c ++++ b/src/login/logind-inhibit.c +@@ -303,7 +303,7 @@ int inhibitor_create_fifo(Inhibitor *i) { + if (r < 0) + return r; + +- r = sd_event_source_set_priority(i->event_source, SD_EVENT_PRIORITY_IDLE); ++ r = sd_event_source_set_priority(i->event_source, SD_EVENT_PRIORITY_IDLE-10); + if (r < 0) + return r; + } +diff --git a/src/login/logind-session.c b/src/login/logind-session.c +index d2e7b40..f39eb78 100644 +--- a/src/login/logind-session.c ++++ b/src/login/logind-session.c +@@ -888,7 +888,9 @@ int session_create_fifo(Session *s) { + if (r < 0) + return r; + +- r = sd_event_source_set_priority(s->fifo_event_source, SD_EVENT_PRIORITY_IDLE); ++ /* Let's make sure we noticed dead sessions before we process new bus requests (which might create new ++ * sessions). */ ++ r = sd_event_source_set_priority(s->fifo_event_source, SD_EVENT_PRIORITY_NORMAL-10); + if (r < 0) + return r; + } +diff --git a/src/login/logind.c b/src/login/logind.c +index 3afbf34..e8d0669 100644 +--- a/src/login/logind.c ++++ b/src/login/logind.c +@@ -685,7 +685,7 @@ static int manager_connect_bus(Manager *m) { + if (r < 0) + return log_error_errno(r, "Failed to register name: %m"); + +- r = sd_bus_attach_event(m->bus, m->event, 0); ++ r = sd_bus_attach_event(m->bus, m->event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + diff --git a/SOURCES/0281-sd-event-expose-the-event-loop-iteration-counter-via.patch b/SOURCES/0281-sd-event-expose-the-event-loop-iteration-counter-via.patch new file mode 100644 index 0000000..98ff349 --- /dev/null +++ b/SOURCES/0281-sd-event-expose-the-event-loop-iteration-counter-via.patch @@ -0,0 +1,69 @@ +From 782a6f41f36317fe837f8c85d2fc7b9b5de98200 Mon Sep 17 00:00:00 2001 +From: Lennart Poettering +Date: Wed, 29 Jun 2016 19:03:26 -0700 +Subject: [PATCH] sd-event: expose the event loop iteration counter via + sd_event_get_iteration() + +This extends the existing event loop iteration counter to 64bit, and exposes it +via a new function sd_event_get_iteration(). This is helpful for cases like +issue #3612. After all, since we maintain the counter anyway, we might as well +expose it. + +(This also fixes an unrelated issue in the man page for sd_event_wait() where +micro and milliseconds got mixed up) + +Cherry-picked from: 7486322b99da5b4d2d00d35b310b035f936f7964 +Related: #1342173 +--- + src/libsystemd/sd-event/sd-event.c | 14 +++++++++++--- + src/systemd/sd-event.h | 1 + + 2 files changed, 12 insertions(+), 3 deletions(-) + +diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c +index 1f1e6fe..9d48e5a 100644 +--- a/src/libsystemd/sd-event/sd-event.c ++++ b/src/libsystemd/sd-event/sd-event.c +@@ -76,8 +76,8 @@ struct sd_event_source { + int64_t priority; + unsigned pending_index; + unsigned prepare_index; +- unsigned pending_iteration; +- unsigned prepare_iteration; ++ uint64_t pending_iteration; ++ uint64_t prepare_iteration; + + LIST_FIELDS(sd_event_source, sources); + +@@ -169,7 +169,7 @@ struct sd_event { + + pid_t original_pid; + +- unsigned iteration; ++ uint64_t iteration; + dual_timestamp timestamp; + usec_t timestamp_boottime; + int state; +@@ -2689,3 +2689,11 @@ _public_ int sd_event_get_watchdog(sd_event *e) { + + return e->watchdog; + } ++ ++_public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) { ++ assert_return(e, -EINVAL); ++ assert_return(!event_pid_changed(e), -ECHILD); ++ ++ *ret = e->iteration; ++ return 0; ++} +diff --git a/src/systemd/sd-event.h b/src/systemd/sd-event.h +index 25a10f9..4957f3a 100644 +--- a/src/systemd/sd-event.h ++++ b/src/systemd/sd-event.h +@@ -101,6 +101,7 @@ int sd_event_get_tid(sd_event *e, pid_t *tid); + int sd_event_get_exit_code(sd_event *e, int *code); + int sd_event_set_watchdog(sd_event *e, int b); + int sd_event_get_watchdog(sd_event *e); ++int sd_event_get_iteration(sd_event *e, uint64_t *ret); + + sd_event_source* sd_event_source_ref(sd_event_source *s); + sd_event_source* sd_event_source_unref(sd_event_source *s); diff --git a/SOURCES/0282-manager-Only-invoke-a-single-sigchld-per-unit-within.patch b/SOURCES/0282-manager-Only-invoke-a-single-sigchld-per-unit-within.patch new file mode 100644 index 0000000..ba20e3a --- /dev/null +++ b/SOURCES/0282-manager-Only-invoke-a-single-sigchld-per-unit-within.patch @@ -0,0 +1,85 @@ +From f84b01e4d13d155bd0a8a429c5d18e855414de4b Mon Sep 17 00:00:00 2001 +From: Kyle Walker +Date: Thu, 30 Jun 2016 15:12:18 -0400 +Subject: [PATCH] manager: Only invoke a single sigchld per unit within a + cleanup cycle + +By default, each iteration of manager_dispatch_sigchld() results in a unit level +sigchld event being invoked. For scope units, this results in a scope_sigchld_event() +which can seemingly stall for workloads that have a large number of PIDs within the +scope. The stall exhibits itself as a SIG_0 being initiated for each u->pids entry +as a result of pid_is_unwaited(). + +v2: +This patch resolves this condition by only paying to cost of a sigchld in the underlying +scope unit once per sigchld iteration. A new "sigchldgen" member resides within the +Unit struct. The Manager is incremented via the sd event loop, accessed via +sd_event_get_iteration, and the Unit member is set to the same value as the manager each +time that a sigchld event is invoked. If the Manager iteration value and Unit member +match, the sigchld event is not invoked for that iteration. + +Cherry-picked from: 36f20ae3b2975e44b6ef17e453ae06a289e9a122 +Resolves: #1342173 +--- + src/core/manager.c | 13 ++++++++++++- + src/core/unit.c | 1 + + src/core/unit.h | 3 +++ + 3 files changed, 16 insertions(+), 1 deletion(-) + +diff --git a/src/core/manager.c b/src/core/manager.c +index b6d76ca..f2f5c6a 100644 +--- a/src/core/manager.c ++++ b/src/core/manager.c +@@ -1743,14 +1743,25 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t + } + + static void invoke_sigchld_event(Manager *m, Unit *u, siginfo_t *si) { ++ uint64_t iteration; ++ + assert(m); + assert(u); + assert(si); + ++ sd_event_get_iteration(m->event, &iteration); ++ + log_unit_debug(u->id, "Child "PID_FMT" belongs to %s", si->si_pid, u->id); + + unit_unwatch_pid(u, si->si_pid); +- UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status); ++ ++ if (UNIT_VTABLE(u)->sigchld_event) { ++ if (set_size(u->pids) <= 1 || iteration != u->sigchldgen) { ++ UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status); ++ u->sigchldgen = iteration; ++ } else ++ log_debug("%s already issued a sigchld this iteration %llu, skipping. Pids still being watched %d", u->id, iteration, set_size(u->pids)); ++ } + } + + static int manager_dispatch_sigchld(Manager *m) { +diff --git a/src/core/unit.c b/src/core/unit.c +index 4fb2fd3..5b2becc 100644 +--- a/src/core/unit.c ++++ b/src/core/unit.c +@@ -94,6 +94,7 @@ Unit *unit_new(Manager *m, size_t size) { + u->unit_file_state = _UNIT_FILE_STATE_INVALID; + u->unit_file_preset = -1; + u->on_failure_job_mode = JOB_REPLACE; ++ u->sigchldgen = 0; + + return u; + } +diff --git a/src/core/unit.h b/src/core/unit.h +index 0eebc0b..d936457 100644 +--- a/src/core/unit.h ++++ b/src/core/unit.h +@@ -167,6 +167,9 @@ struct Unit { + * process SIGCHLD for */ + Set *pids; + ++ /* Used in sigchld event invocation to avoid repeat events being invoked */ ++ uint64_t sigchldgen; ++ + /* Used during GC sweeps */ + unsigned gc_marker; + diff --git a/SOURCES/0283-manager-Fixing-a-debug-printf-formatting-mistake.patch b/SOURCES/0283-manager-Fixing-a-debug-printf-formatting-mistake.patch new file mode 100644 index 0000000..007e1ab --- /dev/null +++ b/SOURCES/0283-manager-Fixing-a-debug-printf-formatting-mistake.patch @@ -0,0 +1,28 @@ +From 69ecc270c614acdb5469551ac3dbe41bd43c44ae Mon Sep 17 00:00:00 2001 +From: Kyle Walker +Date: Fri, 1 Jul 2016 10:04:40 -0400 +Subject: [PATCH] manager: Fixing a debug printf formatting mistake + +A 'llu' formatting statement was used in a debugging printf statement +instead of a 'PRIu64'. Correcting that mistake here. + +Cherry-picked from: 72b0c3f59695239c51b719576f625e789bd00a66 +Related: #1342173 +--- + src/core/manager.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/src/core/manager.c b/src/core/manager.c +index f2f5c6a..b63d929 100644 +--- a/src/core/manager.c ++++ b/src/core/manager.c +@@ -1760,7 +1760,8 @@ static void invoke_sigchld_event(Manager *m, Unit *u, siginfo_t *si) { + UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status); + u->sigchldgen = iteration; + } else +- log_debug("%s already issued a sigchld this iteration %llu, skipping. Pids still being watched %d", u->id, iteration, set_size(u->pids)); ++ log_debug("%s already issued a sigchld this iteration %" PRIu64 ", skipping. Pids still being watched %d", u->id, iteration, set_size(u->pids)); ++ + } + } + diff --git a/SOURCES/0284-manager-don-t-skip-sigchld-handler-for-main-and-cont.patch b/SOURCES/0284-manager-don-t-skip-sigchld-handler-for-main-and-cont.patch new file mode 100644 index 0000000..d4bc7c1 --- /dev/null +++ b/SOURCES/0284-manager-don-t-skip-sigchld-handler-for-main-and-cont.patch @@ -0,0 +1,248 @@ +From 35bf3bd6a276fa58fa6ed5a258a7fbc4ba8bce05 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Nykr=C3=BDn?= +Date: Sat, 16 Jul 2016 21:04:13 +0200 +Subject: [PATCH] manager: don't skip sigchld handler for main and control pid + for services (#3738) + +During stop when service has one "regular" pid one main pid and one +control pid and the sighld for the regular one is processed first the +unit_tidy_watch_pids will skip the main and control pid and does not +remove them from u->pids(). But then we skip the sigchld event because we +already did one in the iteration and there are two pids in u->pids. + +v2: Use general unit_main_pid() and unit_control_pid() instead of +reaching directly to service structure. +Cherry-picked from: ccc2c98e1b0c06861577632440b996ca16cefd53 +Resolves: #1342173 +--- + src/core/busname.c | 10 ++++++++++ + src/core/manager.c | 5 ++++- + src/core/mount.c | 10 ++++++++++ + src/core/service.c | 19 +++++++++++++++++++ + src/core/socket.c | 10 ++++++++++ + src/core/swap.c | 10 ++++++++++ + src/core/unit.c | 18 ++++++++++++++++++ + src/core/unit.h | 9 +++++++++ + 8 files changed, 90 insertions(+), 1 deletion(-) + +diff --git a/src/core/busname.c b/src/core/busname.c +index 43d7607..f626ba9 100644 +--- a/src/core/busname.c ++++ b/src/core/busname.c +@@ -997,6 +997,14 @@ static const char* const busname_state_table[_BUSNAME_STATE_MAX] = { + + DEFINE_STRING_TABLE_LOOKUP(busname_state, BusNameState); + ++static int busname_control_pid(Unit *u) { ++ BusName *n = BUSNAME(u); ++ ++ assert(n); ++ ++ return n->control_pid; ++} ++ + static const char* const busname_result_table[_BUSNAME_RESULT_MAX] = { + [BUSNAME_SUCCESS] = "success", + [BUSNAME_FAILURE_RESOURCES] = "resources", +@@ -1047,6 +1055,8 @@ const UnitVTable busname_vtable = { + + .supported = busname_supported, + ++ .control_pid = busname_control_pid, ++ + .bus_interface = "org.freedesktop.systemd1.BusName", + .bus_vtable = bus_busname_vtable, + +diff --git a/src/core/manager.c b/src/core/manager.c +index b63d929..87b5e57 100644 +--- a/src/core/manager.c ++++ b/src/core/manager.c +@@ -1756,7 +1756,10 @@ static void invoke_sigchld_event(Manager *m, Unit *u, siginfo_t *si) { + unit_unwatch_pid(u, si->si_pid); + + if (UNIT_VTABLE(u)->sigchld_event) { +- if (set_size(u->pids) <= 1 || iteration != u->sigchldgen) { ++ if (set_size(u->pids) <= 1 || ++ iteration != u->sigchldgen || ++ unit_main_pid(u) == si->si_pid || ++ unit_control_pid(u) == si->si_pid) { + UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status); + u->sigchldgen = iteration; + } else +diff --git a/src/core/mount.c b/src/core/mount.c +index 23f63ce..4dc9f2e 100644 +--- a/src/core/mount.c ++++ b/src/core/mount.c +@@ -1883,6 +1883,14 @@ static const char* const mount_state_table[_MOUNT_STATE_MAX] = { + + DEFINE_STRING_TABLE_LOOKUP(mount_state, MountState); + ++static int mount_control_pid(Unit *u) { ++ Mount *m = MOUNT(u); ++ ++ assert(m); ++ ++ return m->control_pid; ++} ++ + static const char* const mount_exec_command_table[_MOUNT_EXEC_COMMAND_MAX] = { + [MOUNT_EXEC_MOUNT] = "ExecMount", + [MOUNT_EXEC_UNMOUNT] = "ExecUnmount", +@@ -1944,6 +1952,8 @@ const UnitVTable mount_vtable = { + + .reset_failed = mount_reset_failed, + ++ .control_pid = mount_control_pid, ++ + .bus_interface = "org.freedesktop.systemd1.Mount", + .bus_vtable = bus_mount_vtable, + .bus_set_property = bus_mount_set_property, +diff --git a/src/core/service.c b/src/core/service.c +index ae5e610..f102ef3 100644 +--- a/src/core/service.c ++++ b/src/core/service.c +@@ -3028,6 +3028,22 @@ static const char* const service_state_table[_SERVICE_STATE_MAX] = { + + DEFINE_STRING_TABLE_LOOKUP(service_state, ServiceState); + ++static int service_main_pid(Unit *u) { ++ Service *s = SERVICE(u); ++ ++ assert(s); ++ ++ return s->main_pid; ++} ++ ++static int service_control_pid(Unit *u) { ++ Service *s = SERVICE(u); ++ ++ assert(s); ++ ++ return s->control_pid; ++} ++ + static const char* const service_restart_table[_SERVICE_RESTART_MAX] = { + [SERVICE_RESTART_NO] = "no", + [SERVICE_RESTART_ON_SUCCESS] = "on-success", +@@ -3138,6 +3154,9 @@ const UnitVTable service_vtable = { + .notify_cgroup_empty = service_notify_cgroup_empty_event, + .notify_message = service_notify_message, + ++ .main_pid = service_main_pid, ++ .control_pid = service_control_pid, ++ + .bus_name_owner_change = service_bus_name_owner_change, + + .bus_interface = "org.freedesktop.systemd1.Service", +diff --git a/src/core/socket.c b/src/core/socket.c +index bc677a2..771af0d 100644 +--- a/src/core/socket.c ++++ b/src/core/socket.c +@@ -2648,6 +2648,14 @@ static const char* const socket_state_table[_SOCKET_STATE_MAX] = { + + DEFINE_STRING_TABLE_LOOKUP(socket_state, SocketState); + ++static int socket_control_pid(Unit *u) { ++ Socket *s = SOCKET(u); ++ ++ assert(s); ++ ++ return s->control_pid; ++} ++ + static const char* const socket_exec_command_table[_SOCKET_EXEC_COMMAND_MAX] = { + [SOCKET_EXEC_START_PRE] = "StartPre", + [SOCKET_EXEC_START_CHOWN] = "StartChown", +@@ -2713,6 +2721,8 @@ const UnitVTable socket_vtable = { + + .reset_failed = socket_reset_failed, + ++ .control_pid = socket_control_pid, ++ + .bus_interface = "org.freedesktop.systemd1.Socket", + .bus_vtable = bus_socket_vtable, + .bus_set_property = bus_socket_set_property, +diff --git a/src/core/swap.c b/src/core/swap.c +index 34a2c40..42f9959 100644 +--- a/src/core/swap.c ++++ b/src/core/swap.c +@@ -1426,6 +1426,14 @@ static const char* const swap_state_table[_SWAP_STATE_MAX] = { + + DEFINE_STRING_TABLE_LOOKUP(swap_state, SwapState); + ++static int swap_control_pid(Unit *u) { ++ Swap *s = SWAP(u); ++ ++ assert(s); ++ ++ return s->control_pid; ++} ++ + static const char* const swap_exec_command_table[_SWAP_EXEC_COMMAND_MAX] = { + [SWAP_EXEC_ACTIVATE] = "ExecActivate", + [SWAP_EXEC_DEACTIVATE] = "ExecDeactivate", +@@ -1487,6 +1495,8 @@ const UnitVTable swap_vtable = { + + .reset_failed = swap_reset_failed, + ++ .control_pid = swap_control_pid, ++ + .bus_interface = "org.freedesktop.systemd1.Swap", + .bus_vtable = bus_swap_vtable, + .bus_set_property = bus_swap_set_property, +diff --git a/src/core/unit.c b/src/core/unit.c +index 5b2becc..f03f185 100644 +--- a/src/core/unit.c ++++ b/src/core/unit.c +@@ -3667,6 +3667,24 @@ int unit_setup_exec_runtime(Unit *u) { + return exec_runtime_make(rt, unit_get_exec_context(u), u->id); + } + ++pid_t unit_control_pid(Unit *u) { ++ assert(u); ++ ++ if (UNIT_VTABLE(u)->control_pid) ++ return UNIT_VTABLE(u)->control_pid(u); ++ ++ return 0; ++} ++ ++pid_t unit_main_pid(Unit *u) { ++ assert(u); ++ ++ if (UNIT_VTABLE(u)->main_pid) ++ return UNIT_VTABLE(u)->main_pid(u); ++ ++ return 0; ++} ++ + static const char* const unit_active_state_table[_UNIT_ACTIVE_STATE_MAX] = { + [UNIT_ACTIVE] = "active", + [UNIT_RELOADING] = "reloading", +diff --git a/src/core/unit.h b/src/core/unit.h +index d936457..35287a5 100644 +--- a/src/core/unit.h ++++ b/src/core/unit.h +@@ -399,6 +399,12 @@ struct UnitVTable { + + int (*get_timeout)(Unit *u, uint64_t *timeout); + ++ /* Returns the main PID if there is any defined, or 0. */ ++ pid_t (*main_pid)(Unit *u); ++ ++ /* Returns the main PID if there is any defined, or 0. */ ++ pid_t (*control_pid)(Unit *u); ++ + /* This is called for each unit type and should be used to + * enumerate existing devices and load them. However, + * everything that is loaded here should still stay in +@@ -610,6 +616,9 @@ int unit_make_transient(Unit *u); + + int unit_require_mounts_for(Unit *u, const char *path); + ++pid_t unit_control_pid(Unit *u); ++pid_t unit_main_pid(Unit *u); ++ + const char *unit_active_state_to_string(UnitActiveState i) _const_; + UnitActiveState unit_active_state_from_string(const char *s) _pure_; + diff --git a/SPECS/systemd.spec b/SPECS/systemd.spec index 2942307..debe9b3 100644 --- a/SPECS/systemd.spec +++ b/SPECS/systemd.spec @@ -7,7 +7,7 @@ Name: systemd Url: http://www.freedesktop.org/wiki/Software/systemd Version: 219 -Release: 19%{?dist}.12 +Release: 19%{?dist}.13 # For a breakdown of the licensing, see README License: LGPLv2+ and MIT and GPLv2+ Summary: A System and Service Manager @@ -306,6 +306,12 @@ Patch0275: 0275-journal-fix-error-handling-when-compressing-journal-.patch Patch0276: 0276-journal-irrelevant-coding-style-fixes.patch Patch0277: 0277-fstab-generator-cescape-device-name-in-root-fsck-ser.patch Patch0278: 0278-manager-reduce-complexity-of-unit_gc_sweep-3507.patch +Patch0279: 0279-core-use-an-AF_UNIX-SOCK_DGRAM-socket-for-cgroup-age.patch +Patch0280: 0280-logind-process-session-inhibitor-fds-at-higher-prior.patch +Patch0281: 0281-sd-event-expose-the-event-loop-iteration-counter-via.patch +Patch0282: 0282-manager-Only-invoke-a-single-sigchld-per-unit-within.patch +Patch0283: 0283-manager-Fixing-a-debug-printf-formatting-mistake.patch +Patch0284: 0284-manager-don-t-skip-sigchld-handler-for-main-and-cont.patch %global num_patches %{lua: c=0; for i,p in ipairs(patches) do c=c+1; end; print(c);} @@ -1266,6 +1272,14 @@ getent passwd systemd-resolve >/dev/null 2>&1 || useradd -r -l -g systemd-resolv %{_mandir}/man8/systemd-resolved.* %changelog +* Wed Jul 27 2016 Lukas Nykryn - 219-19.13 +- core: use an AF_UNIX/SOCK_DGRAM socket for cgroup agent notification (#1305608) +- logind: process session/inhibitor fds at higher priority (#1305608) +- sd-event: expose the event loop iteration counter via sd_event_get_iteration() (#1342173) +- manager: Only invoke a single sigchld per unit within a cleanup cycle (#1342173) +- manager: Fixing a debug printf formatting mistake (#1342173) +- manager: don't skip sigchld handler for main and control pid for services (#3738) (#1342173) + * Tue Jun 14 2016 Lukas Nykryn - 219-19.12 - manager: reduce complexity of unit_gc_sweep (#3507) (#1344556)