9fc0f6
From 1386240aee3f78a9101a118f11a7028571d33a71 Mon Sep 17 00:00:00 2001
9fc0f6
From: Michal Sekletar <msekleta@redhat.com>
9fc0f6
Date: Thu, 27 Feb 2014 18:16:19 +0100
9fc0f6
Subject: [PATCH] core: watch SIGCHLD more closely to track processes of units
9fc0f6
 with no reliable cgroup empty notifier
9fc0f6
9fc0f6
When a process dies that we can associate with a specific unit, start
9fc0f6
watching all other processes of that unit, so that we can associate
9fc0f6
those processes with the unit too.
9fc0f6
9fc0f6
Also, for service units start doing this as soon as we get the first
9fc0f6
SIGCHLD for either control or main process, so that we can follow the
9fc0f6
processes of the service from one to the other, as long as process that
9fc0f6
remain are processes of the ones we watched that died and got reassigned
9fc0f6
to us as parent.
9fc0f6
9fc0f6
Similar, for scope units start doing this as soon as the scope
9fc0f6
controller abandons the unit, and thus management entirely reverts to
9fc0f6
systemd. To abandon a unit introduce a new Abandon() scope unit method
9fc0f6
call.
9fc0f6
9fc0f6
Based-on: a911bb9ab27ac0eb3bbf4e8b4109e5da9b88eee3
9fc0f6
---
9fc0f6
 src/core/dbus-scope.c |  36 +++++++++----
9fc0f6
 src/core/manager.c    |   2 +-
9fc0f6
 src/core/scope.c      |  87 ++++++++++++++++++++++---------
9fc0f6
 src/core/scope.h      |   5 +-
9fc0f6
 src/core/service.c    | 140 ++++++++++++++++++++++++++++++--------------------
9fc0f6
 src/core/unit.c       | 112 +++++++++++++++++++++++++++++++++++++++-
9fc0f6
 src/core/unit.h       |   9 ++++
9fc0f6
 7 files changed, 298 insertions(+), 93 deletions(-)
9fc0f6
9fc0f6
diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c
9fc0f6
index b576f76..58dd9ff 100644
9fc0f6
--- a/src/core/dbus-scope.c
9fc0f6
+++ b/src/core/dbus-scope.c
9fc0f6
@@ -30,6 +30,7 @@
9fc0f6
 
9fc0f6
 #define BUS_SCOPE_INTERFACE                                             \
9fc0f6
         " <interface name=\"org.freedesktop.systemd1.Scope\">\n"        \
9fc0f6
+        "  <method name=\"Abandon\"/>\n"                                \
9fc0f6
         BUS_UNIT_CGROUP_INTERFACE                                       \
9fc0f6
         "  <property name=\"Controller\" type=\"s\" access=\"read\"/>\n"\
9fc0f6
         "  <property name=\"TimeoutStopUSec\" type=\"t\" access=\"read\"/>\n" \
9fc0f6
@@ -66,19 +67,40 @@ static const BusProperty bus_scope_properties[] = {
9fc0f6
 
9fc0f6
 DBusHandlerResult bus_scope_message_handler(Unit *u, DBusConnection *c, DBusMessage *message) {
9fc0f6
         Scope *s = SCOPE(u);
9fc0f6
+        _cleanup_dbus_message_unref_ DBusMessage *reply = NULL;
9fc0f6
 
9fc0f6
-        const BusBoundProperties bps[] = {
9fc0f6
+        SELINUX_UNIT_ACCESS_CHECK(u, c, message, "status");
9fc0f6
+
9fc0f6
+        if (dbus_message_is_method_call(message, "org.freedesktop.systemd1.Scope", "Abandon")) {
9fc0f6
+                int r;
9fc0f6
+
9fc0f6
+                r = scope_abandon(s);
9fc0f6
+                if (r < 0)
9fc0f6
+                        log_error("Failed to mark scope %s as abandoned : %s", UNIT(s)->id, strerror(-r));
9fc0f6
+
9fc0f6
+                reply = dbus_message_new_method_return(message);
9fc0f6
+                if (!reply)
9fc0f6
+                        goto oom;
9fc0f6
+        } else {
9fc0f6
+                const BusBoundProperties bps[] = {
9fc0f6
                 { "org.freedesktop.systemd1.Unit",  bus_unit_properties,           u },
9fc0f6
                 { "org.freedesktop.systemd1.Scope", bus_unit_cgroup_properties,    u },
9fc0f6
                 { "org.freedesktop.systemd1.Scope", bus_scope_properties,          s },
9fc0f6
                 { "org.freedesktop.systemd1.Scope", bus_cgroup_context_properties, &s->cgroup_context },
9fc0f6
                 { "org.freedesktop.systemd1.Scope", bus_kill_context_properties,   &s->kill_context   },
9fc0f6
                 {}
9fc0f6
-        };
9fc0f6
+                };
9fc0f6
 
9fc0f6
-        SELINUX_UNIT_ACCESS_CHECK(u, c, message, "status");
9fc0f6
+               return  bus_default_message_handler(c, message, INTROSPECTION, INTERFACES_LIST, bps);
9fc0f6
+        }
9fc0f6
+
9fc0f6
+        if (reply)
9fc0f6
+                if (!bus_maybe_send_reply(c, message, reply))
9fc0f6
+                        goto oom;
9fc0f6
 
9fc0f6
-        return bus_default_message_handler(c, message, INTROSPECTION, INTERFACES_LIST, bps);
9fc0f6
+        return DBUS_HANDLER_RESULT_HANDLED;
9fc0f6
+oom:
9fc0f6
+        return DBUS_HANDLER_RESULT_NEED_MEMORY;
9fc0f6
 }
9fc0f6
 
9fc0f6
 static int bus_scope_set_transient_property(
9fc0f6
@@ -102,10 +124,6 @@ static int bus_scope_set_transient_property(
9fc0f6
                     dbus_message_iter_get_element_type(i) != DBUS_TYPE_UINT32)
9fc0f6
                         return -EINVAL;
9fc0f6
 
9fc0f6
-                r = set_ensure_allocated(&s->pids, trivial_hash_func, trivial_compare_func);
9fc0f6
-                if (r < 0)
9fc0f6
-                        return r;
9fc0f6
-
9fc0f6
                 dbus_message_iter_recurse(i, &sub);
9fc0f6
                 while (dbus_message_iter_get_arg_type(&sub) == DBUS_TYPE_UINT32) {
9fc0f6
                         uint32_t pid;
9fc0f6
@@ -116,7 +134,7 @@ static int bus_scope_set_transient_property(
9fc0f6
                                 return -EINVAL;
9fc0f6
 
9fc0f6
                         if (mode != UNIT_CHECK) {
9fc0f6
-                                r = set_put(s->pids, LONG_TO_PTR(pid));
9fc0f6
+                                r = unit_watch_pid(UNIT(s), pid);
9fc0f6
                                 if (r < 0 && r != -EEXIST)
9fc0f6
                                         return r;
9fc0f6
                         }
9fc0f6
diff --git a/src/core/manager.c b/src/core/manager.c
9fc0f6
index a34a3c6..db5094f 100644
9fc0f6
--- a/src/core/manager.c
9fc0f6
+++ b/src/core/manager.c
9fc0f6
@@ -1389,7 +1389,7 @@ static int manager_dispatch_sigchld(Manager *m) {
9fc0f6
                 log_debug_unit(u->id,
9fc0f6
                                "Child %lu belongs to %s", (long unsigned) si.si_pid, u->id);
9fc0f6
 
9fc0f6
-                hashmap_remove(m->watch_pids, LONG_TO_PTR(si.si_pid));
9fc0f6
+                unit_unwatch_pid(u, si.si_pid);
9fc0f6
                 UNIT_VTABLE(u)->sigchld_event(u, si.si_pid, si.si_code, si.si_status);
9fc0f6
         }
9fc0f6
 
9fc0f6
diff --git a/src/core/scope.c b/src/core/scope.c
9fc0f6
index e75fc2b..22bdfb2 100644
9fc0f6
--- a/src/core/scope.c
9fc0f6
+++ b/src/core/scope.c
9fc0f6
@@ -35,6 +35,7 @@
9fc0f6
 static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = {
9fc0f6
         [SCOPE_DEAD] = UNIT_INACTIVE,
9fc0f6
         [SCOPE_RUNNING] = UNIT_ACTIVE,
9fc0f6
+        [SCOPE_ABANDONED] = UNIT_ACTIVE,
9fc0f6
         [SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING,
9fc0f6
         [SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING,
9fc0f6
         [SCOPE_FAILED] = UNIT_FAILED
9fc0f6
@@ -67,9 +68,6 @@ static void scope_done(Unit *u) {
9fc0f6
         free(s->controller);
9fc0f6
         s->controller = NULL;
9fc0f6
 
9fc0f6
-        set_free(s->pids);
9fc0f6
-        s->pids = NULL;
9fc0f6
-
9fc0f6
         unit_unwatch_timer(u, &s->timer_watch);
9fc0f6
 }
9fc0f6
 
9fc0f6
@@ -84,6 +82,9 @@ static void scope_set_state(Scope *s, ScopeState state) {
9fc0f6
             state != SCOPE_STOP_SIGKILL)
9fc0f6
                 unit_unwatch_timer(UNIT(s), &s->timer_watch);
9fc0f6
 
9fc0f6
+        if (state == SCOPE_DEAD || state == SCOPE_FAILED)
9fc0f6
+                unit_unwatch_all_pids(UNIT(s));
9fc0f6
+
9fc0f6
         if (state != old_state)
9fc0f6
                 log_debug("%s changed %s -> %s",
9fc0f6
                           UNIT(s)->id,
9fc0f6
@@ -115,7 +116,7 @@ static int scope_verify(Scope *s) {
9fc0f6
         if (UNIT(s)->load_state != UNIT_LOADED)
9fc0f6
                 return 0;
9fc0f6
 
9fc0f6
-        if (set_size(s->pids) <= 0 && UNIT(s)->manager->n_reloading <= 0) {
9fc0f6
+        if (set_size(UNIT(s)->pids) <= 0 && UNIT(s)->manager->n_reloading <= 0) {
9fc0f6
                 log_error_unit(UNIT(s)->id, "Scope %s has no PIDs. Refusing.", UNIT(s)->id);
9fc0f6
                 return -EINVAL;
9fc0f6
         }
9fc0f6
@@ -169,6 +170,9 @@ static int scope_coldplug(Unit *u) {
9fc0f6
                                 return r;
9fc0f6
                 }
9fc0f6
 
9fc0f6
+                if (s->deserialized_state != SCOPE_DEAD && s->deserialized_state != SCOPE_FAILED)
9fc0f6
+                        unit_watch_all_pids(UNIT(s));
9fc0f6
+
9fc0f6
                 scope_set_state(s, s->deserialized_state);
9fc0f6
         }
9fc0f6
 
9fc0f6
@@ -209,6 +213,8 @@ static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
9fc0f6
         if (f != SCOPE_SUCCESS)
9fc0f6
                 s->result = f;
9fc0f6
 
9fc0f6
+        unit_watch_all_pids(UNIT(s));
9fc0f6
+
9fc0f6
         /* If we have a controller set let's ask the controller nicely
9fc0f6
          * to terminate the scope, instead of us going directly into
9fc0f6
          * SIGTERM beserk mode */
9fc0f6
@@ -271,13 +277,10 @@ static int scope_start(Unit *u) {
9fc0f6
                 return r;
9fc0f6
         }
9fc0f6
 
9fc0f6
-        r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, s->pids);
9fc0f6
+        r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, UNIT(s)->pids);
9fc0f6
         if (r < 0)
9fc0f6
                 return r;
9fc0f6
 
9fc0f6
-        set_free(s->pids);
9fc0f6
-        s->pids = NULL;
9fc0f6
-
9fc0f6
         s->result = SCOPE_SUCCESS;
9fc0f6
 
9fc0f6
         scope_set_state(s, SCOPE_RUNNING);
9fc0f6
@@ -288,13 +291,13 @@ static int scope_stop(Unit *u) {
9fc0f6
         Scope *s = SCOPE(u);
9fc0f6
 
9fc0f6
         assert(s);
9fc0f6
-        assert(s->state == SCOPE_RUNNING);
9fc0f6
 
9fc0f6
         if (s->state == SCOPE_STOP_SIGTERM ||
9fc0f6
             s->state == SCOPE_STOP_SIGKILL)
9fc0f6
                 return 0;
9fc0f6
 
9fc0f6
-        assert(s->state == SCOPE_RUNNING);
9fc0f6
+        assert(s->state == SCOPE_RUNNING ||
9fc0f6
+               s->state == SCOPE_ABANDONED);
9fc0f6
 
9fc0f6
         scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_SUCCESS);
9fc0f6
         return 0;
9fc0f6
@@ -358,7 +361,7 @@ static bool scope_check_gc(Unit *u) {
9fc0f6
         /* Never clean up scopes that still have a process around,
9fc0f6
          * even if the scope is formally dead. */
9fc0f6
 
9fc0f6
-        if (UNIT(s)->cgroup_path) {
9fc0f6
+        if (u->cgroup_path) {
9fc0f6
                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, UNIT(s)->cgroup_path, true);
9fc0f6
                 if (r <= 0)
9fc0f6
                         return true;
9fc0f6
@@ -367,6 +370,33 @@ static bool scope_check_gc(Unit *u) {
9fc0f6
         return false;
9fc0f6
 }
9fc0f6
 
9fc0f6
+static void scope_notify_cgroup_empty_event(Unit *u) {
9fc0f6
+        Scope *s = SCOPE(u);
9fc0f6
+
9fc0f6
+        assert(u);
9fc0f6
+
9fc0f6
+        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
9fc0f6
+
9fc0f6
+        if (s->state == SCOPE_RUNNING || s->state == SCOPE_ABANDONED ||
9fc0f6
+            s->state == SCOPE_STOP_SIGTERM || SCOPE_STOP_SIGKILL)
9fc0f6
+                scope_enter_dead(s, SCOPE_SUCCESS);
9fc0f6
+}
9fc0f6
+
9fc0f6
+static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) {
9fc0f6
+        /* If we get a SIGCHLD event for one of the processes we were
9fc0f6
+           interested in, then we look for others to watch, under the
9fc0f6
+           assumption that we'll sooner or later get a SIGCHLD for
9fc0f6
+           them, as the original process we watched was probably the
9fc0f6
+           parent of them, and they are hence now our children. */
9fc0f6
+
9fc0f6
+        unit_tidy_watch_pids(u, 0, 0);
9fc0f6
+        unit_watch_all_pids(u);
9fc0f6
+
9fc0f6
+        /* If the PID set is empty now, then let's finish this off */
9fc0f6
+        if (set_isempty(u->pids))
9fc0f6
+                scope_notify_cgroup_empty_event(u);
9fc0f6
+}
9fc0f6
+
9fc0f6
 static void scope_timer_event(Unit *u, uint64_t elapsed, Watch*w) {
9fc0f6
         Scope *s = SCOPE(u);
9fc0f6
 
9fc0f6
@@ -397,24 +427,30 @@ static void scope_timer_event(Unit *u, uint64_t elapsed, Watch*w) {
9fc0f6
         }
9fc0f6
 }
9fc0f6
 
9fc0f6
-static void scope_notify_cgroup_empty_event(Unit *u) {
9fc0f6
-        Scope *s = SCOPE(u);
9fc0f6
-        assert(u);
9fc0f6
+int scope_abandon(Scope *s) {
9fc0f6
+        assert(s);
9fc0f6
 
9fc0f6
-        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
9fc0f6
+        if (s->state != SCOPE_RUNNING && s->state != SCOPE_ABANDONED)
9fc0f6
+                return -ESTALE;
9fc0f6
 
9fc0f6
-        switch (s->state) {
9fc0f6
+        free(s->controller);
9fc0f6
+        s->controller = NULL;
9fc0f6
 
9fc0f6
-        case SCOPE_RUNNING:
9fc0f6
-        case SCOPE_STOP_SIGTERM:
9fc0f6
-        case SCOPE_STOP_SIGKILL:
9fc0f6
-                scope_enter_dead(s, SCOPE_SUCCESS);
9fc0f6
+        /* The client is no longer watching the remaining processes,
9fc0f6
+         * so let's step in here, under the assumption that the
9fc0f6
+         * remaining processes will be sooner or later reassigned to
9fc0f6
+         * us as parent. */
9fc0f6
 
9fc0f6
-                break;
9fc0f6
+        unit_tidy_watch_pids(UNIT(s), 0, 0);
9fc0f6
+        unit_watch_all_pids(UNIT(s));
9fc0f6
 
9fc0f6
-        default:
9fc0f6
-                ;
9fc0f6
-        }
9fc0f6
+        /* If the PID set is empty now, then let's finish this off */
9fc0f6
+        if (set_isempty(UNIT(s)->pids))
9fc0f6
+                scope_notify_cgroup_empty_event(UNIT(s));
9fc0f6
+        else
9fc0f6
+                scope_set_state(s, SCOPE_ABANDONED);
9fc0f6
+
9fc0f6
+        return 0;
9fc0f6
 }
9fc0f6
 
9fc0f6
 _pure_ static UnitActiveState scope_active_state(Unit *u) {
9fc0f6
@@ -432,6 +468,7 @@ _pure_ static const char *scope_sub_state_to_string(Unit *u) {
9fc0f6
 static const char* const scope_state_table[_SCOPE_STATE_MAX] = {
9fc0f6
         [SCOPE_DEAD] = "dead",
9fc0f6
         [SCOPE_RUNNING] = "running",
9fc0f6
+        [SCOPE_ABANDONED] = "abandoned",
9fc0f6
         [SCOPE_STOP_SIGTERM] = "stop-sigterm",
9fc0f6
         [SCOPE_STOP_SIGKILL] = "stop-sigkill",
9fc0f6
         [SCOPE_FAILED] = "failed",
9fc0f6
@@ -481,6 +518,8 @@ const UnitVTable scope_vtable = {
9fc0f6
 
9fc0f6
         .check_gc = scope_check_gc,
9fc0f6
 
9fc0f6
+        .sigchld_event = scope_sigchld_event,
9fc0f6
+
9fc0f6
         .timer_event = scope_timer_event,
9fc0f6
 
9fc0f6
         .reset_failed = scope_reset_failed,
9fc0f6
diff --git a/src/core/scope.h b/src/core/scope.h
9fc0f6
index b4bafa7..1e9f201 100644
9fc0f6
--- a/src/core/scope.h
9fc0f6
+++ b/src/core/scope.h
9fc0f6
@@ -29,6 +29,7 @@ typedef struct Scope Scope;
9fc0f6
 typedef enum ScopeState {
9fc0f6
         SCOPE_DEAD,
9fc0f6
         SCOPE_RUNNING,
9fc0f6
+        SCOPE_ABANDONED,
9fc0f6
         SCOPE_STOP_SIGTERM,
9fc0f6
         SCOPE_STOP_SIGKILL,
9fc0f6
         SCOPE_FAILED,
9fc0f6
@@ -57,13 +58,13 @@ struct Scope {
9fc0f6
 
9fc0f6
         char *controller;
9fc0f6
 
9fc0f6
-        Set *pids;
9fc0f6
-
9fc0f6
         Watch timer_watch;
9fc0f6
 };
9fc0f6
 
9fc0f6
 extern const UnitVTable scope_vtable;
9fc0f6
 
9fc0f6
+int scope_abandon(Scope *s);
9fc0f6
+
9fc0f6
 const char* scope_state_to_string(ScopeState i) _const_;
9fc0f6
 ScopeState scope_state_from_string(const char *s) _pure_;
9fc0f6
 
9fc0f6
diff --git a/src/core/service.c b/src/core/service.c
9fc0f6
index f0acda1..41e5cb5 100644
9fc0f6
--- a/src/core/service.c
9fc0f6
+++ b/src/core/service.c
9fc0f6
@@ -1546,6 +1546,11 @@ static void service_set_state(Service *s, ServiceState state) {
9fc0f6
                 s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
9fc0f6
         }
9fc0f6
 
9fc0f6
+        if (state == SERVICE_DEAD ||
9fc0f6
+            state == SERVICE_FAILED ||
9fc0f6
+            state == SERVICE_AUTO_RESTART)
9fc0f6
+                unit_unwatch_all_pids(UNIT(s));
9fc0f6
+
9fc0f6
         if (state != SERVICE_START_PRE &&
9fc0f6
             state != SERVICE_START &&
9fc0f6
             state != SERVICE_START_POST &&
9fc0f6
@@ -1661,8 +1666,14 @@ static int service_coldplug(Unit *u) {
9fc0f6
                                         return r;
9fc0f6
                         }
9fc0f6
 
9fc0f6
+                if (s->deserialized_state != SERVICE_DEAD &&
9fc0f6
+                    s->deserialized_state != SERVICE_FAILED &&
9fc0f6
+                    s->deserialized_state != SERVICE_AUTO_RESTART)
9fc0f6
+                        unit_watch_all_pids(UNIT(s));
9fc0f6
+
9fc0f6
                 if (s->deserialized_state == SERVICE_START_POST ||
9fc0f6
-                    s->deserialized_state == SERVICE_RUNNING)
9fc0f6
+                    s->deserialized_state == SERVICE_RUNNING ||
9fc0f6
+                    s->deserialized_state == SERVICE_RELOAD)
9fc0f6
                         service_handle_watchdog(s);
9fc0f6
 
9fc0f6
                 service_set_state(s, s->deserialized_state);
9fc0f6
@@ -1970,6 +1981,7 @@ static void service_enter_stop_post(Service *s, ServiceResult f) {
9fc0f6
                 s->result = f;
9fc0f6
 
9fc0f6
         service_unwatch_control_pid(s);
9fc0f6
+        unit_watch_all_pids(UNIT(s));
9fc0f6
 
9fc0f6
         s->control_command = s->exec_command[SERVICE_EXEC_STOP_POST];
9fc0f6
         if (s->control_command) {
9fc0f6
@@ -2010,6 +2022,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
9fc0f6
         if (f != SERVICE_SUCCESS)
9fc0f6
                 s->result = f;
9fc0f6
 
9fc0f6
+        unit_watch_all_pids(UNIT(s));
9fc0f6
+
9fc0f6
         r = unit_kill_context(
9fc0f6
                         UNIT(s),
9fc0f6
                         &s->kill_context,
9fc0f6
@@ -2055,6 +2069,7 @@ static void service_enter_stop(Service *s, ServiceResult f) {
9fc0f6
                 s->result = f;
9fc0f6
 
9fc0f6
         service_unwatch_control_pid(s);
9fc0f6
+        unit_watch_all_pids(UNIT(s));
9fc0f6
 
9fc0f6
         s->control_command = s->exec_command[SERVICE_EXEC_STOP];
9fc0f6
         if (s->control_command) {
9fc0f6
@@ -2961,6 +2976,62 @@ fail:
9fc0f6
         service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
9fc0f6
 }
9fc0f6
 
9fc0f6
+static void service_notify_cgroup_empty_event(Unit *u) {
9fc0f6
+        Service *s = SERVICE(u);
9fc0f6
+
9fc0f6
+        assert(u);
9fc0f6
+
9fc0f6
+        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
9fc0f6
+
9fc0f6
+        switch (s->state) {
9fc0f6
+
9fc0f6
+                /* Waiting for SIGCHLD is usually more interesting,
9fc0f6
+                 * because it includes return codes/signals. Which is
9fc0f6
+                 * why we ignore the cgroup events for most cases,
9fc0f6
+                 * except when we don't know pid which to expect the
9fc0f6
+                 * SIGCHLD for. */
9fc0f6
+
9fc0f6
+        case SERVICE_START:
9fc0f6
+        case SERVICE_START_POST:
9fc0f6
+                /* If we were hoping for the daemon to write its PID file,
9fc0f6
+                 * we can give up now. */
9fc0f6
+                if (s->pid_file_pathspec) {
9fc0f6
+                        log_warning_unit(u->id,
9fc0f6
+                                         "%s never wrote its PID file. Failing.", UNIT(s)->id);
9fc0f6
+                        service_unwatch_pid_file(s);
9fc0f6
+                        if (s->state == SERVICE_START)
9fc0f6
+                                service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES);
9fc0f6
+                        else
9fc0f6
+                                service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
9fc0f6
+                }
9fc0f6
+                break;
9fc0f6
+
9fc0f6
+        case SERVICE_RUNNING:
9fc0f6
+                /* service_enter_running() will figure out what to do */
9fc0f6
+                service_enter_running(s, SERVICE_SUCCESS);
9fc0f6
+                break;
9fc0f6
+
9fc0f6
+        case SERVICE_STOP_SIGTERM:
9fc0f6
+        case SERVICE_STOP_SIGKILL:
9fc0f6
+
9fc0f6
+                if (main_pid_good(s) <= 0 && !control_pid_good(s))
9fc0f6
+                        service_enter_stop_post(s, SERVICE_SUCCESS);
9fc0f6
+
9fc0f6
+                break;
9fc0f6
+
9fc0f6
+        case SERVICE_STOP_POST:
9fc0f6
+        case SERVICE_FINAL_SIGTERM:
9fc0f6
+        case SERVICE_FINAL_SIGKILL:
9fc0f6
+                if (main_pid_good(s) <= 0 && !control_pid_good(s))
9fc0f6
+                        service_enter_dead(s, SERVICE_SUCCESS, true);
9fc0f6
+
9fc0f6
+                break;
9fc0f6
+
9fc0f6
+        default:
9fc0f6
+                ;
9fc0f6
+        }
9fc0f6
+}
9fc0f6
+
9fc0f6
 static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
9fc0f6
         Service *s = SERVICE(u);
9fc0f6
         ServiceResult f;
9fc0f6
@@ -3229,6 +3300,18 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
9fc0f6
 
9fc0f6
         /* Notify clients about changed exit status */
9fc0f6
         unit_add_to_dbus_queue(u);
9fc0f6
+
9fc0f6
+        /* We got one SIGCHLD for the service, let's watch all
9fc0f6
+         * processes that are now running of the service, and watch
9fc0f6
+         * that. Among the PIDs we then watch will be children
9fc0f6
+         * reassigned to us, which hopefully allows us to identify
9fc0f6
+         * when all children are gone */
9fc0f6
+        unit_tidy_watch_pids(u, s->main_pid, s->control_pid);
9fc0f6
+        unit_watch_all_pids(u);
9fc0f6
+
9fc0f6
+        /* If the PID set is empty now, then let's finish this off */
9fc0f6
+        if (set_isempty(u->pids))
9fc0f6
+                service_notify_cgroup_empty_event(u);
9fc0f6
 }
9fc0f6
 
9fc0f6
 static void service_timer_event(Unit *u, uint64_t elapsed, Watch* w) {
9fc0f6
@@ -3332,61 +3415,6 @@ static void service_timer_event(Unit *u, uint64_t elapsed, Watch* w) {
9fc0f6
         }
9fc0f6
 }
9fc0f6
 
9fc0f6
-static void service_notify_cgroup_empty_event(Unit *u) {
9fc0f6
-        Service *s = SERVICE(u);
9fc0f6
-
9fc0f6
-        assert(u);
9fc0f6
-
9fc0f6
-        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
9fc0f6
-
9fc0f6
-        switch (s->state) {
9fc0f6
-
9fc0f6
-                /* Waiting for SIGCHLD is usually more interesting,
9fc0f6
-                 * because it includes return codes/signals. Which is
9fc0f6
-                 * why we ignore the cgroup events for most cases,
9fc0f6
-                 * except when we don't know pid which to expect the
9fc0f6
-                 * SIGCHLD for. */
9fc0f6
-
9fc0f6
-        case SERVICE_START:
9fc0f6
-        case SERVICE_START_POST:
9fc0f6
-                /* If we were hoping for the daemon to write its PID file,
9fc0f6
-                 * we can give up now. */
9fc0f6
-                if (s->pid_file_pathspec) {
9fc0f6
-                        log_warning_unit(u->id,
9fc0f6
-                                         "%s never wrote its PID file. Failing.", UNIT(s)->id);
9fc0f6
-                        service_unwatch_pid_file(s);
9fc0f6
-                        if (s->state == SERVICE_START)
9fc0f6
-                                service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES);
9fc0f6
-                        else
9fc0f6
-                                service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
9fc0f6
-                }
9fc0f6
-                break;
9fc0f6
-
9fc0f6
-        case SERVICE_RUNNING:
9fc0f6
-                /* service_enter_running() will figure out what to do */
9fc0f6
-                service_enter_running(s, SERVICE_SUCCESS);
9fc0f6
-                break;
9fc0f6
-
9fc0f6
-        case SERVICE_STOP_SIGTERM:
9fc0f6
-        case SERVICE_STOP_SIGKILL:
9fc0f6
-
9fc0f6
-                if (main_pid_good(s) <= 0 && !control_pid_good(s))
9fc0f6
-                        service_enter_stop_post(s, SERVICE_SUCCESS);
9fc0f6
-
9fc0f6
-                break;
9fc0f6
-
9fc0f6
-        case SERVICE_FINAL_SIGTERM:
9fc0f6
-        case SERVICE_FINAL_SIGKILL:
9fc0f6
-                if (main_pid_good(s) <= 0 && !control_pid_good(s))
9fc0f6
-                        service_enter_dead(s, SERVICE_SUCCESS, true);
9fc0f6
-
9fc0f6
-                break;
9fc0f6
-
9fc0f6
-        default:
9fc0f6
-                ;
9fc0f6
-        }
9fc0f6
-}
9fc0f6
-
9fc0f6
 static void service_notify_message(Unit *u, pid_t pid, char **tags) {
9fc0f6
         Service *s = SERVICE(u);
9fc0f6
         const char *e;
9fc0f6
diff --git a/src/core/unit.c b/src/core/unit.c
9fc0f6
index 6c2c4a0..0332094 100644
9fc0f6
--- a/src/core/unit.c
9fc0f6
+++ b/src/core/unit.c
9fc0f6
@@ -472,6 +472,8 @@ void unit_free(Unit *u) {
9fc0f6
 
9fc0f6
         set_free_free(u->names);
9fc0f6
 
9fc0f6
+        unit_unwatch_all_pids(u);
9fc0f6
+
9fc0f6
         condition_free_list(u->conditions);
9fc0f6
 
9fc0f6
         unit_ref_unset(&u->slice);
9fc0f6
@@ -1658,13 +1660,25 @@ void unit_unwatch_fd(Unit *u, Watch *w) {
9fc0f6
 }
9fc0f6
 
9fc0f6
 int unit_watch_pid(Unit *u, pid_t pid) {
9fc0f6
+        int q, r;
9fc0f6
+
9fc0f6
         assert(u);
9fc0f6
         assert(pid >= 1);
9fc0f6
 
9fc0f6
+        r = set_ensure_allocated(&u->pids, trivial_hash_func, trivial_compare_func);
9fc0f6
+        if (r < 0)
9fc0f6
+                return r;
9fc0f6
+
9fc0f6
         /* Watch a specific PID. We only support one unit watching
9fc0f6
          * each PID for now. */
9fc0f6
 
9fc0f6
-        return hashmap_put(u->manager->watch_pids, LONG_TO_PTR(pid), u);
9fc0f6
+        r = set_put(u->pids, LONG_TO_PTR(pid));
9fc0f6
+
9fc0f6
+        q = hashmap_put(u->manager->watch_pids, LONG_TO_PTR(pid), u);
9fc0f6
+        if (q < 0)
9fc0f6
+                return q;
9fc0f6
+
9fc0f6
+        return r;
9fc0f6
 }
9fc0f6
 
9fc0f6
 void unit_unwatch_pid(Unit *u, pid_t pid) {
9fc0f6
@@ -1672,6 +1686,102 @@ void unit_unwatch_pid(Unit *u, pid_t pid) {
9fc0f6
         assert(pid >= 1);
9fc0f6
 
9fc0f6
         hashmap_remove_value(u->manager->watch_pids, LONG_TO_PTR(pid), u);
9fc0f6
+        set_remove(u->pids, LONG_TO_PTR(pid));
9fc0f6
+}
9fc0f6
+
9fc0f6
+static int watch_pids_in_path(Unit *u, const char *path) {
9fc0f6
+        _cleanup_closedir_ DIR *d = NULL;
9fc0f6
+        _cleanup_fclose_ FILE *f = NULL;
9fc0f6
+        int ret = 0, r;
9fc0f6
+
9fc0f6
+        assert(u);
9fc0f6
+        assert(path);
9fc0f6
+
9fc0f6
+        /* Adds all PIDs from a specific cgroup path to the set of PIDs we watch. */
9fc0f6
+
9fc0f6
+        r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
9fc0f6
+        if (r >= 0) {
9fc0f6
+                pid_t pid;
9fc0f6
+
9fc0f6
+                while ((r = cg_read_pid(f, &pid)) > 0) {
9fc0f6
+                        r = unit_watch_pid(u, pid);
9fc0f6
+                        if (r < 0 && ret >= 0)
9fc0f6
+                                ret = r;
9fc0f6
+                }
9fc0f6
+                if (r < 0 && ret >= 0)
9fc0f6
+                        ret = r;
9fc0f6
+
9fc0f6
+        } else if (ret >= 0)
9fc0f6
+                ret = r;
9fc0f6
+
9fc0f6
+        r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
9fc0f6
+        if (r >= 0) {
9fc0f6
+                char *fn;
9fc0f6
+
9fc0f6
+                while ((r = cg_read_subgroup(d, &fn)) > 0) {
9fc0f6
+                        _cleanup_free_ char *p = NULL;
9fc0f6
+
9fc0f6
+                        p = strjoin(path, "/", fn, NULL);
9fc0f6
+                        free(fn);
9fc0f6
+
9fc0f6
+                        if (!p)
9fc0f6
+                                return -ENOMEM;
9fc0f6
+
9fc0f6
+                        r = watch_pids_in_path(u, p);
9fc0f6
+                        if (r < 0 && ret >= 0)
9fc0f6
+                                ret = r;
9fc0f6
+                }
9fc0f6
+                if (r < 0 && ret >= 0)
9fc0f6
+                        ret = r;
9fc0f6
+
9fc0f6
+        } else if (ret >= 0)
9fc0f6
+                ret = r;
9fc0f6
+
9fc0f6
+        return ret;
9fc0f6
+}
9fc0f6
+
9fc0f6
+
9fc0f6
+int unit_watch_all_pids(Unit *u) {
9fc0f6
+        assert(u);
9fc0f6
+
9fc0f6
+        if (!u->cgroup_path)
9fc0f6
+                return -ENOENT;
9fc0f6
+
9fc0f6
+        /* Adds all PIDs from our cgroup to the set of PIDs we watch */
9fc0f6
+
9fc0f6
+        return watch_pids_in_path(u, u->cgroup_path);
9fc0f6
+}
9fc0f6
+
9fc0f6
+void unit_unwatch_all_pids(Unit *u) {
9fc0f6
+        Iterator i;
9fc0f6
+        void *e;
9fc0f6
+
9fc0f6
+        assert(u);
9fc0f6
+
9fc0f6
+        SET_FOREACH(e, u->pids, i)
9fc0f6
+                hashmap_remove_value(u->manager->watch_pids, e, u);
9fc0f6
+
9fc0f6
+        set_free(u->pids);
9fc0f6
+        u->pids = NULL;
9fc0f6
+}
9fc0f6
+
9fc0f6
+void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2) {
9fc0f6
+        Iterator i;
9fc0f6
+        void *e;
9fc0f6
+
9fc0f6
+        assert(u);
9fc0f6
+
9fc0f6
+        /* Cleans dead PIDs from our list */
9fc0f6
+
9fc0f6
+        SET_FOREACH(e, u->pids, i) {
9fc0f6
+                pid_t pid = PTR_TO_LONG(e);
9fc0f6
+
9fc0f6
+                if (pid == except1 || pid == except2)
9fc0f6
+                        continue;
9fc0f6
+
9fc0f6
+                if (kill(pid, 0) < 0 && errno == ESRCH)
9fc0f6
+                        set_remove(u->pids, e);
9fc0f6
+        }
9fc0f6
 }
9fc0f6
 
9fc0f6
 int unit_watch_timer(Unit *u, clockid_t clock_id, bool relative, usec_t usec, Watch *w) {
9fc0f6
diff --git a/src/core/unit.h b/src/core/unit.h
9fc0f6
index 6dd750f..6dff25e 100644
9fc0f6
--- a/src/core/unit.h
9fc0f6
+++ b/src/core/unit.h
9fc0f6
@@ -198,6 +198,11 @@ struct Unit {
9fc0f6
         /* CGroup realize members queue */
9fc0f6
         LIST_FIELDS(Unit, cgroup_queue);
9fc0f6
 
9fc0f6
+        /* PIDs we keep an eye on. Note that a unit might have many
9fc0f6
+         * more, but these are the ones we care enough about to
9fc0f6
+         * process SIGCHLD for */
9fc0f6
+        Set *pids;
9fc0f6
+
9fc0f6
         /* Used during GC sweeps */
9fc0f6
         unsigned gc_marker;
9fc0f6
 
9fc0f6
@@ -531,6 +536,10 @@ void unit_unwatch_fd(Unit *u, Watch *w);
9fc0f6
 
9fc0f6
 int unit_watch_pid(Unit *u, pid_t pid);
9fc0f6
 void unit_unwatch_pid(Unit *u, pid_t pid);
9fc0f6
+int unit_watch_all_pids(Unit *u);
9fc0f6
+void unit_unwatch_all_pids(Unit *u);
9fc0f6
+
9fc0f6
+void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2);
9fc0f6
 
9fc0f6
 int unit_watch_timer(Unit *u, clockid_t, bool relative, usec_t usec, Watch *w);
9fc0f6
 void unit_unwatch_timer(Unit *u, Watch *w);