d0811f
From b554f941a8f275124508794b0b83f0554c7b84dc Mon Sep 17 00:00:00 2001
d0811f
From: Anita Zhang <the.anitazha@gmail.com>
d0811f
Date: Thu, 22 Oct 2020 22:44:22 -0700
d0811f
Subject: [PATCH 2/3] core: clean up inactive/failed {service|scope}'s cgroups
d0811f
 when the last process exits
d0811f
d0811f
If processes remain in the unit's cgroup after the final SIGKILL is
d0811f
sent and the unit has exceeded stop timeout, don't release the unit's
d0811f
cgroup information. Pid1 will have failed to `rmdir` the cgroup path due
d0811f
to processes remaining in the cgroup and releasing would leave the cgroup
d0811f
path on the file system with no tracking for pid1 to clean it up.
d0811f
d0811f
Instead, keep the information around until the last process exits and pid1
d0811f
sends the cgroup empty notification. The service/scope can then prune
d0811f
the cgroup if the unit is inactive/failed.
d0811f
---
d0811f
 src/core/cgroup.c  | 26 +++++++++++++++++++++++++-
d0811f
 src/core/cgroup.h  |  6 +++++-
d0811f
 src/core/scope.c   |  5 +++++
d0811f
 src/core/service.c |  7 +++++++
d0811f
 4 files changed, 42 insertions(+), 2 deletions(-)
d0811f
d0811f
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
d0811f
index 031b28a684..bce5f44e78 100644
d0811f
--- a/src/core/cgroup.c
d0811f
+++ b/src/core/cgroup.c
d0811f
@@ -2414,6 +2414,29 @@ void unit_release_cgroup(Unit *u) {
d0811f
         }
d0811f
 }
d0811f
 
d0811f
+bool unit_maybe_release_cgroup(Unit *u) {
d0811f
+        int r;
d0811f
+
d0811f
+        assert(u);
d0811f
+
d0811f
+        if (!u->cgroup_path)
d0811f
+                return true;
d0811f
+
d0811f
+        /* Don't release the cgroup if there are still processes under it. If we get notified later when all the
d0811f
+         * processes exit (e.g. the processes were in D-state and exited after the unit was marked as failed)
d0811f
+         * we need the cgroup paths to continue to be tracked by the manager so they can be looked up and cleaned
d0811f
+         * up later. */
d0811f
+        r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
d0811f
+        if (r < 0)
d0811f
+                log_unit_debug_errno(u, r, "Error checking if the cgroup is recursively empty, ignoring: %m");
d0811f
+        else if (r == 1) {
d0811f
+                unit_release_cgroup(u);
d0811f
+                return true;
d0811f
+        }
d0811f
+
d0811f
+        return false;
d0811f
+}
d0811f
+
d0811f
 void unit_prune_cgroup(Unit *u) {
d0811f
         int r;
d0811f
         bool is_root_slice;
d0811f
@@ -2441,7 +2464,8 @@ void unit_prune_cgroup(Unit *u) {
d0811f
         if (is_root_slice)
d0811f
                 return;
d0811f
 
d0811f
-        unit_release_cgroup(u);
d0811f
+        if (!unit_maybe_release_cgroup(u)) /* Returns true if the cgroup was released */
d0811f
+                return;
d0811f
 
d0811f
         u->cgroup_realized = false;
d0811f
         u->cgroup_realized_mask = 0;
d0811f
diff --git a/src/core/cgroup.h b/src/core/cgroup.h
d0811f
index 52d028e740..be6856c20c 100644
d0811f
--- a/src/core/cgroup.h
d0811f
+++ b/src/core/cgroup.h
d0811f
@@ -220,11 +220,15 @@ int unit_set_cgroup_path(Unit *u, const char *path);
d0811f
 int unit_pick_cgroup_path(Unit *u);
d0811f
 
d0811f
 int unit_realize_cgroup(Unit *u);
d0811f
-void unit_release_cgroup(Unit *u);
d0811f
 void unit_prune_cgroup(Unit *u);
d0811f
 int unit_watch_cgroup(Unit *u);
d0811f
 int unit_watch_cgroup_memory(Unit *u);
d0811f
 
d0811f
+void unit_release_cgroup(Unit *u);
d0811f
+/* Releases the cgroup only if it is recursively empty.
d0811f
+ * Returns true if the cgroup was released, false otherwise. */
d0811f
+bool unit_maybe_release_cgroup(Unit *u);
d0811f
+
d0811f
 void unit_add_to_cgroup_empty_queue(Unit *u);
d0811f
 int unit_check_oom(Unit *u);
d0811f
 
d0811f
diff --git a/src/core/scope.c b/src/core/scope.c
d0811f
index 42c51b0865..ffee783a4c 100644
d0811f
--- a/src/core/scope.c
d0811f
+++ b/src/core/scope.c
d0811f
@@ -487,6 +487,11 @@ static void scope_notify_cgroup_empty_event(Unit *u) {
d0811f
 
d0811f
         if (IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
d0811f
                 scope_enter_dead(s, SCOPE_SUCCESS);
d0811f
+
d0811f
+        /* If the cgroup empty notification comes when the unit is not active, we must have failed to clean
d0811f
+         * up the cgroup earlier and should do it now. */
d0811f
+        if (IN_SET(s->state, SCOPE_DEAD, SCOPE_FAILED))
d0811f
+                unit_prune_cgroup(u);
d0811f
 }
d0811f
 
d0811f
 static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) {
d0811f
diff --git a/src/core/service.c b/src/core/service.c
d0811f
index 00e61945ba..db8f596ca6 100644
d0811f
--- a/src/core/service.c
d0811f
+++ b/src/core/service.c
d0811f
@@ -3334,6 +3334,13 @@ static void service_notify_cgroup_empty_event(Unit *u) {
d0811f
 
d0811f
                 break;
d0811f
 
d0811f
+        /* If the cgroup empty notification comes when the unit is not active, we must have failed to clean
d0811f
+         * up the cgroup earlier and should do it now. */
d0811f
+        case SERVICE_DEAD:
d0811f
+        case SERVICE_FAILED:
d0811f
+                unit_prune_cgroup(u);
d0811f
+                break;
d0811f
+
d0811f
         default:
d0811f
                 ;
d0811f
         }
d0811f
-- 
d0811f
2.24.1
d0811f