|
|
505ca8 |
From 5adb2f01405d7cb7ba3cf9d4ee035f57952f79a6 Mon Sep 17 00:00:00 2001
|
|
|
d0811f |
From: Chris Down <chris@chrisdown.name>
|
|
|
d0811f |
Date: Thu, 29 Oct 2020 12:03:52 +0000
|
|
|
505ca8 |
Subject: [PATCH 3/3] bpf: pid1: Pin reference to BPF programs for
|
|
|
d0811f |
post-coldplug
|
|
|
d0811f |
MIME-Version: 1.0
|
|
|
d0811f |
Content-Type: text/plain; charset=UTF-8
|
|
|
d0811f |
Content-Transfer-Encoding: 8bit
|
|
|
d0811f |
|
|
|
d0811f |
During `daemon-reload` and `daemon-reexec`, we detach and reattach all
|
|
|
d0811f |
BPF programs attached to cgroups. This, however, poses a real practical
|
|
|
d0811f |
problem for DevicePolicy (and some other settings using BPF): it
|
|
|
d0811f |
presents a period of time where the old device filtering BPF program has
|
|
|
d0811f |
been unloaded, but the new one has not been loaded yet.
|
|
|
d0811f |
|
|
|
d0811f |
Since the filtering is at open() time, it has become apparent that that
|
|
|
d0811f |
there's a non-trivial period where applications inside that ostensibly
|
|
|
d0811f |
filtered cgroup can grab any device -- and often do so -- and then
|
|
|
d0811f |
retain access to that device even after the reload is over. Due to the
|
|
|
d0811f |
file continuing to be available after the initial open(), this issue is
|
|
|
d0811f |
particularly visible for DevicePolicy={strict,closed}, however it also
|
|
|
d0811f |
applies to other BPF programs we install.
|
|
|
d0811f |
|
|
|
d0811f |
In particular, for BPF ingress/egress filtering this may have more
|
|
|
d0811f |
concerning implications: network traffic which is supposed to be
|
|
|
d0811f |
filtered will -- for a very brief period of time -- not be filtered or
|
|
|
d0811f |
subject to any restrictions imposed by BPF.
|
|
|
d0811f |
|
|
|
d0811f |
These BPF programs are fundamentally attached to a cgroup lifetime, not
|
|
|
d0811f |
our unit lifetime, so it's enough to pin these programs by taking a
|
|
|
d0811f |
reference to affected BPF programs before reload/reexec. We can then
|
|
|
d0811f |
serialise the program's kernel-facing FD and cgroup attachment FD for
|
|
|
d0811f |
the new daemon, and have the daemon on the other side unpin the programs
|
|
|
d0811f |
after it's finished with coldplug.
|
|
|
d0811f |
|
|
|
d0811f |
That means that, for example, the BPF program lifecycle during
|
|
|
d0811f |
daemon-reload or daemon-reexec changes from this:
|
|
|
d0811f |
|
|
|
d0811f |
manager_clear_jobs_and_units
|
|
|
d0811f |
│
|
|
|
d0811f |
╔══════╪═════════╤═══════╗
|
|
|
d0811f |
║ prog │ no prog │ prog' ║
|
|
|
d0811f |
╚══════╧═════════╪═══════╝
|
|
|
d0811f |
│
|
|
|
d0811f |
manager_coldplug
|
|
|
d0811f |
|
|
|
d0811f |
to this:
|
|
|
d0811f |
|
|
|
d0811f |
manager_clear_jobs_and_units manager_dispatch_cgroup_realize_queue
|
|
|
d0811f |
│ │
|
|
|
d0811f |
╔══════╪═══════════════╤═══════════════════════╪═══════╗
|
|
|
d0811f |
║ prog │ prog (orphan) │ prog (orphan) + prog' │ prog' ║
|
|
|
d0811f |
╚══════╧═══════════════╪═══════════════════════╧═══════╝
|
|
|
d0811f |
│
|
|
|
d0811f |
manager_coldplug
|
|
|
d0811f |
|
|
|
d0811f |
For daemon-reexec the semantics are mostly the same, but the point at
|
|
|
d0811f |
which the program becomes orphan is tied to the process lifecycle
|
|
|
d0811f |
instead.
|
|
|
d0811f |
|
|
|
d0811f |
None of the BPF programs we install require exclusive access, so having
|
|
|
d0811f |
multiple instances of them running at the same time is fine. Custom
|
|
|
d0811f |
programs, of course, are unknown, but it's hard to imagine legitimate
|
|
|
d0811f |
cases which should be affected, whereas the benefits of this "overlap"
|
|
|
d0811f |
approach with reference pinning is immediately tangible.
|
|
|
d0811f |
|
|
|
d0811f |
[keszybz: use _cleanup_ for unpin, use FOREACH_POINTER]
|
|
|
d0811f |
---
|
|
|
d0811f |
src/core/bpf-firewall.c | 9 +--
|
|
|
d0811f |
src/core/main.c | 9 +++
|
|
|
505ca8 |
src/core/manager.c | 161 ++++++++++++++++++++++++++++++++++++++-
|
|
|
d0811f |
src/core/manager.h | 6 ++
|
|
|
d0811f |
src/shared/bpf-program.c | 10 +++
|
|
|
d0811f |
src/shared/bpf-program.h | 1 +
|
|
|
505ca8 |
6 files changed, 189 insertions(+), 7 deletions(-)
|
|
|
d0811f |
|
|
|
d0811f |
diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c
|
|
|
505ca8 |
index 99783aca22..c205ba1b19 100644
|
|
|
d0811f |
--- a/src/core/bpf-firewall.c
|
|
|
d0811f |
+++ b/src/core/bpf-firewall.c
|
|
|
505ca8 |
@@ -702,8 +702,7 @@ int bpf_firewall_install(Unit *u) {
|
|
|
d0811f |
if (r < 0)
|
|
|
d0811f |
return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m");
|
|
|
505ca8 |
|
|
|
d0811f |
- flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
|
|
|
d0811f |
- (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
|
|
|
d0811f |
+ flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI) ? BPF_F_ALLOW_MULTI : 0;
|
|
|
505ca8 |
|
|
|
d0811f |
/* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
|
|
|
d0811f |
* minimize the time window when we don't account for IP traffic. */
|
|
|
505ca8 |
@@ -711,8 +710,7 @@ int bpf_firewall_install(Unit *u) {
|
|
|
d0811f |
u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
|
|
|
505ca8 |
|
|
|
d0811f |
if (u->ip_bpf_egress) {
|
|
|
d0811f |
- r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path,
|
|
|
d0811f |
- flags | (set_isempty(u->ip_bpf_custom_egress) ? 0 : BPF_F_ALLOW_MULTI));
|
|
|
d0811f |
+ r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
|
|
|
d0811f |
if (r < 0)
|
|
|
d0811f |
return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path);
|
|
|
505ca8 |
|
|
|
505ca8 |
@@ -721,8 +719,7 @@ int bpf_firewall_install(Unit *u) {
|
|
|
d0811f |
}
|
|
|
505ca8 |
|
|
|
d0811f |
if (u->ip_bpf_ingress) {
|
|
|
d0811f |
- r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path,
|
|
|
d0811f |
- flags | (set_isempty(u->ip_bpf_custom_ingress) ? 0 : BPF_F_ALLOW_MULTI));
|
|
|
d0811f |
+ r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
|
|
|
d0811f |
if (r < 0)
|
|
|
d0811f |
return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
|
|
|
505ca8 |
|
|
|
d0811f |
diff --git a/src/core/main.c b/src/core/main.c
|
|
|
505ca8 |
index a280b756ff..2ace4cb89c 100644
|
|
|
d0811f |
--- a/src/core/main.c
|
|
|
d0811f |
+++ b/src/core/main.c
|
|
|
d0811f |
@@ -1144,6 +1144,14 @@ static int prepare_reexecute(
|
|
|
d0811f |
if (!fds)
|
|
|
d0811f |
return log_oom();
|
|
|
505ca8 |
|
|
|
d0811f |
+ /* We need existing BPF programs to survive reload, otherwise there will be a period where no BPF
|
|
|
d0811f |
+ * program is active during task execution within a cgroup. This would be bad since this may have
|
|
|
d0811f |
+ * security or reliability implications: devices we should filter won't be filtered, network activity
|
|
|
d0811f |
+ * we should filter won't be filtered, etc. We pin all the existing devices by bumping their
|
|
|
d0811f |
+ * refcount, and then storing them to later have it decremented. */
|
|
|
d0811f |
+ _cleanup_(manager_unpin_all_cgroup_bpf_programsp) Manager *m_unpin =
|
|
|
d0811f |
+ manager_pin_all_cgroup_bpf_programs(m);
|
|
|
d0811f |
+
|
|
|
d0811f |
r = manager_serialize(m, f, fds, switching_root);
|
|
|
d0811f |
if (r < 0)
|
|
|
d0811f |
return r;
|
|
|
d0811f |
@@ -1159,6 +1167,7 @@ static int prepare_reexecute(
|
|
|
d0811f |
if (r < 0)
|
|
|
d0811f |
return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m");
|
|
|
505ca8 |
|
|
|
d0811f |
+ TAKE_PTR(m_unpin);
|
|
|
d0811f |
*ret_f = TAKE_PTR(f);
|
|
|
d0811f |
*ret_fds = TAKE_PTR(fds);
|
|
|
505ca8 |
|
|
|
d0811f |
diff --git a/src/core/manager.c b/src/core/manager.c
|
|
|
505ca8 |
index a1d6f7cc10..b373d27844 100644
|
|
|
d0811f |
--- a/src/core/manager.c
|
|
|
d0811f |
+++ b/src/core/manager.c
|
|
|
505ca8 |
@@ -65,6 +65,7 @@
|
|
|
d0811f |
#include "rm-rf.h"
|
|
|
505ca8 |
#include "selinux-util.h"
|
|
|
d0811f |
#include "serialize.h"
|
|
|
d0811f |
+#include "set.h"
|
|
|
d0811f |
#include "signal-util.h"
|
|
|
d0811f |
#include "socket-util.h"
|
|
|
d0811f |
#include "special.h"
|
|
|
505ca8 |
@@ -3217,6 +3218,79 @@ static void manager_serialize_gid_refs(Manager *m, FILE *f) {
|
|
|
d0811f |
manager_serialize_uid_refs_internal(m, f, &m->gid_refs, "destroy-ipc-gid");
|
|
|
d0811f |
}
|
|
|
505ca8 |
|
|
|
d0811f |
+static int serialize_limbo_bpf_program(FILE *f, FDSet *fds, BPFProgram *p) {
|
|
|
d0811f |
+ int copy;
|
|
|
d0811f |
+ _cleanup_free_ char *ap = NULL;
|
|
|
d0811f |
+
|
|
|
d0811f |
+ /* We don't actually need the instructions or other data, since this is only used on the other side
|
|
|
d0811f |
+ * for BPF limbo, which just requires the program type, cgroup path, and kernel-facing BPF file
|
|
|
d0811f |
+ * descriptor. We don't even need to know what unit or directive it's attached to, since we're just
|
|
|
d0811f |
+ * going to expire it after coldplug. */
|
|
|
d0811f |
+
|
|
|
d0811f |
+ assert(f);
|
|
|
d0811f |
+ assert(p);
|
|
|
d0811f |
+
|
|
|
d0811f |
+ /* If the program isn't attached to the kernel yet, there's no reason to serialise it for limbo. Just
|
|
|
d0811f |
+ * let it be skeletonized and then coldplug can do the work on the other side if it's still
|
|
|
d0811f |
+ * necessary. */
|
|
|
d0811f |
+ if (p->kernel_fd < 0 || !p->attached_path)
|
|
|
d0811f |
+ return -ENOTCONN;
|
|
|
d0811f |
+
|
|
|
d0811f |
+ copy = fdset_put_dup(fds, p->kernel_fd);
|
|
|
d0811f |
+ if (copy < 0)
|
|
|
d0811f |
+ return log_error_errno(copy, "Failed to add file descriptor to serialization set: %m");
|
|
|
d0811f |
+
|
|
|
d0811f |
+ /* Otherwise, on daemon-reload, we'd remain pinned. */
|
|
|
d0811f |
+ safe_close(p->kernel_fd);
|
|
|
d0811f |
+
|
|
|
d0811f |
+ ap = cescape(p->attached_path);
|
|
|
d0811f |
+ if (!ap)
|
|
|
d0811f |
+ return log_oom();
|
|
|
d0811f |
+
|
|
|
d0811f |
+ return serialize_item_format(f, "bpf-limbo", "%i %i %i \"%s\"",
|
|
|
d0811f |
+ copy, p->prog_type, p->attached_type, ap);
|
|
|
d0811f |
+}
|
|
|
d0811f |
+
|
|
|
d0811f |
+static void deserialize_limbo_bpf_program(Manager *m, FDSet *fds, const char *value) {
|
|
|
d0811f |
+ _cleanup_free_ char *raw_fd = NULL, *raw_pt = NULL, *raw_at = NULL, *cgpath = NULL;
|
|
|
d0811f |
+ int fd, r, prog_type, attached_type;
|
|
|
d0811f |
+
|
|
|
d0811f |
+ assert(m);
|
|
|
d0811f |
+ assert(value);
|
|
|
d0811f |
+
|
|
|
d0811f |
+ r = extract_first_word(&value, &raw_fd, NULL, 0);
|
|
|
d0811f |
+ if (r <= 0 || safe_atoi(raw_fd, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
|
|
|
d0811f |
+ return (void) log_error("Failed to parse bpf-limbo FD: %s", value);
|
|
|
d0811f |
+
|
|
|
d0811f |
+ r = extract_first_word(&value, &raw_pt, NULL, 0);
|
|
|
d0811f |
+ if (r <= 0 || safe_atoi(raw_pt, &prog_type) < 0)
|
|
|
d0811f |
+ return (void) log_error("Failed to parse bpf-limbo program type: %s", value);
|
|
|
d0811f |
+
|
|
|
d0811f |
+ r = extract_first_word(&value, &raw_at, NULL, 0);
|
|
|
d0811f |
+ if (r <= 0 || safe_atoi(raw_at, &attached_type) < 0)
|
|
|
d0811f |
+ return (void) log_error("Failed to parse bpf-limbo attached type: %s", value);
|
|
|
d0811f |
+
|
|
|
d0811f |
+ r = extract_first_word(&value, &cgpath, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE);
|
|
|
d0811f |
+ if (r <= 0)
|
|
|
d0811f |
+ return (void) log_error("Failed to parse attached path for BPF limbo FD %s", value);
|
|
|
d0811f |
+
|
|
|
d0811f |
+ _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
|
|
|
d0811f |
+ r = bpf_program_new(prog_type, &p);
|
|
|
d0811f |
+ if (r < 0)
|
|
|
d0811f |
+ return (void) log_error_errno(r, "Failed to create BPF limbo program: %m");
|
|
|
d0811f |
+
|
|
|
d0811f |
+ /* Just enough to free it when the time is right, this does not have enough information be used as a
|
|
|
d0811f |
+ * real BPFProgram. */
|
|
|
d0811f |
+ p->attached_type = attached_type;
|
|
|
d0811f |
+ p->kernel_fd = fdset_remove(fds, fd);
|
|
|
d0811f |
+ p->attached_path = TAKE_PTR(cgpath);
|
|
|
d0811f |
+
|
|
|
d0811f |
+ r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
|
|
|
d0811f |
+ if (r < 0)
|
|
|
d0811f |
+ return (void) log_error_errno(r, "Failed to register BPF limbo program for FD %s: %m", value);
|
|
|
d0811f |
+ TAKE_PTR(p);
|
|
|
d0811f |
+}
|
|
|
d0811f |
+
|
|
|
d0811f |
int manager_serialize(
|
|
|
d0811f |
Manager *m,
|
|
|
d0811f |
FILE *f,
|
|
|
505ca8 |
@@ -3226,6 +3300,7 @@ int manager_serialize(
|
|
|
505ca8 |
const char *t;
|
|
|
d0811f |
Unit *u;
|
|
|
d0811f |
int r;
|
|
|
d0811f |
+ BPFProgram *p;
|
|
|
505ca8 |
|
|
|
d0811f |
assert(m);
|
|
|
d0811f |
assert(f);
|
|
|
505ca8 |
@@ -3270,6 +3345,9 @@ int manager_serialize(
|
|
|
d0811f |
(void) serialize_dual_timestamp(f, joined, m->timestamps + q);
|
|
|
d0811f |
}
|
|
|
505ca8 |
|
|
|
505ca8 |
+ SET_FOREACH(p, m->bpf_limbo_progs)
|
|
|
d0811f |
+ (void) serialize_limbo_bpf_program(f, fds, p);
|
|
|
d0811f |
+
|
|
|
d0811f |
if (!switching_root)
|
|
|
d0811f |
(void) serialize_strv(f, "env", m->client_environment);
|
|
|
505ca8 |
|
|
|
505ca8 |
@@ -3588,7 +3666,10 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
|
|
|
d0811f |
else
|
|
|
d0811f |
m->n_failed_jobs += n;
|
|
|
505ca8 |
|
|
|
d0811f |
- } else if ((val = startswith(l, "taint-usr="))) {
|
|
|
d0811f |
+ } else if ((val = startswith(l, "bpf-limbo=")))
|
|
|
d0811f |
+ deserialize_limbo_bpf_program(m, fds, val);
|
|
|
d0811f |
+
|
|
|
d0811f |
+ else if ((val = startswith(l, "taint-usr="))) {
|
|
|
d0811f |
int b;
|
|
|
505ca8 |
|
|
|
d0811f |
b = parse_boolean(val);
|
|
|
505ca8 |
@@ -3764,6 +3845,65 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
|
|
|
d0811f |
return manager_deserialize_units(m, f, fds);
|
|
|
d0811f |
}
|
|
|
505ca8 |
|
|
|
d0811f |
+Manager* manager_pin_all_cgroup_bpf_programs(Manager *m) {
|
|
|
d0811f |
+ int r;
|
|
|
d0811f |
+ Unit *u;
|
|
|
d0811f |
+
|
|
|
d0811f |
+ assert(m);
|
|
|
d0811f |
+
|
|
|
505ca8 |
+ HASHMAP_FOREACH(u, m->units) {
|
|
|
d0811f |
+ BPFProgram *p;
|
|
|
d0811f |
+
|
|
|
d0811f |
+ FOREACH_POINTER(p,
|
|
|
d0811f |
+ u->bpf_device_control_installed,
|
|
|
d0811f |
+ u->ip_bpf_ingress,
|
|
|
d0811f |
+ u->ip_bpf_ingress_installed,
|
|
|
d0811f |
+ u->ip_bpf_egress,
|
|
|
d0811f |
+ u->ip_bpf_egress_installed)
|
|
|
d0811f |
+ if (p) {
|
|
|
d0811f |
+ r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
|
|
|
d0811f |
+ if (r < 0) {
|
|
|
d0811f |
+ log_unit_error_errno(u, r, "Cannot store BPF program for reload, ignoring: %m");
|
|
|
d0811f |
+ continue;
|
|
|
d0811f |
+ }
|
|
|
d0811f |
+
|
|
|
d0811f |
+ bpf_program_ref(p);
|
|
|
d0811f |
+ }
|
|
|
d0811f |
+
|
|
|
d0811f |
+ Set *s;
|
|
|
d0811f |
+ FOREACH_POINTER(s,
|
|
|
d0811f |
+ u->ip_bpf_custom_ingress,
|
|
|
d0811f |
+ u->ip_bpf_custom_ingress_installed,
|
|
|
d0811f |
+ u->ip_bpf_custom_egress,
|
|
|
d0811f |
+ u->ip_bpf_custom_egress_installed)
|
|
|
505ca8 |
+ SET_FOREACH(p, s) {
|
|
|
d0811f |
+ r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
|
|
|
d0811f |
+ if (r < 0) {
|
|
|
d0811f |
+ log_unit_error_errno(u, r, "Cannot store BPF program for reload, ignoring: %m");
|
|
|
d0811f |
+ continue;
|
|
|
d0811f |
+ }
|
|
|
d0811f |
+
|
|
|
d0811f |
+ bpf_program_ref(p);
|
|
|
d0811f |
+ }
|
|
|
d0811f |
+ }
|
|
|
d0811f |
+
|
|
|
d0811f |
+ log_debug("Pinned %d BPF programs", set_size(m->bpf_limbo_progs));
|
|
|
d0811f |
+
|
|
|
d0811f |
+ return m;
|
|
|
d0811f |
+}
|
|
|
d0811f |
+
|
|
|
d0811f |
+static void manager_skeletonize_all_cgroup_bpf_programs(Manager *m) {
|
|
|
d0811f |
+ BPFProgram *p;
|
|
|
d0811f |
+
|
|
|
505ca8 |
+ SET_FOREACH(p, m->bpf_limbo_progs)
|
|
|
d0811f |
+ bpf_program_skeletonize(p);
|
|
|
d0811f |
+}
|
|
|
d0811f |
+
|
|
|
d0811f |
+void manager_unpin_all_cgroup_bpf_programs(Manager *m) {
|
|
|
d0811f |
+ log_debug("Unpinning %d BPF programs", set_size(m->bpf_limbo_progs));
|
|
|
d0811f |
+ set_clear_with_destructor(m->bpf_limbo_progs, bpf_program_unref);
|
|
|
d0811f |
+}
|
|
|
d0811f |
+
|
|
|
d0811f |
int manager_reload(Manager *m) {
|
|
|
d0811f |
_cleanup_(manager_reloading_stopp) Manager *reloading = NULL;
|
|
|
d0811f |
_cleanup_fdset_free_ FDSet *fds = NULL;
|
|
|
505ca8 |
@@ -3783,6 +3923,13 @@ int manager_reload(Manager *m) {
|
|
|
d0811f |
/* We are officially in reload mode from here on. */
|
|
|
d0811f |
reloading = manager_reloading_start(m);
|
|
|
505ca8 |
|
|
|
d0811f |
+ /* We need existing BPF programs to survive reload, otherwise there will be a period where no BPF
|
|
|
d0811f |
+ * program is active during task execution within a cgroup. This would be bad since this may have
|
|
|
d0811f |
+ * security or reliability implications: devices we should filter won't be filtered, network activity
|
|
|
d0811f |
+ * we should filter won't be filtered, etc. We pin all the existing devices by bumping their
|
|
|
d0811f |
+ * refcount, and then storing them to later have it decremented. */
|
|
|
d0811f |
+ (void) manager_pin_all_cgroup_bpf_programs(m);
|
|
|
d0811f |
+
|
|
|
d0811f |
r = manager_serialize(m, f, fds, false);
|
|
|
d0811f |
if (r < 0)
|
|
|
d0811f |
return r;
|
|
|
505ca8 |
@@ -3807,6 +3954,12 @@ int manager_reload(Manager *m) {
|
|
|
d0811f |
m->uid_refs = hashmap_free(m->uid_refs);
|
|
|
d0811f |
m->gid_refs = hashmap_free(m->gid_refs);
|
|
|
505ca8 |
|
|
|
d0811f |
+ /* The only canonical reference left to the dynamically allocated parts of these BPF programs is
|
|
|
d0811f |
+ * going to be on the other side of manager_deserialize, so the freeable parts can now be freed. The
|
|
|
d0811f |
+ * program itself will be detached as part of manager_vacuum. */
|
|
|
d0811f |
+ manager_skeletonize_all_cgroup_bpf_programs(m);
|
|
|
d0811f |
+ m->bpf_limbo_progs = set_free(m->bpf_limbo_progs);
|
|
|
d0811f |
+
|
|
|
d0811f |
r = lookup_paths_init(&m->lookup_paths, m->unit_file_scope, 0, NULL);
|
|
|
d0811f |
if (r < 0)
|
|
|
d0811f |
log_warning_errno(r, "Failed to initialize path lookup table, ignoring: %m");
|
|
|
505ca8 |
@@ -4741,6 +4894,12 @@ static void manager_vacuum(Manager *m) {
|
|
|
505ca8 |
|
|
|
d0811f |
/* Release any runtimes no longer referenced */
|
|
|
d0811f |
exec_runtime_vacuum(m);
|
|
|
d0811f |
+
|
|
|
d0811f |
+ /* Release any outmoded BPF programs that were deserialized from the previous manager, since new ones
|
|
|
d0811f |
+ * should be in action now. We first need to make sure all entries in the cgroup realize queue are
|
|
|
d0811f |
+ * complete, otherwise BPF firewalls/etc may not have been set up yet. */
|
|
|
d0811f |
+ (void) manager_dispatch_cgroup_realize_queue(m);
|
|
|
d0811f |
+ manager_unpin_all_cgroup_bpf_programs(m);
|
|
|
d0811f |
}
|
|
|
505ca8 |
|
|
|
d0811f |
int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
|
|
|
d0811f |
diff --git a/src/core/manager.h b/src/core/manager.h
|
|
|
505ca8 |
index 19df889dd8..2c4a2b6063 100644
|
|
|
d0811f |
--- a/src/core/manager.h
|
|
|
d0811f |
+++ b/src/core/manager.h
|
|
|
505ca8 |
@@ -438,6 +438,8 @@ struct Manager {
|
|
|
d0811f |
VarlinkServer *varlink_server;
|
|
|
505ca8 |
/* Only systemd-oomd should be using this to subscribe to changes in ManagedOOM settings */
|
|
|
505ca8 |
Varlink *managed_oom_varlink_request;
|
|
|
d0811f |
+
|
|
|
d0811f |
+ Set *bpf_limbo_progs;
|
|
|
d0811f |
};
|
|
|
505ca8 |
|
|
|
d0811f |
static inline usec_t manager_default_timeout_abort_usec(Manager *m) {
|
|
|
505ca8 |
@@ -479,6 +481,10 @@ int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode
|
|
|
d0811f |
int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, Job **ret);
|
|
|
d0811f |
int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e);
|
|
|
505ca8 |
|
|
|
d0811f |
+Manager* manager_pin_all_cgroup_bpf_programs(Manager *m);
|
|
|
d0811f |
+void manager_unpin_all_cgroup_bpf_programs(Manager *m);
|
|
|
d0811f |
+DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_unpin_all_cgroup_bpf_programs);
|
|
|
d0811f |
+
|
|
|
d0811f |
void manager_dump_units(Manager *s, FILE *f, const char *prefix);
|
|
|
d0811f |
void manager_dump_jobs(Manager *s, FILE *f, const char *prefix);
|
|
|
d0811f |
void manager_dump(Manager *s, FILE *f, const char *prefix);
|
|
|
d0811f |
diff --git a/src/shared/bpf-program.c b/src/shared/bpf-program.c
|
|
|
505ca8 |
index 10239142af..549490da6c 100644
|
|
|
d0811f |
--- a/src/shared/bpf-program.c
|
|
|
d0811f |
+++ b/src/shared/bpf-program.c
|
|
|
505ca8 |
@@ -209,6 +209,16 @@ int bpf_program_cgroup_detach(BPFProgram *p) {
|
|
|
d0811f |
return 0;
|
|
|
d0811f |
}
|
|
|
505ca8 |
|
|
|
d0811f |
+void bpf_program_skeletonize(BPFProgram *p) {
|
|
|
d0811f |
+ assert(p);
|
|
|
d0811f |
+
|
|
|
d0811f |
+ /* Called shortly after serialization. From this point on, we are frozen for serialization and entry
|
|
|
d0811f |
+ * into BPF limbo, so we should proactively free our instructions and attached path. However, we
|
|
|
d0811f |
+ * shouldn't detach the program or close the kernel FD -- we need those on the other side. */
|
|
|
d0811f |
+ free(p->instructions);
|
|
|
d0811f |
+ free(p->attached_path);
|
|
|
d0811f |
+}
|
|
|
d0811f |
+
|
|
|
d0811f |
int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags) {
|
|
|
505ca8 |
union bpf_attr attr;
|
|
|
505ca8 |
int fd;
|
|
|
d0811f |
diff --git a/src/shared/bpf-program.h b/src/shared/bpf-program.h
|
|
|
505ca8 |
index eef77f9d8e..5957a6ce30 100644
|
|
|
d0811f |
--- a/src/shared/bpf-program.h
|
|
|
d0811f |
+++ b/src/shared/bpf-program.h
|
|
|
d0811f |
@@ -28,6 +28,7 @@ struct BPFProgram {
|
|
|
d0811f |
int bpf_program_new(uint32_t prog_type, BPFProgram **ret);
|
|
|
d0811f |
BPFProgram *bpf_program_unref(BPFProgram *p);
|
|
|
d0811f |
BPFProgram *bpf_program_ref(BPFProgram *p);
|
|
|
d0811f |
+void bpf_program_skeletonize(BPFProgram *p);
|
|
|
505ca8 |
|
|
|
d0811f |
int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *insn, size_t count);
|
|
|
d0811f |
int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size);
|
|
|
505ca8 |
--
|
|
|
505ca8 |
2.29.2
|
|
|
d0811f |
|