From 945960a5e4fa339e3d747d695c0e0996c2336e02 Mon Sep 17 00:00:00 2001
From: Chris Down <chris@chrisdown.name>
Date: Thu, 29 Oct 2020 12:03:52 +0000
Subject: [PATCH] bpf: pid1: Pin reference to BPF programs for post-coldplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
During `daemon-reload` and `daemon-reexec`, we detach and reattach all
BPF programs attached to cgroups. This, however, poses a real practical
problem for DevicePolicy (and some other settings using BPF): it
presents a period of time where the old device filtering BPF program has
been unloaded, but the new one has not been loaded yet.
Since the filtering is at open() time, it has become apparent that that
there's a non-trivial period where applications inside that ostensibly
filtered cgroup can grab any device -- and often do so -- and then
retain access to that device even after the reload is over. Due to the
file continuing to be available after the initial open(), this issue is
particularly visible for DevicePolicy={strict,closed}, however it also
applies to other BPF programs we install.
In particular, for BPF ingress/egress filtering this may have more
concerning implications: network traffic which is supposed to be
filtered will -- for a very brief period of time -- not be filtered or
subject to any restrictions imposed by BPF.
These BPF programs are fundamentally attached to a cgroup lifetime, not
our unit lifetime, so it's enough to pin these programs by taking a
reference to affected BPF programs before reload/reexec. We can then
serialise the program's kernel-facing FD and cgroup attachment FD for
the new daemon, and have the daemon on the other side unpin the programs
after it's finished with coldplug.
That means that, for example, the BPF program lifecycle during
daemon-reload or daemon-reexec changes from this:
manager_clear_jobs_and_units
│
╔══════╪═════════╤═══════╗
║ prog │ no prog │ prog' ║
╚══════╧═════════╪═══════╝
│
manager_coldplug
to this:
manager_clear_jobs_and_units manager_dispatch_cgroup_realize_queue
│ │
╔══════╪═══════════════╤═══════════════════════╪═══════╗
║ prog │ prog (orphan) │ prog (orphan) + prog' │ prog' ║
╚══════╧═══════════════╪═══════════════════════╧═══════╝
│
manager_coldplug
For daemon-reexec the semantics are mostly the same, but the point at
which the program becomes orphan is tied to the process lifecycle
instead.
None of the BPF programs we install require exclusive access, so having
multiple instances of them running at the same time is fine. Custom
programs, of course, are unknown, but it's hard to imagine legitimate
cases which should be affected, whereas the benefits of this "overlap"
approach with reference pinning is immediately tangible.
[keszybz: use _cleanup_ for unpin, use FOREACH_POINTER]
---
src/core/main.c | 9 +++
src/core/manager.c | 163 ++++++++++++++++++++++++++++++++++++++-
src/core/manager.h | 5 ++
src/core/unit.h | 8 +-
src/shared/bpf-program.c | 10 +++
src/shared/bpf-program.h | 1 +
6 files changed, 191 insertions(+), 5 deletions(-)
diff --git a/src/core/main.c b/src/core/main.c
index 3ee8d0a869..cb065dbc6e 100644
--- a/src/core/main.c
+++ b/src/core/main.c
@@ -1164,6 +1164,14 @@ static int prepare_reexecute(
if (!fds)
return log_oom();
+ /* We need existing BPF programs to survive reload, otherwise there will be a period where no BPF
+ * program is active during task execution within a cgroup. This would be bad since this may have
+ * security or reliability implications: devices we should filter won't be filtered, network activity
+ * we should filter won't be filtered, etc. We pin all the existing devices by bumping their
+ * refcount, and then storing them to later have it decremented. */
+ _cleanup_(manager_unpin_all_cgroup_bpf_programs) Manager *m_unpin =
+ manager_pin_all_cgroup_bpf_programs(m);
+
r = manager_serialize(m, f, fds, switching_root);
if (r < 0)
return r;
@@ -1179,6 +1187,7 @@ static int prepare_reexecute(
if (r < 0)
return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m");
+ TAKE_PTR(m_unpin);
*ret_f = TAKE_PTR(f);
*ret_fds = TAKE_PTR(fds);
diff --git a/src/core/manager.c b/src/core/manager.c
index 688e6881c3..44a92e7577 100644
--- a/src/core/manager.c
+++ b/src/core/manager.c
@@ -65,6 +65,7 @@
#include "rm-rf.h"
#include "selinux-util.h"
#include "serialize.h"
+#include "set.h"
#include "signal-util.h"
#include "socket-util.h"
#include "special.h"
@@ -3203,6 +3204,79 @@ static void manager_serialize_gid_refs(Manager *m, FILE *f) {
manager_serialize_uid_refs_internal(f, m->gid_refs, "destroy-ipc-gid");
}
+static int serialize_limbo_bpf_program(FILE *f, FDSet *fds, BPFProgram *p) {
+ int copy;
+ _cleanup_free_ char *ap = NULL;
+
+ /* We don't actually need the instructions or other data, since this is only used on the other side
+ * for BPF limbo, which just requires the program type, cgroup path, and kernel-facing BPF file
+ * descriptor. We don't even need to know what unit or directive it's attached to, since we're just
+ * going to expire it after coldplug. */
+
+ assert(f);
+ assert(p);
+
+ /* If the program isn't attached to the kernel yet, there's no reason to serialise it for limbo. Just
+ * let it be skeletonized and then coldplug can do the work on the other side if it's still
+ * necessary. */
+ if (p->kernel_fd < 0 || !p->attached_path)
+ return -ENOTCONN;
+
+ copy = fdset_put_dup(fds, p->kernel_fd);
+ if (copy < 0)
+ return log_error_errno(copy, "Failed to add file descriptor to serialization set: %m");
+
+ /* Otherwise, on daemon-reload, we'd remain pinned. */
+ safe_close(p->kernel_fd);
+
+ ap = cescape(p->attached_path);
+ if (!ap)
+ return log_oom();
+
+ return serialize_item_format(f, "bpf-limbo", "%i %i %i \"%s\"",
+ copy, p->prog_type, p->attached_type, ap);
+}
+
+static void deserialize_limbo_bpf_program(Manager *m, FDSet *fds, const char *value) {
+ _cleanup_free_ char *raw_fd = NULL, *raw_pt = NULL, *raw_at = NULL, *cgpath = NULL;
+ int fd, r, prog_type, attached_type;
+
+ assert(m);
+ assert(value);
+
+ r = extract_first_word(&value, &raw_fd, NULL, 0);
+ if (r <= 0 || safe_atoi(raw_fd, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
+ return (void) log_error("Failed to parse bpf-limbo FD: %s", value);
+
+ r = extract_first_word(&value, &raw_pt, NULL, 0);
+ if (r <= 0 || safe_atoi(raw_pt, &prog_type) < 0)
+ return (void) log_error("Failed to parse bpf-limbo program type: %s", value);
+
+ r = extract_first_word(&value, &raw_at, NULL, 0);
+ if (r <= 0 || safe_atoi(raw_at, &attached_type) < 0)
+ return (void) log_error("Failed to parse bpf-limbo attached type: %s", value);
+
+ r = extract_first_word(&value, &cgpath, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE);
+ if (r <= 0)
+ return (void) log_error("Failed to parse attached path for BPF limbo FD %s", value);
+
+ _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
+ r = bpf_program_new(prog_type, &p);
+ if (r < 0)
+ return (void) log_error_errno(r, "Failed to create BPF limbo program: %m");
+
+ /* Just enough to free it when the time is right, this does not have enough information be used as a
+ * real BPFProgram. */
+ p->attached_type = attached_type;
+ p->kernel_fd = fdset_remove(fds, fd);
+ p->attached_path = TAKE_PTR(cgpath);
+
+ r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
+ if (r < 0)
+ return (void) log_error_errno(r, "Failed to register BPF limbo program for FD %s: %m", value);
+ TAKE_PTR(p);
+}
+
int manager_serialize(
Manager *m,
FILE *f,
@@ -3212,6 +3286,7 @@ int manager_serialize(
const char *t;
Unit *u;
int r;
+ BPFProgram *p;
assert(m);
assert(f);
@@ -3256,6 +3331,9 @@ int manager_serialize(
(void) serialize_dual_timestamp(f, joined, m->timestamps + q);
}
+ SET_FOREACH(p, m->bpf_limbo_progs)
+ (void) serialize_limbo_bpf_program(f, fds, p);
+
if (!switching_root)
(void) serialize_strv(f, "env", m->client_environment);
@@ -3571,7 +3649,10 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
else
m->n_failed_jobs += n;
- } else if ((val = startswith(l, "taint-usr="))) {
+ } else if ((val = startswith(l, "bpf-limbo=")))
+ deserialize_limbo_bpf_program(m, fds, val);
+
+ else if ((val = startswith(l, "taint-usr="))) {
int b;
b = parse_boolean(val);
@@ -3747,6 +3828,67 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
return manager_deserialize_units(m, f, fds);
}
+Manager* manager_pin_all_cgroup_bpf_programs(Manager *m) {
+ int r;
+ Unit *u;
+
+ assert(m);
+
+ HASHMAP_FOREACH(u, m->units) {
+ BPFProgram *p;
+
+ FOREACH_POINTER(p,
+ u->bpf_device_control_installed,
+ u->ip_bpf_ingress,
+ u->ip_bpf_ingress_installed,
+ u->ip_bpf_egress,
+ u->ip_bpf_egress_installed)
+ if (p) {
+ r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
+ if (r < 0) {
+ log_unit_error_errno(u, r, "Cannot store BPF program for reload, ignoring: %m");
+ continue;
+ }
+
+ bpf_program_ref(p);
+ }
+
+ Set *s;
+ FOREACH_POINTER(s,
+ u->ip_bpf_custom_ingress,
+ u->ip_bpf_custom_ingress_installed,
+ u->ip_bpf_custom_egress,
+ u->ip_bpf_custom_egress_installed)
+ SET_FOREACH(p, s) {
+ r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
+ if (r < 0) {
+ log_unit_error_errno(u, r, "Cannot store BPF program for reload, ignoring: %m");
+ continue;
+ }
+
+ bpf_program_ref(p);
+ }
+ }
+
+ log_debug("Pinned %d BPF programs", set_size(m->bpf_limbo_progs));
+
+ return m;
+}
+
+static void manager_skeletonize_all_cgroup_bpf_programs(Manager *m) {
+ BPFProgram *p;
+
+ SET_FOREACH(p, m->bpf_limbo_progs)
+ bpf_program_skeletonize(p);
+}
+
+void manager_unpin_all_cgroup_bpf_programs(Manager **m) {
+ if (m && *m) {
+ log_debug("Unpinning %d BPF programs", set_size((*m)->bpf_limbo_progs));
+ set_clear_with_destructor((*m)->bpf_limbo_progs, bpf_program_unref);
+ }
+}
+
int manager_reload(Manager *m) {
_cleanup_(manager_reloading_stopp) Manager *reloading = NULL;
_cleanup_fdset_free_ FDSet *fds = NULL;
@@ -3766,6 +3908,13 @@ int manager_reload(Manager *m) {
/* We are officially in reload mode from here on. */
reloading = manager_reloading_start(m);
+ /* We need existing BPF programs to survive reload, otherwise there will be a period where no BPF
+ * program is active during task execution within a cgroup. This would be bad since this may have
+ * security or reliability implications: devices we should filter won't be filtered, network activity
+ * we should filter won't be filtered, etc. We pin all the existing devices by bumping their
+ * refcount, and then storing them to later have it decremented. */
+ (void) manager_pin_all_cgroup_bpf_programs(m);
+
r = manager_serialize(m, f, fds, false);
if (r < 0)
return r;
@@ -3790,6 +3939,12 @@ int manager_reload(Manager *m) {
m->uid_refs = hashmap_free(m->uid_refs);
m->gid_refs = hashmap_free(m->gid_refs);
+ /* The only canonical reference left to the dynamically allocated parts of these BPF programs is
+ * going to be on the other side of manager_deserialize, so the freeable parts can now be freed. The
+ * program itself will be detached as part of manager_vacuum. */
+ manager_skeletonize_all_cgroup_bpf_programs(m);
+ m->bpf_limbo_progs = set_free(m->bpf_limbo_progs);
+
r = lookup_paths_init(&m->lookup_paths, m->unit_file_scope, 0, NULL);
if (r < 0)
log_warning_errno(r, "Failed to initialize path lookup table, ignoring: %m");
@@ -4721,6 +4876,12 @@ static void manager_vacuum(Manager *m) {
/* Release any runtimes no longer referenced */
exec_runtime_vacuum(m);
+
+ /* Release any outmoded BPF programs that were deserialized from the previous manager, since new ones
+ * should be in action now. We first need to make sure all entries in the cgroup realize queue are
+ * complete, otherwise BPF firewalls/etc may not have been set up yet. */
+ (void) manager_dispatch_cgroup_realize_queue(m);
+ manager_unpin_all_cgroup_bpf_programs(&m);
}
int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
diff --git a/src/core/manager.h b/src/core/manager.h
index f58982a364..16bcdc1277 100644
--- a/src/core/manager.h
+++ b/src/core/manager.h
@@ -438,6 +438,8 @@ struct Manager {
VarlinkServer *varlink_server;
/* Only systemd-oomd should be using this to subscribe to changes in ManagedOOM settings */
Varlink *managed_oom_varlink_request;
+
+ Set *bpf_limbo_progs;
};
static inline usec_t manager_default_timeout_abort_usec(Manager *m) {
@@ -479,6 +481,9 @@ int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode
int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs, Job **ret);
int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e);
+Manager* manager_pin_all_cgroup_bpf_programs(Manager *m);
+void manager_unpin_all_cgroup_bpf_programs(Manager **m);
+
void manager_dump_units(Manager *s, FILE *f, const char *prefix);
void manager_dump_jobs(Manager *s, FILE *f, const char *prefix);
void manager_dump(Manager *s, FILE *f, const char *prefix);
diff --git a/src/core/unit.h b/src/core/unit.h
index 6de529af92..7c9cc47f92 100644
--- a/src/core/unit.h
+++ b/src/core/unit.h
@@ -927,10 +927,10 @@ int unit_thaw_vtable_common(Unit *u);
#define log_unit_full_errno(unit, level, error, ...) \
({ \
const Unit *_u = (unit); \
- const int _l = (level); \
- (log_get_max_level() < LOG_PRI(_l) || (_u && !unit_log_level_test(_u, _l))) ? -ERRNO_VALUE(error) : \
- _u ? log_object_internal(_l, error, PROJECT_FILE, __LINE__, __func__, _u->manager->unit_log_field, _u->id, _u->manager->invocation_log_field, _u->invocation_id_string, ##__VA_ARGS__) : \
- log_internal(_l, error, PROJECT_FILE, __LINE__, __func__, ##__VA_ARGS__); \
+ const int _v = (level); \
+ (log_get_max_level() < LOG_PRI(_v) || (_u && !unit_log_level_test(_u, _v))) ? -ERRNO_VALUE(error) : \
+ _u ? log_object_internal(_v, error, PROJECT_FILE, __LINE__, __func__, _u->manager->unit_log_field, _u->id, _u->manager->invocation_log_field, _u->invocation_id_string, ##__VA_ARGS__) : \
+ log_internal(_v, error, PROJECT_FILE, __LINE__, __func__, ##__VA_ARGS__); \
})
#define log_unit_full(unit, level, ...) (void) log_unit_full_errno(unit, level, 0, __VA_ARGS__)
diff --git a/src/shared/bpf-program.c b/src/shared/bpf-program.c
index a8a34521fd..314b48b229 100644
--- a/src/shared/bpf-program.c
+++ b/src/shared/bpf-program.c
@@ -104,6 +104,16 @@ int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret) {
return 0;
}
+void bpf_program_skeletonize(BPFProgram *p) {
+ assert(p);
+
+ /* Called shortly after serialization. From this point on, we are frozen for serialization and entry
+ * into BPF limbo, so we should proactively free our instructions and attached path. However, we
+ * shouldn't detach the program or close the kernel FD -- we need those on the other side. */
+ free(p->instructions);
+ free(p->attached_path);
+}
+
static BPFProgram *bpf_program_free(BPFProgram *p) {
assert(p);
diff --git a/src/shared/bpf-program.h b/src/shared/bpf-program.h
index 86fd338c93..dcaeb6cde7 100644
--- a/src/shared/bpf-program.h
+++ b/src/shared/bpf-program.h
@@ -29,6 +29,7 @@ int bpf_program_new(uint32_t prog_type, BPFProgram **ret);
int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret);
BPFProgram *bpf_program_ref(BPFProgram *p);
BPFProgram *bpf_program_unref(BPFProgram *p);
+void bpf_program_skeletonize(BPFProgram *p);
int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *insn, size_t count);
int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size);
--
2.30.2