Blob Blame History Raw
From 4e42210d40f96e185a55d43041dd6b962ea830dd Mon Sep 17 00:00:00 2001
From: Chris Down <chris@chrisdown.name>
Date: Thu, 29 Oct 2020 12:03:52 +0000
Subject: [PATCH] bpf: pid1: Pin reference to BPF programs for post-coldplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During `daemon-reload` and `daemon-reexec`, we detach and reattach all
BPF programs attached to cgroups. This, however, poses a real practical
problem for DevicePolicy (and some other settings using BPF): it
presents a period of time where the old device filtering BPF program has
been unloaded, but the new one has not been loaded yet.

Since the filtering is at open() time, it has become apparent that that
there's a non-trivial period where applications inside that ostensibly
filtered cgroup can grab any device -- and often do so -- and then
retain access to that device even after the reload is over. Due to the
file continuing to be available after the initial open(), this issue is
particularly visible for DevicePolicy={strict,closed}, however it also
applies to other BPF programs we install.

In particular, for BPF ingress/egress filtering this may have more
concerning implications: network traffic which is supposed to be
filtered will -- for a very brief period of time -- not be filtered or
subject to any restrictions imposed by BPF.

These BPF programs are fundamentally attached to a cgroup lifetime, not
our unit lifetime, so it's enough to pin these programs by taking a
reference to affected BPF programs before reload/reexec. We can then
serialise the program's kernel-facing FD and cgroup attachment FD for
the new daemon, and have the daemon on the other side unpin the programs
after it's finished with coldplug.

That means that, for example, the BPF program lifecycle during
daemon-reload or daemon-reexec changes from this:

    manager_clear_jobs_and_units
                 │
          ╔══════╪═════════╤═══════╗
          ║ prog │ no prog │ prog' ║
          ╚══════╧═════════╪═══════╝
                           │
                    manager_coldplug

to this:

    manager_clear_jobs_and_units         manager_dispatch_cgroup_realize_queue
                 │                                       │
          ╔══════╪═══════════════╤═══════════════════════╪═══════╗
          ║ prog │ prog (orphan) │ prog (orphan) + prog' │ prog' ║
          ╚══════╧═══════════════╪═══════════════════════╧═══════╝
                                 │
                          manager_coldplug

For daemon-reexec the semantics are mostly the same, but the point at
which the program becomes orphan is tied to the process lifecycle
instead.

None of the BPF programs we install require exclusive access, so having
multiple instances of them running at the same time is fine. Custom
programs, of course, are unknown, but it's hard to imagine legitimate
cases which should be affected, whereas the benefits of this "overlap"
approach with reference pinning is immediately tangible.

[keszybz: use _cleanup_ for unpin, use FOREACH_POINTER]
---
 src/core/bpf-firewall.c  |   9 +--
 src/core/main.c          |   9 +++
 src/core/manager.c       | 161 ++++++++++++++++++++++++++++++++++++++-
 src/core/manager.h       |   6 ++
 src/shared/bpf-program.c |  10 +++
 src/shared/bpf-program.h |   1 +
 6 files changed, 189 insertions(+), 7 deletions(-)

diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c
index 0f588b6ca5f..6283f15737b 100644
--- a/src/core/bpf-firewall.c
+++ b/src/core/bpf-firewall.c
@@ -698,8 +698,7 @@ int bpf_firewall_install(Unit *u) {
         if (r < 0)
                 return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m");
 
-        flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
-                 (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
+        flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI) ? BPF_F_ALLOW_MULTI : 0;
 
         /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
          * minimize the time window when we don't account for IP traffic. */
@@ -707,8 +706,7 @@ int bpf_firewall_install(Unit *u) {
         u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
 
         if (u->ip_bpf_egress) {
-                r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path,
-                                              flags | (set_isempty(u->ip_bpf_custom_egress) ? 0 : BPF_F_ALLOW_MULTI));
+                r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
                 if (r < 0)
                         return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path);
 
@@ -717,8 +715,7 @@ int bpf_firewall_install(Unit *u) {
         }
 
         if (u->ip_bpf_ingress) {
-                r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path,
-                                              flags | (set_isempty(u->ip_bpf_custom_ingress) ? 0 : BPF_F_ALLOW_MULTI));
+                r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
                 if (r < 0)
                         return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
 
diff --git a/src/core/main.c b/src/core/main.c
index eaa56aca2a4..0edf8c0abad 100644
--- a/src/core/main.c
+++ b/src/core/main.c
@@ -1144,6 +1144,14 @@ static int prepare_reexecute(
         if (!fds)
                 return log_oom();
 
+        /* We need existing BPF programs to survive reload, otherwise there will be a period where no BPF
+         * program is active during task execution within a cgroup. This would be bad since this may have
+         * security or reliability implications: devices we should filter won't be filtered, network activity
+         * we should filter won't be filtered, etc. We pin all the existing devices by bumping their
+         * refcount, and then storing them to later have it decremented. */
+        _cleanup_(manager_unpin_all_cgroup_bpf_programsp) Manager *m_unpin =
+                manager_pin_all_cgroup_bpf_programs(m);
+
         r = manager_serialize(m, f, fds, switching_root);
         if (r < 0)
                 return r;
@@ -1159,6 +1167,7 @@ static int prepare_reexecute(
         if (r < 0)
                 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m");
 
+        TAKE_PTR(m_unpin);
         *ret_f = TAKE_PTR(f);
         *ret_fds = TAKE_PTR(fds);
 
diff --git a/src/core/manager.c b/src/core/manager.c
index 4b215a61766..c330a02bf79 100644
--- a/src/core/manager.c
+++ b/src/core/manager.c
@@ -65,6 +65,7 @@
 #include "rm-rf.h"
 #include "selinux-util.h"
 #include "serialize.h"
+#include "set.h"
 #include "signal-util.h"
 #include "socket-util.h"
 #include "special.h"
@@ -3213,6 +3214,79 @@ static void manager_serialize_gid_refs(Manager *m, FILE *f) {
         manager_serialize_uid_refs_internal(m, f, &m->gid_refs, "destroy-ipc-gid");
 }
 
+static int serialize_limbo_bpf_program(FILE *f, FDSet *fds, BPFProgram *p) {
+        int copy;
+        _cleanup_free_ char *ap = NULL;
+
+        /* We don't actually need the instructions or other data, since this is only used on the other side
+         * for BPF limbo, which just requires the program type, cgroup path, and kernel-facing BPF file
+         * descriptor. We don't even need to know what unit or directive it's attached to, since we're just
+         * going to expire it after coldplug. */
+
+        assert(f);
+        assert(p);
+
+        /* If the program isn't attached to the kernel yet, there's no reason to serialise it for limbo. Just
+         * let it be skeletonized and then coldplug can do the work on the other side if it's still
+         * necessary. */
+        if (p->kernel_fd < 0 || !p->attached_path)
+                return -ENOTCONN;
+
+        copy = fdset_put_dup(fds, p->kernel_fd);
+        if (copy < 0)
+                return log_error_errno(copy, "Failed to add file descriptor to serialization set: %m");
+
+        /* Otherwise, on daemon-reload, we'd remain pinned. */
+        safe_close(p->kernel_fd);
+
+        ap = cescape(p->attached_path);
+        if (!ap)
+                return log_oom();
+
+        return serialize_item_format(f, "bpf-limbo", "%i %i %i \"%s\"",
+                                     copy, p->prog_type, p->attached_type, ap);
+}
+
+static void deserialize_limbo_bpf_program(Manager *m, FDSet *fds, const char *value) {
+        _cleanup_free_ char *raw_fd = NULL, *raw_pt = NULL, *raw_at = NULL, *cgpath = NULL;
+        int fd, r, prog_type, attached_type;
+
+        assert(m);
+        assert(value);
+
+        r = extract_first_word(&value, &raw_fd, NULL, 0);
+        if (r <= 0 || safe_atoi(raw_fd, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
+                return (void) log_error("Failed to parse bpf-limbo FD: %s", value);
+
+        r = extract_first_word(&value, &raw_pt, NULL, 0);
+        if (r <= 0 || safe_atoi(raw_pt, &prog_type) < 0)
+                return (void) log_error("Failed to parse bpf-limbo program type: %s", value);
+
+        r = extract_first_word(&value, &raw_at, NULL, 0);
+        if (r <= 0 || safe_atoi(raw_at, &attached_type) < 0)
+                return (void) log_error("Failed to parse bpf-limbo attached type: %s", value);
+
+        r = extract_first_word(&value, &cgpath, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE);
+        if (r <= 0)
+                return (void) log_error("Failed to parse attached path for BPF limbo FD %s", value);
+
+        _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
+        r = bpf_program_new(prog_type, &p);
+        if (r < 0)
+                return (void) log_error_errno(r, "Failed to create BPF limbo program: %m");
+
+        /* Just enough to free it when the time is right, this does not have enough information be used as a
+         * real BPFProgram. */
+        p->attached_type = attached_type;
+        p->kernel_fd = fdset_remove(fds, fd);
+        p->attached_path = TAKE_PTR(cgpath);
+
+        r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
+        if (r < 0)
+                return (void) log_error_errno(r, "Failed to register BPF limbo program for FD %s: %m", value);
+        TAKE_PTR(p);
+}
+
 int manager_serialize(
                 Manager *m,
                 FILE *f,
@@ -3222,6 +3296,7 @@ int manager_serialize(
         const char *t;
         Unit *u;
         int r;
+        BPFProgram *p;
 
         assert(m);
         assert(f);
@@ -3266,6 +3341,9 @@ int manager_serialize(
                 (void) serialize_dual_timestamp(f, joined, m->timestamps + q);
         }
 
+        SET_FOREACH(p, m->bpf_limbo_progs)
+                (void) serialize_limbo_bpf_program(f, fds, p);
+
         if (!switching_root)
                 (void) serialize_strv(f, "env", m->client_environment);
 
@@ -3584,7 +3662,10 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
                         else
                                 m->n_failed_jobs += n;
 
-                } else if ((val = startswith(l, "taint-usr="))) {
+                } else if ((val = startswith(l, "bpf-limbo=")))
+                        deserialize_limbo_bpf_program(m, fds, val);
+
+                else if ((val = startswith(l, "taint-usr="))) {
                         int b;
 
                         b = parse_boolean(val);
@@ -3760,6 +3841,65 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
         return manager_deserialize_units(m, f, fds);
 }
 
+Manager* manager_pin_all_cgroup_bpf_programs(Manager *m) {
+        int r;
+        Unit *u;
+
+        assert(m);
+
+        HASHMAP_FOREACH(u, m->units) {
+                BPFProgram *p;
+
+                FOREACH_POINTER(p,
+                                u->bpf_device_control_installed,
+                                u->ip_bpf_ingress,
+                                u->ip_bpf_ingress_installed,
+                                u->ip_bpf_egress,
+                                u->ip_bpf_egress_installed)
+                        if (p) {
+                                r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
+                                if (r < 0) {
+                                        log_unit_error_errno(u, r, "Cannot store BPF program for reload, ignoring: %m");
+                                        continue;
+                                }
+
+                                bpf_program_ref(p);
+                        }
+
+                Set *s;
+                FOREACH_POINTER(s,
+                                u->ip_bpf_custom_ingress,
+                                u->ip_bpf_custom_ingress_installed,
+                                u->ip_bpf_custom_egress,
+                                u->ip_bpf_custom_egress_installed)
+                        SET_FOREACH(p, s) {
+                                r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
+                                if (r < 0) {
+                                        log_unit_error_errno(u, r, "Cannot store BPF program for reload, ignoring: %m");
+                                        continue;
+                                }
+
+                                bpf_program_ref(p);
+                        }
+        }
+
+        log_debug("Pinned %d BPF programs", set_size(m->bpf_limbo_progs));
+
+        return m;
+}
+
+static void manager_skeletonize_all_cgroup_bpf_programs(Manager *m) {
+        BPFProgram *p;
+
+        SET_FOREACH(p, m->bpf_limbo_progs)
+                bpf_program_skeletonize(p);
+}
+
+void manager_unpin_all_cgroup_bpf_programs(Manager *m) {
+        log_debug("Unpinning %d BPF programs", set_size(m->bpf_limbo_progs));
+        set_clear_with_destructor(m->bpf_limbo_progs, bpf_program_unref);
+}
+
 int manager_reload(Manager *m) {
         _cleanup_(manager_reloading_stopp) Manager *reloading = NULL;
         _cleanup_fdset_free_ FDSet *fds = NULL;
@@ -3779,6 +3919,13 @@ int manager_reload(Manager *m) {
         /* We are officially in reload mode from here on. */
         reloading = manager_reloading_start(m);
 
+        /* We need existing BPF programs to survive reload, otherwise there will be a period where no BPF
+         * program is active during task execution within a cgroup. This would be bad since this may have
+         * security or reliability implications: devices we should filter won't be filtered, network activity
+         * we should filter won't be filtered, etc. We pin all the existing devices by bumping their
+         * refcount, and then storing them to later have it decremented. */
+        (void) manager_pin_all_cgroup_bpf_programs(m);
+
         r = manager_serialize(m, f, fds, false);
         if (r < 0)
                 return r;
@@ -3803,6 +3950,12 @@ int manager_reload(Manager *m) {
         m->uid_refs = hashmap_free(m->uid_refs);
         m->gid_refs = hashmap_free(m->gid_refs);
 
+        /* The only canonical reference left to the dynamically allocated parts of these BPF programs is
+         * going to be on the other side of manager_deserialize, so the freeable parts can now be freed. The
+         * program itself will be detached as part of manager_vacuum. */
+        manager_skeletonize_all_cgroup_bpf_programs(m);
+        m->bpf_limbo_progs = set_free(m->bpf_limbo_progs);
+
         r = lookup_paths_init(&m->lookup_paths, m->unit_file_scope, 0, NULL);
         if (r < 0)
                 log_warning_errno(r, "Failed to initialize path lookup table, ignoring: %m");
@@ -4737,6 +4890,12 @@ static void manager_vacuum(Manager *m) {
 
         /* Release any runtimes no longer referenced */
         exec_runtime_vacuum(m);
+
+        /* Release any outmoded BPF programs that were deserialized from the previous manager, since new ones
+         * should be in action now. We first need to make sure all entries in the cgroup realize queue are
+         * complete, otherwise BPF firewalls/etc may not have been set up yet. */
+        (void) manager_dispatch_cgroup_realize_queue(m);
+        manager_unpin_all_cgroup_bpf_programs(m);
 }
 
 int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
diff --git a/src/core/manager.h b/src/core/manager.h
index 19df889dd89..2c4a2b60632 100644
--- a/src/core/manager.h
+++ b/src/core/manager.h
@@ -438,6 +438,8 @@ struct Manager {
         VarlinkServer *varlink_server;
         /* Only systemd-oomd should be using this to subscribe to changes in ManagedOOM settings */
         Varlink *managed_oom_varlink_request;
+
+        Set *bpf_limbo_progs;
 };
 
 static inline usec_t manager_default_timeout_abort_usec(Manager *m) {
@@ -479,6 +481,10 @@ int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode
 int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs,  Job **ret);
 int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e);
 
+Manager* manager_pin_all_cgroup_bpf_programs(Manager *m);
+void manager_unpin_all_cgroup_bpf_programs(Manager *m);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_unpin_all_cgroup_bpf_programs);
+
 void manager_dump_units(Manager *s, FILE *f, const char *prefix);
 void manager_dump_jobs(Manager *s, FILE *f, const char *prefix);
 void manager_dump(Manager *s, FILE *f, const char *prefix);
diff --git a/src/shared/bpf-program.c b/src/shared/bpf-program.c
index 10239142af3..549490da6ce 100644
--- a/src/shared/bpf-program.c
+++ b/src/shared/bpf-program.c
@@ -209,6 +209,16 @@ int bpf_program_cgroup_detach(BPFProgram *p) {
         return 0;
 }
 
+void bpf_program_skeletonize(BPFProgram *p) {
+        assert(p);
+
+        /* Called shortly after serialization. From this point on, we are frozen for serialization and entry
+         * into BPF limbo, so we should proactively free our instructions and attached path. However, we
+         * shouldn't detach the program or close the kernel FD -- we need those on the other side. */
+        free(p->instructions);
+        free(p->attached_path);
+}
+
 int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags) {
         union bpf_attr attr;
         int fd;
diff --git a/src/shared/bpf-program.h b/src/shared/bpf-program.h
index eef77f9d8e1..5957a6ce300 100644
--- a/src/shared/bpf-program.h
+++ b/src/shared/bpf-program.h
@@ -28,6 +28,7 @@ struct BPFProgram {
 int bpf_program_new(uint32_t prog_type, BPFProgram **ret);
 BPFProgram *bpf_program_unref(BPFProgram *p);
 BPFProgram *bpf_program_ref(BPFProgram *p);
+void bpf_program_skeletonize(BPFProgram *p);
 
 int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *insn, size_t count);
 int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size);