6c7d9c
From 4e42210d40f96e185a55d43041dd6b962ea830dd Mon Sep 17 00:00:00 2001
d0811f
From: Chris Down <chris@chrisdown.name>
d0811f
Date: Thu, 29 Oct 2020 12:03:52 +0000
6c7d9c
Subject: [PATCH] bpf: pid1: Pin reference to BPF programs for post-coldplug
d0811f
MIME-Version: 1.0
d0811f
Content-Type: text/plain; charset=UTF-8
d0811f
Content-Transfer-Encoding: 8bit
d0811f
d0811f
During `daemon-reload` and `daemon-reexec`, we detach and reattach all
d0811f
BPF programs attached to cgroups. This, however, poses a real practical
d0811f
problem for DevicePolicy (and some other settings using BPF): it
d0811f
presents a period of time where the old device filtering BPF program has
d0811f
been unloaded, but the new one has not been loaded yet.
d0811f
d0811f
Since the filtering is at open() time, it has become apparent that that
d0811f
there's a non-trivial period where applications inside that ostensibly
d0811f
filtered cgroup can grab any device -- and often do so -- and then
d0811f
retain access to that device even after the reload is over. Due to the
d0811f
file continuing to be available after the initial open(), this issue is
d0811f
particularly visible for DevicePolicy={strict,closed}, however it also
d0811f
applies to other BPF programs we install.
d0811f
d0811f
In particular, for BPF ingress/egress filtering this may have more
d0811f
concerning implications: network traffic which is supposed to be
d0811f
filtered will -- for a very brief period of time -- not be filtered or
d0811f
subject to any restrictions imposed by BPF.
d0811f
d0811f
These BPF programs are fundamentally attached to a cgroup lifetime, not
d0811f
our unit lifetime, so it's enough to pin these programs by taking a
d0811f
reference to affected BPF programs before reload/reexec. We can then
d0811f
serialise the program's kernel-facing FD and cgroup attachment FD for
d0811f
the new daemon, and have the daemon on the other side unpin the programs
d0811f
after it's finished with coldplug.
d0811f
d0811f
That means that, for example, the BPF program lifecycle during
d0811f
daemon-reload or daemon-reexec changes from this:
d0811f
d0811f
    manager_clear_jobs_and_units
d0811f
d0811f
          ╔══════╪═════════╤═══════╗
d0811f
          ║ prog │ no prog │ prog' ║
d0811f
          ╚══════╧═════════╪═══════╝
d0811f
d0811f
                    manager_coldplug
d0811f
d0811f
to this:
d0811f
d0811f
    manager_clear_jobs_and_units         manager_dispatch_cgroup_realize_queue
d0811f
                 │                                       │
d0811f
          ╔══════╪═══════════════╤═══════════════════════╪═══════╗
d0811f
          ║ prog │ prog (orphan) │ prog (orphan) + prog' │ prog' ║
d0811f
          ╚══════╧═══════════════╪═══════════════════════╧═══════╝
d0811f
d0811f
                          manager_coldplug
d0811f
d0811f
For daemon-reexec the semantics are mostly the same, but the point at
d0811f
which the program becomes orphan is tied to the process lifecycle
d0811f
instead.
d0811f
d0811f
None of the BPF programs we install require exclusive access, so having
d0811f
multiple instances of them running at the same time is fine. Custom
d0811f
programs, of course, are unknown, but it's hard to imagine legitimate
d0811f
cases which should be affected, whereas the benefits of this "overlap"
d0811f
approach with reference pinning is immediately tangible.
d0811f
d0811f
[keszybz: use _cleanup_ for unpin, use FOREACH_POINTER]
d0811f
---
d0811f
 src/core/bpf-firewall.c  |   9 +--
d0811f
 src/core/main.c          |   9 +++
505ca8
 src/core/manager.c       | 161 ++++++++++++++++++++++++++++++++++++++-
d0811f
 src/core/manager.h       |   6 ++
d0811f
 src/shared/bpf-program.c |  10 +++
d0811f
 src/shared/bpf-program.h |   1 +
505ca8
 6 files changed, 189 insertions(+), 7 deletions(-)
d0811f
d0811f
diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c
6c7d9c
index 0f588b6ca5f..6283f15737b 100644
d0811f
--- a/src/core/bpf-firewall.c
d0811f
+++ b/src/core/bpf-firewall.c
6c7d9c
@@ -698,8 +698,7 @@ int bpf_firewall_install(Unit *u) {
d0811f
         if (r < 0)
d0811f
                 return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m");
6c7d9c
 
d0811f
-        flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
d0811f
-                 (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
d0811f
+        flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI) ? BPF_F_ALLOW_MULTI : 0;
6c7d9c
 
d0811f
         /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
d0811f
          * minimize the time window when we don't account for IP traffic. */
6c7d9c
@@ -707,8 +706,7 @@ int bpf_firewall_install(Unit *u) {
d0811f
         u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
6c7d9c
 
d0811f
         if (u->ip_bpf_egress) {
d0811f
-                r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path,
d0811f
-                                              flags | (set_isempty(u->ip_bpf_custom_egress) ? 0 : BPF_F_ALLOW_MULTI));
d0811f
+                r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
d0811f
                 if (r < 0)
d0811f
                         return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path);
6c7d9c
 
6c7d9c
@@ -717,8 +715,7 @@ int bpf_firewall_install(Unit *u) {
d0811f
         }
6c7d9c
 
d0811f
         if (u->ip_bpf_ingress) {
d0811f
-                r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path,
d0811f
-                                              flags | (set_isempty(u->ip_bpf_custom_ingress) ? 0 : BPF_F_ALLOW_MULTI));
d0811f
+                r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
d0811f
                 if (r < 0)
d0811f
                         return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
6c7d9c
 
d0811f
diff --git a/src/core/main.c b/src/core/main.c
6c7d9c
index eaa56aca2a4..0edf8c0abad 100644
d0811f
--- a/src/core/main.c
d0811f
+++ b/src/core/main.c
d0811f
@@ -1144,6 +1144,14 @@ static int prepare_reexecute(
d0811f
         if (!fds)
d0811f
                 return log_oom();
6c7d9c
 
d0811f
+        /* We need existing BPF programs to survive reload, otherwise there will be a period where no BPF
d0811f
+         * program is active during task execution within a cgroup. This would be bad since this may have
d0811f
+         * security or reliability implications: devices we should filter won't be filtered, network activity
d0811f
+         * we should filter won't be filtered, etc. We pin all the existing devices by bumping their
d0811f
+         * refcount, and then storing them to later have it decremented. */
d0811f
+        _cleanup_(manager_unpin_all_cgroup_bpf_programsp) Manager *m_unpin =
d0811f
+                manager_pin_all_cgroup_bpf_programs(m);
d0811f
+
d0811f
         r = manager_serialize(m, f, fds, switching_root);
d0811f
         if (r < 0)
d0811f
                 return r;
d0811f
@@ -1159,6 +1167,7 @@ static int prepare_reexecute(
d0811f
         if (r < 0)
d0811f
                 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m");
6c7d9c
 
d0811f
+        TAKE_PTR(m_unpin);
d0811f
         *ret_f = TAKE_PTR(f);
d0811f
         *ret_fds = TAKE_PTR(fds);
6c7d9c
 
d0811f
diff --git a/src/core/manager.c b/src/core/manager.c
6c7d9c
index 4b215a61766..c330a02bf79 100644
d0811f
--- a/src/core/manager.c
d0811f
+++ b/src/core/manager.c
505ca8
@@ -65,6 +65,7 @@
d0811f
 #include "rm-rf.h"
505ca8
 #include "selinux-util.h"
d0811f
 #include "serialize.h"
d0811f
+#include "set.h"
d0811f
 #include "signal-util.h"
d0811f
 #include "socket-util.h"
d0811f
 #include "special.h"
6c7d9c
@@ -3213,6 +3214,79 @@ static void manager_serialize_gid_refs(Manager *m, FILE *f) {
d0811f
         manager_serialize_uid_refs_internal(m, f, &m->gid_refs, "destroy-ipc-gid");
d0811f
 }
6c7d9c
 
d0811f
+static int serialize_limbo_bpf_program(FILE *f, FDSet *fds, BPFProgram *p) {
d0811f
+        int copy;
d0811f
+        _cleanup_free_ char *ap = NULL;
d0811f
+
d0811f
+        /* We don't actually need the instructions or other data, since this is only used on the other side
d0811f
+         * for BPF limbo, which just requires the program type, cgroup path, and kernel-facing BPF file
d0811f
+         * descriptor. We don't even need to know what unit or directive it's attached to, since we're just
d0811f
+         * going to expire it after coldplug. */
d0811f
+
d0811f
+        assert(f);
d0811f
+        assert(p);
d0811f
+
d0811f
+        /* If the program isn't attached to the kernel yet, there's no reason to serialise it for limbo. Just
d0811f
+         * let it be skeletonized and then coldplug can do the work on the other side if it's still
d0811f
+         * necessary. */
d0811f
+        if (p->kernel_fd < 0 || !p->attached_path)
d0811f
+                return -ENOTCONN;
d0811f
+
d0811f
+        copy = fdset_put_dup(fds, p->kernel_fd);
d0811f
+        if (copy < 0)
d0811f
+                return log_error_errno(copy, "Failed to add file descriptor to serialization set: %m");
d0811f
+
d0811f
+        /* Otherwise, on daemon-reload, we'd remain pinned. */
d0811f
+        safe_close(p->kernel_fd);
d0811f
+
d0811f
+        ap = cescape(p->attached_path);
d0811f
+        if (!ap)
d0811f
+                return log_oom();
d0811f
+
d0811f
+        return serialize_item_format(f, "bpf-limbo", "%i %i %i \"%s\"",
d0811f
+                                     copy, p->prog_type, p->attached_type, ap);
d0811f
+}
d0811f
+
d0811f
+static void deserialize_limbo_bpf_program(Manager *m, FDSet *fds, const char *value) {
d0811f
+        _cleanup_free_ char *raw_fd = NULL, *raw_pt = NULL, *raw_at = NULL, *cgpath = NULL;
d0811f
+        int fd, r, prog_type, attached_type;
d0811f
+
d0811f
+        assert(m);
d0811f
+        assert(value);
d0811f
+
d0811f
+        r = extract_first_word(&value, &raw_fd, NULL, 0);
d0811f
+        if (r <= 0 || safe_atoi(raw_fd, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
d0811f
+                return (void) log_error("Failed to parse bpf-limbo FD: %s", value);
d0811f
+
d0811f
+        r = extract_first_word(&value, &raw_pt, NULL, 0);
d0811f
+        if (r <= 0 || safe_atoi(raw_pt, &prog_type) < 0)
d0811f
+                return (void) log_error("Failed to parse bpf-limbo program type: %s", value);
d0811f
+
d0811f
+        r = extract_first_word(&value, &raw_at, NULL, 0);
d0811f
+        if (r <= 0 || safe_atoi(raw_at, &attached_type) < 0)
d0811f
+                return (void) log_error("Failed to parse bpf-limbo attached type: %s", value);
d0811f
+
d0811f
+        r = extract_first_word(&value, &cgpath, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE);
d0811f
+        if (r <= 0)
d0811f
+                return (void) log_error("Failed to parse attached path for BPF limbo FD %s", value);
d0811f
+
d0811f
+        _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
d0811f
+        r = bpf_program_new(prog_type, &p);
d0811f
+        if (r < 0)
d0811f
+                return (void) log_error_errno(r, "Failed to create BPF limbo program: %m");
d0811f
+
d0811f
+        /* Just enough to free it when the time is right, this does not have enough information be used as a
d0811f
+         * real BPFProgram. */
d0811f
+        p->attached_type = attached_type;
d0811f
+        p->kernel_fd = fdset_remove(fds, fd);
d0811f
+        p->attached_path = TAKE_PTR(cgpath);
d0811f
+
d0811f
+        r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
d0811f
+        if (r < 0)
d0811f
+                return (void) log_error_errno(r, "Failed to register BPF limbo program for FD %s: %m", value);
d0811f
+        TAKE_PTR(p);
d0811f
+}
d0811f
+
d0811f
 int manager_serialize(
d0811f
                 Manager *m,
d0811f
                 FILE *f,
6c7d9c
@@ -3222,6 +3296,7 @@ int manager_serialize(
505ca8
         const char *t;
d0811f
         Unit *u;
d0811f
         int r;
d0811f
+        BPFProgram *p;
6c7d9c
 
d0811f
         assert(m);
d0811f
         assert(f);
6c7d9c
@@ -3266,6 +3341,9 @@ int manager_serialize(
d0811f
                 (void) serialize_dual_timestamp(f, joined, m->timestamps + q);
d0811f
         }
6c7d9c
 
505ca8
+        SET_FOREACH(p, m->bpf_limbo_progs)
d0811f
+                (void) serialize_limbo_bpf_program(f, fds, p);
d0811f
+
d0811f
         if (!switching_root)
d0811f
                 (void) serialize_strv(f, "env", m->client_environment);
6c7d9c
 
6c7d9c
@@ -3584,7 +3662,10 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
d0811f
                         else
d0811f
                                 m->n_failed_jobs += n;
6c7d9c
 
d0811f
-                } else if ((val = startswith(l, "taint-usr="))) {
d0811f
+                } else if ((val = startswith(l, "bpf-limbo=")))
d0811f
+                        deserialize_limbo_bpf_program(m, fds, val);
d0811f
+
d0811f
+                else if ((val = startswith(l, "taint-usr="))) {
d0811f
                         int b;
6c7d9c
 
d0811f
                         b = parse_boolean(val);
6c7d9c
@@ -3760,6 +3841,65 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
d0811f
         return manager_deserialize_units(m, f, fds);
d0811f
 }
6c7d9c
 
d0811f
+Manager* manager_pin_all_cgroup_bpf_programs(Manager *m) {
d0811f
+        int r;
d0811f
+        Unit *u;
d0811f
+
d0811f
+        assert(m);
d0811f
+
505ca8
+        HASHMAP_FOREACH(u, m->units) {
d0811f
+                BPFProgram *p;
d0811f
+
d0811f
+                FOREACH_POINTER(p,
d0811f
+                                u->bpf_device_control_installed,
d0811f
+                                u->ip_bpf_ingress,
d0811f
+                                u->ip_bpf_ingress_installed,
d0811f
+                                u->ip_bpf_egress,
d0811f
+                                u->ip_bpf_egress_installed)
d0811f
+                        if (p) {
d0811f
+                                r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
d0811f
+                                if (r < 0) {
d0811f
+                                        log_unit_error_errno(u, r, "Cannot store BPF program for reload, ignoring: %m");
d0811f
+                                        continue;
d0811f
+                                }
d0811f
+
d0811f
+                                bpf_program_ref(p);
d0811f
+                        }
d0811f
+
d0811f
+                Set *s;
d0811f
+                FOREACH_POINTER(s,
d0811f
+                                u->ip_bpf_custom_ingress,
d0811f
+                                u->ip_bpf_custom_ingress_installed,
d0811f
+                                u->ip_bpf_custom_egress,
d0811f
+                                u->ip_bpf_custom_egress_installed)
505ca8
+                        SET_FOREACH(p, s) {
d0811f
+                                r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
d0811f
+                                if (r < 0) {
d0811f
+                                        log_unit_error_errno(u, r, "Cannot store BPF program for reload, ignoring: %m");
d0811f
+                                        continue;
d0811f
+                                }
d0811f
+
d0811f
+                                bpf_program_ref(p);
d0811f
+                        }
d0811f
+        }
d0811f
+
d0811f
+        log_debug("Pinned %d BPF programs", set_size(m->bpf_limbo_progs));
d0811f
+
d0811f
+        return m;
d0811f
+}
d0811f
+
d0811f
+static void manager_skeletonize_all_cgroup_bpf_programs(Manager *m) {
d0811f
+        BPFProgram *p;
d0811f
+
505ca8
+        SET_FOREACH(p, m->bpf_limbo_progs)
d0811f
+                bpf_program_skeletonize(p);
d0811f
+}
d0811f
+
d0811f
+void manager_unpin_all_cgroup_bpf_programs(Manager *m) {
d0811f
+        log_debug("Unpinning %d BPF programs", set_size(m->bpf_limbo_progs));
d0811f
+        set_clear_with_destructor(m->bpf_limbo_progs, bpf_program_unref);
d0811f
+}
d0811f
+
d0811f
 int manager_reload(Manager *m) {
d0811f
         _cleanup_(manager_reloading_stopp) Manager *reloading = NULL;
d0811f
         _cleanup_fdset_free_ FDSet *fds = NULL;
6c7d9c
@@ -3779,6 +3919,13 @@ int manager_reload(Manager *m) {
d0811f
         /* We are officially in reload mode from here on. */
d0811f
         reloading = manager_reloading_start(m);
6c7d9c
 
d0811f
+        /* We need existing BPF programs to survive reload, otherwise there will be a period where no BPF
d0811f
+         * program is active during task execution within a cgroup. This would be bad since this may have
d0811f
+         * security or reliability implications: devices we should filter won't be filtered, network activity
d0811f
+         * we should filter won't be filtered, etc. We pin all the existing devices by bumping their
d0811f
+         * refcount, and then storing them to later have it decremented. */
d0811f
+        (void) manager_pin_all_cgroup_bpf_programs(m);
d0811f
+
d0811f
         r = manager_serialize(m, f, fds, false);
d0811f
         if (r < 0)
d0811f
                 return r;
6c7d9c
@@ -3803,6 +3950,12 @@ int manager_reload(Manager *m) {
d0811f
         m->uid_refs = hashmap_free(m->uid_refs);
d0811f
         m->gid_refs = hashmap_free(m->gid_refs);
6c7d9c
 
d0811f
+        /* The only canonical reference left to the dynamically allocated parts of these BPF programs is
d0811f
+         * going to be on the other side of manager_deserialize, so the freeable parts can now be freed. The
d0811f
+         * program itself will be detached as part of manager_vacuum. */
d0811f
+        manager_skeletonize_all_cgroup_bpf_programs(m);
d0811f
+        m->bpf_limbo_progs = set_free(m->bpf_limbo_progs);
d0811f
+
d0811f
         r = lookup_paths_init(&m->lookup_paths, m->unit_file_scope, 0, NULL);
d0811f
         if (r < 0)
d0811f
                 log_warning_errno(r, "Failed to initialize path lookup table, ignoring: %m");
6c7d9c
@@ -4737,6 +4890,12 @@ static void manager_vacuum(Manager *m) {
6c7d9c
 
d0811f
         /* Release any runtimes no longer referenced */
d0811f
         exec_runtime_vacuum(m);
d0811f
+
d0811f
+        /* Release any outmoded BPF programs that were deserialized from the previous manager, since new ones
d0811f
+         * should be in action now. We first need to make sure all entries in the cgroup realize queue are
d0811f
+         * complete, otherwise BPF firewalls/etc may not have been set up yet. */
d0811f
+        (void) manager_dispatch_cgroup_realize_queue(m);
d0811f
+        manager_unpin_all_cgroup_bpf_programs(m);
d0811f
 }
6c7d9c
 
d0811f
 int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
d0811f
diff --git a/src/core/manager.h b/src/core/manager.h
6c7d9c
index 19df889dd89..2c4a2b60632 100644
d0811f
--- a/src/core/manager.h
d0811f
+++ b/src/core/manager.h
505ca8
@@ -438,6 +438,8 @@ struct Manager {
d0811f
         VarlinkServer *varlink_server;
505ca8
         /* Only systemd-oomd should be using this to subscribe to changes in ManagedOOM settings */
505ca8
         Varlink *managed_oom_varlink_request;
d0811f
+
d0811f
+        Set *bpf_limbo_progs;
d0811f
 };
6c7d9c
 
d0811f
 static inline usec_t manager_default_timeout_abort_usec(Manager *m) {
505ca8
@@ -479,6 +481,10 @@ int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode
d0811f
 int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs,  Job **ret);
d0811f
 int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e);
6c7d9c
 
d0811f
+Manager* manager_pin_all_cgroup_bpf_programs(Manager *m);
d0811f
+void manager_unpin_all_cgroup_bpf_programs(Manager *m);
d0811f
+DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_unpin_all_cgroup_bpf_programs);
d0811f
+
d0811f
 void manager_dump_units(Manager *s, FILE *f, const char *prefix);
d0811f
 void manager_dump_jobs(Manager *s, FILE *f, const char *prefix);
d0811f
 void manager_dump(Manager *s, FILE *f, const char *prefix);
d0811f
diff --git a/src/shared/bpf-program.c b/src/shared/bpf-program.c
6c7d9c
index 10239142af3..549490da6ce 100644
d0811f
--- a/src/shared/bpf-program.c
d0811f
+++ b/src/shared/bpf-program.c
505ca8
@@ -209,6 +209,16 @@ int bpf_program_cgroup_detach(BPFProgram *p) {
d0811f
         return 0;
d0811f
 }
6c7d9c
 
d0811f
+void bpf_program_skeletonize(BPFProgram *p) {
d0811f
+        assert(p);
d0811f
+
d0811f
+        /* Called shortly after serialization. From this point on, we are frozen for serialization and entry
d0811f
+         * into BPF limbo, so we should proactively free our instructions and attached path. However, we
d0811f
+         * shouldn't detach the program or close the kernel FD -- we need those on the other side. */
d0811f
+        free(p->instructions);
d0811f
+        free(p->attached_path);
d0811f
+}
d0811f
+
d0811f
 int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags) {
505ca8
         union bpf_attr attr;
505ca8
         int fd;
d0811f
diff --git a/src/shared/bpf-program.h b/src/shared/bpf-program.h
6c7d9c
index eef77f9d8e1..5957a6ce300 100644
d0811f
--- a/src/shared/bpf-program.h
d0811f
+++ b/src/shared/bpf-program.h
d0811f
@@ -28,6 +28,7 @@ struct BPFProgram {
d0811f
 int bpf_program_new(uint32_t prog_type, BPFProgram **ret);
d0811f
 BPFProgram *bpf_program_unref(BPFProgram *p);
d0811f
 BPFProgram *bpf_program_ref(BPFProgram *p);
d0811f
+void bpf_program_skeletonize(BPFProgram *p);
6c7d9c
 
d0811f
 int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *insn, size_t count);
d0811f
 int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size);