505ca8
From 5adb2f01405d7cb7ba3cf9d4ee035f57952f79a6 Mon Sep 17 00:00:00 2001
d0811f
From: Chris Down <chris@chrisdown.name>
d0811f
Date: Thu, 29 Oct 2020 12:03:52 +0000
505ca8
Subject: [PATCH 3/3] bpf: pid1: Pin reference to BPF programs for
d0811f
 post-coldplug
d0811f
MIME-Version: 1.0
d0811f
Content-Type: text/plain; charset=UTF-8
d0811f
Content-Transfer-Encoding: 8bit
d0811f
d0811f
During `daemon-reload` and `daemon-reexec`, we detach and reattach all
d0811f
BPF programs attached to cgroups. This, however, poses a real practical
d0811f
problem for DevicePolicy (and some other settings using BPF): it
d0811f
presents a period of time where the old device filtering BPF program has
d0811f
been unloaded, but the new one has not been loaded yet.
d0811f
d0811f
Since the filtering is at open() time, it has become apparent that that
d0811f
there's a non-trivial period where applications inside that ostensibly
d0811f
filtered cgroup can grab any device -- and often do so -- and then
d0811f
retain access to that device even after the reload is over. Due to the
d0811f
file continuing to be available after the initial open(), this issue is
d0811f
particularly visible for DevicePolicy={strict,closed}, however it also
d0811f
applies to other BPF programs we install.
d0811f
d0811f
In particular, for BPF ingress/egress filtering this may have more
d0811f
concerning implications: network traffic which is supposed to be
d0811f
filtered will -- for a very brief period of time -- not be filtered or
d0811f
subject to any restrictions imposed by BPF.
d0811f
d0811f
These BPF programs are fundamentally attached to a cgroup lifetime, not
d0811f
our unit lifetime, so it's enough to pin these programs by taking a
d0811f
reference to affected BPF programs before reload/reexec. We can then
d0811f
serialise the program's kernel-facing FD and cgroup attachment FD for
d0811f
the new daemon, and have the daemon on the other side unpin the programs
d0811f
after it's finished with coldplug.
d0811f
d0811f
That means that, for example, the BPF program lifecycle during
d0811f
daemon-reload or daemon-reexec changes from this:
d0811f
d0811f
    manager_clear_jobs_and_units
d0811f
d0811f
          ╔══════╪═════════╤═══════╗
d0811f
          ║ prog │ no prog │ prog' ║
d0811f
          ╚══════╧═════════╪═══════╝
d0811f
d0811f
                    manager_coldplug
d0811f
d0811f
to this:
d0811f
d0811f
    manager_clear_jobs_and_units         manager_dispatch_cgroup_realize_queue
d0811f
                 │                                       │
d0811f
          ╔══════╪═══════════════╤═══════════════════════╪═══════╗
d0811f
          ║ prog │ prog (orphan) │ prog (orphan) + prog' │ prog' ║
d0811f
          ╚══════╧═══════════════╪═══════════════════════╧═══════╝
d0811f
d0811f
                          manager_coldplug
d0811f
d0811f
For daemon-reexec the semantics are mostly the same, but the point at
d0811f
which the program becomes orphan is tied to the process lifecycle
d0811f
instead.
d0811f
d0811f
None of the BPF programs we install require exclusive access, so having
d0811f
multiple instances of them running at the same time is fine. Custom
d0811f
programs, of course, are unknown, but it's hard to imagine legitimate
d0811f
cases which should be affected, whereas the benefits of this "overlap"
d0811f
approach with reference pinning is immediately tangible.
d0811f
d0811f
[keszybz: use _cleanup_ for unpin, use FOREACH_POINTER]
d0811f
---
d0811f
 src/core/bpf-firewall.c  |   9 +--
d0811f
 src/core/main.c          |   9 +++
505ca8
 src/core/manager.c       | 161 ++++++++++++++++++++++++++++++++++++++-
d0811f
 src/core/manager.h       |   6 ++
d0811f
 src/shared/bpf-program.c |  10 +++
d0811f
 src/shared/bpf-program.h |   1 +
505ca8
 6 files changed, 189 insertions(+), 7 deletions(-)
d0811f
d0811f
diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c
505ca8
index 99783aca22..c205ba1b19 100644
d0811f
--- a/src/core/bpf-firewall.c
d0811f
+++ b/src/core/bpf-firewall.c
505ca8
@@ -702,8 +702,7 @@ int bpf_firewall_install(Unit *u) {
d0811f
         if (r < 0)
d0811f
                 return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m");
505ca8
d0811f
-        flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
d0811f
-                 (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
d0811f
+        flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI) ? BPF_F_ALLOW_MULTI : 0;
505ca8
d0811f
         /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
d0811f
          * minimize the time window when we don't account for IP traffic. */
505ca8
@@ -711,8 +710,7 @@ int bpf_firewall_install(Unit *u) {
d0811f
         u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
505ca8
d0811f
         if (u->ip_bpf_egress) {
d0811f
-                r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path,
d0811f
-                                              flags | (set_isempty(u->ip_bpf_custom_egress) ? 0 : BPF_F_ALLOW_MULTI));
d0811f
+                r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
d0811f
                 if (r < 0)
d0811f
                         return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path);
505ca8
505ca8
@@ -721,8 +719,7 @@ int bpf_firewall_install(Unit *u) {
d0811f
         }
505ca8
d0811f
         if (u->ip_bpf_ingress) {
d0811f
-                r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path,
d0811f
-                                              flags | (set_isempty(u->ip_bpf_custom_ingress) ? 0 : BPF_F_ALLOW_MULTI));
d0811f
+                r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
d0811f
                 if (r < 0)
d0811f
                         return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
505ca8
d0811f
diff --git a/src/core/main.c b/src/core/main.c
505ca8
index a280b756ff..2ace4cb89c 100644
d0811f
--- a/src/core/main.c
d0811f
+++ b/src/core/main.c
d0811f
@@ -1144,6 +1144,14 @@ static int prepare_reexecute(
d0811f
         if (!fds)
d0811f
                 return log_oom();
505ca8
d0811f
+        /* We need existing BPF programs to survive reload, otherwise there will be a period where no BPF
d0811f
+         * program is active during task execution within a cgroup. This would be bad since this may have
d0811f
+         * security or reliability implications: devices we should filter won't be filtered, network activity
d0811f
+         * we should filter won't be filtered, etc. We pin all the existing devices by bumping their
d0811f
+         * refcount, and then storing them to later have it decremented. */
d0811f
+        _cleanup_(manager_unpin_all_cgroup_bpf_programsp) Manager *m_unpin =
d0811f
+                manager_pin_all_cgroup_bpf_programs(m);
d0811f
+
d0811f
         r = manager_serialize(m, f, fds, switching_root);
d0811f
         if (r < 0)
d0811f
                 return r;
d0811f
@@ -1159,6 +1167,7 @@ static int prepare_reexecute(
d0811f
         if (r < 0)
d0811f
                 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m");
505ca8
d0811f
+        TAKE_PTR(m_unpin);
d0811f
         *ret_f = TAKE_PTR(f);
d0811f
         *ret_fds = TAKE_PTR(fds);
505ca8
d0811f
diff --git a/src/core/manager.c b/src/core/manager.c
505ca8
index a1d6f7cc10..b373d27844 100644
d0811f
--- a/src/core/manager.c
d0811f
+++ b/src/core/manager.c
505ca8
@@ -65,6 +65,7 @@
d0811f
 #include "rm-rf.h"
505ca8
 #include "selinux-util.h"
d0811f
 #include "serialize.h"
d0811f
+#include "set.h"
d0811f
 #include "signal-util.h"
d0811f
 #include "socket-util.h"
d0811f
 #include "special.h"
505ca8
@@ -3217,6 +3218,79 @@ static void manager_serialize_gid_refs(Manager *m, FILE *f) {
d0811f
         manager_serialize_uid_refs_internal(m, f, &m->gid_refs, "destroy-ipc-gid");
d0811f
 }
505ca8
d0811f
+static int serialize_limbo_bpf_program(FILE *f, FDSet *fds, BPFProgram *p) {
d0811f
+        int copy;
d0811f
+        _cleanup_free_ char *ap = NULL;
d0811f
+
d0811f
+        /* We don't actually need the instructions or other data, since this is only used on the other side
d0811f
+         * for BPF limbo, which just requires the program type, cgroup path, and kernel-facing BPF file
d0811f
+         * descriptor. We don't even need to know what unit or directive it's attached to, since we're just
d0811f
+         * going to expire it after coldplug. */
d0811f
+
d0811f
+        assert(f);
d0811f
+        assert(p);
d0811f
+
d0811f
+        /* If the program isn't attached to the kernel yet, there's no reason to serialise it for limbo. Just
d0811f
+         * let it be skeletonized and then coldplug can do the work on the other side if it's still
d0811f
+         * necessary. */
d0811f
+        if (p->kernel_fd < 0 || !p->attached_path)
d0811f
+                return -ENOTCONN;
d0811f
+
d0811f
+        copy = fdset_put_dup(fds, p->kernel_fd);
d0811f
+        if (copy < 0)
d0811f
+                return log_error_errno(copy, "Failed to add file descriptor to serialization set: %m");
d0811f
+
d0811f
+        /* Otherwise, on daemon-reload, we'd remain pinned. */
d0811f
+        safe_close(p->kernel_fd);
d0811f
+
d0811f
+        ap = cescape(p->attached_path);
d0811f
+        if (!ap)
d0811f
+                return log_oom();
d0811f
+
d0811f
+        return serialize_item_format(f, "bpf-limbo", "%i %i %i \"%s\"",
d0811f
+                                     copy, p->prog_type, p->attached_type, ap);
d0811f
+}
d0811f
+
d0811f
+static void deserialize_limbo_bpf_program(Manager *m, FDSet *fds, const char *value) {
d0811f
+        _cleanup_free_ char *raw_fd = NULL, *raw_pt = NULL, *raw_at = NULL, *cgpath = NULL;
d0811f
+        int fd, r, prog_type, attached_type;
d0811f
+
d0811f
+        assert(m);
d0811f
+        assert(value);
d0811f
+
d0811f
+        r = extract_first_word(&value, &raw_fd, NULL, 0);
d0811f
+        if (r <= 0 || safe_atoi(raw_fd, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd))
d0811f
+                return (void) log_error("Failed to parse bpf-limbo FD: %s", value);
d0811f
+
d0811f
+        r = extract_first_word(&value, &raw_pt, NULL, 0);
d0811f
+        if (r <= 0 || safe_atoi(raw_pt, &prog_type) < 0)
d0811f
+                return (void) log_error("Failed to parse bpf-limbo program type: %s", value);
d0811f
+
d0811f
+        r = extract_first_word(&value, &raw_at, NULL, 0);
d0811f
+        if (r <= 0 || safe_atoi(raw_at, &attached_type) < 0)
d0811f
+                return (void) log_error("Failed to parse bpf-limbo attached type: %s", value);
d0811f
+
d0811f
+        r = extract_first_word(&value, &cgpath, NULL, EXTRACT_CUNESCAPE | EXTRACT_UNQUOTE);
d0811f
+        if (r <= 0)
d0811f
+                return (void) log_error("Failed to parse attached path for BPF limbo FD %s", value);
d0811f
+
d0811f
+        _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
d0811f
+        r = bpf_program_new(prog_type, &p);
d0811f
+        if (r < 0)
d0811f
+                return (void) log_error_errno(r, "Failed to create BPF limbo program: %m");
d0811f
+
d0811f
+        /* Just enough to free it when the time is right, this does not have enough information be used as a
d0811f
+         * real BPFProgram. */
d0811f
+        p->attached_type = attached_type;
d0811f
+        p->kernel_fd = fdset_remove(fds, fd);
d0811f
+        p->attached_path = TAKE_PTR(cgpath);
d0811f
+
d0811f
+        r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
d0811f
+        if (r < 0)
d0811f
+                return (void) log_error_errno(r, "Failed to register BPF limbo program for FD %s: %m", value);
d0811f
+        TAKE_PTR(p);
d0811f
+}
d0811f
+
d0811f
 int manager_serialize(
d0811f
                 Manager *m,
d0811f
                 FILE *f,
505ca8
@@ -3226,6 +3300,7 @@ int manager_serialize(
505ca8
         const char *t;
d0811f
         Unit *u;
d0811f
         int r;
d0811f
+        BPFProgram *p;
505ca8
d0811f
         assert(m);
d0811f
         assert(f);
505ca8
@@ -3270,6 +3345,9 @@ int manager_serialize(
d0811f
                 (void) serialize_dual_timestamp(f, joined, m->timestamps + q);
d0811f
         }
505ca8
505ca8
+        SET_FOREACH(p, m->bpf_limbo_progs)
d0811f
+                (void) serialize_limbo_bpf_program(f, fds, p);
d0811f
+
d0811f
         if (!switching_root)
d0811f
                 (void) serialize_strv(f, "env", m->client_environment);
505ca8
505ca8
@@ -3588,7 +3666,10 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
d0811f
                         else
d0811f
                                 m->n_failed_jobs += n;
505ca8
d0811f
-                } else if ((val = startswith(l, "taint-usr="))) {
d0811f
+                } else if ((val = startswith(l, "bpf-limbo=")))
d0811f
+                        deserialize_limbo_bpf_program(m, fds, val);
d0811f
+
d0811f
+                else if ((val = startswith(l, "taint-usr="))) {
d0811f
                         int b;
505ca8
d0811f
                         b = parse_boolean(val);
505ca8
@@ -3764,6 +3845,65 @@ int manager_deserialize(Manager *m, FILE *f, FDSet *fds) {
d0811f
         return manager_deserialize_units(m, f, fds);
d0811f
 }
505ca8
d0811f
+Manager* manager_pin_all_cgroup_bpf_programs(Manager *m) {
d0811f
+        int r;
d0811f
+        Unit *u;
d0811f
+
d0811f
+        assert(m);
d0811f
+
505ca8
+        HASHMAP_FOREACH(u, m->units) {
d0811f
+                BPFProgram *p;
d0811f
+
d0811f
+                FOREACH_POINTER(p,
d0811f
+                                u->bpf_device_control_installed,
d0811f
+                                u->ip_bpf_ingress,
d0811f
+                                u->ip_bpf_ingress_installed,
d0811f
+                                u->ip_bpf_egress,
d0811f
+                                u->ip_bpf_egress_installed)
d0811f
+                        if (p) {
d0811f
+                                r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
d0811f
+                                if (r < 0) {
d0811f
+                                        log_unit_error_errno(u, r, "Cannot store BPF program for reload, ignoring: %m");
d0811f
+                                        continue;
d0811f
+                                }
d0811f
+
d0811f
+                                bpf_program_ref(p);
d0811f
+                        }
d0811f
+
d0811f
+                Set *s;
d0811f
+                FOREACH_POINTER(s,
d0811f
+                                u->ip_bpf_custom_ingress,
d0811f
+                                u->ip_bpf_custom_ingress_installed,
d0811f
+                                u->ip_bpf_custom_egress,
d0811f
+                                u->ip_bpf_custom_egress_installed)
505ca8
+                        SET_FOREACH(p, s) {
d0811f
+                                r = set_ensure_put(&m->bpf_limbo_progs, NULL, p);
d0811f
+                                if (r < 0) {
d0811f
+                                        log_unit_error_errno(u, r, "Cannot store BPF program for reload, ignoring: %m");
d0811f
+                                        continue;
d0811f
+                                }
d0811f
+
d0811f
+                                bpf_program_ref(p);
d0811f
+                        }
d0811f
+        }
d0811f
+
d0811f
+        log_debug("Pinned %d BPF programs", set_size(m->bpf_limbo_progs));
d0811f
+
d0811f
+        return m;
d0811f
+}
d0811f
+
d0811f
+static void manager_skeletonize_all_cgroup_bpf_programs(Manager *m) {
d0811f
+        BPFProgram *p;
d0811f
+
505ca8
+        SET_FOREACH(p, m->bpf_limbo_progs)
d0811f
+                bpf_program_skeletonize(p);
d0811f
+}
d0811f
+
d0811f
+void manager_unpin_all_cgroup_bpf_programs(Manager *m) {
d0811f
+        log_debug("Unpinning %d BPF programs", set_size(m->bpf_limbo_progs));
d0811f
+        set_clear_with_destructor(m->bpf_limbo_progs, bpf_program_unref);
d0811f
+}
d0811f
+
d0811f
 int manager_reload(Manager *m) {
d0811f
         _cleanup_(manager_reloading_stopp) Manager *reloading = NULL;
d0811f
         _cleanup_fdset_free_ FDSet *fds = NULL;
505ca8
@@ -3783,6 +3923,13 @@ int manager_reload(Manager *m) {
d0811f
         /* We are officially in reload mode from here on. */
d0811f
         reloading = manager_reloading_start(m);
505ca8
d0811f
+        /* We need existing BPF programs to survive reload, otherwise there will be a period where no BPF
d0811f
+         * program is active during task execution within a cgroup. This would be bad since this may have
d0811f
+         * security or reliability implications: devices we should filter won't be filtered, network activity
d0811f
+         * we should filter won't be filtered, etc. We pin all the existing devices by bumping their
d0811f
+         * refcount, and then storing them to later have it decremented. */
d0811f
+        (void) manager_pin_all_cgroup_bpf_programs(m);
d0811f
+
d0811f
         r = manager_serialize(m, f, fds, false);
d0811f
         if (r < 0)
d0811f
                 return r;
505ca8
@@ -3807,6 +3954,12 @@ int manager_reload(Manager *m) {
d0811f
         m->uid_refs = hashmap_free(m->uid_refs);
d0811f
         m->gid_refs = hashmap_free(m->gid_refs);
505ca8
d0811f
+        /* The only canonical reference left to the dynamically allocated parts of these BPF programs is
d0811f
+         * going to be on the other side of manager_deserialize, so the freeable parts can now be freed. The
d0811f
+         * program itself will be detached as part of manager_vacuum. */
d0811f
+        manager_skeletonize_all_cgroup_bpf_programs(m);
d0811f
+        m->bpf_limbo_progs = set_free(m->bpf_limbo_progs);
d0811f
+
d0811f
         r = lookup_paths_init(&m->lookup_paths, m->unit_file_scope, 0, NULL);
d0811f
         if (r < 0)
d0811f
                 log_warning_errno(r, "Failed to initialize path lookup table, ignoring: %m");
505ca8
@@ -4741,6 +4894,12 @@ static void manager_vacuum(Manager *m) {
505ca8
d0811f
         /* Release any runtimes no longer referenced */
d0811f
         exec_runtime_vacuum(m);
d0811f
+
d0811f
+        /* Release any outmoded BPF programs that were deserialized from the previous manager, since new ones
d0811f
+         * should be in action now. We first need to make sure all entries in the cgroup realize queue are
d0811f
+         * complete, otherwise BPF firewalls/etc may not have been set up yet. */
d0811f
+        (void) manager_dispatch_cgroup_realize_queue(m);
d0811f
+        manager_unpin_all_cgroup_bpf_programs(m);
d0811f
 }
505ca8
d0811f
 int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
d0811f
diff --git a/src/core/manager.h b/src/core/manager.h
505ca8
index 19df889dd8..2c4a2b6063 100644
d0811f
--- a/src/core/manager.h
d0811f
+++ b/src/core/manager.h
505ca8
@@ -438,6 +438,8 @@ struct Manager {
d0811f
         VarlinkServer *varlink_server;
505ca8
         /* Only systemd-oomd should be using this to subscribe to changes in ManagedOOM settings */
505ca8
         Varlink *managed_oom_varlink_request;
d0811f
+
d0811f
+        Set *bpf_limbo_progs;
d0811f
 };
505ca8
d0811f
 static inline usec_t manager_default_timeout_abort_usec(Manager *m) {
505ca8
@@ -479,6 +481,10 @@ int manager_add_job_by_name(Manager *m, JobType type, const char *name, JobMode
d0811f
 int manager_add_job_by_name_and_warn(Manager *m, JobType type, const char *name, JobMode mode, Set *affected_jobs,  Job **ret);
d0811f
 int manager_propagate_reload(Manager *m, Unit *unit, JobMode mode, sd_bus_error *e);
505ca8
d0811f
+Manager* manager_pin_all_cgroup_bpf_programs(Manager *m);
d0811f
+void manager_unpin_all_cgroup_bpf_programs(Manager *m);
d0811f
+DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_unpin_all_cgroup_bpf_programs);
d0811f
+
d0811f
 void manager_dump_units(Manager *s, FILE *f, const char *prefix);
d0811f
 void manager_dump_jobs(Manager *s, FILE *f, const char *prefix);
d0811f
 void manager_dump(Manager *s, FILE *f, const char *prefix);
d0811f
diff --git a/src/shared/bpf-program.c b/src/shared/bpf-program.c
505ca8
index 10239142af..549490da6c 100644
d0811f
--- a/src/shared/bpf-program.c
d0811f
+++ b/src/shared/bpf-program.c
505ca8
@@ -209,6 +209,16 @@ int bpf_program_cgroup_detach(BPFProgram *p) {
d0811f
         return 0;
d0811f
 }
505ca8
d0811f
+void bpf_program_skeletonize(BPFProgram *p) {
d0811f
+        assert(p);
d0811f
+
d0811f
+        /* Called shortly after serialization. From this point on, we are frozen for serialization and entry
d0811f
+         * into BPF limbo, so we should proactively free our instructions and attached path. However, we
d0811f
+         * shouldn't detach the program or close the kernel FD -- we need those on the other side. */
d0811f
+        free(p->instructions);
d0811f
+        free(p->attached_path);
d0811f
+}
d0811f
+
d0811f
 int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags) {
505ca8
         union bpf_attr attr;
505ca8
         int fd;
d0811f
diff --git a/src/shared/bpf-program.h b/src/shared/bpf-program.h
505ca8
index eef77f9d8e..5957a6ce30 100644
d0811f
--- a/src/shared/bpf-program.h
d0811f
+++ b/src/shared/bpf-program.h
d0811f
@@ -28,6 +28,7 @@ struct BPFProgram {
d0811f
 int bpf_program_new(uint32_t prog_type, BPFProgram **ret);
d0811f
 BPFProgram *bpf_program_unref(BPFProgram *p);
d0811f
 BPFProgram *bpf_program_ref(BPFProgram *p);
d0811f
+void bpf_program_skeletonize(BPFProgram *p);
505ca8
d0811f
 int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *insn, size_t count);
d0811f
 int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size);
505ca8
--
505ca8
2.29.2
d0811f