5d2ee9
From 77a273e02c1c811485d13ddca0f844512aed2cff Mon Sep 17 00:00:00 2001
5d2ee9
From: Jan Synacek <jsynacek@redhat.com>
5d2ee9
Date: Wed, 12 Feb 2020 12:58:54 +0100
5d2ee9
Subject: [PATCH] pid1: make sure to restore correct default values for some
5d2ee9
 rlimits
5d2ee9
5d2ee9
Commit fb39af4ce42d7ef9af63009f271f404038703704 forgot to restore the default
5d2ee9
rlimit values (RLIMIT_NOFILE and RLIMIT_MEMLOCK) while PID1 is reloading.
5d2ee9
5d2ee9
This patch extracts the code in charge of initializing the default values for
5d2ee9
those rlimits in order to create dedicated functions, which take care of their
5d2ee9
initialization.
5d2ee9
5d2ee9
These functions are then called in parse_configuration() so we make sure that
5d2ee9
the default values for these rlimits get restored every time PID1 is reloading
5d2ee9
its configuration.
5d2ee9
5d2ee9
(cherry picked from commit a9fd4cd1206832a61aaf61fff583bb133e6cb965)
5d2ee9
Resolves: #1789930
5d2ee9
---
5d2ee9
 src/core/main.c | 135 +++++++++++++++++++++++++++++++++++++-----------
5d2ee9
 1 file changed, 106 insertions(+), 29 deletions(-)
5d2ee9
5d2ee9
diff --git a/src/core/main.c b/src/core/main.c
5d2ee9
index c83249a8dc..b8c1e567ad 100644
5d2ee9
--- a/src/core/main.c
5d2ee9
+++ b/src/core/main.c
5d2ee9
@@ -136,7 +136,8 @@ static EmergencyAction arg_cad_burst_action;
5d2ee9
 static CPUSet arg_cpu_affinity;
5d2ee9
 static NUMAPolicy arg_numa_policy;
5d2ee9
 
5d2ee9
-static int parse_configuration(void);
5d2ee9
+static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
5d2ee9
+                               const struct rlimit *saved_rlimit_memlock);
5d2ee9
 
5d2ee9
 _noreturn_ static void freeze_or_reboot(void) {
5d2ee9
 
5d2ee9
@@ -1149,25 +1150,6 @@ static int prepare_reexecute(Manager *m, FILE **_f, FDSet **_fds, bool switching
5d2ee9
 static int bump_rlimit_nofile(struct rlimit *saved_rlimit) {
5d2ee9
         int r, nr;
5d2ee9
 
5d2ee9
-        assert(saved_rlimit);
5d2ee9
-
5d2ee9
-        /* Save the original RLIMIT_NOFILE so that we can reset it
5d2ee9
-         * later when transitioning from the initrd to the main
5d2ee9
-         * systemd or suchlike. */
5d2ee9
-        if (getrlimit(RLIMIT_NOFILE, saved_rlimit) < 0)
5d2ee9
-                return log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
5d2ee9
-
5d2ee9
-        /* Make sure forked processes get the default kernel setting */
5d2ee9
-        if (!arg_default_rlimit[RLIMIT_NOFILE]) {
5d2ee9
-                struct rlimit *rl;
5d2ee9
-
5d2ee9
-                rl = newdup(struct rlimit, saved_rlimit, 1);
5d2ee9
-                if (!rl)
5d2ee9
-                        return log_oom();
5d2ee9
-
5d2ee9
-                arg_default_rlimit[RLIMIT_NOFILE] = rl;
5d2ee9
-        }
5d2ee9
-
5d2ee9
         /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows */
5d2ee9
         nr = read_nr_open();
5d2ee9
         r = setrlimit_closest(RLIMIT_NOFILE, &RLIMIT_MAKE_CONST(nr));
5d2ee9
@@ -1180,16 +1162,12 @@ static int bump_rlimit_nofile(struct rlimit *saved_rlimit) {
5d2ee9
 static int bump_rlimit_memlock(struct rlimit *saved_rlimit) {
5d2ee9
         int r;
5d2ee9
 
5d2ee9
-        assert(saved_rlimit);
5d2ee9
         assert(getuid() == 0);
5d2ee9
 
5d2ee9
         /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even though we have CAP_IPC_LOCK which
5d2ee9
          * should normally disable such checks. We need them to implement IPAccessAllow= and IPAccessDeny=, hence let's
5d2ee9
          * bump the value high enough for the root user. */
5d2ee9
 
5d2ee9
-        if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit) < 0)
5d2ee9
-                return log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
5d2ee9
-
5d2ee9
         r = setrlimit_closest(RLIMIT_MEMLOCK, &RLIMIT_MAKE_CONST(1024ULL*1024ULL*16ULL));
5d2ee9
         if (r < 0)
5d2ee9
                 return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
5d2ee9
@@ -1651,6 +1629,8 @@ static void do_reexecute(
5d2ee9
 
5d2ee9
 static int invoke_main_loop(
5d2ee9
                 Manager *m,
5d2ee9
+                const struct rlimit *saved_rlimit_nofile,
5d2ee9
+                const struct rlimit *saved_rlimit_memlock,
5d2ee9
                 bool *ret_reexecute,
5d2ee9
                 int *ret_retval,                   /* Return parameters relevant for shutting down */
5d2ee9
                 const char **ret_shutdown_verb,    /* … */
5d2ee9
@@ -1662,6 +1642,8 @@ static int invoke_main_loop(
5d2ee9
         int r;
5d2ee9
 
5d2ee9
         assert(m);
5d2ee9
+        assert(saved_rlimit_nofile);
5d2ee9
+        assert(saved_rlimit_memlock);
5d2ee9
         assert(ret_reexecute);
5d2ee9
         assert(ret_retval);
5d2ee9
         assert(ret_shutdown_verb);
5d2ee9
@@ -1691,7 +1673,7 @@ static int invoke_main_loop(
5d2ee9
                         saved_log_level = m->log_level_overridden ? log_get_max_level() : -1;
5d2ee9
                         saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID;
5d2ee9
 
5d2ee9
-                        (void) parse_configuration();
5d2ee9
+                        (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
5d2ee9
 
5d2ee9
                         set_manager_defaults(m);
5d2ee9
 
5d2ee9
@@ -1983,6 +1965,80 @@ static int do_queue_default_job(
5d2ee9
         return 0;
5d2ee9
 }
5d2ee9
 
5d2ee9
+static void save_rlimits(struct rlimit *saved_rlimit_nofile,
5d2ee9
+                         struct rlimit *saved_rlimit_memlock) {
5d2ee9
+
5d2ee9
+        assert(saved_rlimit_nofile);
5d2ee9
+        assert(saved_rlimit_memlock);
5d2ee9
+
5d2ee9
+        if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0)
5d2ee9
+                log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
5d2ee9
+
5d2ee9
+        if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0)
5d2ee9
+                log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
5d2ee9
+}
5d2ee9
+
5d2ee9
+static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) {
5d2ee9
+        struct rlimit *rl;
5d2ee9
+
5d2ee9
+        if (arg_default_rlimit[RLIMIT_NOFILE])
5d2ee9
+                return;
5d2ee9
+
5d2ee9
+        /* Make sure forked processes get limits based on the original kernel setting */
5d2ee9
+
5d2ee9
+        rl = newdup(struct rlimit, saved_rlimit_nofile, 1);
5d2ee9
+        if (!rl) {
5d2ee9
+                log_oom();
5d2ee9
+                return;
5d2ee9
+        }
5d2ee9
+
5d2ee9
+        /* Bump the hard limit for system services to a substantially higher value. The default
5d2ee9
+         * hard limit current kernels set is pretty low (4K), mostly for historical
5d2ee9
+         * reasons. According to kernel developers, the fd handling in recent kernels has been
5d2ee9
+         * optimized substantially enough, so that we can bump the limit now, without paying too
5d2ee9
+         * high a price in memory or performance. Note however that we only bump the hard limit,
5d2ee9
+         * not the soft limit. That's because select() works the way it works, and chokes on fds
5d2ee9
+         * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to
5d2ee9
+         * unexpecting programs that they get fds higher than what they can process using
5d2ee9
+         * select(). By only bumping the hard limit but leaving the low limit as it is we avoid
5d2ee9
+         * this pitfall:  programs that are written by folks aware of the select() problem in mind
5d2ee9
+         * (and thus use poll()/epoll instead of select(), the way everybody should) can
5d2ee9
+         * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit
5d2ee9
+         * we pass. */
5d2ee9
+        if (arg_system) {
5d2ee9
+                int nr;
5d2ee9
+
5d2ee9
+                /* Get the underlying absolute limit the kernel enforces */
5d2ee9
+                nr = read_nr_open();
5d2ee9
+
5d2ee9
+                rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE));
5d2ee9
+        }
5d2ee9
+
5d2ee9
+        /* If for some reason we were invoked with a soft limit above 1024 (which should never
5d2ee9
+         * happen!, but who knows what we get passed in from pam_limit when invoked as --user
5d2ee9
+         * instance), then lower what we pass on to not confuse our children */
5d2ee9
+        rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE);
5d2ee9
+
5d2ee9
+        arg_default_rlimit[RLIMIT_NOFILE] = rl;
5d2ee9
+}
5d2ee9
+
5d2ee9
+static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) {
5d2ee9
+        struct rlimit *rl;
5d2ee9
+
5d2ee9
+        /* Pass the original value down to invoked processes */
5d2ee9
+
5d2ee9
+        if (arg_default_rlimit[RLIMIT_MEMLOCK])
5d2ee9
+                return;
5d2ee9
+
5d2ee9
+        rl = newdup(struct rlimit, saved_rlimit_memlock, 1);
5d2ee9
+        if (!rl) {
5d2ee9
+                log_oom();
5d2ee9
+                return;
5d2ee9
+        }
5d2ee9
+
5d2ee9
+        arg_default_rlimit[RLIMIT_MEMLOCK] = rl;
5d2ee9
+}
5d2ee9
+
5d2ee9
 static void reset_arguments(void) {
5d2ee9
         /* Frees/resets arg_* variables, with a few exceptions commented below. */
5d2ee9
 
5d2ee9
@@ -2040,9 +2096,13 @@ static void reset_arguments(void) {
5d2ee9
         numa_policy_reset(&arg_numa_policy);
5d2ee9
 }
5d2ee9
 
5d2ee9
-static int parse_configuration(void) {
5d2ee9
+static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
5d2ee9
+                               const struct rlimit *saved_rlimit_memlock) {
5d2ee9
         int r;
5d2ee9
 
5d2ee9
+        assert(saved_rlimit_nofile);
5d2ee9
+        assert(saved_rlimit_memlock);
5d2ee9
+
5d2ee9
         arg_default_tasks_max = system_tasks_max_scale(DEFAULT_TASKS_MAX_PERCENTAGE, 100U);
5d2ee9
 
5d2ee9
         /* Assign configuration defaults */
5d2ee9
@@ -2058,18 +2118,29 @@ static int parse_configuration(void) {
5d2ee9
                         log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
5d2ee9
         }
5d2ee9
 
5d2ee9
+        /* Initialize some default rlimits for services if they haven't been configured */
5d2ee9
+        fallback_rlimit_nofile(saved_rlimit_nofile);
5d2ee9
+        fallback_rlimit_memlock(saved_rlimit_memlock);
5d2ee9
+
5d2ee9
         /* Note that this also parses bits from the kernel command line, including "debug". */
5d2ee9
         log_parse_environment();
5d2ee9
 
5d2ee9
         return 0;
5d2ee9
 }
5d2ee9
 
5d2ee9
-static int load_configuration(int argc, char **argv, const char **ret_error_message) {
5d2ee9
+static int load_configuration(
5d2ee9
+                int argc,
5d2ee9
+                char **argv,
5d2ee9
+                const struct rlimit *saved_rlimit_nofile,
5d2ee9
+                const struct rlimit *saved_rlimit_memlock,
5d2ee9
+                const char **ret_error_message) {
5d2ee9
         int r;
5d2ee9
 
5d2ee9
+        assert(saved_rlimit_nofile);
5d2ee9
+        assert(saved_rlimit_memlock);
5d2ee9
         assert(ret_error_message);
5d2ee9
 
5d2ee9
-        (void) parse_configuration();
5d2ee9
+        (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
5d2ee9
 
5d2ee9
         r = parse_argv(argc, argv);
5d2ee9
         if (r < 0) {
5d2ee9
@@ -2403,11 +2474,15 @@ int main(int argc, char *argv[]) {
5d2ee9
                 }
5d2ee9
         }
5d2ee9
 
5d2ee9
+        /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when
5d2ee9
+         * transitioning from the initrd to the main systemd or suchlike. */
5d2ee9
+        save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock);
5d2ee9
+
5d2ee9
         /* Reset all signal handlers. */
5d2ee9
         (void) reset_all_signal_handlers();
5d2ee9
         (void) ignore_signals(SIGNALS_IGNORE, -1);
5d2ee9
 
5d2ee9
-        r = load_configuration(argc, argv, &error_message);
5d2ee9
+        r = load_configuration(argc, argv, &saved_rlimit_nofile, &saved_rlimit_memlock, &error_message);
5d2ee9
         if (r < 0)
5d2ee9
                 goto finish;
5d2ee9
 
5d2ee9
@@ -2522,6 +2597,8 @@ int main(int argc, char *argv[]) {
5d2ee9
         }
5d2ee9
 
5d2ee9
         (void) invoke_main_loop(m,
5d2ee9
+                                &saved_rlimit_nofile,
5d2ee9
+                                &saved_rlimit_memlock,
5d2ee9
                                 &reexecute,
5d2ee9
                                 &retval,
5d2ee9
                                 &shutdown_verb,