Blob Blame History Raw
From 77a273e02c1c811485d13ddca0f844512aed2cff Mon Sep 17 00:00:00 2001
From: Jan Synacek <jsynacek@redhat.com>
Date: Wed, 12 Feb 2020 12:58:54 +0100
Subject: [PATCH] pid1: make sure to restore correct default values for some
 rlimits

Commit fb39af4ce42d7ef9af63009f271f404038703704 forgot to restore the default
rlimit values (RLIMIT_NOFILE and RLIMIT_MEMLOCK) while PID1 is reloading.

This patch extracts the code in charge of initializing the default values for
those rlimits in order to create dedicated functions, which take care of their
initialization.

These functions are then called in parse_configuration() so we make sure that
the default values for these rlimits get restored every time PID1 is reloading
its configuration.

(cherry picked from commit a9fd4cd1206832a61aaf61fff583bb133e6cb965)
Resolves: #1789930
---
 src/core/main.c | 135 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 106 insertions(+), 29 deletions(-)

diff --git a/src/core/main.c b/src/core/main.c
index c83249a8dc..b8c1e567ad 100644
--- a/src/core/main.c
+++ b/src/core/main.c
@@ -136,7 +136,8 @@ static EmergencyAction arg_cad_burst_action;
 static CPUSet arg_cpu_affinity;
 static NUMAPolicy arg_numa_policy;
 
-static int parse_configuration(void);
+static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
+                               const struct rlimit *saved_rlimit_memlock);
 
 _noreturn_ static void freeze_or_reboot(void) {
 
@@ -1149,25 +1150,6 @@ static int prepare_reexecute(Manager *m, FILE **_f, FDSet **_fds, bool switching
 static int bump_rlimit_nofile(struct rlimit *saved_rlimit) {
         int r, nr;
 
-        assert(saved_rlimit);
-
-        /* Save the original RLIMIT_NOFILE so that we can reset it
-         * later when transitioning from the initrd to the main
-         * systemd or suchlike. */
-        if (getrlimit(RLIMIT_NOFILE, saved_rlimit) < 0)
-                return log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
-
-        /* Make sure forked processes get the default kernel setting */
-        if (!arg_default_rlimit[RLIMIT_NOFILE]) {
-                struct rlimit *rl;
-
-                rl = newdup(struct rlimit, saved_rlimit, 1);
-                if (!rl)
-                        return log_oom();
-
-                arg_default_rlimit[RLIMIT_NOFILE] = rl;
-        }
-
         /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows */
         nr = read_nr_open();
         r = setrlimit_closest(RLIMIT_NOFILE, &RLIMIT_MAKE_CONST(nr));
@@ -1180,16 +1162,12 @@ static int bump_rlimit_nofile(struct rlimit *saved_rlimit) {
 static int bump_rlimit_memlock(struct rlimit *saved_rlimit) {
         int r;
 
-        assert(saved_rlimit);
         assert(getuid() == 0);
 
         /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even though we have CAP_IPC_LOCK which
          * should normally disable such checks. We need them to implement IPAccessAllow= and IPAccessDeny=, hence let's
          * bump the value high enough for the root user. */
 
-        if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit) < 0)
-                return log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
-
         r = setrlimit_closest(RLIMIT_MEMLOCK, &RLIMIT_MAKE_CONST(1024ULL*1024ULL*16ULL));
         if (r < 0)
                 return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
@@ -1651,6 +1629,8 @@ static void do_reexecute(
 
 static int invoke_main_loop(
                 Manager *m,
+                const struct rlimit *saved_rlimit_nofile,
+                const struct rlimit *saved_rlimit_memlock,
                 bool *ret_reexecute,
                 int *ret_retval,                   /* Return parameters relevant for shutting down */
                 const char **ret_shutdown_verb,    /* … */
@@ -1662,6 +1642,8 @@ static int invoke_main_loop(
         int r;
 
         assert(m);
+        assert(saved_rlimit_nofile);
+        assert(saved_rlimit_memlock);
         assert(ret_reexecute);
         assert(ret_retval);
         assert(ret_shutdown_verb);
@@ -1691,7 +1673,7 @@ static int invoke_main_loop(
                         saved_log_level = m->log_level_overridden ? log_get_max_level() : -1;
                         saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID;
 
-                        (void) parse_configuration();
+                        (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
 
                         set_manager_defaults(m);
 
@@ -1983,6 +1965,80 @@ static int do_queue_default_job(
         return 0;
 }
 
+static void save_rlimits(struct rlimit *saved_rlimit_nofile,
+                         struct rlimit *saved_rlimit_memlock) {
+
+        assert(saved_rlimit_nofile);
+        assert(saved_rlimit_memlock);
+
+        if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0)
+                log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
+
+        if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0)
+                log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
+}
+
+static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) {
+        struct rlimit *rl;
+
+        if (arg_default_rlimit[RLIMIT_NOFILE])
+                return;
+
+        /* Make sure forked processes get limits based on the original kernel setting */
+
+        rl = newdup(struct rlimit, saved_rlimit_nofile, 1);
+        if (!rl) {
+                log_oom();
+                return;
+        }
+
+        /* Bump the hard limit for system services to a substantially higher value. The default
+         * hard limit current kernels set is pretty low (4K), mostly for historical
+         * reasons. According to kernel developers, the fd handling in recent kernels has been
+         * optimized substantially enough, so that we can bump the limit now, without paying too
+         * high a price in memory or performance. Note however that we only bump the hard limit,
+         * not the soft limit. That's because select() works the way it works, and chokes on fds
+         * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to
+         * unexpecting programs that they get fds higher than what they can process using
+         * select(). By only bumping the hard limit but leaving the low limit as it is we avoid
+         * this pitfall:  programs that are written by folks aware of the select() problem in mind
+         * (and thus use poll()/epoll instead of select(), the way everybody should) can
+         * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit
+         * we pass. */
+        if (arg_system) {
+                int nr;
+
+                /* Get the underlying absolute limit the kernel enforces */
+                nr = read_nr_open();
+
+                rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE));
+        }
+
+        /* If for some reason we were invoked with a soft limit above 1024 (which should never
+         * happen!, but who knows what we get passed in from pam_limit when invoked as --user
+         * instance), then lower what we pass on to not confuse our children */
+        rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE);
+
+        arg_default_rlimit[RLIMIT_NOFILE] = rl;
+}
+
+static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) {
+        struct rlimit *rl;
+
+        /* Pass the original value down to invoked processes */
+
+        if (arg_default_rlimit[RLIMIT_MEMLOCK])
+                return;
+
+        rl = newdup(struct rlimit, saved_rlimit_memlock, 1);
+        if (!rl) {
+                log_oom();
+                return;
+        }
+
+        arg_default_rlimit[RLIMIT_MEMLOCK] = rl;
+}
+
 static void reset_arguments(void) {
         /* Frees/resets arg_* variables, with a few exceptions commented below. */
 
@@ -2040,9 +2096,13 @@ static void reset_arguments(void) {
         numa_policy_reset(&arg_numa_policy);
 }
 
-static int parse_configuration(void) {
+static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
+                               const struct rlimit *saved_rlimit_memlock) {
         int r;
 
+        assert(saved_rlimit_nofile);
+        assert(saved_rlimit_memlock);
+
         arg_default_tasks_max = system_tasks_max_scale(DEFAULT_TASKS_MAX_PERCENTAGE, 100U);
 
         /* Assign configuration defaults */
@@ -2058,18 +2118,29 @@ static int parse_configuration(void) {
                         log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
         }
 
+        /* Initialize some default rlimits for services if they haven't been configured */
+        fallback_rlimit_nofile(saved_rlimit_nofile);
+        fallback_rlimit_memlock(saved_rlimit_memlock);
+
         /* Note that this also parses bits from the kernel command line, including "debug". */
         log_parse_environment();
 
         return 0;
 }
 
-static int load_configuration(int argc, char **argv, const char **ret_error_message) {
+static int load_configuration(
+                int argc,
+                char **argv,
+                const struct rlimit *saved_rlimit_nofile,
+                const struct rlimit *saved_rlimit_memlock,
+                const char **ret_error_message) {
         int r;
 
+        assert(saved_rlimit_nofile);
+        assert(saved_rlimit_memlock);
         assert(ret_error_message);
 
-        (void) parse_configuration();
+        (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
 
         r = parse_argv(argc, argv);
         if (r < 0) {
@@ -2403,11 +2474,15 @@ int main(int argc, char *argv[]) {
                 }
         }
 
+        /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when
+         * transitioning from the initrd to the main systemd or suchlike. */
+        save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock);
+
         /* Reset all signal handlers. */
         (void) reset_all_signal_handlers();
         (void) ignore_signals(SIGNALS_IGNORE, -1);
 
-        r = load_configuration(argc, argv, &error_message);
+        r = load_configuration(argc, argv, &saved_rlimit_nofile, &saved_rlimit_memlock, &error_message);
         if (r < 0)
                 goto finish;
 
@@ -2522,6 +2597,8 @@ int main(int argc, char *argv[]) {
         }
 
         (void) invoke_main_loop(m,
+                                &saved_rlimit_nofile,
+                                &saved_rlimit_memlock,
                                 &reexecute,
                                 &retval,
                                 &shutdown_verb,