b9a53a
From 77a273e02c1c811485d13ddca0f844512aed2cff Mon Sep 17 00:00:00 2001
b9a53a
From: Jan Synacek <jsynacek@redhat.com>
b9a53a
Date: Wed, 12 Feb 2020 12:58:54 +0100
b9a53a
Subject: [PATCH] pid1: make sure to restore correct default values for some
b9a53a
 rlimits
b9a53a
b9a53a
Commit fb39af4ce42d7ef9af63009f271f404038703704 forgot to restore the default
b9a53a
rlimit values (RLIMIT_NOFILE and RLIMIT_MEMLOCK) while PID1 is reloading.
b9a53a
b9a53a
This patch extracts the code in charge of initializing the default values for
b9a53a
those rlimits in order to create dedicated functions, which take care of their
b9a53a
initialization.
b9a53a
b9a53a
These functions are then called in parse_configuration() so we make sure that
b9a53a
the default values for these rlimits get restored every time PID1 is reloading
b9a53a
its configuration.
b9a53a
b9a53a
(cherry picked from commit a9fd4cd1206832a61aaf61fff583bb133e6cb965)
b9a53a
Resolves: #1789930
b9a53a
---
b9a53a
 src/core/main.c | 135 +++++++++++++++++++++++++++++++++++++-----------
b9a53a
 1 file changed, 106 insertions(+), 29 deletions(-)
b9a53a
b9a53a
diff --git a/src/core/main.c b/src/core/main.c
b9a53a
index c83249a8dc..b8c1e567ad 100644
b9a53a
--- a/src/core/main.c
b9a53a
+++ b/src/core/main.c
b9a53a
@@ -136,7 +136,8 @@ static EmergencyAction arg_cad_burst_action;
b9a53a
 static CPUSet arg_cpu_affinity;
b9a53a
 static NUMAPolicy arg_numa_policy;
b9a53a
 
b9a53a
-static int parse_configuration(void);
b9a53a
+static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
b9a53a
+                               const struct rlimit *saved_rlimit_memlock);
b9a53a
 
b9a53a
 _noreturn_ static void freeze_or_reboot(void) {
b9a53a
 
b9a53a
@@ -1149,25 +1150,6 @@ static int prepare_reexecute(Manager *m, FILE **_f, FDSet **_fds, bool switching
b9a53a
 static int bump_rlimit_nofile(struct rlimit *saved_rlimit) {
b9a53a
         int r, nr;
b9a53a
 
b9a53a
-        assert(saved_rlimit);
b9a53a
-
b9a53a
-        /* Save the original RLIMIT_NOFILE so that we can reset it
b9a53a
-         * later when transitioning from the initrd to the main
b9a53a
-         * systemd or suchlike. */
b9a53a
-        if (getrlimit(RLIMIT_NOFILE, saved_rlimit) < 0)
b9a53a
-                return log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
b9a53a
-
b9a53a
-        /* Make sure forked processes get the default kernel setting */
b9a53a
-        if (!arg_default_rlimit[RLIMIT_NOFILE]) {
b9a53a
-                struct rlimit *rl;
b9a53a
-
b9a53a
-                rl = newdup(struct rlimit, saved_rlimit, 1);
b9a53a
-                if (!rl)
b9a53a
-                        return log_oom();
b9a53a
-
b9a53a
-                arg_default_rlimit[RLIMIT_NOFILE] = rl;
b9a53a
-        }
b9a53a
-
b9a53a
         /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows */
b9a53a
         nr = read_nr_open();
b9a53a
         r = setrlimit_closest(RLIMIT_NOFILE, &RLIMIT_MAKE_CONST(nr));
b9a53a
@@ -1180,16 +1162,12 @@ static int bump_rlimit_nofile(struct rlimit *saved_rlimit) {
b9a53a
 static int bump_rlimit_memlock(struct rlimit *saved_rlimit) {
b9a53a
         int r;
b9a53a
 
b9a53a
-        assert(saved_rlimit);
b9a53a
         assert(getuid() == 0);
b9a53a
 
b9a53a
         /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even though we have CAP_IPC_LOCK which
b9a53a
          * should normally disable such checks. We need them to implement IPAccessAllow= and IPAccessDeny=, hence let's
b9a53a
          * bump the value high enough for the root user. */
b9a53a
 
b9a53a
-        if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit) < 0)
b9a53a
-                return log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
b9a53a
-
b9a53a
         r = setrlimit_closest(RLIMIT_MEMLOCK, &RLIMIT_MAKE_CONST(1024ULL*1024ULL*16ULL));
b9a53a
         if (r < 0)
b9a53a
                 return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
b9a53a
@@ -1651,6 +1629,8 @@ static void do_reexecute(
b9a53a
 
b9a53a
 static int invoke_main_loop(
b9a53a
                 Manager *m,
b9a53a
+                const struct rlimit *saved_rlimit_nofile,
b9a53a
+                const struct rlimit *saved_rlimit_memlock,
b9a53a
                 bool *ret_reexecute,
b9a53a
                 int *ret_retval,                   /* Return parameters relevant for shutting down */
b9a53a
                 const char **ret_shutdown_verb,    /* … */
b9a53a
@@ -1662,6 +1642,8 @@ static int invoke_main_loop(
b9a53a
         int r;
b9a53a
 
b9a53a
         assert(m);
b9a53a
+        assert(saved_rlimit_nofile);
b9a53a
+        assert(saved_rlimit_memlock);
b9a53a
         assert(ret_reexecute);
b9a53a
         assert(ret_retval);
b9a53a
         assert(ret_shutdown_verb);
b9a53a
@@ -1691,7 +1673,7 @@ static int invoke_main_loop(
b9a53a
                         saved_log_level = m->log_level_overridden ? log_get_max_level() : -1;
b9a53a
                         saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID;
b9a53a
 
b9a53a
-                        (void) parse_configuration();
b9a53a
+                        (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
b9a53a
 
b9a53a
                         set_manager_defaults(m);
b9a53a
 
b9a53a
@@ -1983,6 +1965,80 @@ static int do_queue_default_job(
b9a53a
         return 0;
b9a53a
 }
b9a53a
 
b9a53a
+static void save_rlimits(struct rlimit *saved_rlimit_nofile,
b9a53a
+                         struct rlimit *saved_rlimit_memlock) {
b9a53a
+
b9a53a
+        assert(saved_rlimit_nofile);
b9a53a
+        assert(saved_rlimit_memlock);
b9a53a
+
b9a53a
+        if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0)
b9a53a
+                log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
b9a53a
+
b9a53a
+        if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0)
b9a53a
+                log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
b9a53a
+}
b9a53a
+
b9a53a
+static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) {
b9a53a
+        struct rlimit *rl;
b9a53a
+
b9a53a
+        if (arg_default_rlimit[RLIMIT_NOFILE])
b9a53a
+                return;
b9a53a
+
b9a53a
+        /* Make sure forked processes get limits based on the original kernel setting */
b9a53a
+
b9a53a
+        rl = newdup(struct rlimit, saved_rlimit_nofile, 1);
b9a53a
+        if (!rl) {
b9a53a
+                log_oom();
b9a53a
+                return;
b9a53a
+        }
b9a53a
+
b9a53a
+        /* Bump the hard limit for system services to a substantially higher value. The default
b9a53a
+         * hard limit current kernels set is pretty low (4K), mostly for historical
b9a53a
+         * reasons. According to kernel developers, the fd handling in recent kernels has been
b9a53a
+         * optimized substantially enough, so that we can bump the limit now, without paying too
b9a53a
+         * high a price in memory or performance. Note however that we only bump the hard limit,
b9a53a
+         * not the soft limit. That's because select() works the way it works, and chokes on fds
b9a53a
+         * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to
b9a53a
+         * unexpecting programs that they get fds higher than what they can process using
b9a53a
+         * select(). By only bumping the hard limit but leaving the low limit as it is we avoid
b9a53a
+         * this pitfall:  programs that are written by folks aware of the select() problem in mind
b9a53a
+         * (and thus use poll()/epoll instead of select(), the way everybody should) can
b9a53a
+         * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit
b9a53a
+         * we pass. */
b9a53a
+        if (arg_system) {
b9a53a
+                int nr;
b9a53a
+
b9a53a
+                /* Get the underlying absolute limit the kernel enforces */
b9a53a
+                nr = read_nr_open();
b9a53a
+
b9a53a
+                rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE));
b9a53a
+        }
b9a53a
+
b9a53a
+        /* If for some reason we were invoked with a soft limit above 1024 (which should never
b9a53a
+         * happen!, but who knows what we get passed in from pam_limit when invoked as --user
b9a53a
+         * instance), then lower what we pass on to not confuse our children */
b9a53a
+        rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE);
b9a53a
+
b9a53a
+        arg_default_rlimit[RLIMIT_NOFILE] = rl;
b9a53a
+}
b9a53a
+
b9a53a
+static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) {
b9a53a
+        struct rlimit *rl;
b9a53a
+
b9a53a
+        /* Pass the original value down to invoked processes */
b9a53a
+
b9a53a
+        if (arg_default_rlimit[RLIMIT_MEMLOCK])
b9a53a
+                return;
b9a53a
+
b9a53a
+        rl = newdup(struct rlimit, saved_rlimit_memlock, 1);
b9a53a
+        if (!rl) {
b9a53a
+                log_oom();
b9a53a
+                return;
b9a53a
+        }
b9a53a
+
b9a53a
+        arg_default_rlimit[RLIMIT_MEMLOCK] = rl;
b9a53a
+}
b9a53a
+
b9a53a
 static void reset_arguments(void) {
b9a53a
         /* Frees/resets arg_* variables, with a few exceptions commented below. */
b9a53a
 
b9a53a
@@ -2040,9 +2096,13 @@ static void reset_arguments(void) {
b9a53a
         numa_policy_reset(&arg_numa_policy);
b9a53a
 }
b9a53a
 
b9a53a
-static int parse_configuration(void) {
b9a53a
+static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
b9a53a
+                               const struct rlimit *saved_rlimit_memlock) {
b9a53a
         int r;
b9a53a
 
b9a53a
+        assert(saved_rlimit_nofile);
b9a53a
+        assert(saved_rlimit_memlock);
b9a53a
+
b9a53a
         arg_default_tasks_max = system_tasks_max_scale(DEFAULT_TASKS_MAX_PERCENTAGE, 100U);
b9a53a
 
b9a53a
         /* Assign configuration defaults */
b9a53a
@@ -2058,18 +2118,29 @@ static int parse_configuration(void) {
b9a53a
                         log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
b9a53a
         }
b9a53a
 
b9a53a
+        /* Initialize some default rlimits for services if they haven't been configured */
b9a53a
+        fallback_rlimit_nofile(saved_rlimit_nofile);
b9a53a
+        fallback_rlimit_memlock(saved_rlimit_memlock);
b9a53a
+
b9a53a
         /* Note that this also parses bits from the kernel command line, including "debug". */
b9a53a
         log_parse_environment();
b9a53a
 
b9a53a
         return 0;
b9a53a
 }
b9a53a
 
b9a53a
-static int load_configuration(int argc, char **argv, const char **ret_error_message) {
b9a53a
+static int load_configuration(
b9a53a
+                int argc,
b9a53a
+                char **argv,
b9a53a
+                const struct rlimit *saved_rlimit_nofile,
b9a53a
+                const struct rlimit *saved_rlimit_memlock,
b9a53a
+                const char **ret_error_message) {
b9a53a
         int r;
b9a53a
 
b9a53a
+        assert(saved_rlimit_nofile);
b9a53a
+        assert(saved_rlimit_memlock);
b9a53a
         assert(ret_error_message);
b9a53a
 
b9a53a
-        (void) parse_configuration();
b9a53a
+        (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
b9a53a
 
b9a53a
         r = parse_argv(argc, argv);
b9a53a
         if (r < 0) {
b9a53a
@@ -2403,11 +2474,15 @@ int main(int argc, char *argv[]) {
b9a53a
                 }
b9a53a
         }
b9a53a
 
b9a53a
+        /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when
b9a53a
+         * transitioning from the initrd to the main systemd or suchlike. */
b9a53a
+        save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock);
b9a53a
+
b9a53a
         /* Reset all signal handlers. */
b9a53a
         (void) reset_all_signal_handlers();
b9a53a
         (void) ignore_signals(SIGNALS_IGNORE, -1);
b9a53a
 
b9a53a
-        r = load_configuration(argc, argv, &error_message);
b9a53a
+        r = load_configuration(argc, argv, &saved_rlimit_nofile, &saved_rlimit_memlock, &error_message);
b9a53a
         if (r < 0)
b9a53a
                 goto finish;
b9a53a
 
b9a53a
@@ -2522,6 +2597,8 @@ int main(int argc, char *argv[]) {
b9a53a
         }
b9a53a
 
b9a53a
         (void) invoke_main_loop(m,
b9a53a
+                                &saved_rlimit_nofile,
b9a53a
+                                &saved_rlimit_memlock,
b9a53a
                                 &reexecute,
b9a53a
                                 &retval,
b9a53a
                                 &shutdown_verb,