aed857
From 5ccae46f2a192a9347feb604901127c55ce1e039 Mon Sep 17 00:00:00 2001
510b6a
From: Kyle Walker <kwalker@redhat.com>
510b6a
Date: Wed, 13 Dec 2017 12:49:26 -0500
510b6a
Subject: [PATCH] core: Implement timeout based umount/remount limit
510b6a
510b6a
Remount, and subsequent umount, attempts can hang for inaccessible network
510b6a
based mount points. This can leave a system in a hard hang state that
510b6a
requires a hard reset in order to recover. This change moves the remount,
510b6a
and umount attempts into separate child processes. The remount and umount
510b6a
operations will block for up to 90 seconds (DEFAULT_TIMEOUT_USEC). Should
510b6a
those waits fail, the parent will issue a SIGKILL to the child and continue
510b6a
with the shutdown efforts.
510b6a
510b6a
In addition, instead of only reporting some additional errors on the final
510b6a
attempt, failures are reported as they occur.
510b6a
510b6a
(cherry picked from commit d5641e0d7e8f55937fbc3a7ecd667e42c5836d80)
510b6a
510b6a
Related: #1571098
510b6a
---
510b6a
 src/core/umount.c         | 112 +++++++++++++++++++++++++++++++++++++---------
510b6a
 src/shared/def.h          |   2 -
510b6a
 src/shared/login-shared.c |   1 +
510b6a
 src/shared/util.c         |  61 +++++++++++++++++++++++++
aed857
 src/shared/util.h         |  16 +++++++
aed857
 5 files changed, 168 insertions(+), 24 deletions(-)
510b6a
510b6a
diff --git a/src/core/umount.c b/src/core/umount.c
510b6a
index 91d67c06c..bd3896612 100644
510b6a
--- a/src/core/umount.c
510b6a
+++ b/src/core/umount.c
510b6a
@@ -363,7 +363,84 @@ static int delete_dm(dev_t devnum) {
510b6a
         return r >= 0 ? 0 : -errno;
510b6a
 }
510b6a
 
510b6a
-static int mount_points_list_umount(MountPoint **head, bool *changed, bool log_error) {
510b6a
+static int remount_with_timeout(MountPoint *m, char *options, int *n_failed) {
510b6a
+        pid_t pid;
510b6a
+        int r;
510b6a
+
510b6a
+        BLOCK_SIGNALS(SIGCHLD);
510b6a
+
510b6a
+        /* Due to the possiblity of a remount operation hanging, we
510b6a
+         * fork a child process and set a timeout. If the timeout
510b6a
+         * lapses, the assumption is that that particular remount
510b6a
+         * failed. */
510b6a
+        pid = fork();
510b6a
+        if (pid < 0)
510b6a
+                return log_error_errno(errno, "Failed to fork: %m");
510b6a
+
510b6a
+        if (pid == 0) {
510b6a
+                log_info("Remounting '%s' read-only in with options '%s'.", m->path, options);
510b6a
+
510b6a
+                /* Start the mount operation here in the child */
510b6a
+                r = mount(NULL, m->path, NULL, MS_REMOUNT|MS_RDONLY, options);
510b6a
+                if (r < 0)
510b6a
+                        log_error_errno(errno, "Failed to remount '%s' read-only: %m", m->path);
510b6a
+
510b6a
+                _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
510b6a
+        }
510b6a
+
510b6a
+        r = wait_for_terminate_with_timeout(pid, DEFAULT_TIMEOUT_USEC);
510b6a
+        if (r == -ETIMEDOUT) {
510b6a
+                log_error_errno(errno, "Remounting '%s' - timed out, issuing SIGKILL to PID "PID_FMT".", m->path, pid);
510b6a
+                (void) kill(pid, SIGKILL);
510b6a
+        } else if (r < 0)
510b6a
+                log_error_errno(r, "Failed to wait for process: %m");
510b6a
+
510b6a
+        return r;
510b6a
+}
510b6a
+
510b6a
+static int umount_with_timeout(MountPoint *m, bool *changed) {
510b6a
+        pid_t pid;
510b6a
+        int r;
510b6a
+
510b6a
+        BLOCK_SIGNALS(SIGCHLD);
510b6a
+
510b6a
+        /* Due to the possiblity of a umount operation hanging, we
510b6a
+         * fork a child process and set a timeout. If the timeout
510b6a
+         * lapses, the assumption is that that particular umount
510b6a
+         * failed. */
510b6a
+        pid = fork();
510b6a
+        if (pid < 0)
510b6a
+                return log_error_errno(errno, "Failed to fork: %m");
510b6a
+
510b6a
+        if (pid == 0) {
510b6a
+                log_info("Unmounting '%s'.", m->path);
510b6a
+
510b6a
+                /* Start the mount operation here in the child Using MNT_FORCE
510b6a
+                 * causes some filesystems (e.g. FUSE and NFS and other network
510b6a
+                 * filesystems) to abort any pending requests and return -EIO
510b6a
+                 * rather than blocking indefinitely. If the filesysten is
510b6a
+                 * "busy", this may allow processes to die, thus making the
510b6a
+                 * filesystem less busy so the unmount might succeed (rather
510b6a
+                 * then return EBUSY).*/
510b6a
+                r = umount2(m->path, MNT_FORCE);
510b6a
+                if (r < 0)
510b6a
+                        log_error_errno(errno, "Failed to unmount %s: %m", m->path);
510b6a
+
510b6a
+                _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
510b6a
+        }
510b6a
+
510b6a
+        r = wait_for_terminate_with_timeout(pid, DEFAULT_TIMEOUT_USEC);
510b6a
+        if (r == -ETIMEDOUT) {
510b6a
+                log_error_errno(errno, "Unmounting '%s' - timed out, issuing SIGKILL to PID "PID_FMT".", m->path, pid);
510b6a
+                (void) kill(pid, SIGKILL);
510b6a
+        } else if (r < 0)
510b6a
+                log_error_errno(r, "Failed to wait for process: %m");
510b6a
+
510b6a
+        return r;
510b6a
+}
510b6a
+
510b6a
+
510b6a
+static int mount_points_list_umount(MountPoint **head, bool *changed) {
510b6a
         MountPoint *m, *n;
510b6a
         int n_failed = 0;
510b6a
 
510b6a
@@ -405,9 +482,13 @@ static int mount_points_list_umount(MountPoint **head, bool *changed, bool log_e
510b6a
                          * explicitly remount the super block of that
510b6a
                          * alias read-only we hence should be
510b6a
                          * relatively safe regarding keeping the fs we
510b6a
-                         * can otherwise not see dirty. */
510b6a
-                        log_info("Remounting '%s' read-only with options '%s'.", m->path, options);
510b6a
-                        (void) mount(NULL, m->path, NULL, MS_REMOUNT|MS_RDONLY, options);
510b6a
+                         * can otherwise not see dirty.
510b6a
+                         *
510b6a
+                         * Since the remount can hang in the instance of
510b6a
+                         * remote filesystems, we remount asynchronously
510b6a
+                         * and skip the subsequent umount if it fails */
510b6a
+                        if (remount_with_timeout(m, options, &n_failed) < 0)
510b6a
+                                continue;
510b6a
                 }
510b6a
 
510b6a
                 /* Skip / and /usr since we cannot unmount that
510b6a
@@ -420,22 +501,14 @@ static int mount_points_list_umount(MountPoint **head, bool *changed, bool log_e
510b6a
                 )
510b6a
                         continue;
510b6a
 
510b6a
-                /* Trying to umount. Using MNT_FORCE causes some
510b6a
-                 * filesystems (e.g. FUSE and NFS and other network
510b6a
-                 * filesystems) to abort any pending requests and
510b6a
-                 * return -EIO rather than blocking indefinitely.
510b6a
-                 * If the filesysten is "busy", this may allow processes
510b6a
-                 * to die, thus making the filesystem less busy so
510b6a
-                 * the unmount might succeed (rather then return EBUSY).*/
510b6a
-                log_info("Unmounting %s.", m->path);
510b6a
-                if (umount2(m->path, MNT_FORCE) == 0) {
510b6a
+                /* Trying to umount */
510b6a
+                if (umount_with_timeout(m, changed) < 0)
510b6a
+                        n_failed++;
510b6a
+                else {
510b6a
                         if (changed)
510b6a
                                 *changed = true;
510b6a
 
510b6a
                         mount_point_free(head, m);
510b6a
-                } else if (log_error) {
510b6a
-                        log_warning_errno(errno, "Could not unmount %s: %m", m->path);
510b6a
-                        n_failed++;
510b6a
                 }
510b6a
         }
510b6a
 
510b6a
@@ -550,17 +623,12 @@ int umount_all(bool *changed) {
510b6a
         do {
510b6a
                 umount_changed = false;
510b6a
 
510b6a
-                mount_points_list_umount(&mp_list_head, &umount_changed, false);
510b6a
+                mount_points_list_umount(&mp_list_head, &umount_changed);
510b6a
                 if (umount_changed)
510b6a
                         *changed = true;
510b6a
 
510b6a
         } while (umount_changed);
510b6a
 
510b6a
-        /* umount one more time with logging enabled */
510b6a
-        r = mount_points_list_umount(&mp_list_head, &umount_changed, true);
510b6a
-        if (r <= 0)
510b6a
-                goto end;
510b6a
-
510b6a
   end:
510b6a
         mount_points_list_free(&mp_list_head);
510b6a
 
510b6a
diff --git a/src/shared/def.h b/src/shared/def.h
510b6a
index 9e008a6d2..f193ab1f9 100644
510b6a
--- a/src/shared/def.h
510b6a
+++ b/src/shared/def.h
510b6a
@@ -21,8 +21,6 @@
510b6a
   along with systemd; If not, see <http://www.gnu.org/licenses/>.
510b6a
 ***/
510b6a
 
510b6a
-#include "util.h"
510b6a
-
510b6a
 #define DEFAULT_TIMEOUT_USEC (90*USEC_PER_SEC)
510b6a
 #define DEFAULT_RESTART_USEC (100*USEC_PER_MSEC)
510b6a
 #define DEFAULT_CONFIRM_USEC (30*USEC_PER_SEC)
510b6a
diff --git a/src/shared/login-shared.c b/src/shared/login-shared.c
510b6a
index 054c77503..5da0f0583 100644
510b6a
--- a/src/shared/login-shared.c
510b6a
+++ b/src/shared/login-shared.c
510b6a
@@ -21,6 +21,7 @@
510b6a
 
510b6a
 #include "login-shared.h"
510b6a
 #include "def.h"
510b6a
+#include "util.h"
510b6a
 
510b6a
 bool session_id_valid(const char *id) {
510b6a
         assert(id);
510b6a
diff --git a/src/shared/util.c b/src/shared/util.c
aed857
index 982f5e044..3216f004a 100644
510b6a
--- a/src/shared/util.c
510b6a
+++ b/src/shared/util.c
aed857
@@ -9049,3 +9049,64 @@ try_dev_shm_without_o_tmpfile:
510b6a
 
aed857
         return -EOPNOTSUPP;
510b6a
 }
510b6a
+
510b6a
+/*
510b6a
+ * Return values:
510b6a
+ * < 0 : wait_for_terminate_with_timeout() failed to get the state of the
510b6a
+ *       process, the process timed out, the process was terminated by a
510b6a
+ *       signal, or failed for an unknown reason.
510b6a
+ * >=0 : The process terminated normally with no failures.
510b6a
+ *
510b6a
+ * Success is indicated by a return value of zero, a timeout is indicated
510b6a
+ * by ETIMEDOUT, and all other child failure states are indicated by error
510b6a
+ * is indicated by a non-zero value.
510b6a
+*/
510b6a
+int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout) {
510b6a
+        sigset_t mask;
510b6a
+        int r;
510b6a
+        usec_t until;
510b6a
+
510b6a
+        assert_se(sigemptyset(&mask) == 0);
510b6a
+        assert_se(sigaddset(&mask, SIGCHLD) == 0);
510b6a
+
510b6a
+        /* Drop into a sigtimewait-based timeout. Waiting for the
510b6a
+         * pid to exit. */
510b6a
+        until = now(CLOCK_MONOTONIC) + timeout;
510b6a
+        for (;;) {
510b6a
+                usec_t n;
510b6a
+                siginfo_t status = {};
510b6a
+                struct timespec ts;
510b6a
+
510b6a
+                n = now(CLOCK_MONOTONIC);
510b6a
+                if (n >= until)
510b6a
+                        break;
510b6a
+
510b6a
+                r = sigtimedwait(&mask, NULL, timespec_store(&ts, until - n)) < 0 ? -errno : 0;
510b6a
+                /* Assuming we woke due to the child exiting. */
510b6a
+                if (waitid(P_PID, pid, &status, WEXITED|WNOHANG) == 0) {
510b6a
+                        if (status.si_pid == pid) {
510b6a
+                                /* This is the correct child.*/
510b6a
+                                if (status.si_code == CLD_EXITED)
510b6a
+                                        return (status.si_status == 0) ? 0 : -EPROTO;
510b6a
+                                else
510b6a
+                                        return -EPROTO;
510b6a
+                        }
510b6a
+                }
510b6a
+                /* Not the child, check for errors and proceed appropriately */
510b6a
+                if (r < 0) {
510b6a
+                        switch (r) {
510b6a
+                        case -EAGAIN:
510b6a
+                                /* Timed out, child is likely hung. */
510b6a
+                                return -ETIMEDOUT;
510b6a
+                        case -EINTR:
510b6a
+                                /* Received a different signal and should retry */
510b6a
+                                continue;
510b6a
+                        default:
510b6a
+                                /* Return any unexpected errors */
510b6a
+                                return r;
510b6a
+                        }
510b6a
+                }
510b6a
+        }
510b6a
+
510b6a
+        return -EPROTO;
510b6a
+}
510b6a
diff --git a/src/shared/util.h b/src/shared/util.h
aed857
index 9c4be0256..998f882bb 100644
510b6a
--- a/src/shared/util.h
510b6a
+++ b/src/shared/util.h
510b6a
@@ -22,6 +22,7 @@
510b6a
 ***/
510b6a
 
510b6a
 #include <alloca.h>
510b6a
+#include <def.h>
510b6a
 #include <fcntl.h>
510b6a
 #include <inttypes.h>
510b6a
 #include <time.h>
aed857
@@ -1122,3 +1123,18 @@ enum {
aed857
 };
510b6a
 
aed857
 int acquire_data_fd(const void *data, size_t size, unsigned flags);
510b6a
+
510b6a
+int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout);
510b6a
+
510b6a
+static inline void block_signals_reset(sigset_t *ss) {
510b6a
+        assert_se(sigprocmask(SIG_SETMASK, ss, NULL) >= 0);
510b6a
+}
510b6a
+
510b6a
+#define BLOCK_SIGNALS(...)                                                         \
510b6a
+        _cleanup_(block_signals_reset) _unused_ sigset_t _saved_sigset = ({        \
510b6a
+                sigset_t _t;                                                       \
510b6a
+                assert_se(sigprocmask(SIG_SETMASK, NULL, &_t) == 0);               \
510b6a
+                assert_se(sigprocmask_many(SIG_BLOCK, __VA_ARGS__, -1) >= 0);      \
510b6a
+                _t;                                                                \
510b6a
+        })
aed857
+