2be57e
From dd531b196e9dd794f7409f97320814e24f50081b Mon Sep 17 00:00:00 2001
2be57e
From: Kyle Walker <kwalker@redhat.com>
2be57e
Date: Wed, 13 Dec 2017 12:49:26 -0500
2be57e
Subject: [PATCH] core: Implement timeout based umount/remount limit
2be57e
2be57e
Remount, and subsequent umount, attempts can hang for inaccessible network
2be57e
based mount points. This can leave a system in a hard hang state that
2be57e
requires a hard reset in order to recover. This change moves the remount,
2be57e
and umount attempts into separate child processes. The remount and umount
2be57e
operations will block for up to 90 seconds (DEFAULT_TIMEOUT_USEC). Should
2be57e
those waits fail, the parent will issue a SIGKILL to the child and continue
2be57e
with the shutdown efforts.
2be57e
2be57e
In addition, instead of only reporting some additional errors on the final
2be57e
attempt, failures are reported as they occur.
2be57e
2be57e
(cherry picked from commit d5641e0d7e8f55937fbc3a7ecd667e42c5836d80)
2be57e
2be57e
Related: #1571098
2be57e
---
2be57e
 src/core/umount.c         | 112 +++++++++++++++++++++++++++++++++++++---------
2be57e
 src/shared/def.h          |   2 -
2be57e
 src/shared/login-shared.c |   1 +
2be57e
 src/shared/util.c         |  61 +++++++++++++++++++++++++
2be57e
 src/shared/util.h         |  15 +++++++
2be57e
 5 files changed, 167 insertions(+), 24 deletions(-)
2be57e
2be57e
diff --git a/src/core/umount.c b/src/core/umount.c
2be57e
index 91d67c06c..bd3896612 100644
2be57e
--- a/src/core/umount.c
2be57e
+++ b/src/core/umount.c
2be57e
@@ -363,7 +363,84 @@ static int delete_dm(dev_t devnum) {
2be57e
         return r >= 0 ? 0 : -errno;
2be57e
 }
2be57e
 
2be57e
-static int mount_points_list_umount(MountPoint **head, bool *changed, bool log_error) {
2be57e
+static int remount_with_timeout(MountPoint *m, char *options, int *n_failed) {
2be57e
+        pid_t pid;
2be57e
+        int r;
2be57e
+
2be57e
+        BLOCK_SIGNALS(SIGCHLD);
2be57e
+
2be57e
+        /* Due to the possiblity of a remount operation hanging, we
2be57e
+         * fork a child process and set a timeout. If the timeout
2be57e
+         * lapses, the assumption is that that particular remount
2be57e
+         * failed. */
2be57e
+        pid = fork();
2be57e
+        if (pid < 0)
2be57e
+                return log_error_errno(errno, "Failed to fork: %m");
2be57e
+
2be57e
+        if (pid == 0) {
2be57e
+                log_info("Remounting '%s' read-only in with options '%s'.", m->path, options);
2be57e
+
2be57e
+                /* Start the mount operation here in the child */
2be57e
+                r = mount(NULL, m->path, NULL, MS_REMOUNT|MS_RDONLY, options);
2be57e
+                if (r < 0)
2be57e
+                        log_error_errno(errno, "Failed to remount '%s' read-only: %m", m->path);
2be57e
+
2be57e
+                _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
2be57e
+        }
2be57e
+
2be57e
+        r = wait_for_terminate_with_timeout(pid, DEFAULT_TIMEOUT_USEC);
2be57e
+        if (r == -ETIMEDOUT) {
2be57e
+                log_error_errno(errno, "Remounting '%s' - timed out, issuing SIGKILL to PID "PID_FMT".", m->path, pid);
2be57e
+                (void) kill(pid, SIGKILL);
2be57e
+        } else if (r < 0)
2be57e
+                log_error_errno(r, "Failed to wait for process: %m");
2be57e
+
2be57e
+        return r;
2be57e
+}
2be57e
+
2be57e
+static int umount_with_timeout(MountPoint *m, bool *changed) {
2be57e
+        pid_t pid;
2be57e
+        int r;
2be57e
+
2be57e
+        BLOCK_SIGNALS(SIGCHLD);
2be57e
+
2be57e
+        /* Due to the possiblity of a umount operation hanging, we
2be57e
+         * fork a child process and set a timeout. If the timeout
2be57e
+         * lapses, the assumption is that that particular umount
2be57e
+         * failed. */
2be57e
+        pid = fork();
2be57e
+        if (pid < 0)
2be57e
+                return log_error_errno(errno, "Failed to fork: %m");
2be57e
+
2be57e
+        if (pid == 0) {
2be57e
+                log_info("Unmounting '%s'.", m->path);
2be57e
+
2be57e
+                /* Start the mount operation here in the child Using MNT_FORCE
2be57e
+                 * causes some filesystems (e.g. FUSE and NFS and other network
2be57e
+                 * filesystems) to abort any pending requests and return -EIO
2be57e
+                 * rather than blocking indefinitely. If the filesysten is
2be57e
+                 * "busy", this may allow processes to die, thus making the
2be57e
+                 * filesystem less busy so the unmount might succeed (rather
2be57e
+                 * then return EBUSY).*/
2be57e
+                r = umount2(m->path, MNT_FORCE);
2be57e
+                if (r < 0)
2be57e
+                        log_error_errno(errno, "Failed to unmount %s: %m", m->path);
2be57e
+
2be57e
+                _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
2be57e
+        }
2be57e
+
2be57e
+        r = wait_for_terminate_with_timeout(pid, DEFAULT_TIMEOUT_USEC);
2be57e
+        if (r == -ETIMEDOUT) {
2be57e
+                log_error_errno(errno, "Unmounting '%s' - timed out, issuing SIGKILL to PID "PID_FMT".", m->path, pid);
2be57e
+                (void) kill(pid, SIGKILL);
2be57e
+        } else if (r < 0)
2be57e
+                log_error_errno(r, "Failed to wait for process: %m");
2be57e
+
2be57e
+        return r;
2be57e
+}
2be57e
+
2be57e
+
2be57e
+static int mount_points_list_umount(MountPoint **head, bool *changed) {
2be57e
         MountPoint *m, *n;
2be57e
         int n_failed = 0;
2be57e
 
2be57e
@@ -405,9 +482,13 @@ static int mount_points_list_umount(MountPoint **head, bool *changed, bool log_e
2be57e
                          * explicitly remount the super block of that
2be57e
                          * alias read-only we hence should be
2be57e
                          * relatively safe regarding keeping the fs we
2be57e
-                         * can otherwise not see dirty. */
2be57e
-                        log_info("Remounting '%s' read-only with options '%s'.", m->path, options);
2be57e
-                        (void) mount(NULL, m->path, NULL, MS_REMOUNT|MS_RDONLY, options);
2be57e
+                         * can otherwise not see dirty.
2be57e
+                         *
2be57e
+                         * Since the remount can hang in the instance of
2be57e
+                         * remote filesystems, we remount asynchronously
2be57e
+                         * and skip the subsequent umount if it fails */
2be57e
+                        if (remount_with_timeout(m, options, &n_failed) < 0)
2be57e
+                                continue;
2be57e
                 }
2be57e
 
2be57e
                 /* Skip / and /usr since we cannot unmount that
2be57e
@@ -420,22 +501,14 @@ static int mount_points_list_umount(MountPoint **head, bool *changed, bool log_e
2be57e
                 )
2be57e
                         continue;
2be57e
 
2be57e
-                /* Trying to umount. Using MNT_FORCE causes some
2be57e
-                 * filesystems (e.g. FUSE and NFS and other network
2be57e
-                 * filesystems) to abort any pending requests and
2be57e
-                 * return -EIO rather than blocking indefinitely.
2be57e
-                 * If the filesysten is "busy", this may allow processes
2be57e
-                 * to die, thus making the filesystem less busy so
2be57e
-                 * the unmount might succeed (rather then return EBUSY).*/
2be57e
-                log_info("Unmounting %s.", m->path);
2be57e
-                if (umount2(m->path, MNT_FORCE) == 0) {
2be57e
+                /* Trying to umount */
2be57e
+                if (umount_with_timeout(m, changed) < 0)
2be57e
+                        n_failed++;
2be57e
+                else {
2be57e
                         if (changed)
2be57e
                                 *changed = true;
2be57e
 
2be57e
                         mount_point_free(head, m);
2be57e
-                } else if (log_error) {
2be57e
-                        log_warning_errno(errno, "Could not unmount %s: %m", m->path);
2be57e
-                        n_failed++;
2be57e
                 }
2be57e
         }
2be57e
 
2be57e
@@ -550,17 +623,12 @@ int umount_all(bool *changed) {
2be57e
         do {
2be57e
                 umount_changed = false;
2be57e
 
2be57e
-                mount_points_list_umount(&mp_list_head, &umount_changed, false);
2be57e
+                mount_points_list_umount(&mp_list_head, &umount_changed);
2be57e
                 if (umount_changed)
2be57e
                         *changed = true;
2be57e
 
2be57e
         } while (umount_changed);
2be57e
 
2be57e
-        /* umount one more time with logging enabled */
2be57e
-        r = mount_points_list_umount(&mp_list_head, &umount_changed, true);
2be57e
-        if (r <= 0)
2be57e
-                goto end;
2be57e
-
2be57e
   end:
2be57e
         mount_points_list_free(&mp_list_head);
2be57e
 
2be57e
diff --git a/src/shared/def.h b/src/shared/def.h
2be57e
index 9e008a6d2..f193ab1f9 100644
2be57e
--- a/src/shared/def.h
2be57e
+++ b/src/shared/def.h
2be57e
@@ -21,8 +21,6 @@
2be57e
   along with systemd; If not, see <http://www.gnu.org/licenses/>.
2be57e
 ***/
2be57e
 
2be57e
-#include "util.h"
2be57e
-
2be57e
 #define DEFAULT_TIMEOUT_USEC (90*USEC_PER_SEC)
2be57e
 #define DEFAULT_RESTART_USEC (100*USEC_PER_MSEC)
2be57e
 #define DEFAULT_CONFIRM_USEC (30*USEC_PER_SEC)
2be57e
diff --git a/src/shared/login-shared.c b/src/shared/login-shared.c
2be57e
index 054c77503..5da0f0583 100644
2be57e
--- a/src/shared/login-shared.c
2be57e
+++ b/src/shared/login-shared.c
2be57e
@@ -21,6 +21,7 @@
2be57e
 
2be57e
 #include "login-shared.h"
2be57e
 #include "def.h"
2be57e
+#include "util.h"
2be57e
 
2be57e
 bool session_id_valid(const char *id) {
2be57e
         assert(id);
2be57e
diff --git a/src/shared/util.c b/src/shared/util.c
2be57e
index af0953273..5e27a41da 100644
2be57e
--- a/src/shared/util.c
2be57e
+++ b/src/shared/util.c
2be57e
@@ -8893,3 +8893,64 @@ uint64_t system_tasks_max_scale(uint64_t v, uint64_t max) {
2be57e
 
2be57e
         return m / max;
2be57e
 }
2be57e
+
2be57e
+/*
2be57e
+ * Return values:
2be57e
+ * < 0 : wait_for_terminate_with_timeout() failed to get the state of the
2be57e
+ *       process, the process timed out, the process was terminated by a
2be57e
+ *       signal, or failed for an unknown reason.
2be57e
+ * >=0 : The process terminated normally with no failures.
2be57e
+ *
2be57e
+ * Success is indicated by a return value of zero, a timeout is indicated
2be57e
+ * by ETIMEDOUT, and all other child failure states are indicated by error
2be57e
+ * is indicated by a non-zero value.
2be57e
+*/
2be57e
+int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout) {
2be57e
+        sigset_t mask;
2be57e
+        int r;
2be57e
+        usec_t until;
2be57e
+
2be57e
+        assert_se(sigemptyset(&mask) == 0);
2be57e
+        assert_se(sigaddset(&mask, SIGCHLD) == 0);
2be57e
+
2be57e
+        /* Drop into a sigtimewait-based timeout. Waiting for the
2be57e
+         * pid to exit. */
2be57e
+        until = now(CLOCK_MONOTONIC) + timeout;
2be57e
+        for (;;) {
2be57e
+                usec_t n;
2be57e
+                siginfo_t status = {};
2be57e
+                struct timespec ts;
2be57e
+
2be57e
+                n = now(CLOCK_MONOTONIC);
2be57e
+                if (n >= until)
2be57e
+                        break;
2be57e
+
2be57e
+                r = sigtimedwait(&mask, NULL, timespec_store(&ts, until - n)) < 0 ? -errno : 0;
2be57e
+                /* Assuming we woke due to the child exiting. */
2be57e
+                if (waitid(P_PID, pid, &status, WEXITED|WNOHANG) == 0) {
2be57e
+                        if (status.si_pid == pid) {
2be57e
+                                /* This is the correct child.*/
2be57e
+                                if (status.si_code == CLD_EXITED)
2be57e
+                                        return (status.si_status == 0) ? 0 : -EPROTO;
2be57e
+                                else
2be57e
+                                        return -EPROTO;
2be57e
+                        }
2be57e
+                }
2be57e
+                /* Not the child, check for errors and proceed appropriately */
2be57e
+                if (r < 0) {
2be57e
+                        switch (r) {
2be57e
+                        case -EAGAIN:
2be57e
+                                /* Timed out, child is likely hung. */
2be57e
+                                return -ETIMEDOUT;
2be57e
+                        case -EINTR:
2be57e
+                                /* Received a different signal and should retry */
2be57e
+                                continue;
2be57e
+                        default:
2be57e
+                                /* Return any unexpected errors */
2be57e
+                                return r;
2be57e
+                        }
2be57e
+                }
2be57e
+        }
2be57e
+
2be57e
+        return -EPROTO;
2be57e
+}
2be57e
diff --git a/src/shared/util.h b/src/shared/util.h
2be57e
index 526a6fe84..81aef034e 100644
2be57e
--- a/src/shared/util.h
2be57e
+++ b/src/shared/util.h
2be57e
@@ -22,6 +22,7 @@
2be57e
 ***/
2be57e
 
2be57e
 #include <alloca.h>
2be57e
+#include <def.h>
2be57e
 #include <fcntl.h>
2be57e
 #include <inttypes.h>
2be57e
 #include <time.h>
2be57e
@@ -1112,3 +1113,17 @@ int parse_percent(const char *p);
2be57e
 
2be57e
 uint64_t system_tasks_max(void);
2be57e
 uint64_t system_tasks_max_scale(uint64_t v, uint64_t max);
2be57e
+
2be57e
+int wait_for_terminate_with_timeout(pid_t pid, usec_t timeout);
2be57e
+
2be57e
+static inline void block_signals_reset(sigset_t *ss) {
2be57e
+        assert_se(sigprocmask(SIG_SETMASK, ss, NULL) >= 0);
2be57e
+}
2be57e
+
2be57e
+#define BLOCK_SIGNALS(...)                                                         \
2be57e
+        _cleanup_(block_signals_reset) _unused_ sigset_t _saved_sigset = ({        \
2be57e
+                sigset_t _t;                                                       \
2be57e
+                assert_se(sigprocmask(SIG_SETMASK, NULL, &_t) == 0);               \
2be57e
+                assert_se(sigprocmask_many(SIG_BLOCK, __VA_ARGS__, -1) >= 0);      \
2be57e
+                _t;                                                                \
2be57e
+        })