Blob Blame History Raw
From ddc1464f3b129ea8490d7da3dd000ab1874b3a4d Mon Sep 17 00:00:00 2001
From: Kyle Walker <kwalker@redhat.com>
Date: Thu, 14 Dec 2017 11:46:03 -0500
Subject: [PATCH] core: Implement sync_with_progress()

In similar fashion to the previous change, sync() operations can stall
endlessly if cache is unable to be written out. In order to avoid an
unbounded hang, the sync takes place within a child process. Every 10
seconds (SYNC_TIMEOUT_USEC), the value of /proc/meminfo "Dirty" is checked
to verify it is smaller than the last iteration. If the sync is not making
progress for 3 successive iterations (SYNC_PROGRESS_ATTEMPTS), a SIGKILL is
sent to the sync process and the shutdown continues.

(cherry picked from commit 73ad712fcfea5d8ba475044698d31d2c15d4180d)

Related: #1571098
---
 src/core/shutdown.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 111 insertions(+), 5 deletions(-)

diff --git a/src/core/shutdown.c b/src/core/shutdown.c
index 71f001ac1..0b0a54a7d 100644
--- a/src/core/shutdown.c
+++ b/src/core/shutdown.c
@@ -53,6 +53,9 @@
 
 #define FINALIZE_ATTEMPTS 50
 
+#define SYNC_PROGRESS_ATTEMPTS 3
+#define SYNC_TIMEOUT_USEC (10*USEC_PER_SEC)
+
 static char* arg_verb;
 
 static int parse_argv(int argc, char *argv[]) {
@@ -152,6 +155,102 @@ static int switch_root_initramfs(void) {
         return switch_root("/run/initramfs", "/oldroot", false, MS_BIND);
 }
 
+/* Read the following fields from /proc/meminfo:
+ *
+ *  NFS_Unstable
+ *  Writeback
+ *  Dirty
+ *
+ * Return true if the sum of these fields is greater than the previous
+ * value input. For all other issues, report the failure and indicate that
+ * the sync is not making progress.
+ */
+static bool sync_making_progress(unsigned long long *prev_dirty) {
+        _cleanup_fclose_ FILE *f = NULL;
+        char line[LINE_MAX];
+        bool r = false;
+        unsigned long long val = 0;
+
+        f = fopen("/proc/meminfo", "re");
+        if (!f)
+                return log_warning_errno(errno, "Failed to open /proc/meminfo: %m");
+
+        FOREACH_LINE(line, f, log_warning_errno(errno, "Failed to parse /proc/meminfo: %m")) {
+                unsigned long long ull = 0;
+
+                if (!first_word(line, "NFS_Unstable:") && !first_word(line, "Writeback:") && !first_word(line, "Dirty:"))
+                        continue;
+
+                errno = 0;
+                if (sscanf(line, "%*s %llu %*s", &ull) != 1) {
+                        if (errno != 0)
+                                log_warning_errno(errno, "Failed to parse /proc/meminfo: %m");
+                        else
+                                log_warning("Failed to parse /proc/meminfo");
+
+                        return false;
+                }
+
+                val += ull;
+        }
+
+        r = *prev_dirty > val;
+
+        *prev_dirty = val;
+
+        return r;
+}
+
+static void sync_with_progress(void) {
+        unsigned checks;
+        pid_t pid;
+        int r;
+        unsigned long long dirty = ULONG_LONG_MAX;
+
+        BLOCK_SIGNALS(SIGCHLD);
+
+        /* Due to the possiblity of the sync operation hanging, we fork
+         * a child process and monitor the progress. If the timeout
+         * lapses, the assumption is that that particular sync stalled. */
+        pid = fork();
+        if (pid < 0) {
+                log_error_errno(errno, "Failed to fork: %m");
+                return;
+        }
+
+        if (pid == 0) {
+                /* Start the sync operation here in the child */
+                sync();
+                _exit(EXIT_SUCCESS);
+        }
+
+        log_info("Syncing filesystems and block devices.");
+
+        /* Start monitoring the sync operation. If more than
+         * SYNC_PROGRESS_ATTEMPTS lapse without progress being made,
+         * we assume that the sync is stalled */
+        for (checks = 0; checks < SYNC_PROGRESS_ATTEMPTS; checks++) {
+                r = wait_for_terminate_with_timeout(pid, SYNC_TIMEOUT_USEC);
+                if (r == 0)
+                        /* Sync finished without error.
+                         * (The sync itself does not return an error code) */
+                        return;
+                else if (r == -ETIMEDOUT) {
+                        /* Reset the check counter if the "Dirty" value is
+                         * decreasing */
+                        if (sync_making_progress(&dirty))
+                                checks = 0;
+                } else {
+                        log_error_errno(r, "Failed to sync filesystems and block devices: %m");
+                        return;
+                }
+        }
+
+        /* Only reached in the event of a timeout. We should issue a kill
+         * to the stray process. */
+        log_error("Syncing filesystems and block devices - timed out, issuing SIGKILL to PID "PID_FMT".", pid);
+        (void) kill(pid, SIGKILL);
+}
 
 int main(int argc, char *argv[]) {
         bool need_umount, need_swapoff, need_loop_detach, need_dm_detach;
@@ -202,6 +301,13 @@ int main(int argc, char *argv[]) {
         /* lock us into memory */
         mlockall(MCL_CURRENT|MCL_FUTURE);
 
+        /* Synchronize everything that is not written to disk yet at this point already. This is a good idea so that
+         * slow IO is processed here already and the final process killing spree is not impacted by processes
+         * desperately trying to sync IO to disk within their timeout. Do not remove this sync, data corruption will
+         * result. */
+        if (!in_container)
+                sync_with_progress();
+
         log_info("Sending SIGTERM to remaining processes...");
         broadcast_signal(SIGTERM, true, true);
 
@@ -338,12 +444,12 @@ int main(int argc, char *argv[]) {
                           need_loop_detach ? " loop devices," : "",
                           need_dm_detach ? " DM devices," : "");
 
-        /* The kernel will automaticall flush ATA disks and suchlike
-         * on reboot(), but the file systems need to be synce'd
-         * explicitly in advance. So let's do this here, but not
-         * needlessly slow down containers. */
+        /* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need to be
+         * sync'ed explicitly in advance. So let's do this here, but not needlessly slow down containers. Note that we
+         * sync'ed things already once above, but we did some more work since then which might have caused IO, hence
+         * let's do it once more. Do not remove this sync, data corruption will result. */
         if (!in_container)
-                sync();
+                sync_with_progress();
 
         switch (cmd) {