aed857
From db57bf73d3e5e650b261834a0c39c9d368f9eeea Mon Sep 17 00:00:00 2001
510b6a
From: Kyle Walker <kwalker@redhat.com>
510b6a
Date: Thu, 14 Dec 2017 11:46:03 -0500
510b6a
Subject: [PATCH] core: Implement sync_with_progress()
510b6a
510b6a
In similar fashion to the previous change, sync() operations can stall
510b6a
endlessly if cache is unable to be written out. In order to avoid an
510b6a
unbounded hang, the sync takes place within a child process. Every 10
510b6a
seconds (SYNC_TIMEOUT_USEC), the value of /proc/meminfo "Dirty" is checked
510b6a
to verify it is smaller than the last iteration. If the sync is not making
510b6a
progress for 3 successive iterations (SYNC_PROGRESS_ATTEMPTS), a SIGKILL is
510b6a
sent to the sync process and the shutdown continues.
510b6a
510b6a
(cherry picked from commit 73ad712fcfea5d8ba475044698d31d2c15d4180d)
510b6a
510b6a
Related: #1571098
510b6a
---
23b3cf
 src/core/shutdown.c | 116 ++++++++++++++++++++++++++++++++++++++++++--
510b6a
 1 file changed, 111 insertions(+), 5 deletions(-)
510b6a
510b6a
diff --git a/src/core/shutdown.c b/src/core/shutdown.c
510b6a
index 71f001ac1..0b0a54a7d 100644
510b6a
--- a/src/core/shutdown.c
510b6a
+++ b/src/core/shutdown.c
510b6a
@@ -53,6 +53,9 @@
510b6a
 
510b6a
 #define FINALIZE_ATTEMPTS 50
510b6a
 
510b6a
+#define SYNC_PROGRESS_ATTEMPTS 3
510b6a
+#define SYNC_TIMEOUT_USEC (10*USEC_PER_SEC)
510b6a
+
510b6a
 static char* arg_verb;
510b6a
 
510b6a
 static int parse_argv(int argc, char *argv[]) {
510b6a
@@ -152,6 +155,102 @@ static int switch_root_initramfs(void) {
510b6a
         return switch_root("/run/initramfs", "/oldroot", false, MS_BIND);
510b6a
 }
510b6a
 
510b6a
+/* Read the following fields from /proc/meminfo:
510b6a
+ *
510b6a
+ *  NFS_Unstable
510b6a
+ *  Writeback
510b6a
+ *  Dirty
510b6a
+ *
510b6a
+ * Return true if the sum of these fields is greater than the previous
510b6a
+ * value input. For all other issues, report the failure and indicate that
510b6a
+ * the sync is not making progress.
510b6a
+ */
510b6a
+static bool sync_making_progress(unsigned long long *prev_dirty) {
510b6a
+        _cleanup_fclose_ FILE *f = NULL;
510b6a
+        char line[LINE_MAX];
510b6a
+        bool r = false;
510b6a
+        unsigned long long val = 0;
510b6a
+
510b6a
+        f = fopen("/proc/meminfo", "re");
510b6a
+        if (!f)
510b6a
+                return log_warning_errno(errno, "Failed to open /proc/meminfo: %m");
510b6a
+
510b6a
+        FOREACH_LINE(line, f, log_warning_errno(errno, "Failed to parse /proc/meminfo: %m")) {
510b6a
+                unsigned long long ull = 0;
510b6a
+
510b6a
+                if (!first_word(line, "NFS_Unstable:") && !first_word(line, "Writeback:") && !first_word(line, "Dirty:"))
510b6a
+                        continue;
510b6a
+
510b6a
+                errno = 0;
510b6a
+                if (sscanf(line, "%*s %llu %*s", &ull) != 1) {
510b6a
+                        if (errno != 0)
510b6a
+                                log_warning_errno(errno, "Failed to parse /proc/meminfo: %m");
510b6a
+                        else
510b6a
+                                log_warning("Failed to parse /proc/meminfo");
510b6a
+
510b6a
+                        return false;
510b6a
+                }
510b6a
+
510b6a
+                val += ull;
510b6a
+        }
510b6a
+
510b6a
+        r = *prev_dirty > val;
510b6a
+
510b6a
+        *prev_dirty = val;
510b6a
+
510b6a
+        return r;
510b6a
+}
510b6a
+
510b6a
+static void sync_with_progress(void) {
510b6a
+        unsigned checks;
510b6a
+        pid_t pid;
510b6a
+        int r;
510b6a
+        unsigned long long dirty = ULONG_LONG_MAX;
510b6a
+
510b6a
+        BLOCK_SIGNALS(SIGCHLD);
510b6a
+
510b6a
+        /* Due to the possiblity of the sync operation hanging, we fork
510b6a
+         * a child process and monitor the progress. If the timeout
510b6a
+         * lapses, the assumption is that that particular sync stalled. */
510b6a
+        pid = fork();
510b6a
+        if (pid < 0) {
510b6a
+                log_error_errno(errno, "Failed to fork: %m");
510b6a
+                return;
510b6a
+        }
510b6a
+
510b6a
+        if (pid == 0) {
510b6a
+                /* Start the sync operation here in the child */
510b6a
+                sync();
510b6a
+                _exit(EXIT_SUCCESS);
510b6a
+        }
510b6a
+
510b6a
+        log_info("Syncing filesystems and block devices.");
510b6a
+
510b6a
+        /* Start monitoring the sync operation. If more than
510b6a
+         * SYNC_PROGRESS_ATTEMPTS lapse without progress being made,
510b6a
+         * we assume that the sync is stalled */
510b6a
+        for (checks = 0; checks < SYNC_PROGRESS_ATTEMPTS; checks++) {
510b6a
+                r = wait_for_terminate_with_timeout(pid, SYNC_TIMEOUT_USEC);
510b6a
+                if (r == 0)
510b6a
+                        /* Sync finished without error.
510b6a
+                         * (The sync itself does not return an error code) */
510b6a
+                        return;
510b6a
+                else if (r == -ETIMEDOUT) {
510b6a
+                        /* Reset the check counter if the "Dirty" value is
510b6a
+                         * decreasing */
510b6a
+                        if (sync_making_progress(&dirty))
510b6a
+                                checks = 0;
510b6a
+                } else {
510b6a
+                        log_error_errno(r, "Failed to sync filesystems and block devices: %m");
510b6a
+                        return;
510b6a
+                }
510b6a
+        }
510b6a
+
510b6a
+        /* Only reached in the event of a timeout. We should issue a kill
510b6a
+         * to the stray process. */
510b6a
+        log_error("Syncing filesystems and block devices - timed out, issuing SIGKILL to PID "PID_FMT".", pid);
510b6a
+        (void) kill(pid, SIGKILL);
510b6a
+}
510b6a
 
510b6a
 int main(int argc, char *argv[]) {
510b6a
         bool need_umount, need_swapoff, need_loop_detach, need_dm_detach;
510b6a
@@ -202,6 +301,13 @@ int main(int argc, char *argv[]) {
510b6a
         /* lock us into memory */
510b6a
         mlockall(MCL_CURRENT|MCL_FUTURE);
510b6a
 
510b6a
+        /* Synchronize everything that is not written to disk yet at this point already. This is a good idea so that
510b6a
+         * slow IO is processed here already and the final process killing spree is not impacted by processes
510b6a
+         * desperately trying to sync IO to disk within their timeout. Do not remove this sync, data corruption will
510b6a
+         * result. */
510b6a
+        if (!in_container)
510b6a
+                sync_with_progress();
510b6a
+
510b6a
         log_info("Sending SIGTERM to remaining processes...");
510b6a
         broadcast_signal(SIGTERM, true, true);
510b6a
 
510b6a
@@ -338,12 +444,12 @@ int main(int argc, char *argv[]) {
510b6a
                           need_loop_detach ? " loop devices," : "",
510b6a
                           need_dm_detach ? " DM devices," : "");
510b6a
 
510b6a
-        /* The kernel will automaticall flush ATA disks and suchlike
510b6a
-         * on reboot(), but the file systems need to be synce'd
510b6a
-         * explicitly in advance. So let's do this here, but not
510b6a
-         * needlessly slow down containers. */
510b6a
+        /* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need to be
510b6a
+         * sync'ed explicitly in advance. So let's do this here, but not needlessly slow down containers. Note that we
510b6a
+         * sync'ed things already once above, but we did some more work since then which might have caused IO, hence
510b6a
+         * let's do it once more. Do not remove this sync, data corruption will result. */
510b6a
         if (!in_container)
510b6a
-                sync();
510b6a
+                sync_with_progress();
510b6a
 
510b6a
         switch (cmd) {
510b6a