2be57e
From ddc1464f3b129ea8490d7da3dd000ab1874b3a4d Mon Sep 17 00:00:00 2001
2be57e
From: Kyle Walker <kwalker@redhat.com>
2be57e
Date: Thu, 14 Dec 2017 11:46:03 -0500
2be57e
Subject: [PATCH] core: Implement sync_with_progress()
2be57e
2be57e
In similar fashion to the previous change, sync() operations can stall
2be57e
endlessly if cache is unable to be written out. In order to avoid an
2be57e
unbounded hang, the sync takes place within a child process. Every 10
2be57e
seconds (SYNC_TIMEOUT_USEC), the value of /proc/meminfo "Dirty" is checked
2be57e
to verify it is smaller than the last iteration. If the sync is not making
2be57e
progress for 3 successive iterations (SYNC_PROGRESS_ATTEMPTS), a SIGKILL is
2be57e
sent to the sync process and the shutdown continues.
2be57e
2be57e
(cherry picked from commit 73ad712fcfea5d8ba475044698d31d2c15d4180d)
2be57e
2be57e
Related: #1571098
2be57e
---
2be57e
 src/core/shutdown.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++---
2be57e
 1 file changed, 111 insertions(+), 5 deletions(-)
2be57e
2be57e
diff --git a/src/core/shutdown.c b/src/core/shutdown.c
2be57e
index 71f001ac1..0b0a54a7d 100644
2be57e
--- a/src/core/shutdown.c
2be57e
+++ b/src/core/shutdown.c
2be57e
@@ -53,6 +53,9 @@
2be57e
 
2be57e
 #define FINALIZE_ATTEMPTS 50
2be57e
 
2be57e
+#define SYNC_PROGRESS_ATTEMPTS 3
2be57e
+#define SYNC_TIMEOUT_USEC (10*USEC_PER_SEC)
2be57e
+
2be57e
 static char* arg_verb;
2be57e
 
2be57e
 static int parse_argv(int argc, char *argv[]) {
2be57e
@@ -152,6 +155,102 @@ static int switch_root_initramfs(void) {
2be57e
         return switch_root("/run/initramfs", "/oldroot", false, MS_BIND);
2be57e
 }
2be57e
 
2be57e
+/* Read the following fields from /proc/meminfo:
2be57e
+ *
2be57e
+ *  NFS_Unstable
2be57e
+ *  Writeback
2be57e
+ *  Dirty
2be57e
+ *
2be57e
+ * Return true if the sum of these fields is greater than the previous
2be57e
+ * value input. For all other issues, report the failure and indicate that
2be57e
+ * the sync is not making progress.
2be57e
+ */
2be57e
+static bool sync_making_progress(unsigned long long *prev_dirty) {
2be57e
+        _cleanup_fclose_ FILE *f = NULL;
2be57e
+        char line[LINE_MAX];
2be57e
+        bool r = false;
2be57e
+        unsigned long long val = 0;
2be57e
+
2be57e
+        f = fopen("/proc/meminfo", "re");
2be57e
+        if (!f)
2be57e
+                return log_warning_errno(errno, "Failed to open /proc/meminfo: %m");
2be57e
+
2be57e
+        FOREACH_LINE(line, f, log_warning_errno(errno, "Failed to parse /proc/meminfo: %m")) {
2be57e
+                unsigned long long ull = 0;
2be57e
+
2be57e
+                if (!first_word(line, "NFS_Unstable:") && !first_word(line, "Writeback:") && !first_word(line, "Dirty:"))
2be57e
+                        continue;
2be57e
+
2be57e
+                errno = 0;
2be57e
+                if (sscanf(line, "%*s %llu %*s", &ull) != 1) {
2be57e
+                        if (errno != 0)
2be57e
+                                log_warning_errno(errno, "Failed to parse /proc/meminfo: %m");
2be57e
+                        else
2be57e
+                                log_warning("Failed to parse /proc/meminfo");
2be57e
+
2be57e
+                        return false;
2be57e
+                }
2be57e
+
2be57e
+                val += ull;
2be57e
+        }
2be57e
+
2be57e
+        r = *prev_dirty > val;
2be57e
+
2be57e
+        *prev_dirty = val;
2be57e
+
2be57e
+        return r;
2be57e
+}
2be57e
+
2be57e
+static void sync_with_progress(void) {
2be57e
+        unsigned checks;
2be57e
+        pid_t pid;
2be57e
+        int r;
2be57e
+        unsigned long long dirty = ULONG_LONG_MAX;
2be57e
+
2be57e
+        BLOCK_SIGNALS(SIGCHLD);
2be57e
+
2be57e
+        /* Due to the possiblity of the sync operation hanging, we fork
2be57e
+         * a child process and monitor the progress. If the timeout
2be57e
+         * lapses, the assumption is that that particular sync stalled. */
2be57e
+        pid = fork();
2be57e
+        if (pid < 0) {
2be57e
+                log_error_errno(errno, "Failed to fork: %m");
2be57e
+                return;
2be57e
+        }
2be57e
+
2be57e
+        if (pid == 0) {
2be57e
+                /* Start the sync operation here in the child */
2be57e
+                sync();
2be57e
+                _exit(EXIT_SUCCESS);
2be57e
+        }
2be57e
+
2be57e
+        log_info("Syncing filesystems and block devices.");
2be57e
+
2be57e
+        /* Start monitoring the sync operation. If more than
2be57e
+         * SYNC_PROGRESS_ATTEMPTS lapse without progress being made,
2be57e
+         * we assume that the sync is stalled */
2be57e
+        for (checks = 0; checks < SYNC_PROGRESS_ATTEMPTS; checks++) {
2be57e
+                r = wait_for_terminate_with_timeout(pid, SYNC_TIMEOUT_USEC);
2be57e
+                if (r == 0)
2be57e
+                        /* Sync finished without error.
2be57e
+                         * (The sync itself does not return an error code) */
2be57e
+                        return;
2be57e
+                else if (r == -ETIMEDOUT) {
2be57e
+                        /* Reset the check counter if the "Dirty" value is
2be57e
+                         * decreasing */
2be57e
+                        if (sync_making_progress(&dirty))
2be57e
+                                checks = 0;
2be57e
+                } else {
2be57e
+                        log_error_errno(r, "Failed to sync filesystems and block devices: %m");
2be57e
+                        return;
2be57e
+                }
2be57e
+        }
2be57e
+
2be57e
+        /* Only reached in the event of a timeout. We should issue a kill
2be57e
+         * to the stray process. */
2be57e
+        log_error("Syncing filesystems and block devices - timed out, issuing SIGKILL to PID "PID_FMT".", pid);
2be57e
+        (void) kill(pid, SIGKILL);
2be57e
+}
2be57e
 
2be57e
 int main(int argc, char *argv[]) {
2be57e
         bool need_umount, need_swapoff, need_loop_detach, need_dm_detach;
2be57e
@@ -202,6 +301,13 @@ int main(int argc, char *argv[]) {
2be57e
         /* lock us into memory */
2be57e
         mlockall(MCL_CURRENT|MCL_FUTURE);
2be57e
 
2be57e
+        /* Synchronize everything that is not written to disk yet at this point already. This is a good idea so that
2be57e
+         * slow IO is processed here already and the final process killing spree is not impacted by processes
2be57e
+         * desperately trying to sync IO to disk within their timeout. Do not remove this sync, data corruption will
2be57e
+         * result. */
2be57e
+        if (!in_container)
2be57e
+                sync_with_progress();
2be57e
+
2be57e
         log_info("Sending SIGTERM to remaining processes...");
2be57e
         broadcast_signal(SIGTERM, true, true);
2be57e
 
2be57e
@@ -338,12 +444,12 @@ int main(int argc, char *argv[]) {
2be57e
                           need_loop_detach ? " loop devices," : "",
2be57e
                           need_dm_detach ? " DM devices," : "");
2be57e
 
2be57e
-        /* The kernel will automaticall flush ATA disks and suchlike
2be57e
-         * on reboot(), but the file systems need to be synce'd
2be57e
-         * explicitly in advance. So let's do this here, but not
2be57e
-         * needlessly slow down containers. */
2be57e
+        /* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need to be
2be57e
+         * sync'ed explicitly in advance. So let's do this here, but not needlessly slow down containers. Note that we
2be57e
+         * sync'ed things already once above, but we did some more work since then which might have caused IO, hence
2be57e
+         * let's do it once more. Do not remove this sync, data corruption will result. */
2be57e
         if (!in_container)
2be57e
-                sync();
2be57e
+                sync_with_progress();
2be57e
 
2be57e
         switch (cmd) {
2be57e