Blame SOURCES/0029-file-Fix-implementation-of-cache-none-for-writes.patch

6661d0
From aaead0f62b190f0686f86d825c698f9cd3e803c6 Mon Sep 17 00:00:00 2001
e7ca0c
From: "Richard W.M. Jones" <rjones@redhat.com>
e7ca0c
Date: Tue, 7 Dec 2021 21:08:26 +0000
e7ca0c
Subject: [PATCH] file: Fix implementation of cache=none for writes
e7ca0c
e7ca0c
When testing virt-v2v we found that cache=none had very pessimal
e7ca0c
performance in its current implementation when writing.  See:
e7ca0c
e7ca0c
  https://github.com/libguestfs/virt-v2v/commit/ac59d3b2310511b1537d408b675b19ec9a5d384e
e7ca0c
e7ca0c
However we know of a much better implementation - the one in nbdcopy.
e7ca0c
This commit copies that implementation (for writes only).
e7ca0c
e7ca0c
A simple test is to do:
e7ca0c
e7ca0c
  $ ./nbdkit file out.img cache=none --run 'nbdcopy fedora-33.img $uri'
e7ca0c
e7ca0c
and then check the cache usage of the output file, which should be
e7ca0c
around 0% (using https://github.com/Feh/nocache):
e7ca0c
e7ca0c
  $ cachestats out.img
e7ca0c
  pages in cache: 409/1572864 (0.0%)  [filesize=6291456.0K, pagesize=4K]
e7ca0c
e7ca0c
For modular virt-v2v doing a local disk to local disk conversion:
e7ca0c
e7ca0c
 - before this change, without cache=none
e7ca0c
   virt-v2v took 93.7 seconds, 19.1% pages cached in output file
e7ca0c
e7ca0c
 - before this change, enabling cache=none
e7ca0c
   virt-v2v took 125.4 seconds, 0.0% pages cached in output file
e7ca0c
                 ^^^ this is the bad case which caused the investigation
e7ca0c
e7ca0c
 - after this change, without cache=none
e7ca0c
   virt-v2v took 93.2 seconds, 19.1% pages cached in output file
e7ca0c
e7ca0c
 - after this change, enabling cache=none
e7ca0c
   virt-v2v took 97.9 seconds, 0.1% pages cached in output file
e7ca0c
e7ca0c
I tried to adjust NR_WINDOWS to find an optimum.  Increasing it made
e7ca0c
no difference in performance but predictably caused a slight increase
e7ca0c
in cached pages.  Reducing it slowed performance slightly.  So I
e7ca0c
conclude that 8 is about right, but it probably depends on the
e7ca0c
hardware.
e7ca0c
e7ca0c
(cherry picked from commit a956e2e75d6c88eeefecd967505667c9f176e3af)
e7ca0c
---
e7ca0c
 plugins/file/file.c                 | 79 +++++++++++++++++++++++++----
e7ca0c
 plugins/file/nbdkit-file-plugin.pod |  3 ++
e7ca0c
 2 files changed, 72 insertions(+), 10 deletions(-)
e7ca0c
e7ca0c
diff --git a/plugins/file/file.c b/plugins/file/file.c
e7ca0c
index 35270a24..caf24b2c 100644
e7ca0c
--- a/plugins/file/file.c
e7ca0c
+++ b/plugins/file/file.c
e7ca0c
@@ -85,6 +85,69 @@ static int fadvise_mode =
e7ca0c
 /* cache mode */
e7ca0c
 static enum { cache_default, cache_none } cache_mode = cache_default;
e7ca0c
 
e7ca0c
+/* Define EVICT_WRITES if we are going to evict the page cache
e7ca0c
+ * (cache=none) after writing.  This is only known to work on Linux.
e7ca0c
+ */
e7ca0c
+#ifdef __linux__
e7ca0c
+#define EVICT_WRITES 1
e7ca0c
+#endif
e7ca0c
+
e7ca0c
+#ifdef EVICT_WRITES
e7ca0c
+/* Queue writes so they will be evicted from the cache.  See
e7ca0c
+ * libnbd.git copy/file-ops.c for the rationale behind this.
e7ca0c
+ */
e7ca0c
+#define NR_WINDOWS 8
e7ca0c
+
e7ca0c
+struct write_window {
e7ca0c
+  int fd;
e7ca0c
+  uint64_t offset;
e7ca0c
+  size_t len;
e7ca0c
+};
e7ca0c
+
e7ca0c
+static pthread_mutex_t window_lock = PTHREAD_MUTEX_INITIALIZER;
e7ca0c
+static struct write_window window[NR_WINDOWS];
e7ca0c
+
e7ca0c
+static void
e7ca0c
+evict_writes (int fd, uint64_t offset, size_t len)
e7ca0c
+{
e7ca0c
+  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock);
e7ca0c
+
e7ca0c
+  /* Evict the oldest window from the page cache. */
e7ca0c
+  if (window[0].len > 0) {
e7ca0c
+    sync_file_range (window[0].fd, window[0].offset, window[0].len,
e7ca0c
+                     SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|
e7ca0c
+                     SYNC_FILE_RANGE_WAIT_AFTER);
e7ca0c
+    posix_fadvise (window[0].fd, window[0].offset, window[0].len,
e7ca0c
+                   POSIX_FADV_DONTNEED);
e7ca0c
+  }
e7ca0c
+
e7ca0c
+  /* Move the Nth window to N-1. */
e7ca0c
+  memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1));
e7ca0c
+
e7ca0c
+  /* Set up the current window and tell Linux to start writing it out
e7ca0c
+   * to disk (asynchronously).
e7ca0c
+   */
e7ca0c
+  sync_file_range (fd, offset, len, SYNC_FILE_RANGE_WRITE);
e7ca0c
+  window[NR_WINDOWS-1].fd = fd;
e7ca0c
+  window[NR_WINDOWS-1].offset = offset;
e7ca0c
+  window[NR_WINDOWS-1].len = len;
e7ca0c
+}
e7ca0c
+
e7ca0c
+/* When we close the handle we must remove any windows which are still
e7ca0c
+ * associated.  They missed the boat, oh well :-(
e7ca0c
+ */
e7ca0c
+static void
e7ca0c
+remove_fd_from_window (int fd)
e7ca0c
+{
e7ca0c
+  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock);
e7ca0c
+  size_t i;
e7ca0c
+
e7ca0c
+  for (i = 0; i < NR_WINDOWS; ++i)
e7ca0c
+    if (window[i].len > 0 && window[i].fd == fd)
e7ca0c
+      window[i].len = 0;
e7ca0c
+}
e7ca0c
+#endif /* EVICT_WRITES */
e7ca0c
+
e7ca0c
 /* Any callbacks using lseek must be protected by this lock. */
e7ca0c
 static pthread_mutex_t lseek_lock = PTHREAD_MUTEX_INITIALIZER;
e7ca0c
 
e7ca0c
@@ -431,6 +494,9 @@ file_close (void *handle)
e7ca0c
 {
e7ca0c
   struct handle *h = handle;
e7ca0c
 
e7ca0c
+#ifdef EVICT_WRITES
e7ca0c
+  remove_fd_from_window (h->fd);
e7ca0c
+#endif
e7ca0c
   close (h->fd);
e7ca0c
   free (h);
e7ca0c
 }
e7ca0c
@@ -583,15 +649,9 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset,
e7ca0c
 {
e7ca0c
   struct handle *h = handle;
e7ca0c
 
e7ca0c
-#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED)
e7ca0c
+#if EVICT_WRITES
e7ca0c
   uint32_t orig_count = count;
e7ca0c
   uint64_t orig_offset = offset;
e7ca0c
-
e7ca0c
-  /* If cache=none we want to force pages we have just written to the
e7ca0c
-   * file to be flushed to disk so we can immediately evict them from
e7ca0c
-   * the page cache.
e7ca0c
-   */
e7ca0c
-  if (cache_mode == cache_none) flags |= NBDKIT_FLAG_FUA;
e7ca0c
 #endif
e7ca0c
 
e7ca0c
   while (count > 0) {
e7ca0c
@@ -608,10 +668,9 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset,
e7ca0c
   if ((flags & NBDKIT_FLAG_FUA) && file_flush (handle, 0) == -1)
e7ca0c
     return -1;
e7ca0c
 
e7ca0c
-#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED)
e7ca0c
-  /* On Linux this will evict the pages we just wrote from the page cache. */
e7ca0c
+#if EVICT_WRITES
e7ca0c
   if (cache_mode == cache_none)
e7ca0c
-    posix_fadvise (h->fd, orig_offset, orig_count, POSIX_FADV_DONTNEED);
e7ca0c
+    evict_writes (h->fd, orig_offset, orig_count);
e7ca0c
 #endif
e7ca0c
 
e7ca0c
   return 0;
e7ca0c
diff --git a/plugins/file/nbdkit-file-plugin.pod b/plugins/file/nbdkit-file-plugin.pod
e7ca0c
index 0ac0ee53..f8f0e198 100644
e7ca0c
--- a/plugins/file/nbdkit-file-plugin.pod
e7ca0c
+++ b/plugins/file/nbdkit-file-plugin.pod
e7ca0c
@@ -117,6 +117,9 @@ cache:
e7ca0c
 
e7ca0c
  nbdkit file disk.img fadvise=sequential cache=none
e7ca0c
 
e7ca0c
+Only use fadvise=sequential if reading, and the reads are mainly
e7ca0c
+sequential.
e7ca0c
+
e7ca0c
 =head2 Files on tmpfs
e7ca0c
 
e7ca0c
 If you want to expose a file that resides on a file system known to
e7ca0c
-- 
e7ca0c
2.31.1
e7ca0c