|
|
6661d0 |
From aaead0f62b190f0686f86d825c698f9cd3e803c6 Mon Sep 17 00:00:00 2001
|
|
|
e7ca0c |
From: "Richard W.M. Jones" <rjones@redhat.com>
|
|
|
e7ca0c |
Date: Tue, 7 Dec 2021 21:08:26 +0000
|
|
|
e7ca0c |
Subject: [PATCH] file: Fix implementation of cache=none for writes
|
|
|
e7ca0c |
|
|
|
e7ca0c |
When testing virt-v2v we found that cache=none had very pessimal
|
|
|
e7ca0c |
performance in its current implementation when writing. See:
|
|
|
e7ca0c |
|
|
|
e7ca0c |
https://github.com/libguestfs/virt-v2v/commit/ac59d3b2310511b1537d408b675b19ec9a5d384e
|
|
|
e7ca0c |
|
|
|
e7ca0c |
However we know of a much better implementation - the one in nbdcopy.
|
|
|
e7ca0c |
This commit copies that implementation (for writes only).
|
|
|
e7ca0c |
|
|
|
e7ca0c |
A simple test is to do:
|
|
|
e7ca0c |
|
|
|
e7ca0c |
$ ./nbdkit file out.img cache=none --run 'nbdcopy fedora-33.img $uri'
|
|
|
e7ca0c |
|
|
|
e7ca0c |
and then check the cache usage of the output file, which should be
|
|
|
e7ca0c |
around 0% (using https:
|
|
|
e7ca0c |
|
|
|
e7ca0c |
$ cachestats out.img
|
|
|
e7ca0c |
pages in cache: 409/1572864 (0.0%) [filesize=6291456.0K, pagesize=4K]
|
|
|
e7ca0c |
|
|
|
e7ca0c |
For modular virt-v2v doing a local disk to local disk conversion:
|
|
|
e7ca0c |
|
|
|
e7ca0c |
- before this change, without cache=none
|
|
|
e7ca0c |
virt-v2v took 93.7 seconds, 19.1% pages cached in output file
|
|
|
e7ca0c |
|
|
|
e7ca0c |
- before this change, enabling cache=none
|
|
|
e7ca0c |
virt-v2v took 125.4 seconds, 0.0% pages cached in output file
|
|
|
e7ca0c |
^^^ this is the bad case which caused the investigation
|
|
|
e7ca0c |
|
|
|
e7ca0c |
- after this change, without cache=none
|
|
|
e7ca0c |
virt-v2v took 93.2 seconds, 19.1% pages cached in output file
|
|
|
e7ca0c |
|
|
|
e7ca0c |
- after this change, enabling cache=none
|
|
|
e7ca0c |
virt-v2v took 97.9 seconds, 0.1% pages cached in output file
|
|
|
e7ca0c |
|
|
|
e7ca0c |
I tried to adjust NR_WINDOWS to find an optimum. Increasing it made
|
|
|
e7ca0c |
no difference in performance but predictably caused a slight increase
|
|
|
e7ca0c |
in cached pages. Reducing it slowed performance slightly. So I
|
|
|
e7ca0c |
conclude that 8 is about right, but it probably depends on the
|
|
|
e7ca0c |
hardware.
|
|
|
e7ca0c |
|
|
|
e7ca0c |
(cherry picked from commit a956e2e75d6c88eeefecd967505667c9f176e3af)
|
|
|
e7ca0c |
|
|
|
e7ca0c |
plugins/file/file.c | 79 +++++++++++++++++++++++++
|
|
|
e7ca0c |
plugins/file/nbdkit-file-plugin.pod | 3 ++
|
|
|
e7ca0c |
2 files changed, 72 insertions(+), 10 deletions(-)
|
|
|
e7ca0c |
|
|
|
e7ca0c |
diff
|
|
|
e7ca0c |
index 35270a24..caf24b2c 100644
|
|
|
e7ca0c |
|
|
|
e7ca0c |
|
|
|
e7ca0c |
@@ -85,6 +85,69 @@ static int fadvise_mode =
|
|
|
e7ca0c |
|
|
|
e7ca0c |
static enum { cache_default, cache_none } cache_mode = cache_default;
|
|
|
e7ca0c |
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+ * (cache=none) after writing. This is only known to work on Linux.
|
|
|
e7ca0c |
+ */
|
|
|
e7ca0c |
+#ifdef __linux__
|
|
|
e7ca0c |
+#define EVICT_WRITES 1
|
|
|
e7ca0c |
+#endif
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+#ifdef EVICT_WRITES
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+ * libnbd.git copy/file-ops.c for the rationale behind this.
|
|
|
e7ca0c |
+ */
|
|
|
e7ca0c |
+#define NR_WINDOWS 8
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+struct write_window {
|
|
|
e7ca0c |
+ int fd;
|
|
|
e7ca0c |
+ uint64_t offset;
|
|
|
e7ca0c |
+ size_t len;
|
|
|
e7ca0c |
+};
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+static pthread_mutex_t window_lock = PTHREAD_MUTEX_INITIALIZER;
|
|
|
e7ca0c |
+static struct write_window window[NR_WINDOWS];
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+static void
|
|
|
e7ca0c |
+evict_writes (int fd, uint64_t offset, size_t len)
|
|
|
e7ca0c |
+{
|
|
|
e7ca0c |
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock);
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+ if (window[0].len > 0) {
|
|
|
e7ca0c |
+ sync_file_range (window[0].fd, window[0].offset, window[0].len,
|
|
|
e7ca0c |
+ SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|
|
|
|
e7ca0c |
+ SYNC_FILE_RANGE_WAIT_AFTER);
|
|
|
e7ca0c |
+ posix_fadvise (window[0].fd, window[0].offset, window[0].len,
|
|
|
e7ca0c |
+ POSIX_FADV_DONTNEED);
|
|
|
e7ca0c |
+ }
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+ memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1));
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+ * to disk (asynchronously).
|
|
|
e7ca0c |
+ */
|
|
|
e7ca0c |
+ sync_file_range (fd, offset, len, SYNC_FILE_RANGE_WRITE);
|
|
|
e7ca0c |
+ window[NR_WINDOWS-1].fd = fd;
|
|
|
e7ca0c |
+ window[NR_WINDOWS-1].offset = offset;
|
|
|
e7ca0c |
+ window[NR_WINDOWS-1].len = len;
|
|
|
e7ca0c |
+}
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+ * associated. They missed the boat, oh well :-(
|
|
|
e7ca0c |
+ */
|
|
|
e7ca0c |
+static void
|
|
|
e7ca0c |
+remove_fd_from_window (int fd)
|
|
|
e7ca0c |
+{
|
|
|
e7ca0c |
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock);
|
|
|
e7ca0c |
+ size_t i;
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
+ for (i = 0; i < NR_WINDOWS; ++i)
|
|
|
e7ca0c |
+ if (window[i].len > 0 && window[i].fd == fd)
|
|
|
e7ca0c |
+ window[i].len = 0;
|
|
|
e7ca0c |
+}
|
|
|
e7ca0c |
+#endif
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
|
|
|
e7ca0c |
static pthread_mutex_t lseek_lock = PTHREAD_MUTEX_INITIALIZER;
|
|
|
e7ca0c |
|
|
|
e7ca0c |
@@ -431,6 +494,9 @@ file_close (void *handle)
|
|
|
e7ca0c |
{
|
|
|
e7ca0c |
struct handle *h = handle;
|
|
|
e7ca0c |
|
|
|
e7ca0c |
+#ifdef EVICT_WRITES
|
|
|
e7ca0c |
+ remove_fd_from_window (h->fd);
|
|
|
e7ca0c |
+#endif
|
|
|
e7ca0c |
close (h->fd);
|
|
|
e7ca0c |
free (h);
|
|
|
e7ca0c |
}
|
|
|
e7ca0c |
@@ -583,15 +649,9 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset,
|
|
|
e7ca0c |
{
|
|
|
e7ca0c |
struct handle *h = handle;
|
|
|
e7ca0c |
|
|
|
e7ca0c |
-#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED)
|
|
|
e7ca0c |
+#if EVICT_WRITES
|
|
|
e7ca0c |
uint32_t orig_count = count;
|
|
|
e7ca0c |
uint64_t orig_offset = offset;
|
|
|
e7ca0c |
-
|
|
|
e7ca0c |
-
|
|
|
e7ca0c |
- * file to be flushed to disk so we can immediately evict them from
|
|
|
e7ca0c |
- * the page cache.
|
|
|
e7ca0c |
- */
|
|
|
e7ca0c |
- if (cache_mode == cache_none) flags |= NBDKIT_FLAG_FUA;
|
|
|
e7ca0c |
#endif
|
|
|
e7ca0c |
|
|
|
e7ca0c |
while (count > 0) {
|
|
|
e7ca0c |
@@ -608,10 +668,9 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset,
|
|
|
e7ca0c |
if ((flags & NBDKIT_FLAG_FUA) && file_flush (handle, 0) == -1)
|
|
|
e7ca0c |
return -1;
|
|
|
e7ca0c |
|
|
|
e7ca0c |
-#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED)
|
|
|
e7ca0c |
-
|
|
|
e7ca0c |
+#if EVICT_WRITES
|
|
|
e7ca0c |
if (cache_mode == cache_none)
|
|
|
e7ca0c |
- posix_fadvise (h->fd, orig_offset, orig_count, POSIX_FADV_DONTNEED);
|
|
|
e7ca0c |
+ evict_writes (h->fd, orig_offset, orig_count);
|
|
|
e7ca0c |
#endif
|
|
|
e7ca0c |
|
|
|
e7ca0c |
return 0;
|
|
|
e7ca0c |
diff
|
|
|
e7ca0c |
index 0ac0ee53..f8f0e198 100644
|
|
|
e7ca0c |
|
|
|
e7ca0c |
|
|
|
e7ca0c |
@@ -117,6 +117,9 @@ cache:
|
|
|
e7ca0c |
|
|
|
e7ca0c |
nbdkit file disk.img fadvise=sequential cache=none
|
|
|
e7ca0c |
|
|
|
e7ca0c |
+Only use fadvise=sequential if reading, and the reads are mainly
|
|
|
e7ca0c |
+sequential.
|
|
|
e7ca0c |
+
|
|
|
e7ca0c |
=head2 Files on tmpfs
|
|
|
e7ca0c |
|
|
|
e7ca0c |
If you want to expose a file that resides on a file system known to
|
|
|
e7ca0c |
--
|
|
|
e7ca0c |
2.31.1
|
|
|
e7ca0c |
|