Blame SOURCES/0003-cache-cow-Add-blk_read_multiple-function.patch

f954f3
From b5dc8577c5c6d1205e2106b629fad327c3a409ea Mon Sep 17 00:00:00 2001
f954f3
From: "Richard W.M. Jones" <rjones@redhat.com>
f954f3
Date: Mon, 26 Jul 2021 13:55:21 +0100
f954f3
Subject: [PATCH] cache, cow: Add blk_read_multiple function
f954f3
f954f3
Currently the cache and cow filters break up large requests into many
f954f3
single block-sized requests to the underlying plugin.  For some
f954f3
plugins (eg. curl) this is very inefficient and causes huge
f954f3
slow-downs.
f954f3
f954f3
For example I tested nbdkit + curl vs nbdkit + cache + curl against a
f954f3
slow, remote VMware server.  A simple run of virt-inspector was at
f954f3
least 6-7 times slower with the cache filter.  (It was so slow that I
f954f3
didn't actually let it run to completion - I am estimating the
f954f3
slowdown multiple using interim debug messages).
f954f3
f954f3
Implement a new blk_read_multiple function in the cache filter.  It
f954f3
does not break up "runs" of blocks which all have the same cache
f954f3
state.  The cache .pread method uses the new function to read the
f954f3
block-aligned part of the request.
f954f3
f954f3
(cherry picked from commit ab661ccef5b3369fa22c33d0289baddc251b73bf)
f954f3
---
f954f3
 filters/cache/blk.c   | 83 ++++++++++++++++++++++++++++++++-----------
f954f3
 filters/cache/blk.h   |  6 ++++
f954f3
 filters/cache/cache.c | 21 +++++------
f954f3
 filters/cow/blk.c     | 63 +++++++++++++++++++++++---------
f954f3
 filters/cow/blk.h     |  6 ++++
f954f3
 filters/cow/cow.c     | 21 +++++------
f954f3
 6 files changed, 138 insertions(+), 62 deletions(-)
f954f3
f954f3
diff --git a/filters/cache/blk.c b/filters/cache/blk.c
f954f3
index f52f30e3..f85ada35 100644
f954f3
--- a/filters/cache/blk.c
f954f3
+++ b/filters/cache/blk.c
f954f3
@@ -44,6 +44,7 @@
f954f3
 #include <string.h>
f954f3
 #include <unistd.h>
f954f3
 #include <fcntl.h>
f954f3
+#include <limits.h>
f954f3
 #include <errno.h>
f954f3
 
f954f3
 #ifdef HAVE_SYS_STATVFS_H
f954f3
@@ -193,26 +194,40 @@ blk_set_size (uint64_t new_size)
f954f3
   return 0;
f954f3
 }
f954f3
 
f954f3
-int
f954f3
-blk_read (nbdkit_next *next,
f954f3
-          uint64_t blknum, uint8_t *block, int *err)
f954f3
+static int
f954f3
+_blk_read_multiple (nbdkit_next *next,
f954f3
+                    uint64_t blknum, uint64_t nrblocks,
f954f3
+                    uint8_t *block, int *err)
f954f3
 {
f954f3
   off_t offset = blknum * blksize;
f954f3
-  enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED);
f954f3
+  bool not_cached =
f954f3
+    bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED) == BLOCK_NOT_CACHED;
f954f3
+  uint64_t b, runblocks;
f954f3
 
f954f3
-  reclaim (fd, &bm;;
f954f3
+  assert (nrblocks > 0);
f954f3
 
f954f3
   if (cache_debug_verbose)
f954f3
-    nbdkit_debug ("cache: blk_read block %" PRIu64
f954f3
+    nbdkit_debug ("cache: blk_read_multiple block %" PRIu64
f954f3
                   " (offset %" PRIu64 ") is %s",
f954f3
                   blknum, (uint64_t) offset,
f954f3
-                  state == BLOCK_NOT_CACHED ? "not cached" :
f954f3
-                  state == BLOCK_CLEAN ? "clean" :
f954f3
-                  state == BLOCK_DIRTY ? "dirty" :
f954f3
-                  "unknown");
f954f3
+                  not_cached ? "not cached" : "cached");
f954f3
 
f954f3
-  if (state == BLOCK_NOT_CACHED) { /* Read underlying plugin. */
f954f3
-    unsigned n = blksize, tail = 0;
f954f3
+  /* Find out how many of the following blocks form a "run" with the
f954f3
+   * same cached/not-cached state.  We can process that many blocks in
f954f3
+   * one go.
f954f3
+   */
f954f3
+  for (b = 1, runblocks = 1; b < nrblocks; ++b, ++runblocks) {
f954f3
+    bool s =
f954f3
+      bitmap_get_blk (&bm, blknum + b, BLOCK_NOT_CACHED) == BLOCK_NOT_CACHED;
f954f3
+    if (not_cached != s)
f954f3
+      break;
f954f3
+  }
f954f3
+
f954f3
+  if (not_cached) {             /* Read underlying plugin. */
f954f3
+    unsigned n, tail = 0;
f954f3
+
f954f3
+    assert (blksize * runblocks <= UINT_MAX);
f954f3
+    n = blksize * runblocks;
f954f3
 
f954f3
     if (offset + n > size) {
f954f3
       tail = offset + n - size;
f954f3
@@ -228,32 +243,60 @@ blk_read (nbdkit_next *next,
f954f3
      */
f954f3
     memset (block + n, 0, tail);
f954f3
 
f954f3
-    /* If cache-on-read, copy the block to the cache. */
f954f3
+    /* If cache-on-read, copy the blocks to the cache. */
f954f3
     if (cache_on_read) {
f954f3
       if (cache_debug_verbose)
f954f3
         nbdkit_debug ("cache: cache-on-read block %" PRIu64
f954f3
                       " (offset %" PRIu64 ")",
f954f3
                       blknum, (uint64_t) offset);
f954f3
 
f954f3
-      if (pwrite (fd, block, blksize, offset) == -1) {
f954f3
+      if (pwrite (fd, block, blksize * runblocks, offset) == -1) {
f954f3
         *err = errno;
f954f3
         nbdkit_error ("pwrite: %m");
f954f3
         return -1;
f954f3
       }
f954f3
-      bitmap_set_blk (&bm, blknum, BLOCK_CLEAN);
f954f3
-      lru_set_recently_accessed (blknum);
f954f3
+      for (b = 0; b < runblocks; ++b) {
f954f3
+        bitmap_set_blk (&bm, blknum + b, BLOCK_CLEAN);
f954f3
+        lru_set_recently_accessed (blknum + b);
f954f3
+      }
f954f3
     }
f954f3
-    return 0;
f954f3
   }
f954f3
   else {                        /* Read cache. */
f954f3
-    if (pread (fd, block, blksize, offset) == -1) {
f954f3
+    if (pread (fd, block, blksize * runblocks, offset) == -1) {
f954f3
       *err = errno;
f954f3
       nbdkit_error ("pread: %m");
f954f3
       return -1;
f954f3
     }
f954f3
-    lru_set_recently_accessed (blknum);
f954f3
-    return 0;
f954f3
+    for (b = 0; b < runblocks; ++b)
f954f3
+      lru_set_recently_accessed (blknum + b);
f954f3
   }
f954f3
+
f954f3
+  /* If all done, return. */
f954f3
+  if (runblocks == nrblocks)
f954f3
+    return 0;
f954f3
+
f954f3
+  /* Recurse to read remaining blocks. */
f954f3
+  return _blk_read_multiple (next,
f954f3
+                             blknum + runblocks,
f954f3
+                             nrblocks - runblocks,
f954f3
+                             block + blksize * runblocks,
f954f3
+                             err);
f954f3
+}
f954f3
+
f954f3
+int
f954f3
+blk_read_multiple (nbdkit_next *next,
f954f3
+                   uint64_t blknum, uint64_t nrblocks,
f954f3
+                   uint8_t *block, int *err)
f954f3
+{
f954f3
+  reclaim (fd, &bm;;
f954f3
+  return _blk_read_multiple (next, blknum, nrblocks, block, err);
f954f3
+}
f954f3
+
f954f3
+int
f954f3
+blk_read (nbdkit_next *next,
f954f3
+          uint64_t blknum, uint8_t *block, int *err)
f954f3
+{
f954f3
+  return blk_read_multiple (next, blknum, 1, block, err);
f954f3
 }
f954f3
 
f954f3
 int
f954f3
diff --git a/filters/cache/blk.h b/filters/cache/blk.h
f954f3
index 87c753e2..1ee33ed7 100644
f954f3
--- a/filters/cache/blk.h
f954f3
+++ b/filters/cache/blk.h
f954f3
@@ -55,6 +55,12 @@ extern int blk_read (nbdkit_next *next,
f954f3
                      uint64_t blknum, uint8_t *block, int *err)
f954f3
   __attribute__((__nonnull__ (1, 3, 4)));
f954f3
 
f954f3
+/* As above, but read multiple blocks. */
f954f3
+extern int blk_read_multiple (nbdkit_next *next,
f954f3
+                              uint64_t blknum, uint64_t nrblocks,
f954f3
+                              uint8_t *block, int *err)
f954f3
+  __attribute__((__nonnull__ (1, 4, 5)));
f954f3
+
f954f3
 /* If a single block is not cached, copy it from the plugin. */
f954f3
 extern int blk_cache (nbdkit_next *next,
f954f3
                       uint64_t blknum, uint8_t *block, int *err)
f954f3
diff --git a/filters/cache/cache.c b/filters/cache/cache.c
f954f3
index 745f552d..14cc03f2 100644
f954f3
--- a/filters/cache/cache.c
f954f3
+++ b/filters/cache/cache.c
f954f3
@@ -313,7 +313,7 @@ cache_pread (nbdkit_next *next,
f954f3
              uint32_t flags, int *err)
f954f3
 {
f954f3
   CLEANUP_FREE uint8_t *block = NULL;
f954f3
-  uint64_t blknum, blkoffs;
f954f3
+  uint64_t blknum, blkoffs, nrblocks;
f954f3
   int r;
f954f3
 
f954f3
   assert (!flags);
f954f3
@@ -348,22 +348,17 @@ cache_pread (nbdkit_next *next,
f954f3
   }
f954f3
 
f954f3
   /* Aligned body */
f954f3
-  /* XXX This breaks up large read requests into smaller ones, which
f954f3
-   * is a problem for plugins which have a large, fixed per-request
f954f3
-   * overhead (hello, curl).  We should try to keep large requests
f954f3
-   * together as much as possible, but that requires us to be much
f954f3
-   * smarter here.
f954f3
-   */
f954f3
-  while (count >= blksize) {
f954f3
+  nrblocks = count / blksize;
f954f3
+  if (nrblocks > 0) {
f954f3
     ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
f954f3
-    r = blk_read (next, blknum, buf, err);
f954f3
+    r = blk_read_multiple (next, blknum, nrblocks, buf, err);
f954f3
     if (r == -1)
f954f3
       return -1;
f954f3
 
f954f3
-    buf += blksize;
f954f3
-    count -= blksize;
f954f3
-    offset += blksize;
f954f3
-    blknum++;
f954f3
+    buf += nrblocks * blksize;
f954f3
+    count -= nrblocks * blksize;
f954f3
+    offset += nrblocks * blksize;
f954f3
+    blknum += nrblocks;
f954f3
   }
f954f3
 
f954f3
   /* Unaligned tail */
f954f3
diff --git a/filters/cow/blk.c b/filters/cow/blk.c
f954f3
index b7c4d7f1..4ec8d1b8 100644
f954f3
--- a/filters/cow/blk.c
f954f3
+++ b/filters/cow/blk.c
f954f3
@@ -79,6 +79,7 @@
f954f3
 #include <inttypes.h>
f954f3
 #include <unistd.h>
f954f3
 #include <fcntl.h>
f954f3
+#include <limits.h>
f954f3
 #include <errno.h>
f954f3
 #include <sys/types.h>
f954f3
 
f954f3
@@ -219,33 +220,48 @@ blk_status (uint64_t blknum, bool *present, bool *trimmed)
f954f3
   *trimmed = state == BLOCK_TRIMMED;
f954f3
 }
f954f3
 
f954f3
-/* These are the block operations.  They always read or write a single
f954f3
- * whole block of size ‘blksize’.
f954f3
+/* These are the block operations.  They always read or write whole
f954f3
+ * blocks of size ‘blksize’.
f954f3
  */
f954f3
 int
f954f3
-blk_read (nbdkit_next *next,
f954f3
-          uint64_t blknum, uint8_t *block, int *err)
f954f3
+blk_read_multiple (nbdkit_next *next,
f954f3
+                   uint64_t blknum, uint64_t nrblocks,
f954f3
+                   uint8_t *block, int *err)
f954f3
 {
f954f3
   off_t offset = blknum * BLKSIZE;
f954f3
   enum bm_entry state;
f954f3
+  uint64_t b, runblocks;
f954f3
 
f954f3
-  /* The state might be modified from another thread - for example
f954f3
-   * another thread might write (BLOCK_NOT_ALLOCATED ->
f954f3
-   * BLOCK_ALLOCATED) while we are reading from the plugin, returning
f954f3
-   * the old data.  However a read issued after the write returns
f954f3
-   * should always return the correct data.
f954f3
+  /* Find out how many of the following blocks form a "run" with the
f954f3
+   * same state.  We can process that many blocks in one go.
f954f3
+   *
f954f3
+   * About the locking: The state might be modified from another
f954f3
+   * thread - for example another thread might write
f954f3
+   * (BLOCK_NOT_ALLOCATED -> BLOCK_ALLOCATED) while we are reading
f954f3
+   * from the plugin, returning the old data.  However a read issued
f954f3
+   * after the write returns should always return the correct data.
f954f3
    */
f954f3
   {
f954f3
     ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
f954f3
     state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
f954f3
+
f954f3
+    for (b = 1, runblocks = 1; b < nrblocks; ++b, ++runblocks) {
f954f3
+      enum bm_entry s = bitmap_get_blk (&bm, blknum + b, BLOCK_NOT_ALLOCATED);
f954f3
+      if (state != s)
f954f3
+        break;
f954f3
+    }
f954f3
   }
f954f3
 
f954f3
   if (cow_debug_verbose)
f954f3
-    nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64 ") is %s",
f954f3
+    nbdkit_debug ("cow: blk_read_multiple block %" PRIu64
f954f3
+                  " (offset %" PRIu64 ") is %s",
f954f3
                   blknum, (uint64_t) offset, state_to_string (state));
f954f3
 
f954f3
   if (state == BLOCK_NOT_ALLOCATED) { /* Read underlying plugin. */
f954f3
-    unsigned n = BLKSIZE, tail = 0;
f954f3
+    unsigned n, tail = 0;
f954f3
+
f954f3
+    assert (BLKSIZE * runblocks <= UINT_MAX);
f954f3
+    n = BLKSIZE * runblocks;
f954f3
 
f954f3
     if (offset + n > size) {
f954f3
       tail = offset + n - size;
f954f3
@@ -260,20 +276,35 @@ blk_read (nbdkit_next *next,
f954f3
      * zeroing the tail.
f954f3
      */
f954f3
     memset (block + n, 0, tail);
f954f3
-    return 0;
f954f3
   }
f954f3
   else if (state == BLOCK_ALLOCATED) { /* Read overlay. */
f954f3
-    if (pread (fd, block, BLKSIZE, offset) == -1) {
f954f3
+    if (pread (fd, block, BLKSIZE * runblocks, offset) == -1) {
f954f3
       *err = errno;
f954f3
       nbdkit_error ("pread: %m");
f954f3
       return -1;
f954f3
     }
f954f3
-    return 0;
f954f3
   }
f954f3
   else /* state == BLOCK_TRIMMED */ {
f954f3
-    memset (block, 0, BLKSIZE);
f954f3
-    return 0;
f954f3
+    memset (block, 0, BLKSIZE * runblocks);
f954f3
   }
f954f3
+
f954f3
+  /* If all done, return. */
f954f3
+  if (runblocks == nrblocks)
f954f3
+    return 0;
f954f3
+
f954f3
+  /* Recurse to read remaining blocks. */
f954f3
+  return blk_read_multiple (next,
f954f3
+                            blknum + runblocks,
f954f3
+                            nrblocks - runblocks,
f954f3
+                            block + BLKSIZE * runblocks,
f954f3
+                            err);
f954f3
+}
f954f3
+
f954f3
+int
f954f3
+blk_read (nbdkit_next *next,
f954f3
+          uint64_t blknum, uint8_t *block, int *err)
f954f3
+{
f954f3
+  return blk_read_multiple (next, blknum, 1, block, err);
f954f3
 }
f954f3
 
f954f3
 int
f954f3
diff --git a/filters/cow/blk.h b/filters/cow/blk.h
f954f3
index e6fd7417..b066c602 100644
f954f3
--- a/filters/cow/blk.h
f954f3
+++ b/filters/cow/blk.h
f954f3
@@ -55,6 +55,12 @@ extern int blk_read (nbdkit_next *next,
f954f3
                      uint64_t blknum, uint8_t *block, int *err)
f954f3
   __attribute__((__nonnull__ (1, 3, 4)));
f954f3
 
f954f3
+/* Read multiple blocks from the overlay or plugin. */
f954f3
+extern int blk_read_multiple (nbdkit_next *next,
f954f3
+                              uint64_t blknum, uint64_t nrblocks,
f954f3
+                              uint8_t *block, int *err)
f954f3
+  __attribute__((__nonnull__ (1, 4, 5)));
f954f3
+
f954f3
 /* Cache mode for blocks not already in overlay */
f954f3
 enum cache_mode {
f954f3
   BLK_CACHE_IGNORE,      /* Do nothing */
f954f3
diff --git a/filters/cow/cow.c b/filters/cow/cow.c
f954f3
index f30b7505..78daca22 100644
f954f3
--- a/filters/cow/cow.c
f954f3
+++ b/filters/cow/cow.c
f954f3
@@ -210,7 +210,7 @@ cow_pread (nbdkit_next *next,
f954f3
            uint32_t flags, int *err)
f954f3
 {
f954f3
   CLEANUP_FREE uint8_t *block = NULL;
f954f3
-  uint64_t blknum, blkoffs;
f954f3
+  uint64_t blknum, blkoffs, nrblocks;
f954f3
   int r;
f954f3
 
f954f3
   if (!IS_ALIGNED (count | offset, BLKSIZE)) {
f954f3
@@ -243,21 +243,16 @@ cow_pread (nbdkit_next *next,
f954f3
   }
f954f3
 
f954f3
   /* Aligned body */
f954f3
-  /* XXX This breaks up large read requests into smaller ones, which
f954f3
-   * is a problem for plugins which have a large, fixed per-request
f954f3
-   * overhead (hello, curl).  We should try to keep large requests
f954f3
-   * together as much as possible, but that requires us to be much
f954f3
-   * smarter here.
f954f3
-   */
f954f3
-  while (count >= BLKSIZE) {
f954f3
-    r = blk_read (next, blknum, buf, err);
f954f3
+  nrblocks = count / BLKSIZE;
f954f3
+  if (nrblocks > 0) {
f954f3
+    r = blk_read_multiple (next, blknum, nrblocks, buf, err);
f954f3
     if (r == -1)
f954f3
       return -1;
f954f3
 
f954f3
-    buf += BLKSIZE;
f954f3
-    count -= BLKSIZE;
f954f3
-    offset += BLKSIZE;
f954f3
-    blknum++;
f954f3
+    buf += nrblocks * BLKSIZE;
f954f3
+    count -= nrblocks * BLKSIZE;
f954f3
+    offset += nrblocks * BLKSIZE;
f954f3
+    blknum += nrblocks;
f954f3
   }
f954f3
 
f954f3
   /* Unaligned tail */
f954f3
-- 
f954f3
2.31.1
f954f3