Tree - rpms/nbdkit - CentOS Git server

rpms / nbdkit

Blame SOURCES/0003-cache-cow-Add-blk_read_multiple-function.patch

Blob History Raw

		f954f3	`From b5dc8577c5c6d1205e2106b629fad327c3a409ea Mon Sep 17 00:00:00 2001`
		f954f3	`From: "Richard W.M. Jones" <rjones@redhat.com>`
		f954f3	`Date: Mon, 26 Jul 2021 13:55:21 +0100`
		f954f3	`Subject: [PATCH] cache, cow: Add blk_read_multiple function`
		f954f3
		f954f3	`Currently the cache and cow filters break up large requests into many`
		f954f3	`single block-sized requests to the underlying plugin. For some`
		f954f3	`plugins (eg. curl) this is very inefficient and causes huge`
		f954f3	`slow-downs.`
		f954f3
		f954f3	`For example I tested nbdkit + curl vs nbdkit + cache + curl against a`
		f954f3	`slow, remote VMware server. A simple run of virt-inspector was at`
		f954f3	`least 6-7 times slower with the cache filter. (It was so slow that I`
		f954f3	`didn't actually let it run to completion - I am estimating the`
		f954f3	`slowdown multiple using interim debug messages).`
		f954f3
		f954f3	`Implement a new blk_read_multiple function in the cache filter. It`
		f954f3	`does not break up "runs" of blocks which all have the same cache`
		f954f3	`state. The cache .pread method uses the new function to read the`
		f954f3	`block-aligned part of the request.`
		f954f3
		f954f3	`(cherry picked from commit ab661ccef5b3369fa22c33d0289baddc251b73bf)`
		f954f3	`---`
		f954f3	`filters/cache/blk.c \| 83 ++++++++++++++++++++++++++++++++-----------`
		f954f3	`filters/cache/blk.h \| 6 ++++`
		f954f3	`filters/cache/cache.c \| 21 +++++------`
		f954f3	`filters/cow/blk.c \| 63 +++++++++++++++++++++++---------`
		f954f3	`filters/cow/blk.h \| 6 ++++`
		f954f3	`filters/cow/cow.c \| 21 +++++------`
		f954f3	`6 files changed, 138 insertions(+), 62 deletions(-)`
		f954f3
		f954f3	`diff --git a/filters/cache/blk.c b/filters/cache/blk.c`
		f954f3	`index f52f30e3..f85ada35 100644`
		f954f3	`--- a/filters/cache/blk.c`
		f954f3	`+++ b/filters/cache/blk.c`
		f954f3	`@@ -44,6 +44,7 @@`
		f954f3	`#include <string.h>`
		f954f3	`#include <unistd.h>`
		f954f3	`#include <fcntl.h>`
		f954f3	`+#include <limits.h>`
		f954f3	`#include <errno.h>`
		f954f3
		f954f3	`#ifdef HAVE_SYS_STATVFS_H`
		f954f3	`@@ -193,26 +194,40 @@ blk_set_size (uint64_t new_size)`
		f954f3	`return 0;`
		f954f3	`}`
		f954f3
		f954f3	`-int`
		f954f3	`-blk_read (nbdkit_next *next,`
		f954f3	`- uint64_t blknum, uint8_t block, int err)`
		f954f3	`+static int`
		f954f3	`+_blk_read_multiple (nbdkit_next *next,`
		f954f3	`+ uint64_t blknum, uint64_t nrblocks,`
		f954f3	`+ uint8_t block, int err)`
		f954f3	`{`
		f954f3	`off_t offset = blknum * blksize;`
		f954f3	`- enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED);`
		f954f3	`+ bool not_cached =`
		f954f3	`+ bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED) == BLOCK_NOT_CACHED;`
		f954f3	`+ uint64_t b, runblocks;`
		f954f3
		f954f3	`- reclaim (fd, &bm;;`
		f954f3	`+ assert (nrblocks > 0);`
		f954f3
		f954f3	`if (cache_debug_verbose)`
		f954f3	`- nbdkit_debug ("cache: blk_read block %" PRIu64`
		f954f3	`+ nbdkit_debug ("cache: blk_read_multiple block %" PRIu64`
		f954f3	`" (offset %" PRIu64 ") is %s",`
		f954f3	`blknum, (uint64_t) offset,`
		f954f3	`- state == BLOCK_NOT_CACHED ? "not cached" :`
		f954f3	`- state == BLOCK_CLEAN ? "clean" :`
		f954f3	`- state == BLOCK_DIRTY ? "dirty" :`
		f954f3	`- "unknown");`
		f954f3	`+ not_cached ? "not cached" : "cached");`
		f954f3
		f954f3	`- if (state == BLOCK_NOT_CACHED) { /* Read underlying plugin. */`
		f954f3	`- unsigned n = blksize, tail = 0;`
		f954f3	`+ /* Find out how many of the following blocks form a "run" with the`
		f954f3	`+ * same cached/not-cached state. We can process that many blocks in`
		f954f3	`+ * one go.`
		f954f3	`+ */`
		f954f3	`+ for (b = 1, runblocks = 1; b < nrblocks; ++b, ++runblocks) {`
		f954f3	`+ bool s =`
		f954f3	`+ bitmap_get_blk (&bm, blknum + b, BLOCK_NOT_CACHED) == BLOCK_NOT_CACHED;`
		f954f3	`+ if (not_cached != s)`
		f954f3	`+ break;`
		f954f3	`+ }`
		f954f3	`+`
		f954f3	`+ if (not_cached) { /* Read underlying plugin. */`
		f954f3	`+ unsigned n, tail = 0;`
		f954f3	`+`
		f954f3	`+ assert (blksize * runblocks <= UINT_MAX);`
		f954f3	`+ n = blksize * runblocks;`
		f954f3
		f954f3	`if (offset + n > size) {`
		f954f3	`tail = offset + n - size;`
		f954f3	`@@ -228,32 +243,60 @@ blk_read (nbdkit_next *next,`
		f954f3	`*/`
		f954f3	`memset (block + n, 0, tail);`
		f954f3
		f954f3	`- /* If cache-on-read, copy the block to the cache. */`
		f954f3	`+ /* If cache-on-read, copy the blocks to the cache. */`
		f954f3	`if (cache_on_read) {`
		f954f3	`if (cache_debug_verbose)`
		f954f3	`nbdkit_debug ("cache: cache-on-read block %" PRIu64`
		f954f3	`" (offset %" PRIu64 ")",`
		f954f3	`blknum, (uint64_t) offset);`
		f954f3
		f954f3	`- if (pwrite (fd, block, blksize, offset) == -1) {`
		f954f3	`+ if (pwrite (fd, block, blksize * runblocks, offset) == -1) {`
		f954f3	`*err = errno;`
		f954f3	`nbdkit_error ("pwrite: %m");`
		f954f3	`return -1;`
		f954f3	`}`
		f954f3	`- bitmap_set_blk (&bm, blknum, BLOCK_CLEAN);`
		f954f3	`- lru_set_recently_accessed (blknum);`
		f954f3	`+ for (b = 0; b < runblocks; ++b) {`
		f954f3	`+ bitmap_set_blk (&bm, blknum + b, BLOCK_CLEAN);`
		f954f3	`+ lru_set_recently_accessed (blknum + b);`
		f954f3	`+ }`
		f954f3	`}`
		f954f3	`- return 0;`
		f954f3	`}`
		f954f3	`else { /* Read cache. */`
		f954f3	`- if (pread (fd, block, blksize, offset) == -1) {`
		f954f3	`+ if (pread (fd, block, blksize * runblocks, offset) == -1) {`
		f954f3	`*err = errno;`
		f954f3	`nbdkit_error ("pread: %m");`
		f954f3	`return -1;`
		f954f3	`}`
		f954f3	`- lru_set_recently_accessed (blknum);`
		f954f3	`- return 0;`
		f954f3	`+ for (b = 0; b < runblocks; ++b)`
		f954f3	`+ lru_set_recently_accessed (blknum + b);`
		f954f3	`}`
		f954f3	`+`
		f954f3	`+ /* If all done, return. */`
		f954f3	`+ if (runblocks == nrblocks)`
		f954f3	`+ return 0;`
		f954f3	`+`
		f954f3	`+ /* Recurse to read remaining blocks. */`
		f954f3	`+ return _blk_read_multiple (next,`
		f954f3	`+ blknum + runblocks,`
		f954f3	`+ nrblocks - runblocks,`
		f954f3	`+ block + blksize * runblocks,`
		f954f3	`+ err);`
		f954f3	`+}`
		f954f3	`+`
		f954f3	`+int`
		f954f3	`+blk_read_multiple (nbdkit_next *next,`
		f954f3	`+ uint64_t blknum, uint64_t nrblocks,`
		f954f3	`+ uint8_t block, int err)`
		f954f3	`+{`
		f954f3	`+ reclaim (fd, &bm;;`
		f954f3	`+ return _blk_read_multiple (next, blknum, nrblocks, block, err);`
		f954f3	`+}`
		f954f3	`+`
		f954f3	`+int`
		f954f3	`+blk_read (nbdkit_next *next,`
		f954f3	`+ uint64_t blknum, uint8_t block, int err)`
		f954f3	`+{`
		f954f3	`+ return blk_read_multiple (next, blknum, 1, block, err);`
		f954f3	`}`
		f954f3
		f954f3	`int`
		f954f3	`diff --git a/filters/cache/blk.h b/filters/cache/blk.h`
		f954f3	`index 87c753e2..1ee33ed7 100644`
		f954f3	`--- a/filters/cache/blk.h`
		f954f3	`+++ b/filters/cache/blk.h`
		f954f3	`@@ -55,6 +55,12 @@ extern int blk_read (nbdkit_next *next,`
		f954f3	`uint64_t blknum, uint8_t block, int err)`
		f954f3	`__attribute__((__nonnull__ (1, 3, 4)));`
		f954f3
		f954f3	`+/* As above, but read multiple blocks. */`
		f954f3	`+extern int blk_read_multiple (nbdkit_next *next,`
		f954f3	`+ uint64_t blknum, uint64_t nrblocks,`
		f954f3	`+ uint8_t block, int err)`
		f954f3	`+ __attribute__((__nonnull__ (1, 4, 5)));`
		f954f3	`+`
		f954f3	`/* If a single block is not cached, copy it from the plugin. */`
		f954f3	`extern int blk_cache (nbdkit_next *next,`
		f954f3	`uint64_t blknum, uint8_t block, int err)`
		f954f3	`diff --git a/filters/cache/cache.c b/filters/cache/cache.c`
		f954f3	`index 745f552d..14cc03f2 100644`
		f954f3	`--- a/filters/cache/cache.c`
		f954f3	`+++ b/filters/cache/cache.c`
		f954f3	`@@ -313,7 +313,7 @@ cache_pread (nbdkit_next *next,`
		f954f3	`uint32_t flags, int *err)`
		f954f3	`{`
		f954f3	`CLEANUP_FREE uint8_t *block = NULL;`
		f954f3	`- uint64_t blknum, blkoffs;`
		f954f3	`+ uint64_t blknum, blkoffs, nrblocks;`
		f954f3	`int r;`
		f954f3
		f954f3	`assert (!flags);`
		f954f3	`@@ -348,22 +348,17 @@ cache_pread (nbdkit_next *next,`
		f954f3	`}`
		f954f3
		f954f3	`/* Aligned body */`
		f954f3	`- /* XXX This breaks up large read requests into smaller ones, which`
		f954f3	`- * is a problem for plugins which have a large, fixed per-request`
		f954f3	`- * overhead (hello, curl). We should try to keep large requests`
		f954f3	`- * together as much as possible, but that requires us to be much`
		f954f3	`- * smarter here.`
		f954f3	`- */`
		f954f3	`- while (count >= blksize) {`
		f954f3	`+ nrblocks = count / blksize;`
		f954f3	`+ if (nrblocks > 0) {`
		f954f3	`ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);`
		f954f3	`- r = blk_read (next, blknum, buf, err);`
		f954f3	`+ r = blk_read_multiple (next, blknum, nrblocks, buf, err);`
		f954f3	`if (r == -1)`
		f954f3	`return -1;`
		f954f3
		f954f3	`- buf += blksize;`
		f954f3	`- count -= blksize;`
		f954f3	`- offset += blksize;`
		f954f3	`- blknum++;`
		f954f3	`+ buf += nrblocks * blksize;`
		f954f3	`+ count -= nrblocks * blksize;`
		f954f3	`+ offset += nrblocks * blksize;`
		f954f3	`+ blknum += nrblocks;`
		f954f3	`}`
		f954f3
		f954f3	`/* Unaligned tail */`
		f954f3	`diff --git a/filters/cow/blk.c b/filters/cow/blk.c`
		f954f3	`index b7c4d7f1..4ec8d1b8 100644`
		f954f3	`--- a/filters/cow/blk.c`
		f954f3	`+++ b/filters/cow/blk.c`
		f954f3	`@@ -79,6 +79,7 @@`
		f954f3	`#include <inttypes.h>`
		f954f3	`#include <unistd.h>`
		f954f3	`#include <fcntl.h>`
		f954f3	`+#include <limits.h>`
		f954f3	`#include <errno.h>`
		f954f3	`#include <sys/types.h>`
		f954f3
		f954f3	`@@ -219,33 +220,48 @@ blk_status (uint64_t blknum, bool present, bool trimmed)`
		f954f3	`*trimmed = state == BLOCK_TRIMMED;`
		f954f3	`}`
		f954f3
		f954f3	`-/* These are the block operations. They always read or write a single`
		f954f3	`- * whole block of size ‘blksize’.`
		f954f3	`+/* These are the block operations. They always read or write whole`
		f954f3	`+ * blocks of size ‘blksize’.`
		f954f3	`*/`
		f954f3	`int`
		f954f3	`-blk_read (nbdkit_next *next,`
		f954f3	`- uint64_t blknum, uint8_t block, int err)`
		f954f3	`+blk_read_multiple (nbdkit_next *next,`
		f954f3	`+ uint64_t blknum, uint64_t nrblocks,`
		f954f3	`+ uint8_t block, int err)`
		f954f3	`{`
		f954f3	`off_t offset = blknum * BLKSIZE;`
		f954f3	`enum bm_entry state;`
		f954f3	`+ uint64_t b, runblocks;`
		f954f3
		f954f3	`- /* The state might be modified from another thread - for example`
		f954f3	`- * another thread might write (BLOCK_NOT_ALLOCATED ->`
		f954f3	`- * BLOCK_ALLOCATED) while we are reading from the plugin, returning`
		f954f3	`- * the old data. However a read issued after the write returns`
		f954f3	`- * should always return the correct data.`
		f954f3	`+ /* Find out how many of the following blocks form a "run" with the`
		f954f3	`+ * same state. We can process that many blocks in one go.`
		f954f3	`+ *`
		f954f3	`+ * About the locking: The state might be modified from another`
		f954f3	`+ * thread - for example another thread might write`
		f954f3	`+ * (BLOCK_NOT_ALLOCATED -> BLOCK_ALLOCATED) while we are reading`
		f954f3	`+ * from the plugin, returning the old data. However a read issued`
		f954f3	`+ * after the write returns should always return the correct data.`
		f954f3	`*/`
		f954f3	`{`
		f954f3	`ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);`
		f954f3	`state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);`
		f954f3	`+`
		f954f3	`+ for (b = 1, runblocks = 1; b < nrblocks; ++b, ++runblocks) {`
		f954f3	`+ enum bm_entry s = bitmap_get_blk (&bm, blknum + b, BLOCK_NOT_ALLOCATED);`
		f954f3	`+ if (state != s)`
		f954f3	`+ break;`
		f954f3	`+ }`
		f954f3	`}`
		f954f3
		f954f3	`if (cow_debug_verbose)`
		f954f3	`- nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64 ") is %s",`
		f954f3	`+ nbdkit_debug ("cow: blk_read_multiple block %" PRIu64`
		f954f3	`+ " (offset %" PRIu64 ") is %s",`
		f954f3	`blknum, (uint64_t) offset, state_to_string (state));`
		f954f3
		f954f3	`if (state == BLOCK_NOT_ALLOCATED) { /* Read underlying plugin. */`
		f954f3	`- unsigned n = BLKSIZE, tail = 0;`
		f954f3	`+ unsigned n, tail = 0;`
		f954f3	`+`
		f954f3	`+ assert (BLKSIZE * runblocks <= UINT_MAX);`
		f954f3	`+ n = BLKSIZE * runblocks;`
		f954f3
		f954f3	`if (offset + n > size) {`
		f954f3	`tail = offset + n - size;`
		f954f3	`@@ -260,20 +276,35 @@ blk_read (nbdkit_next *next,`
		f954f3	`* zeroing the tail.`
		f954f3	`*/`
		f954f3	`memset (block + n, 0, tail);`
		f954f3	`- return 0;`
		f954f3	`}`
		f954f3	`else if (state == BLOCK_ALLOCATED) { /* Read overlay. */`
		f954f3	`- if (pread (fd, block, BLKSIZE, offset) == -1) {`
		f954f3	`+ if (pread (fd, block, BLKSIZE * runblocks, offset) == -1) {`
		f954f3	`*err = errno;`
		f954f3	`nbdkit_error ("pread: %m");`
		f954f3	`return -1;`
		f954f3	`}`
		f954f3	`- return 0;`
		f954f3	`}`
		f954f3	`else /* state == BLOCK_TRIMMED */ {`
		f954f3	`- memset (block, 0, BLKSIZE);`
		f954f3	`- return 0;`
		f954f3	`+ memset (block, 0, BLKSIZE * runblocks);`
		f954f3	`}`
		f954f3	`+`
		f954f3	`+ /* If all done, return. */`
		f954f3	`+ if (runblocks == nrblocks)`
		f954f3	`+ return 0;`
		f954f3	`+`
		f954f3	`+ /* Recurse to read remaining blocks. */`
		f954f3	`+ return blk_read_multiple (next,`
		f954f3	`+ blknum + runblocks,`
		f954f3	`+ nrblocks - runblocks,`
		f954f3	`+ block + BLKSIZE * runblocks,`
		f954f3	`+ err);`
		f954f3	`+}`
		f954f3	`+`
		f954f3	`+int`
		f954f3	`+blk_read (nbdkit_next *next,`
		f954f3	`+ uint64_t blknum, uint8_t block, int err)`
		f954f3	`+{`
		f954f3	`+ return blk_read_multiple (next, blknum, 1, block, err);`
		f954f3	`}`
		f954f3
		f954f3	`int`
		f954f3	`diff --git a/filters/cow/blk.h b/filters/cow/blk.h`
		f954f3	`index e6fd7417..b066c602 100644`
		f954f3	`--- a/filters/cow/blk.h`
		f954f3	`+++ b/filters/cow/blk.h`
		f954f3	`@@ -55,6 +55,12 @@ extern int blk_read (nbdkit_next *next,`
		f954f3	`uint64_t blknum, uint8_t block, int err)`
		f954f3	`__attribute__((__nonnull__ (1, 3, 4)));`
		f954f3
		f954f3	`+/* Read multiple blocks from the overlay or plugin. */`
		f954f3	`+extern int blk_read_multiple (nbdkit_next *next,`
		f954f3	`+ uint64_t blknum, uint64_t nrblocks,`
		f954f3	`+ uint8_t block, int err)`
		f954f3	`+ __attribute__((__nonnull__ (1, 4, 5)));`
		f954f3	`+`
		f954f3	`/* Cache mode for blocks not already in overlay */`
		f954f3	`enum cache_mode {`
		f954f3	`BLK_CACHE_IGNORE, /* Do nothing */`
		f954f3	`diff --git a/filters/cow/cow.c b/filters/cow/cow.c`
		f954f3	`index f30b7505..78daca22 100644`
		f954f3	`--- a/filters/cow/cow.c`
		f954f3	`+++ b/filters/cow/cow.c`
		f954f3	`@@ -210,7 +210,7 @@ cow_pread (nbdkit_next *next,`
		f954f3	`uint32_t flags, int *err)`
		f954f3	`{`
		f954f3	`CLEANUP_FREE uint8_t *block = NULL;`
		f954f3	`- uint64_t blknum, blkoffs;`
		f954f3	`+ uint64_t blknum, blkoffs, nrblocks;`
		f954f3	`int r;`
		f954f3
		f954f3	`if (!IS_ALIGNED (count \| offset, BLKSIZE)) {`
		f954f3	`@@ -243,21 +243,16 @@ cow_pread (nbdkit_next *next,`
		f954f3	`}`
		f954f3
		f954f3	`/* Aligned body */`
		f954f3	`- /* XXX This breaks up large read requests into smaller ones, which`
		f954f3	`- * is a problem for plugins which have a large, fixed per-request`
		f954f3	`- * overhead (hello, curl). We should try to keep large requests`
		f954f3	`- * together as much as possible, but that requires us to be much`
		f954f3	`- * smarter here.`
		f954f3	`- */`
		f954f3	`- while (count >= BLKSIZE) {`
		f954f3	`- r = blk_read (next, blknum, buf, err);`
		f954f3	`+ nrblocks = count / BLKSIZE;`
		f954f3	`+ if (nrblocks > 0) {`
		f954f3	`+ r = blk_read_multiple (next, blknum, nrblocks, buf, err);`
		f954f3	`if (r == -1)`
		f954f3	`return -1;`
		f954f3
		f954f3	`- buf += BLKSIZE;`
		f954f3	`- count -= BLKSIZE;`
		f954f3	`- offset += BLKSIZE;`
		f954f3	`- blknum++;`
		f954f3	`+ buf += nrblocks * BLKSIZE;`
		f954f3	`+ count -= nrblocks * BLKSIZE;`
		f954f3	`+ offset += nrblocks * BLKSIZE;`
		f954f3	`+ blknum += nrblocks;`
		f954f3	`}`
		f954f3
		f954f3	`/* Unaligned tail */`
		f954f3	`--`
		f954f3	`2.31.1`
		f954f3

rpms / nbdkit

Source Code

Blame SOURCES/0003-cache-cow-Add-blk_read_multiple-function.patch