From 13223e8e3219d0310ce4d94093bbdb7732a891fb Mon Sep 17 00:00:00 2001 From: "Richard W.M. Jones" Date: Wed, 27 Oct 2021 10:17:22 +0100 Subject: [PATCH] vddk: Implement parallel thread model Since VDDK 6.0, asynchronous read and write operations are available. This commit makes use of these, allowing us to use the parallel thread model for increased performance. Note that at least VDDK 6.5 is required because VDDK 6.0 had a different and incompatible signature for VixDiskLibCompletionCB. Also note at least vSphere 6.7 is required for asynch calls to make any performance difference. In older versions they work synchronously. In the parallel thread model, nbdkit will be calling us in parallel from multiple nbdkit threads. VDDK does not allow multiple threads to simultaneously call VDDK operations on the same handle. So we create a background thread per handle (== connection). Only the background thread makes VDDK calls[1]. The background thread handles a mix of synchronous (like extents, flush) and asynchronous (like read, write) operations, but all from one thread. Parallel nbdkit threads issue commands to the background thread associated with each handle, and wait until they are retired. [1] All VDDK calls except for connecting and disconnecting which for different reasons are protected by a global lock, so I did not need to change those. (cherry picked from commit 1eecf15fc3d8ea253ccec4f5883fdbb9aa6f8c2b) --- plugins/vddk/Makefile.am | 1 + plugins/vddk/nbdkit-vddk-plugin.pod | 11 +- plugins/vddk/vddk.c | 380 +++++-------------- plugins/vddk/vddk.h | 49 ++- plugins/vddk/worker.c | 567 ++++++++++++++++++++++++++++ tests/dummy-vddk.c | 32 ++ 6 files changed, 745 insertions(+), 295 deletions(-) create mode 100644 plugins/vddk/worker.c diff --git a/plugins/vddk/Makefile.am b/plugins/vddk/Makefile.am index 4f470ff9..f8382fc9 100644 --- a/plugins/vddk/Makefile.am +++ b/plugins/vddk/Makefile.am @@ -49,6 +49,7 @@ nbdkit_vddk_plugin_la_SOURCES = \ stats.c \ vddk-structs.h \ vddk-stubs.h \ + worker.c \ $(top_srcdir)/include/nbdkit-plugin.h \ $(NULL) diff --git a/plugins/vddk/nbdkit-vddk-plugin.pod b/plugins/vddk/nbdkit-vddk-plugin.pod index 1c16d096..ce82a734 100644 --- a/plugins/vddk/nbdkit-vddk-plugin.pod +++ b/plugins/vddk/nbdkit-vddk-plugin.pod @@ -523,6 +523,14 @@ read bandwidth to the VMware server. Same as above, but for writing and flushing writes. +=item C + +=item C + +Same as above, but for asynchronous read and write calls introduced in +nbdkit 1.30. Unfortunately at the moment the amount of time spent in +these calls is not accounted for correctly. + =item C This call is used to query information about the sparseness of the @@ -580,7 +588,8 @@ Debug extents returned by C. =item B<-D vddk.datapath=0> -Suppress debugging of datapath calls (C and C). +Suppress debugging of datapath calls (C, C, C +and C). =item B<-D vddk.stats=1> diff --git a/plugins/vddk/vddk.c b/plugins/vddk/vddk.c index 67ac775c..9f223db0 100644 --- a/plugins/vddk/vddk.c +++ b/plugins/vddk/vddk.c @@ -50,9 +50,6 @@ #include #include "cleanup.h" -#include "minmax.h" -#include "rounding.h" -#include "tvdiff.h" #include "vector.h" #include "vddk.h" @@ -522,23 +519,18 @@ vddk_dump_plugin (void) /* The rules on threads and VDDK are here: * https://code.vmware.com/docs/11750/virtual-disk-development-kit-programming-guide/GUID-6BE903E8-DC70-46D9-98E4-E34A2002C2AD.html * - * Before nbdkit 1.22 we used SERIALIZE_ALL_REQUESTS. Since nbdkit - * 1.22 we changed this to SERIALIZE_REQUESTS and added a mutex around - * calls to VixDiskLib_Open and VixDiskLib_Close. This is not quite - * within the letter of the rules, but is within the spirit. + * Before nbdkit 1.22 we used SERIALIZE_ALL_REQUESTS. In nbdkit + * 1.22-1.28 we changed this to SERIALIZE_REQUESTS and added a mutex + * around calls to VixDiskLib_Open and VixDiskLib_Close. In nbdkit + * 1.30 and above we assign a background thread per connection to do + * asynch operations and use the PARALLEL model. We still need the + * lock around Open and Close. */ -#define THREAD_MODEL NBDKIT_THREAD_MODEL_SERIALIZE_REQUESTS +#define THREAD_MODEL NBDKIT_THREAD_MODEL_PARALLEL /* Lock protecting open/close calls - see above. */ static pthread_mutex_t open_close_lock = PTHREAD_MUTEX_INITIALIZER; -/* The per-connection handle. */ -struct vddk_handle { - VixDiskLibConnectParams *params; /* connection parameters */ - VixDiskLibConnection connection; /* connection */ - VixDiskLibHandle handle; /* disk handle */ -}; - static inline VixDiskLibConnectParams * allocate_connect_params (void) { @@ -579,12 +571,16 @@ vddk_open (int readonly) VixError err; uint32_t flags; const char *transport_mode; + int pterr; - h = malloc (sizeof *h); + h = calloc (1, sizeof *h); if (h == NULL) { - nbdkit_error ("malloc: %m"); + nbdkit_error ("calloc: %m"); return NULL; } + h->commands = (command_queue) empty_vector; + pthread_mutex_init (&h->commands_lock, NULL); + pthread_cond_init (&h->commands_cond, NULL); h->params = allocate_connect_params (); if (h->params == NULL) { @@ -661,8 +657,22 @@ vddk_open (int readonly) VDDK_CALL_END (VixDiskLib_GetTransportMode, 0); nbdkit_debug ("transport mode: %s", transport_mode); + /* Start the background thread which actually does the asynchronous + * work. + */ + pterr = pthread_create (&h->thread, NULL, vddk_worker_thread, h); + if (pterr != 0) { + errno = pterr; + nbdkit_error ("pthread_create: %m"); + goto err3; + } + return h; + err3: + VDDK_CALL_START (VixDiskLib_Close, "handle") + VixDiskLib_Close (h->handle); + VDDK_CALL_END (VixDiskLib_Close, 0); err2: VDDK_CALL_START (VixDiskLib_Disconnect, "connection") VixDiskLib_Disconnect (h->connection); @@ -670,6 +680,8 @@ vddk_open (int readonly) err1: free_connect_params (h->params); err0: + pthread_mutex_destroy (&h->commands_lock); + pthread_cond_destroy (&h->commands_cond); free (h); return NULL; } @@ -680,6 +692,10 @@ vddk_close (void *handle) { ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&open_close_lock); struct vddk_handle *h = handle; + struct command stop_cmd = { .type = STOP }; + + send_command_and_wait (h, &stop_cmd); + pthread_join (h->thread, NULL); VDDK_CALL_START (VixDiskLib_Close, "handle") VixDiskLib_Close (h->handle); @@ -689,6 +705,9 @@ vddk_close (void *handle) VDDK_CALL_END (VixDiskLib_Disconnect, 0); free_connect_params (h->params); + pthread_mutex_destroy (&h->commands_lock); + pthread_cond_destroy (&h->commands_cond); + command_queue_reset (&h->commands); free (h); } @@ -697,54 +716,29 @@ static int64_t vddk_get_size (void *handle) { struct vddk_handle *h = handle; - VixDiskLibInfo *info; - VixError err; uint64_t size; + struct command get_size_cmd = { .type = GET_SIZE, .ptr = &size }; - VDDK_CALL_START (VixDiskLib_GetInfo, "handle, &info") - err = VixDiskLib_GetInfo (h->handle, &info); - VDDK_CALL_END (VixDiskLib_GetInfo, 0); - if (err != VIX_OK) { - VDDK_ERROR (err, "VixDiskLib_GetInfo"); + if (send_command_and_wait (h, &get_size_cmd) == -1) return -1; - } - - size = info->capacity * (uint64_t)VIXDISKLIB_SECTOR_SIZE; - - if (vddk_debug_diskinfo) { - nbdkit_debug ("disk info: capacity: %" PRIu64 " sectors " - "(%" PRIi64 " bytes)", - info->capacity, size); - nbdkit_debug ("disk info: biosGeo: C:%" PRIu32 " H:%" PRIu32 " S:%" PRIu32, - info->biosGeo.cylinders, - info->biosGeo.heads, - info->biosGeo.sectors); - nbdkit_debug ("disk info: physGeo: C:%" PRIu32 " H:%" PRIu32 " S:%" PRIu32, - info->physGeo.cylinders, - info->physGeo.heads, - info->physGeo.sectors); - nbdkit_debug ("disk info: adapter type: %d", - (int) info->adapterType); - nbdkit_debug ("disk info: num links: %d", info->numLinks); - nbdkit_debug ("disk info: parent filename hint: %s", - info->parentFileNameHint ? : "NULL"); - nbdkit_debug ("disk info: uuid: %s", - info->uuid ? : "NULL"); - if (library_version >= 7) { - nbdkit_debug ("disk info: sector size: " - "logical %" PRIu32 " physical %" PRIu32, - info->logicalSectorSize, - info->physicalSectorSize); - } - } - - VDDK_CALL_START (VixDiskLib_FreeInfo, "info") - VixDiskLib_FreeInfo (info); - VDDK_CALL_END (VixDiskLib_FreeInfo, 0); return (int64_t) size; } +static int +vddk_can_fua (void *handle) +{ + /* The Flush call was not available in VDDK < 6.0. */ + return VixDiskLib_Flush != NULL ? NBDKIT_FUA_NATIVE : NBDKIT_FUA_NONE; +} + +static int +vddk_can_flush (void *handle) +{ + /* The Flush call was not available in VDDK < 6.0. */ + return VixDiskLib_Flush != NULL; +} + /* Read data from the file. * * Note that reads have to be aligned to sectors (XXX). @@ -754,32 +748,14 @@ vddk_pread (void *handle, void *buf, uint32_t count, uint64_t offset, uint32_t flags) { struct vddk_handle *h = handle; - VixError err; + struct command read_cmd = { + .type = READ, + .ptr = buf, + .count = count, + .offset = offset, + }; - /* Align to sectors. */ - if (!IS_ALIGNED (offset, VIXDISKLIB_SECTOR_SIZE)) { - nbdkit_error ("%s is not aligned to sectors", "read"); - return -1; - } - if (!IS_ALIGNED (count, VIXDISKLIB_SECTOR_SIZE)) { - nbdkit_error ("%s is not aligned to sectors", "read"); - return -1; - } - offset /= VIXDISKLIB_SECTOR_SIZE; - count /= VIXDISKLIB_SECTOR_SIZE; - - VDDK_CALL_START (VixDiskLib_Read, - "handle, %" PRIu64 " sectors, " - "%" PRIu32 " sectors, buffer", - offset, count) - err = VixDiskLib_Read (h->handle, offset, count, buf); - VDDK_CALL_END (VixDiskLib_Read, count * VIXDISKLIB_SECTOR_SIZE); - if (err != VIX_OK) { - VDDK_ERROR (err, "VixDiskLib_Read"); - return -1; - } - - return 0; + return send_command_and_wait (h, &read_cmd); } static int vddk_flush (void *handle, uint32_t flags); @@ -792,32 +768,17 @@ static int vddk_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, uint32_t flags) { + struct vddk_handle *h = handle; const bool fua = flags & NBDKIT_FLAG_FUA; - struct vddk_handle *h = handle; - VixError err; + struct command write_cmd = { + .type = WRITE, + .ptr = (void *) buf, + .count = count, + .offset = offset, + }; - /* Align to sectors. */ - if (!IS_ALIGNED (offset, VIXDISKLIB_SECTOR_SIZE)) { - nbdkit_error ("%s is not aligned to sectors", "write"); + if (send_command_and_wait (h, &write_cmd) == -1) return -1; - } - if (!IS_ALIGNED (count, VIXDISKLIB_SECTOR_SIZE)) { - nbdkit_error ("%s is not aligned to sectors", "write"); - return -1; - } - offset /= VIXDISKLIB_SECTOR_SIZE; - count /= VIXDISKLIB_SECTOR_SIZE; - - VDDK_CALL_START (VixDiskLib_Write, - "handle, %" PRIu64 " sectors, " - "%" PRIu32 " sectors, buffer", - offset, count) - err = VixDiskLib_Write (h->handle, offset, count, buf); - VDDK_CALL_END (VixDiskLib_Write, count * VIXDISKLIB_SECTOR_SIZE); - if (err != VIX_OK) { - VDDK_ERROR (err, "VixDiskLib_Write"); - return -1; - } if (fua) { if (vddk_flush (handle, 0) == -1) @@ -827,126 +788,32 @@ vddk_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, return 0; } -static int -vddk_can_fua (void *handle) -{ - /* The Flush call was not available in VDDK < 6.0. */ - return VixDiskLib_Flush != NULL ? NBDKIT_FUA_NATIVE : NBDKIT_FUA_NONE; -} - -static int -vddk_can_flush (void *handle) -{ - /* The Flush call was not available in VDDK < 6.0. */ - return VixDiskLib_Flush != NULL; -} - /* Flush data to the file. */ static int vddk_flush (void *handle, uint32_t flags) { struct vddk_handle *h = handle; - VixError err; + struct command flush_cmd = { + .type = FLUSH, + }; - /* The documentation for Flush is missing, but the comment in the - * header file seems to indicate that it waits for WriteAsync - * commands to finish. We don't use WriteAsync, and in any case - * there's a new function Wait to wait for those. However I - * verified using strace that in fact Flush does call fsync on the - * file so it appears to be the correct call to use here. - */ - - VDDK_CALL_START (VixDiskLib_Flush, "handle") - err = VixDiskLib_Flush (h->handle); - VDDK_CALL_END (VixDiskLib_Flush, 0); - if (err != VIX_OK) { - VDDK_ERROR (err, "VixDiskLib_Flush"); - return -1; - } - - return 0; + return send_command_and_wait (h, &flush_cmd); } static int vddk_can_extents (void *handle) { struct vddk_handle *h = handle; - VixError err; - VixDiskLibBlockList *block_list; + int ret; + struct command can_extents_cmd = { + .type = CAN_EXTENTS, + .ptr = &ret, + }; - /* This call was added in VDDK 6.7. In earlier versions the - * function pointer will be NULL and we cannot query extents. - */ - if (VixDiskLib_QueryAllocatedBlocks == NULL) { - nbdkit_debug ("can_extents: VixDiskLib_QueryAllocatedBlocks == NULL, " - "probably this is VDDK < 6.7"); - return 0; - } - - /* Suppress errors around this call. See: - * https://bugzilla.redhat.com/show_bug.cgi?id=1709211#c7 - */ - error_suppression = 1; - - /* However even when the call is available it rarely works well so - * the best thing we can do here is to try the call and if it's - * non-functional return false. - */ - VDDK_CALL_START (VixDiskLib_QueryAllocatedBlocks, - "handle, 0, %d sectors, %d sectors", - VIXDISKLIB_MIN_CHUNK_SIZE, VIXDISKLIB_MIN_CHUNK_SIZE) - err = VixDiskLib_QueryAllocatedBlocks (h->handle, - 0, VIXDISKLIB_MIN_CHUNK_SIZE, - VIXDISKLIB_MIN_CHUNK_SIZE, - &block_list); - VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, 0); - error_suppression = 0; - if (err == VIX_OK) { - VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list") - VixDiskLib_FreeBlockList (block_list); - VDDK_CALL_END (VixDiskLib_FreeBlockList, 0); - } - if (err != VIX_OK) { - char *errmsg = VixDiskLib_GetErrorText (err, NULL); - nbdkit_debug ("can_extents: VixDiskLib_QueryAllocatedBlocks test failed, " - "extents support will be disabled: " - "original error: %s", - errmsg); - VixDiskLib_FreeErrorText (errmsg); - return 0; - } - - return 1; -} - -static int -add_extent (struct nbdkit_extents *extents, - uint64_t *position, uint64_t next_position, bool is_hole) -{ - uint32_t type = 0; - const uint64_t length = next_position - *position; - - if (is_hole) { - type = NBDKIT_EXTENT_HOLE; - /* Images opened as single link might be backed by another file in the - chain, so the holes are not guaranteed to be zeroes. */ - if (!single_link) - type |= NBDKIT_EXTENT_ZERO; - } - - assert (*position <= next_position); - if (*position == next_position) - return 0; - - if (vddk_debug_extents) - nbdkit_debug ("adding extent type %s at [%" PRIu64 "...%" PRIu64 "]", - is_hole ? "hole" : "allocated data", - *position, next_position-1); - if (nbdkit_add_extent (extents, *position, length, type) == -1) + if (send_command_and_wait (h, &can_extents_cmd) == -1) return -1; - *position = next_position; - return 0; + return ret; } static int @@ -955,88 +822,15 @@ vddk_extents (void *handle, uint32_t count, uint64_t offset, uint32_t flags, { struct vddk_handle *h = handle; bool req_one = flags & NBDKIT_FLAG_REQ_ONE; - uint64_t position, end, start_sector; - - position = offset; - end = offset + count; - - /* We can only query whole chunks. Therefore start with the first - * chunk before offset. - */ - start_sector = - ROUND_DOWN (offset, VIXDISKLIB_MIN_CHUNK_SIZE * VIXDISKLIB_SECTOR_SIZE) - / VIXDISKLIB_SECTOR_SIZE; - while (start_sector * VIXDISKLIB_SECTOR_SIZE < end) { - VixError err; - uint32_t i; - uint64_t nr_chunks, nr_sectors; - VixDiskLibBlockList *block_list; - - assert (IS_ALIGNED (start_sector, VIXDISKLIB_MIN_CHUNK_SIZE)); - - nr_chunks = - ROUND_UP (end - start_sector * VIXDISKLIB_SECTOR_SIZE, - VIXDISKLIB_MIN_CHUNK_SIZE * VIXDISKLIB_SECTOR_SIZE) - / (VIXDISKLIB_MIN_CHUNK_SIZE * VIXDISKLIB_SECTOR_SIZE); - nr_chunks = MIN (nr_chunks, VIXDISKLIB_MAX_CHUNK_NUMBER); - nr_sectors = nr_chunks * VIXDISKLIB_MIN_CHUNK_SIZE; - - VDDK_CALL_START (VixDiskLib_QueryAllocatedBlocks, - "handle, %" PRIu64 " sectors, %" PRIu64 " sectors, " - "%d sectors", - start_sector, nr_sectors, VIXDISKLIB_MIN_CHUNK_SIZE) - err = VixDiskLib_QueryAllocatedBlocks (h->handle, - start_sector, nr_sectors, - VIXDISKLIB_MIN_CHUNK_SIZE, - &block_list); - VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, 0); - if (err != VIX_OK) { - VDDK_ERROR (err, "VixDiskLib_QueryAllocatedBlocks"); - return -1; - } - - for (i = 0; i < block_list->numBlocks; ++i) { - uint64_t blk_offset, blk_length; - - blk_offset = block_list->blocks[i].offset * VIXDISKLIB_SECTOR_SIZE; - blk_length = block_list->blocks[i].length * VIXDISKLIB_SECTOR_SIZE; - - /* The query returns allocated blocks. We must insert holes - * between the blocks as necessary. - */ - if ((position < blk_offset && - add_extent (extents, &position, blk_offset, true) == -1) || - (add_extent (extents, - &position, blk_offset + blk_length, false) == -1)) { - VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list") - VixDiskLib_FreeBlockList (block_list); - VDDK_CALL_END (VixDiskLib_FreeBlockList, 0); - return -1; - } - } - VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list") - VixDiskLib_FreeBlockList (block_list); - VDDK_CALL_END (VixDiskLib_FreeBlockList, 0); - - /* There's an implicit hole after the returned list of blocks, up - * to the end of the QueryAllocatedBlocks request. - */ - if (add_extent (extents, - &position, - (start_sector + nr_sectors) * VIXDISKLIB_SECTOR_SIZE, - true) == -1) - return -1; - - start_sector += nr_sectors; - - /* If one extent was requested, as long as we've added an extent - * overlapping the original offset we're done. - */ - if (req_one && position > offset) - break; - } - - return 0; + struct command extents_cmd = { + .type = EXTENTS, + .ptr = extents, + .count = count, + .offset = offset, + .req_one = req_one, + }; + + return send_command_and_wait (h, &extents_cmd); } static struct nbdkit_plugin plugin = { diff --git a/plugins/vddk/vddk.h b/plugins/vddk/vddk.h index 1400589d..be0b3492 100644 --- a/plugins/vddk/vddk.h +++ b/plugins/vddk/vddk.h @@ -90,7 +90,9 @@ extern int vddk_debug_stats; /* GCC can optimize this away at compile time: */ \ const bool datapath = \ strcmp (#fn, "VixDiskLib_Read") == 0 || \ - strcmp (#fn, "VixDiskLib_Write") == 0; \ + strcmp (#fn, "VixDiskLib_ReadAsync") == 0 || \ + strcmp (#fn, "VixDiskLib_Write") == 0 || \ + strcmp (#fn, "VixDiskLib_WriteAsync") == 0; \ if (vddk_debug_stats) \ gettimeofday (&start_t, NULL); \ if (!datapath || vddk_debug_datapath) \ @@ -120,6 +122,46 @@ extern int vddk_debug_stats; VDDK_CALL_END (VixDiskLib_FreeErrorText, 0); \ } while (0) +/* Queue of asynchronous commands sent to the background thread. */ +enum command_type { GET_SIZE, READ, WRITE, FLUSH, CAN_EXTENTS, EXTENTS, STOP }; +struct command { + /* These fields are set by the caller. */ + enum command_type type; /* command */ + void *ptr; /* buffer, extents list, return values */ + uint32_t count; /* READ, WRITE, EXTENTS */ + uint64_t offset; /* READ, WRITE, EXTENTS */ + bool req_one; /* EXTENTS NBDKIT_FLAG_REQ_ONE */ + + /* This field is set to a unique value by send_command_and_wait. */ + uint64_t id; /* serial number */ + + /* These fields are used by the internal implementation. */ + pthread_mutex_t mutex; /* completion mutex */ + pthread_cond_t cond; /* completion condition */ + enum { SUBMITTED, SUCCEEDED, FAILED } status; +}; + +DEFINE_VECTOR_TYPE(command_queue, struct command *) + +/* The per-connection handle. */ +struct vddk_handle { + VixDiskLibConnectParams *params; /* connection parameters */ + VixDiskLibConnection connection; /* connection */ + VixDiskLibHandle handle; /* disk handle */ + + pthread_t thread; /* background thread for asynch work */ + + /* Command queue of commands sent to the background thread. Use + * send_command_and_wait to add a command. Only the background + * thread must make VDDK API calls (apart from opening and closing). + * The lock protects all of these fields. + */ + pthread_mutex_t commands_lock; /* lock */ + command_queue commands; /* command queue */ + pthread_cond_t commands_cond; /* condition (queue size 0 -> 1) */ + uint64_t id; /* next command ID */ +}; + /* reexec.c */ extern bool noreexec; extern char *reexeced; @@ -141,4 +183,9 @@ extern pthread_mutex_t stats_lock; #undef OPTIONAL_STUB extern void display_stats (void); +/* worker.c */ +extern const char *command_type_string (enum command_type type); +extern int send_command_and_wait (struct vddk_handle *h, struct command *cmd); +extern void *vddk_worker_thread (void *handle); + #endif /* NBDKIT_VDDK_H */ diff --git a/plugins/vddk/worker.c b/plugins/vddk/worker.c new file mode 100644 index 00000000..2a1d4f26 --- /dev/null +++ b/plugins/vddk/worker.c @@ -0,0 +1,567 @@ +/* nbdkit + * Copyright (C) 2013-2021 Red Hat Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name of Red Hat nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include + +#include + +#define NBDKIT_API_VERSION 2 +#include + +#include "cleanup.h" +#include "minmax.h" +#include "rounding.h" +#include "vector.h" + +#include "vddk.h" + +const char * +command_type_string (enum command_type type) +{ + switch (type) { + case GET_SIZE: return "get_size"; + case READ: return "read"; + case WRITE: return "write"; + case FLUSH: return "flush"; + case CAN_EXTENTS: return "can_extents"; + case EXTENTS: return "extents"; + case STOP: return "stop"; + default: abort (); + } +} + +/* Send command to the background thread and wait for completion. + * + * Returns 0 for OK + * On error, calls nbdkit_error and returns -1. + */ +int +send_command_and_wait (struct vddk_handle *h, struct command *cmd) +{ + /* Add the command to the command queue. */ + { + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&h->commands_lock); + cmd->id = h->id++; + + if (command_queue_append (&h->commands, cmd) == -1) + /* On error command_queue_append will call nbdkit_error. */ + return -1; + + /* Signal the caller if it could be sleeping on an empty queue. */ + if (h->commands.size == 1) + pthread_cond_signal (&h->commands_cond); + + /* This will be used to signal command completion back to us. */ + pthread_mutex_init (&cmd->mutex, NULL); + pthread_cond_init (&cmd->cond, NULL); + } + + /* Wait for the command to be completed by the background thread. */ + { + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&cmd->mutex); + while (cmd->status == SUBMITTED) + pthread_cond_wait (&cmd->cond, &cmd->mutex); + } + + pthread_mutex_destroy (&cmd->mutex); + pthread_cond_destroy (&cmd->cond); + + /* On error the background thread will call nbdkit_error. */ + switch (cmd->status) { + case SUCCEEDED: return 0; + case FAILED: return -1; + default: abort (); + } +} + +/* Asynchronous commands are completed when this function is called. */ +static void +complete_command (void *vp, VixError result) +{ + struct command *cmd = vp; + + if (vddk_debug_datapath) + nbdkit_debug ("command %" PRIu64 " completed", cmd->id); + + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&cmd->mutex); + + if (result == VIX_OK) { + cmd->status = SUCCEEDED; + } else { + VDDK_ERROR (result, "command %" PRIu64 ": asynchronous %s failed", + cmd->id, command_type_string (cmd->type)); + cmd->status = FAILED; + } + + pthread_cond_signal (&cmd->cond); +} + +/* Wait for any asynchronous commands to complete. */ +static int +do_stop (struct command *cmd, struct vddk_handle *h) +{ + VixError err; + + /* Because we assume VDDK >= 6.5, VixDiskLib_Wait must exist. */ + VDDK_CALL_START (VixDiskLib_Wait, "handle") + err = VixDiskLib_Wait (h->handle); + VDDK_CALL_END (VixDiskLib_Wait, 0); + if (err != VIX_OK) { + VDDK_ERROR (err, "VixDiskLib_Wait"); + /* In the end this error indication is ignored because it only + * happens on the close path when we cannot handle errors. + */ + return -1; + } + return 0; +} + +/* Get size command. */ +static int64_t +do_get_size (struct command *cmd, struct vddk_handle *h) +{ + VixError err; + VixDiskLibInfo *info; + uint64_t size; + + VDDK_CALL_START (VixDiskLib_GetInfo, "handle, &info") + err = VixDiskLib_GetInfo (h->handle, &info); + VDDK_CALL_END (VixDiskLib_GetInfo, 0); + if (err != VIX_OK) { + VDDK_ERROR (err, "VixDiskLib_GetInfo"); + return -1; + } + + size = info->capacity * (uint64_t)VIXDISKLIB_SECTOR_SIZE; + + if (vddk_debug_diskinfo) { + nbdkit_debug ("disk info: capacity: %" PRIu64 " sectors " + "(%" PRIi64 " bytes)", + info->capacity, size); + nbdkit_debug ("disk info: biosGeo: C:%" PRIu32 " H:%" PRIu32 " S:%" PRIu32, + info->biosGeo.cylinders, + info->biosGeo.heads, + info->biosGeo.sectors); + nbdkit_debug ("disk info: physGeo: C:%" PRIu32 " H:%" PRIu32 " S:%" PRIu32, + info->physGeo.cylinders, + info->physGeo.heads, + info->physGeo.sectors); + nbdkit_debug ("disk info: adapter type: %d", + (int) info->adapterType); + nbdkit_debug ("disk info: num links: %d", info->numLinks); + nbdkit_debug ("disk info: parent filename hint: %s", + info->parentFileNameHint ? : "NULL"); + nbdkit_debug ("disk info: uuid: %s", + info->uuid ? : "NULL"); + if (library_version >= 7) { + nbdkit_debug ("disk info: sector size: " + "logical %" PRIu32 " physical %" PRIu32, + info->logicalSectorSize, + info->physicalSectorSize); + } + } + + VDDK_CALL_START (VixDiskLib_FreeInfo, "info") + VixDiskLib_FreeInfo (info); + VDDK_CALL_END (VixDiskLib_FreeInfo, 0); + + return (int64_t) size; +} + +static int +do_read (struct command *cmd, struct vddk_handle *h) +{ + VixError err; + uint32_t count = cmd->count; + uint64_t offset = cmd->offset; + void *buf = cmd->ptr; + + /* Align to sectors. */ + if (!IS_ALIGNED (offset, VIXDISKLIB_SECTOR_SIZE)) { + nbdkit_error ("%s is not aligned to sectors", "read"); + return -1; + } + if (!IS_ALIGNED (count, VIXDISKLIB_SECTOR_SIZE)) { + nbdkit_error ("%s is not aligned to sectors", "read"); + return -1; + } + offset /= VIXDISKLIB_SECTOR_SIZE; + count /= VIXDISKLIB_SECTOR_SIZE; + + VDDK_CALL_START (VixDiskLib_ReadAsync, + "handle, %" PRIu64 " sectors, " + "%" PRIu32 " sectors, buffer, callback, %" PRIu64, + offset, count, cmd->id) + err = VixDiskLib_ReadAsync (h->handle, offset, count, buf, + complete_command, cmd); + VDDK_CALL_END (VixDiskLib_ReadAsync, count * VIXDISKLIB_SECTOR_SIZE); + if (err != VIX_ASYNC) { + VDDK_ERROR (err, "VixDiskLib_ReadAsync"); + return -1; + } + + return 0; +} + +static int +do_write (struct command *cmd, struct vddk_handle *h) +{ + VixError err; + uint32_t count = cmd->count; + uint64_t offset = cmd->offset; + const void *buf = cmd->ptr; + + /* Align to sectors. */ + if (!IS_ALIGNED (offset, VIXDISKLIB_SECTOR_SIZE)) { + nbdkit_error ("%s is not aligned to sectors", "write"); + return -1; + } + if (!IS_ALIGNED (count, VIXDISKLIB_SECTOR_SIZE)) { + nbdkit_error ("%s is not aligned to sectors", "write"); + return -1; + } + offset /= VIXDISKLIB_SECTOR_SIZE; + count /= VIXDISKLIB_SECTOR_SIZE; + + VDDK_CALL_START (VixDiskLib_WriteAsync, + "handle, %" PRIu64 " sectors, " + "%" PRIu32 " sectors, buffer, callback, %" PRIu64, + offset, count, cmd->id) + err = VixDiskLib_WriteAsync (h->handle, offset, count, buf, + complete_command, cmd); + VDDK_CALL_END (VixDiskLib_WriteAsync, count * VIXDISKLIB_SECTOR_SIZE); + if (err != VIX_ASYNC) { + VDDK_ERROR (err, "VixDiskLib_WriteAsync"); + return -1; + } + + return 0; +} + +static int +do_flush (struct command *cmd, struct vddk_handle *h) +{ + VixError err; + + /* It seems safer to wait for outstanding asynchronous commands to + * complete before doing a flush, so do this but ignore errors + * except to print them. + */ + VDDK_CALL_START (VixDiskLib_Wait, "handle") + err = VixDiskLib_Wait (h->handle); + VDDK_CALL_END (VixDiskLib_Wait, 0); + if (err != VIX_OK) + VDDK_ERROR (err, "VixDiskLib_Wait"); + + /* The documentation for Flush is missing, but the comment in the + * header file seems to indicate that it waits for WriteAsync + * commands to finish. There's a new function Wait to wait for + * those. However I verified using strace that in fact Flush calls + * fsync on the file so it appears to be the correct call to use + * here. + */ + VDDK_CALL_START (VixDiskLib_Flush, "handle") + err = VixDiskLib_Flush (h->handle); + VDDK_CALL_END (VixDiskLib_Flush, 0); + if (err != VIX_OK) { + VDDK_ERROR (err, "VixDiskLib_Flush"); + return -1; + } + + return 0; +} + +static int +do_can_extents (struct command *cmd, struct vddk_handle *h) +{ + VixError err; + VixDiskLibBlockList *block_list; + + /* This call was added in VDDK 6.7. In earlier versions the + * function pointer will be NULL and we cannot query extents. + */ + if (VixDiskLib_QueryAllocatedBlocks == NULL) { + nbdkit_debug ("can_extents: VixDiskLib_QueryAllocatedBlocks == NULL, " + "probably this is VDDK < 6.7"); + return 0; + } + + /* Suppress errors around this call. See: + * https://bugzilla.redhat.com/show_bug.cgi?id=1709211#c7 + */ + error_suppression = 1; + + /* However even when the call is available it rarely works well so + * the best thing we can do here is to try the call and if it's + * non-functional return false. + */ + VDDK_CALL_START (VixDiskLib_QueryAllocatedBlocks, + "handle, 0, %d sectors, %d sectors", + VIXDISKLIB_MIN_CHUNK_SIZE, VIXDISKLIB_MIN_CHUNK_SIZE) + err = VixDiskLib_QueryAllocatedBlocks (h->handle, + 0, VIXDISKLIB_MIN_CHUNK_SIZE, + VIXDISKLIB_MIN_CHUNK_SIZE, + &block_list); + VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, 0); + error_suppression = 0; + if (err == VIX_OK) { + VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list") + VixDiskLib_FreeBlockList (block_list); + VDDK_CALL_END (VixDiskLib_FreeBlockList, 0); + } + if (err != VIX_OK) { + char *errmsg = VixDiskLib_GetErrorText (err, NULL); + nbdkit_debug ("can_extents: " + "VixDiskLib_QueryAllocatedBlocks test failed, " + "extents support will be disabled: " + "original error: %s", + errmsg); + VixDiskLib_FreeErrorText (errmsg); + return 0; + } + + return 1; +} + +/* Add an extent to the list of extents. */ +static int +add_extent (struct nbdkit_extents *extents, + uint64_t *position, uint64_t next_position, bool is_hole) +{ + uint32_t type = 0; + const uint64_t length = next_position - *position; + + if (is_hole) { + type = NBDKIT_EXTENT_HOLE; + /* Images opened as single link might be backed by another file in the + chain, so the holes are not guaranteed to be zeroes. */ + if (!single_link) + type |= NBDKIT_EXTENT_ZERO; + } + + assert (*position <= next_position); + if (*position == next_position) + return 0; + + if (vddk_debug_extents) + nbdkit_debug ("adding extent type %s at [%" PRIu64 "...%" PRIu64 "]", + is_hole ? "hole" : "allocated data", + *position, next_position-1); + if (nbdkit_add_extent (extents, *position, length, type) == -1) + return -1; + + *position = next_position; + return 0; +} + +static int +do_extents (struct command *cmd, struct vddk_handle *h) +{ + uint32_t count = cmd->count; + uint64_t offset = cmd->offset; + bool req_one = cmd->req_one; + struct nbdkit_extents *extents = cmd->ptr; + uint64_t position, end, start_sector; + + position = offset; + end = offset + count; + + /* We can only query whole chunks. Therefore start with the + * first chunk before offset. + */ + start_sector = + ROUND_DOWN (offset, VIXDISKLIB_MIN_CHUNK_SIZE * VIXDISKLIB_SECTOR_SIZE) + / VIXDISKLIB_SECTOR_SIZE; + while (start_sector * VIXDISKLIB_SECTOR_SIZE < end) { + VixError err; + uint32_t i; + uint64_t nr_chunks, nr_sectors; + VixDiskLibBlockList *block_list; + + assert (IS_ALIGNED (start_sector, VIXDISKLIB_MIN_CHUNK_SIZE)); + + nr_chunks = + ROUND_UP (end - start_sector * VIXDISKLIB_SECTOR_SIZE, + VIXDISKLIB_MIN_CHUNK_SIZE * VIXDISKLIB_SECTOR_SIZE) + / (VIXDISKLIB_MIN_CHUNK_SIZE * VIXDISKLIB_SECTOR_SIZE); + nr_chunks = MIN (nr_chunks, VIXDISKLIB_MAX_CHUNK_NUMBER); + nr_sectors = nr_chunks * VIXDISKLIB_MIN_CHUNK_SIZE; + + VDDK_CALL_START (VixDiskLib_QueryAllocatedBlocks, + "handle, %" PRIu64 " sectors, %" PRIu64 " sectors, " + "%d sectors", + start_sector, nr_sectors, VIXDISKLIB_MIN_CHUNK_SIZE) + err = VixDiskLib_QueryAllocatedBlocks (h->handle, + start_sector, nr_sectors, + VIXDISKLIB_MIN_CHUNK_SIZE, + &block_list); + VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, 0); + if (err != VIX_OK) { + VDDK_ERROR (err, "VixDiskLib_QueryAllocatedBlocks"); + return -1; + } + + for (i = 0; i < block_list->numBlocks; ++i) { + uint64_t blk_offset, blk_length; + + blk_offset = block_list->blocks[i].offset * VIXDISKLIB_SECTOR_SIZE; + blk_length = block_list->blocks[i].length * VIXDISKLIB_SECTOR_SIZE; + + /* The query returns allocated blocks. We must insert holes + * between the blocks as necessary. + */ + if ((position < blk_offset && + add_extent (extents, &position, blk_offset, true) == -1) || + (add_extent (extents, + &position, blk_offset + blk_length, false) == -1)) { + VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list") + VixDiskLib_FreeBlockList (block_list); + VDDK_CALL_END (VixDiskLib_FreeBlockList, 0); + return -1; + } + } + VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list") + VixDiskLib_FreeBlockList (block_list); + VDDK_CALL_END (VixDiskLib_FreeBlockList, 0); + + /* There's an implicit hole after the returned list of blocks, + * up to the end of the QueryAllocatedBlocks request. + */ + if (add_extent (extents, + &position, + (start_sector + nr_sectors) * VIXDISKLIB_SECTOR_SIZE, + true) == -1) { + return -1; + } + + start_sector += nr_sectors; + + /* If one extent was requested, as long as we've added an extent + * overlapping the original offset we're done. + */ + if (req_one && position > offset) + break; + } + + return 0; +} + +/* Background worker thread, one per connection, which is where the + * VDDK commands are issued. + */ +void * +vddk_worker_thread (void *handle) +{ + struct vddk_handle *h = handle; + bool stop = false; + + while (!stop) { + struct command *cmd; + int r; + bool async = false; + + /* Wait until we are sent at least one command. */ + { + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&h->commands_lock); + while (h->commands.size == 0) + pthread_cond_wait (&h->commands_cond, &h->commands_lock); + cmd = h->commands.ptr[0]; + command_queue_remove (&h->commands, 0); + } + + switch (cmd->type) { + case STOP: + r = do_stop (cmd, h); + stop = true; + break; + + case GET_SIZE: { + int64_t size = do_get_size (cmd, h); + if (size == -1) + r = -1; + else { + r = 0; + *(uint64_t *)cmd->ptr = size; + } + break; + } + + case READ: + r = do_read (cmd, h); + /* If async is true, don't retire this command now. */ + async = r == 0; + break; + + case WRITE: + r = do_write (cmd, h); + /* If async is true, don't retire this command now. */ + async = r == 0; + break; + + case FLUSH: + r = do_flush (cmd, h); + break; + + case CAN_EXTENTS: + r = do_can_extents (cmd, h); + if (r >= 0) + *(int *)cmd->ptr = r; + break; + + case EXTENTS: + r = do_extents (cmd, h); + break; + + default: abort (); /* impossible, but keeps GCC happy */ + } /* switch */ + + if (!async) { + /* Update the command status. */ + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&cmd->mutex); + cmd->status = r >= 0 ? SUCCEEDED : FAILED; + + /* For synchronous commands signal the caller thread that the + * command has completed. (Asynchronous commands are completed in + * the callback handler). + */ + pthread_cond_signal (&cmd->cond); + } + } /* while (!stop) */ + + /* Exit the worker thread. */ + return NULL; +} diff --git a/tests/dummy-vddk.c b/tests/dummy-vddk.c index cb88380c..b6f12042 100644 --- a/tests/dummy-vddk.c +++ b/tests/dummy-vddk.c @@ -188,6 +188,19 @@ VixDiskLib_Read (VixDiskLibHandle handle, return VIX_OK; } +NBDKIT_DLL_PUBLIC VixError +VixDiskLib_ReadAsync (VixDiskLibHandle handle, + uint64_t start_sector, uint64_t nr_sectors, + unsigned char *buf, + VixDiskLibCompletionCB callback, void *data) +{ + size_t offset = start_sector * VIXDISKLIB_SECTOR_SIZE; + + memcpy (buf, disk + offset, nr_sectors * VIXDISKLIB_SECTOR_SIZE); + callback (data, VIX_OK); + return VIX_ASYNC; +} + NBDKIT_DLL_PUBLIC VixError VixDiskLib_Write (VixDiskLibHandle handle, uint64_t start_sector, uint64_t nr_sectors, @@ -199,6 +212,25 @@ VixDiskLib_Write (VixDiskLibHandle handle, return VIX_OK; } +NBDKIT_DLL_PUBLIC VixError +VixDiskLib_WriteAsync (VixDiskLibHandle handle, + uint64_t start_sector, uint64_t nr_sectors, + const unsigned char *buf, + VixDiskLibCompletionCB callback, void *data) +{ + size_t offset = start_sector * VIXDISKLIB_SECTOR_SIZE; + + memcpy (disk + offset, buf, nr_sectors * VIXDISKLIB_SECTOR_SIZE); + callback (data, VIX_OK); + return VIX_ASYNC; +} + +NBDKIT_DLL_PUBLIC VixError +VixDiskLib_Flush (VixDiskLibHandle handle) +{ + return VIX_OK; +} + NBDKIT_DLL_PUBLIC VixError VixDiskLib_Wait (VixDiskLibHandle handle) { -- 2.31.1