Blame SOURCES/0014-vddk-Implement-parallel-thread-model.patch

6661d0
From 13223e8e3219d0310ce4d94093bbdb7732a891fb Mon Sep 17 00:00:00 2001
7084e2
From: "Richard W.M. Jones" <rjones@redhat.com>
7084e2
Date: Wed, 27 Oct 2021 10:17:22 +0100
7084e2
Subject: [PATCH] vddk: Implement parallel thread model
7084e2
7084e2
Since VDDK 6.0, asynchronous read and write operations are available.
7084e2
This commit makes use of these, allowing us to use the parallel thread
7084e2
model for increased performance.
7084e2
7084e2
Note that at least VDDK 6.5 is required because VDDK 6.0 had a
7084e2
different and incompatible signature for VixDiskLibCompletionCB.
7084e2
7084e2
Also note at least vSphere 6.7 is required for asynch calls to make
7084e2
any performance difference.  In older versions they work
7084e2
synchronously.
7084e2
7084e2
In the parallel thread model, nbdkit will be calling us in parallel
7084e2
from multiple nbdkit threads.  VDDK does not allow multiple threads to
7084e2
simultaneously call VDDK operations on the same handle.  So we create
7084e2
a background thread per handle (== connection).
7084e2
7084e2
Only the background thread makes VDDK calls[1].  The background thread
7084e2
handles a mix of synchronous (like extents, flush) and asynchronous
7084e2
(like read, write) operations, but all from one thread.
7084e2
7084e2
Parallel nbdkit threads issue commands to the background thread
7084e2
associated with each handle, and wait until they are retired.
7084e2
7084e2
[1] All VDDK calls except for connecting and disconnecting which for
7084e2
different reasons are protected by a global lock, so I did not need to
7084e2
change those.
7084e2
7084e2
(cherry picked from commit 1eecf15fc3d8ea253ccec4f5883fdbb9aa6f8c2b)
7084e2
---
7084e2
 plugins/vddk/Makefile.am            |   1 +
7084e2
 plugins/vddk/nbdkit-vddk-plugin.pod |  11 +-
7084e2
 plugins/vddk/vddk.c                 | 380 +++++--------------
7084e2
 plugins/vddk/vddk.h                 |  49 ++-
7084e2
 plugins/vddk/worker.c               | 567 ++++++++++++++++++++++++++++
7084e2
 tests/dummy-vddk.c                  |  32 ++
7084e2
 6 files changed, 745 insertions(+), 295 deletions(-)
7084e2
 create mode 100644 plugins/vddk/worker.c
7084e2
7084e2
diff --git a/plugins/vddk/Makefile.am b/plugins/vddk/Makefile.am
7084e2
index 4f470ff9..f8382fc9 100644
7084e2
--- a/plugins/vddk/Makefile.am
7084e2
+++ b/plugins/vddk/Makefile.am
7084e2
@@ -49,6 +49,7 @@ nbdkit_vddk_plugin_la_SOURCES = \
7084e2
 	stats.c \
7084e2
 	vddk-structs.h \
7084e2
 	vddk-stubs.h \
7084e2
+	worker.c \
7084e2
 	$(top_srcdir)/include/nbdkit-plugin.h \
7084e2
 	$(NULL)
7084e2
 
7084e2
diff --git a/plugins/vddk/nbdkit-vddk-plugin.pod b/plugins/vddk/nbdkit-vddk-plugin.pod
7084e2
index 1c16d096..ce82a734 100644
7084e2
--- a/plugins/vddk/nbdkit-vddk-plugin.pod
7084e2
+++ b/plugins/vddk/nbdkit-vddk-plugin.pod
7084e2
@@ -523,6 +523,14 @@ read bandwidth to the VMware server.
7084e2
 
7084e2
 Same as above, but for writing and flushing writes.
7084e2
 
7084e2
+=item C<ReadAsync>
7084e2
+
7084e2
+=item C<WriteAsync>
7084e2
+
7084e2
+Same as above, but for asynchronous read and write calls introduced in
7084e2
+nbdkit 1.30.  Unfortunately at the moment the amount of time spent in
7084e2
+these calls is not accounted for correctly.
7084e2
+
7084e2
 =item C<QueryAllocatedBlocks>
7084e2
 
7084e2
 This call is used to query information about the sparseness of the
7084e2
@@ -580,7 +588,8 @@ Debug extents returned by C<QueryAllocatedBlocks>.
7084e2
 
7084e2
 =item B<-D vddk.datapath=0>
7084e2
 
7084e2
-Suppress debugging of datapath calls (C<Read> and C<Write>).
7084e2
+Suppress debugging of datapath calls (C<Read>, C<ReadAsync>, C<Write>
7084e2
+and C<WriteAsync>).
7084e2
 
7084e2
 =item B<-D vddk.stats=1>
7084e2
 
7084e2
diff --git a/plugins/vddk/vddk.c b/plugins/vddk/vddk.c
7084e2
index 67ac775c..9f223db0 100644
7084e2
--- a/plugins/vddk/vddk.c
7084e2
+++ b/plugins/vddk/vddk.c
7084e2
@@ -50,9 +50,6 @@
7084e2
 #include <nbdkit-plugin.h>
7084e2
 
7084e2
 #include "cleanup.h"
7084e2
-#include "minmax.h"
7084e2
-#include "rounding.h"
7084e2
-#include "tvdiff.h"
7084e2
 #include "vector.h"
7084e2
 
7084e2
 #include "vddk.h"
7084e2
@@ -522,23 +519,18 @@ vddk_dump_plugin (void)
7084e2
 /* The rules on threads and VDDK are here:
7084e2
  * https://code.vmware.com/docs/11750/virtual-disk-development-kit-programming-guide/GUID-6BE903E8-DC70-46D9-98E4-E34A2002C2AD.html
7084e2
  *
7084e2
- * Before nbdkit 1.22 we used SERIALIZE_ALL_REQUESTS.  Since nbdkit
7084e2
- * 1.22 we changed this to SERIALIZE_REQUESTS and added a mutex around
7084e2
- * calls to VixDiskLib_Open and VixDiskLib_Close.  This is not quite
7084e2
- * within the letter of the rules, but is within the spirit.
7084e2
+ * Before nbdkit 1.22 we used SERIALIZE_ALL_REQUESTS.  In nbdkit
7084e2
+ * 1.22-1.28 we changed this to SERIALIZE_REQUESTS and added a mutex
7084e2
+ * around calls to VixDiskLib_Open and VixDiskLib_Close.  In nbdkit
7084e2
+ * 1.30 and above we assign a background thread per connection to do
7084e2
+ * asynch operations and use the PARALLEL model.  We still need the
7084e2
+ * lock around Open and Close.
7084e2
  */
7084e2
-#define THREAD_MODEL NBDKIT_THREAD_MODEL_SERIALIZE_REQUESTS
7084e2
+#define THREAD_MODEL NBDKIT_THREAD_MODEL_PARALLEL
7084e2
 
7084e2
 /* Lock protecting open/close calls - see above. */
7084e2
 static pthread_mutex_t open_close_lock = PTHREAD_MUTEX_INITIALIZER;
7084e2
 
7084e2
-/* The per-connection handle. */
7084e2
-struct vddk_handle {
7084e2
-  VixDiskLibConnectParams *params; /* connection parameters */
7084e2
-  VixDiskLibConnection connection; /* connection */
7084e2
-  VixDiskLibHandle handle;         /* disk handle */
7084e2
-};
7084e2
-
7084e2
 static inline VixDiskLibConnectParams *
7084e2
 allocate_connect_params (void)
7084e2
 {
7084e2
@@ -579,12 +571,16 @@ vddk_open (int readonly)
7084e2
   VixError err;
7084e2
   uint32_t flags;
7084e2
   const char *transport_mode;
7084e2
+  int pterr;
7084e2
 
7084e2
-  h = malloc (sizeof *h);
7084e2
+  h = calloc (1, sizeof *h);
7084e2
   if (h == NULL) {
7084e2
-    nbdkit_error ("malloc: %m");
7084e2
+    nbdkit_error ("calloc: %m");
7084e2
     return NULL;
7084e2
   }
7084e2
+  h->commands = (command_queue) empty_vector;
7084e2
+  pthread_mutex_init (&h->commands_lock, NULL);
7084e2
+  pthread_cond_init (&h->commands_cond, NULL);
7084e2
 
7084e2
   h->params = allocate_connect_params ();
7084e2
   if (h->params == NULL) {
7084e2
@@ -661,8 +657,22 @@ vddk_open (int readonly)
7084e2
   VDDK_CALL_END (VixDiskLib_GetTransportMode, 0);
7084e2
   nbdkit_debug ("transport mode: %s", transport_mode);
7084e2
 
7084e2
+  /* Start the background thread which actually does the asynchronous
7084e2
+   * work.
7084e2
+   */
7084e2
+  pterr = pthread_create (&h->thread, NULL, vddk_worker_thread, h);
7084e2
+  if (pterr != 0) {
7084e2
+    errno = pterr;
7084e2
+    nbdkit_error ("pthread_create: %m");
7084e2
+    goto err3;
7084e2
+  }
7084e2
+
7084e2
   return h;
7084e2
 
7084e2
+ err3:
7084e2
+  VDDK_CALL_START (VixDiskLib_Close, "handle")
7084e2
+    VixDiskLib_Close (h->handle);
7084e2
+  VDDK_CALL_END (VixDiskLib_Close, 0);
7084e2
  err2:
7084e2
   VDDK_CALL_START (VixDiskLib_Disconnect, "connection")
7084e2
     VixDiskLib_Disconnect (h->connection);
7084e2
@@ -670,6 +680,8 @@ vddk_open (int readonly)
7084e2
  err1:
7084e2
   free_connect_params (h->params);
7084e2
  err0:
7084e2
+  pthread_mutex_destroy (&h->commands_lock);
7084e2
+  pthread_cond_destroy (&h->commands_cond);
7084e2
   free (h);
7084e2
   return NULL;
7084e2
 }
7084e2
@@ -680,6 +692,10 @@ vddk_close (void *handle)
7084e2
 {
7084e2
   ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&open_close_lock);
7084e2
   struct vddk_handle *h = handle;
7084e2
+  struct command stop_cmd = { .type = STOP };
7084e2
+
7084e2
+  send_command_and_wait (h, &stop_cmd);
7084e2
+  pthread_join (h->thread, NULL);
7084e2
 
7084e2
   VDDK_CALL_START (VixDiskLib_Close, "handle")
7084e2
     VixDiskLib_Close (h->handle);
7084e2
@@ -689,6 +705,9 @@ vddk_close (void *handle)
7084e2
   VDDK_CALL_END (VixDiskLib_Disconnect, 0);
7084e2
 
7084e2
   free_connect_params (h->params);
7084e2
+  pthread_mutex_destroy (&h->commands_lock);
7084e2
+  pthread_cond_destroy (&h->commands_cond);
7084e2
+  command_queue_reset (&h->commands);
7084e2
   free (h);
7084e2
 }
7084e2
 
7084e2
@@ -697,54 +716,29 @@ static int64_t
7084e2
 vddk_get_size (void *handle)
7084e2
 {
7084e2
   struct vddk_handle *h = handle;
7084e2
-  VixDiskLibInfo *info;
7084e2
-  VixError err;
7084e2
   uint64_t size;
7084e2
+  struct command get_size_cmd = { .type = GET_SIZE, .ptr = &size };
7084e2
 
7084e2
-  VDDK_CALL_START (VixDiskLib_GetInfo, "handle, &info")
7084e2
-    err = VixDiskLib_GetInfo (h->handle, &info;;
7084e2
-  VDDK_CALL_END (VixDiskLib_GetInfo, 0);
7084e2
-  if (err != VIX_OK) {
7084e2
-    VDDK_ERROR (err, "VixDiskLib_GetInfo");
7084e2
+  if (send_command_and_wait (h, &get_size_cmd) == -1)
7084e2
     return -1;
7084e2
-  }
7084e2
-
7084e2
-  size = info->capacity * (uint64_t)VIXDISKLIB_SECTOR_SIZE;
7084e2
-
7084e2
-  if (vddk_debug_diskinfo) {
7084e2
-    nbdkit_debug ("disk info: capacity: %" PRIu64 " sectors "
7084e2
-                  "(%" PRIi64 " bytes)",
7084e2
-                  info->capacity, size);
7084e2
-    nbdkit_debug ("disk info: biosGeo: C:%" PRIu32 " H:%" PRIu32 " S:%" PRIu32,
7084e2
-                  info->biosGeo.cylinders,
7084e2
-                  info->biosGeo.heads,
7084e2
-                  info->biosGeo.sectors);
7084e2
-    nbdkit_debug ("disk info: physGeo: C:%" PRIu32 " H:%" PRIu32 " S:%" PRIu32,
7084e2
-                  info->physGeo.cylinders,
7084e2
-                  info->physGeo.heads,
7084e2
-                  info->physGeo.sectors);
7084e2
-    nbdkit_debug ("disk info: adapter type: %d",
7084e2
-                  (int) info->adapterType);
7084e2
-    nbdkit_debug ("disk info: num links: %d", info->numLinks);
7084e2
-    nbdkit_debug ("disk info: parent filename hint: %s",
7084e2
-                  info->parentFileNameHint ? : "NULL");
7084e2
-    nbdkit_debug ("disk info: uuid: %s",
7084e2
-                  info->uuid ? : "NULL");
7084e2
-    if (library_version >= 7) {
7084e2
-      nbdkit_debug ("disk info: sector size: "
7084e2
-                    "logical %" PRIu32 " physical %" PRIu32,
7084e2
-                    info->logicalSectorSize,
7084e2
-                    info->physicalSectorSize);
7084e2
-    }
7084e2
-  }
7084e2
-
7084e2
-  VDDK_CALL_START (VixDiskLib_FreeInfo, "info")
7084e2
-    VixDiskLib_FreeInfo (info);
7084e2
-  VDDK_CALL_END (VixDiskLib_FreeInfo, 0);
7084e2
 
7084e2
   return (int64_t) size;
7084e2
 }
7084e2
 
7084e2
+static int
7084e2
+vddk_can_fua (void *handle)
7084e2
+{
7084e2
+  /* The Flush call was not available in VDDK < 6.0. */
7084e2
+  return VixDiskLib_Flush != NULL ? NBDKIT_FUA_NATIVE : NBDKIT_FUA_NONE;
7084e2
+}
7084e2
+
7084e2
+static int
7084e2
+vddk_can_flush (void *handle)
7084e2
+{
7084e2
+  /* The Flush call was not available in VDDK < 6.0. */
7084e2
+  return VixDiskLib_Flush != NULL;
7084e2
+}
7084e2
+
7084e2
 /* Read data from the file.
7084e2
  *
7084e2
  * Note that reads have to be aligned to sectors (XXX).
7084e2
@@ -754,32 +748,14 @@ vddk_pread (void *handle, void *buf, uint32_t count, uint64_t offset,
7084e2
             uint32_t flags)
7084e2
 {
7084e2
   struct vddk_handle *h = handle;
7084e2
-  VixError err;
7084e2
+  struct command read_cmd = {
7084e2
+    .type = READ,
7084e2
+    .ptr = buf,
7084e2
+    .count = count,
7084e2
+    .offset = offset,
7084e2
+  };
7084e2
 
7084e2
-  /* Align to sectors. */
7084e2
-  if (!IS_ALIGNED (offset, VIXDISKLIB_SECTOR_SIZE)) {
7084e2
-    nbdkit_error ("%s is not aligned to sectors", "read");
7084e2
-    return -1;
7084e2
-  }
7084e2
-  if (!IS_ALIGNED (count, VIXDISKLIB_SECTOR_SIZE)) {
7084e2
-    nbdkit_error ("%s is not aligned to sectors", "read");
7084e2
-    return -1;
7084e2
-  }
7084e2
-  offset /= VIXDISKLIB_SECTOR_SIZE;
7084e2
-  count /= VIXDISKLIB_SECTOR_SIZE;
7084e2
-
7084e2
-  VDDK_CALL_START (VixDiskLib_Read,
7084e2
-                   "handle, %" PRIu64 " sectors, "
7084e2
-                   "%" PRIu32 " sectors, buffer",
7084e2
-                   offset, count)
7084e2
-    err = VixDiskLib_Read (h->handle, offset, count, buf);
7084e2
-  VDDK_CALL_END (VixDiskLib_Read, count * VIXDISKLIB_SECTOR_SIZE);
7084e2
-  if (err != VIX_OK) {
7084e2
-    VDDK_ERROR (err, "VixDiskLib_Read");
7084e2
-    return -1;
7084e2
-  }
7084e2
-
7084e2
-  return 0;
7084e2
+  return send_command_and_wait (h, &read_cmd);
7084e2
 }
7084e2
 
7084e2
 static int vddk_flush (void *handle, uint32_t flags);
7084e2
@@ -792,32 +768,17 @@ static int
7084e2
 vddk_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset,
7084e2
              uint32_t flags)
7084e2
 {
7084e2
+  struct vddk_handle *h = handle;
7084e2
   const bool fua = flags & NBDKIT_FLAG_FUA;
7084e2
-  struct vddk_handle *h = handle;
7084e2
-  VixError err;
7084e2
+  struct command write_cmd = {
7084e2
+    .type = WRITE,
7084e2
+    .ptr = (void *) buf,
7084e2
+    .count = count,
7084e2
+    .offset = offset,
7084e2
+  };
7084e2
 
7084e2
-  /* Align to sectors. */
7084e2
-  if (!IS_ALIGNED (offset, VIXDISKLIB_SECTOR_SIZE)) {
7084e2
-    nbdkit_error ("%s is not aligned to sectors", "write");
7084e2
+  if (send_command_and_wait (h, &write_cmd) == -1)
7084e2
     return -1;
7084e2
-  }
7084e2
-  if (!IS_ALIGNED (count, VIXDISKLIB_SECTOR_SIZE)) {
7084e2
-    nbdkit_error ("%s is not aligned to sectors", "write");
7084e2
-    return -1;
7084e2
-  }
7084e2
-  offset /= VIXDISKLIB_SECTOR_SIZE;
7084e2
-  count /= VIXDISKLIB_SECTOR_SIZE;
7084e2
-
7084e2
-  VDDK_CALL_START (VixDiskLib_Write,
7084e2
-                   "handle, %" PRIu64 " sectors, "
7084e2
-                   "%" PRIu32 " sectors, buffer",
7084e2
-                   offset, count)
7084e2
-    err = VixDiskLib_Write (h->handle, offset, count, buf);
7084e2
-  VDDK_CALL_END (VixDiskLib_Write, count * VIXDISKLIB_SECTOR_SIZE);
7084e2
-  if (err != VIX_OK) {
7084e2
-    VDDK_ERROR (err, "VixDiskLib_Write");
7084e2
-    return -1;
7084e2
-  }
7084e2
 
7084e2
   if (fua) {
7084e2
     if (vddk_flush (handle, 0) == -1)
7084e2
@@ -827,126 +788,32 @@ vddk_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset,
7084e2
   return 0;
7084e2
 }
7084e2
 
7084e2
-static int
7084e2
-vddk_can_fua (void *handle)
7084e2
-{
7084e2
-  /* The Flush call was not available in VDDK < 6.0. */
7084e2
-  return VixDiskLib_Flush != NULL ? NBDKIT_FUA_NATIVE : NBDKIT_FUA_NONE;
7084e2
-}
7084e2
-
7084e2
-static int
7084e2
-vddk_can_flush (void *handle)
7084e2
-{
7084e2
-  /* The Flush call was not available in VDDK < 6.0. */
7084e2
-  return VixDiskLib_Flush != NULL;
7084e2
-}
7084e2
-
7084e2
 /* Flush data to the file. */
7084e2
 static int
7084e2
 vddk_flush (void *handle, uint32_t flags)
7084e2
 {
7084e2
   struct vddk_handle *h = handle;
7084e2
-  VixError err;
7084e2
+  struct command flush_cmd = {
7084e2
+    .type = FLUSH,
7084e2
+  };
7084e2
 
7084e2
-  /* The documentation for Flush is missing, but the comment in the
7084e2
-   * header file seems to indicate that it waits for WriteAsync
7084e2
-   * commands to finish.  We don't use WriteAsync, and in any case
7084e2
-   * there's a new function Wait to wait for those.  However I
7084e2
-   * verified using strace that in fact Flush does call fsync on the
7084e2
-   * file so it appears to be the correct call to use here.
7084e2
-   */
7084e2
-
7084e2
-  VDDK_CALL_START (VixDiskLib_Flush, "handle")
7084e2
-    err = VixDiskLib_Flush (h->handle);
7084e2
-  VDDK_CALL_END (VixDiskLib_Flush, 0);
7084e2
-  if (err != VIX_OK) {
7084e2
-    VDDK_ERROR (err, "VixDiskLib_Flush");
7084e2
-    return -1;
7084e2
-  }
7084e2
-
7084e2
-  return 0;
7084e2
+  return send_command_and_wait (h, &flush_cmd);
7084e2
 }
7084e2
 
7084e2
 static int
7084e2
 vddk_can_extents (void *handle)
7084e2
 {
7084e2
   struct vddk_handle *h = handle;
7084e2
-  VixError err;
7084e2
-  VixDiskLibBlockList *block_list;
7084e2
+  int ret;
7084e2
+  struct command can_extents_cmd = {
7084e2
+    .type = CAN_EXTENTS,
7084e2
+    .ptr = &ret,
7084e2
+  };
7084e2
 
7084e2
-  /* This call was added in VDDK 6.7.  In earlier versions the
7084e2
-   * function pointer will be NULL and we cannot query extents.
7084e2
-   */
7084e2
-  if (VixDiskLib_QueryAllocatedBlocks == NULL) {
7084e2
-    nbdkit_debug ("can_extents: VixDiskLib_QueryAllocatedBlocks == NULL, "
7084e2
-                  "probably this is VDDK < 6.7");
7084e2
-    return 0;
7084e2
-  }
7084e2
-
7084e2
-  /* Suppress errors around this call.  See:
7084e2
-   * https://bugzilla.redhat.com/show_bug.cgi?id=1709211#c7
7084e2
-   */
7084e2
-  error_suppression = 1;
7084e2
-
7084e2
-  /* However even when the call is available it rarely works well so
7084e2
-   * the best thing we can do here is to try the call and if it's
7084e2
-   * non-functional return false.
7084e2
-   */
7084e2
-  VDDK_CALL_START (VixDiskLib_QueryAllocatedBlocks,
7084e2
-                   "handle, 0, %d sectors, %d sectors",
7084e2
-                   VIXDISKLIB_MIN_CHUNK_SIZE, VIXDISKLIB_MIN_CHUNK_SIZE)
7084e2
-    err = VixDiskLib_QueryAllocatedBlocks (h->handle,
7084e2
-                                           0, VIXDISKLIB_MIN_CHUNK_SIZE,
7084e2
-                                           VIXDISKLIB_MIN_CHUNK_SIZE,
7084e2
-                                           &block_list);
7084e2
-  VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, 0);
7084e2
-  error_suppression = 0;
7084e2
-  if (err == VIX_OK) {
7084e2
-    VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list")
7084e2
-      VixDiskLib_FreeBlockList (block_list);
7084e2
-    VDDK_CALL_END (VixDiskLib_FreeBlockList, 0);
7084e2
-  }
7084e2
-  if (err != VIX_OK) {
7084e2
-    char *errmsg = VixDiskLib_GetErrorText (err, NULL);
7084e2
-    nbdkit_debug ("can_extents: VixDiskLib_QueryAllocatedBlocks test failed, "
7084e2
-                  "extents support will be disabled: "
7084e2
-                  "original error: %s",
7084e2
-                  errmsg);
7084e2
-    VixDiskLib_FreeErrorText (errmsg);
7084e2
-    return 0;
7084e2
-  }
7084e2
-
7084e2
-  return 1;
7084e2
-}
7084e2
-
7084e2
-static int
7084e2
-add_extent (struct nbdkit_extents *extents,
7084e2
-            uint64_t *position, uint64_t next_position, bool is_hole)
7084e2
-{
7084e2
-  uint32_t type = 0;
7084e2
-  const uint64_t length = next_position - *position;
7084e2
-
7084e2
-  if (is_hole) {
7084e2
-    type = NBDKIT_EXTENT_HOLE;
7084e2
-    /* Images opened as single link might be backed by another file in the
7084e2
-       chain, so the holes are not guaranteed to be zeroes. */
7084e2
-    if (!single_link)
7084e2
-      type |= NBDKIT_EXTENT_ZERO;
7084e2
-  }
7084e2
-
7084e2
-  assert (*position <= next_position);
7084e2
-  if (*position == next_position)
7084e2
-    return 0;
7084e2
-
7084e2
-  if (vddk_debug_extents)
7084e2
-    nbdkit_debug ("adding extent type %s at [%" PRIu64 "...%" PRIu64 "]",
7084e2
-                  is_hole ? "hole" : "allocated data",
7084e2
-                  *position, next_position-1);
7084e2
-  if (nbdkit_add_extent (extents, *position, length, type) == -1)
7084e2
+  if (send_command_and_wait (h, &can_extents_cmd) == -1)
7084e2
     return -1;
7084e2
 
7084e2
-  *position = next_position;
7084e2
-  return 0;
7084e2
+  return ret;
7084e2
 }
7084e2
 
7084e2
 static int
7084e2
@@ -955,88 +822,15 @@ vddk_extents (void *handle, uint32_t count, uint64_t offset, uint32_t flags,
7084e2
 {
7084e2
   struct vddk_handle *h = handle;
7084e2
   bool req_one = flags & NBDKIT_FLAG_REQ_ONE;
7084e2
-  uint64_t position, end, start_sector;
7084e2
-
7084e2
-  position = offset;
7084e2
-  end = offset + count;
7084e2
-
7084e2
-  /* We can only query whole chunks.  Therefore start with the first
7084e2
-   * chunk before offset.
7084e2
-   */
7084e2
-  start_sector =
7084e2
-    ROUND_DOWN (offset, VIXDISKLIB_MIN_CHUNK_SIZE * VIXDISKLIB_SECTOR_SIZE)
7084e2
-    / VIXDISKLIB_SECTOR_SIZE;
7084e2
-  while (start_sector * VIXDISKLIB_SECTOR_SIZE < end) {
7084e2
-    VixError err;
7084e2
-    uint32_t i;
7084e2
-    uint64_t nr_chunks, nr_sectors;
7084e2
-    VixDiskLibBlockList *block_list;
7084e2
-
7084e2
-    assert (IS_ALIGNED (start_sector, VIXDISKLIB_MIN_CHUNK_SIZE));
7084e2
-
7084e2
-    nr_chunks =
7084e2
-      ROUND_UP (end - start_sector * VIXDISKLIB_SECTOR_SIZE,
7084e2
-                VIXDISKLIB_MIN_CHUNK_SIZE * VIXDISKLIB_SECTOR_SIZE)
7084e2
-      / (VIXDISKLIB_MIN_CHUNK_SIZE * VIXDISKLIB_SECTOR_SIZE);
7084e2
-    nr_chunks = MIN (nr_chunks, VIXDISKLIB_MAX_CHUNK_NUMBER);
7084e2
-    nr_sectors = nr_chunks * VIXDISKLIB_MIN_CHUNK_SIZE;
7084e2
-
7084e2
-    VDDK_CALL_START (VixDiskLib_QueryAllocatedBlocks,
7084e2
-                     "handle, %" PRIu64 " sectors, %" PRIu64 " sectors, "
7084e2
-                     "%d sectors",
7084e2
-                     start_sector, nr_sectors, VIXDISKLIB_MIN_CHUNK_SIZE)
7084e2
-      err = VixDiskLib_QueryAllocatedBlocks (h->handle,
7084e2
-                                             start_sector, nr_sectors,
7084e2
-                                             VIXDISKLIB_MIN_CHUNK_SIZE,
7084e2
-                                             &block_list);
7084e2
-    VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, 0);
7084e2
-    if (err != VIX_OK) {
7084e2
-      VDDK_ERROR (err, "VixDiskLib_QueryAllocatedBlocks");
7084e2
-      return -1;
7084e2
-    }
7084e2
-
7084e2
-    for (i = 0; i < block_list->numBlocks; ++i) {
7084e2
-      uint64_t blk_offset, blk_length;
7084e2
-
7084e2
-      blk_offset = block_list->blocks[i].offset * VIXDISKLIB_SECTOR_SIZE;
7084e2
-      blk_length = block_list->blocks[i].length * VIXDISKLIB_SECTOR_SIZE;
7084e2
-
7084e2
-      /* The query returns allocated blocks.  We must insert holes
7084e2
-       * between the blocks as necessary.
7084e2
-       */
7084e2
-      if ((position < blk_offset &&
7084e2
-           add_extent (extents, &position, blk_offset, true) == -1) ||
7084e2
-          (add_extent (extents,
7084e2
-                       &position, blk_offset + blk_length, false) == -1)) {
7084e2
-        VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list")
7084e2
-          VixDiskLib_FreeBlockList (block_list);
7084e2
-        VDDK_CALL_END (VixDiskLib_FreeBlockList, 0);
7084e2
-        return -1;
7084e2
-      }
7084e2
-    }
7084e2
-    VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list")
7084e2
-      VixDiskLib_FreeBlockList (block_list);
7084e2
-    VDDK_CALL_END (VixDiskLib_FreeBlockList, 0);
7084e2
-
7084e2
-    /* There's an implicit hole after the returned list of blocks, up
7084e2
-     * to the end of the QueryAllocatedBlocks request.
7084e2
-     */
7084e2
-    if (add_extent (extents,
7084e2
-                    &position,
7084e2
-                    (start_sector + nr_sectors) * VIXDISKLIB_SECTOR_SIZE,
7084e2
-                    true) == -1)
7084e2
-      return -1;
7084e2
-
7084e2
-    start_sector += nr_sectors;
7084e2
-
7084e2
-    /* If one extent was requested, as long as we've added an extent
7084e2
-     * overlapping the original offset we're done.
7084e2
-     */
7084e2
-    if (req_one && position > offset)
7084e2
-      break;
7084e2
-  }
7084e2
-
7084e2
-  return 0;
7084e2
+  struct command extents_cmd = {
7084e2
+    .type = EXTENTS,
7084e2
+    .ptr = extents,
7084e2
+    .count = count,
7084e2
+    .offset = offset,
7084e2
+    .req_one = req_one,
7084e2
+  };
7084e2
+
7084e2
+  return send_command_and_wait (h, &extents_cmd);
7084e2
 }
7084e2
 
7084e2
 static struct nbdkit_plugin plugin = {
7084e2
diff --git a/plugins/vddk/vddk.h b/plugins/vddk/vddk.h
7084e2
index 1400589d..be0b3492 100644
7084e2
--- a/plugins/vddk/vddk.h
7084e2
+++ b/plugins/vddk/vddk.h
7084e2
@@ -90,7 +90,9 @@ extern int vddk_debug_stats;
7084e2
   /* GCC can optimize this away at compile time: */                     \
7084e2
   const bool datapath =                                                 \
7084e2
     strcmp (#fn, "VixDiskLib_Read") == 0 ||                             \
7084e2
-    strcmp (#fn, "VixDiskLib_Write") == 0;                              \
7084e2
+    strcmp (#fn, "VixDiskLib_ReadAsync") == 0 ||                        \
7084e2
+    strcmp (#fn, "VixDiskLib_Write") == 0 ||                            \
7084e2
+    strcmp (#fn, "VixDiskLib_WriteAsync") == 0;                         \
7084e2
   if (vddk_debug_stats)                                                 \
7084e2
     gettimeofday (&start_t, NULL);                                      \
7084e2
   if (!datapath || vddk_debug_datapath)                                 \
7084e2
@@ -120,6 +122,46 @@ extern int vddk_debug_stats;
7084e2
     VDDK_CALL_END (VixDiskLib_FreeErrorText, 0);                \
7084e2
   } while (0)
7084e2
 
7084e2
+/* Queue of asynchronous commands sent to the background thread. */
7084e2
+enum command_type { GET_SIZE, READ, WRITE, FLUSH, CAN_EXTENTS, EXTENTS, STOP };
7084e2
+struct command {
7084e2
+  /* These fields are set by the caller. */
7084e2
+  enum command_type type;       /* command */
7084e2
+  void *ptr;                    /* buffer, extents list, return values */
7084e2
+  uint32_t count;               /* READ, WRITE, EXTENTS */
7084e2
+  uint64_t offset;              /* READ, WRITE, EXTENTS */
7084e2
+  bool req_one;                 /* EXTENTS NBDKIT_FLAG_REQ_ONE */
7084e2
+
7084e2
+  /* This field is set to a unique value by send_command_and_wait. */
7084e2
+  uint64_t id;                  /* serial number */
7084e2
+
7084e2
+  /* These fields are used by the internal implementation. */
7084e2
+  pthread_mutex_t mutex;        /* completion mutex */
7084e2
+  pthread_cond_t cond;          /* completion condition */
7084e2
+  enum { SUBMITTED, SUCCEEDED, FAILED } status;
7084e2
+};
7084e2
+
7084e2
+DEFINE_VECTOR_TYPE(command_queue, struct command *)
7084e2
+
7084e2
+/* The per-connection handle. */
7084e2
+struct vddk_handle {
7084e2
+  VixDiskLibConnectParams *params; /* connection parameters */
7084e2
+  VixDiskLibConnection connection; /* connection */
7084e2
+  VixDiskLibHandle handle;         /* disk handle */
7084e2
+
7084e2
+  pthread_t thread;                /* background thread for asynch work */
7084e2
+
7084e2
+  /* Command queue of commands sent to the background thread.  Use
7084e2
+   * send_command_and_wait to add a command.  Only the background
7084e2
+   * thread must make VDDK API calls (apart from opening and closing).
7084e2
+   * The lock protects all of these fields.
7084e2
+   */
7084e2
+  pthread_mutex_t commands_lock;   /* lock */
7084e2
+  command_queue commands;          /* command queue */
7084e2
+  pthread_cond_t commands_cond;    /* condition (queue size 0 -> 1) */
7084e2
+  uint64_t id;                     /* next command ID */
7084e2
+};
7084e2
+
7084e2
 /* reexec.c */
7084e2
 extern bool noreexec;
7084e2
 extern char *reexeced;
7084e2
@@ -141,4 +183,9 @@ extern pthread_mutex_t stats_lock;
7084e2
 #undef OPTIONAL_STUB
7084e2
 extern void display_stats (void);
7084e2
 
7084e2
+/* worker.c */
7084e2
+extern const char *command_type_string (enum command_type type);
7084e2
+extern int send_command_and_wait (struct vddk_handle *h, struct command *cmd);
7084e2
+extern void *vddk_worker_thread (void *handle);
7084e2
+
7084e2
 #endif /* NBDKIT_VDDK_H */
7084e2
diff --git a/plugins/vddk/worker.c b/plugins/vddk/worker.c
7084e2
new file mode 100644
7084e2
index 00000000..2a1d4f26
7084e2
--- /dev/null
7084e2
+++ b/plugins/vddk/worker.c
7084e2
@@ -0,0 +1,567 @@
7084e2
+/* nbdkit
7084e2
+ * Copyright (C) 2013-2021 Red Hat Inc.
7084e2
+ *
7084e2
+ * Redistribution and use in source and binary forms, with or without
7084e2
+ * modification, are permitted provided that the following conditions are
7084e2
+ * met:
7084e2
+ *
7084e2
+ * * Redistributions of source code must retain the above copyright
7084e2
+ * notice, this list of conditions and the following disclaimer.
7084e2
+ *
7084e2
+ * * Redistributions in binary form must reproduce the above copyright
7084e2
+ * notice, this list of conditions and the following disclaimer in the
7084e2
+ * documentation and/or other materials provided with the distribution.
7084e2
+ *
7084e2
+ * * Neither the name of Red Hat nor the names of its contributors may be
7084e2
+ * used to endorse or promote products derived from this software without
7084e2
+ * specific prior written permission.
7084e2
+ *
7084e2
+ * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
7084e2
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
7084e2
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
7084e2
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
7084e2
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
7084e2
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
7084e2
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
7084e2
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
7084e2
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
7084e2
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
7084e2
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
7084e2
+ * SUCH DAMAGE.
7084e2
+ */
7084e2
+
7084e2
+#include <config.h>
7084e2
+
7084e2
+#include <stdio.h>
7084e2
+#include <stdlib.h>
7084e2
+#include <stdint.h>
7084e2
+#include <inttypes.h>
7084e2
+
7084e2
+#include <pthread.h>
7084e2
+
7084e2
+#define NBDKIT_API_VERSION 2
7084e2
+#include <nbdkit-plugin.h>
7084e2
+
7084e2
+#include "cleanup.h"
7084e2
+#include "minmax.h"
7084e2
+#include "rounding.h"
7084e2
+#include "vector.h"
7084e2
+
7084e2
+#include "vddk.h"
7084e2
+
7084e2
+const char *
7084e2
+command_type_string (enum command_type type)
7084e2
+{
7084e2
+  switch (type) {
7084e2
+  case GET_SIZE:    return "get_size";
7084e2
+  case READ:        return "read";
7084e2
+  case WRITE:       return "write";
7084e2
+  case FLUSH:       return "flush";
7084e2
+  case CAN_EXTENTS: return "can_extents";
7084e2
+  case EXTENTS:     return "extents";
7084e2
+  case STOP:        return "stop";
7084e2
+  default:          abort ();
7084e2
+  }
7084e2
+}
7084e2
+
7084e2
+/* Send command to the background thread and wait for completion.
7084e2
+ *
7084e2
+ * Returns 0 for OK
7084e2
+ * On error, calls nbdkit_error and returns -1.
7084e2
+ */
7084e2
+int
7084e2
+send_command_and_wait (struct vddk_handle *h, struct command *cmd)
7084e2
+{
7084e2
+  /* Add the command to the command queue. */
7084e2
+  {
7084e2
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&h->commands_lock);
7084e2
+    cmd->id = h->id++;
7084e2
+
7084e2
+    if (command_queue_append (&h->commands, cmd) == -1)
7084e2
+      /* On error command_queue_append will call nbdkit_error. */
7084e2
+      return -1;
7084e2
+
7084e2
+    /* Signal the caller if it could be sleeping on an empty queue. */
7084e2
+    if (h->commands.size == 1)
7084e2
+      pthread_cond_signal (&h->commands_cond);
7084e2
+
7084e2
+    /* This will be used to signal command completion back to us. */
7084e2
+    pthread_mutex_init (&cmd->mutex, NULL);
7084e2
+    pthread_cond_init (&cmd->cond, NULL);
7084e2
+  }
7084e2
+
7084e2
+  /* Wait for the command to be completed by the background thread. */
7084e2
+  {
7084e2
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&cmd->mutex);
7084e2
+    while (cmd->status == SUBMITTED)
7084e2
+      pthread_cond_wait (&cmd->cond, &cmd->mutex);
7084e2
+  }
7084e2
+
7084e2
+  pthread_mutex_destroy (&cmd->mutex);
7084e2
+  pthread_cond_destroy (&cmd->cond);
7084e2
+
7084e2
+  /* On error the background thread will call nbdkit_error. */
7084e2
+  switch (cmd->status) {
7084e2
+  case SUCCEEDED: return 0;
7084e2
+  case FAILED:    return -1;
7084e2
+  default:        abort ();
7084e2
+  }
7084e2
+}
7084e2
+
7084e2
+/* Asynchronous commands are completed when this function is called. */
7084e2
+static void
7084e2
+complete_command (void *vp, VixError result)
7084e2
+{
7084e2
+  struct command *cmd = vp;
7084e2
+
7084e2
+  if (vddk_debug_datapath)
7084e2
+    nbdkit_debug ("command %" PRIu64 " completed", cmd->id);
7084e2
+
7084e2
+  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&cmd->mutex);
7084e2
+
7084e2
+  if (result == VIX_OK) {
7084e2
+    cmd->status = SUCCEEDED;
7084e2
+  } else {
7084e2
+    VDDK_ERROR (result, "command %" PRIu64 ": asynchronous %s failed",
7084e2
+                cmd->id, command_type_string (cmd->type));
7084e2
+    cmd->status = FAILED;
7084e2
+  }
7084e2
+
7084e2
+  pthread_cond_signal (&cmd->cond);
7084e2
+}
7084e2
+
7084e2
+/* Wait for any asynchronous commands to complete. */
7084e2
+static int
7084e2
+do_stop (struct command *cmd, struct vddk_handle *h)
7084e2
+{
7084e2
+  VixError err;
7084e2
+
7084e2
+  /* Because we assume VDDK >= 6.5, VixDiskLib_Wait must exist. */
7084e2
+  VDDK_CALL_START (VixDiskLib_Wait, "handle")
7084e2
+    err = VixDiskLib_Wait (h->handle);
7084e2
+  VDDK_CALL_END (VixDiskLib_Wait, 0);
7084e2
+  if (err != VIX_OK) {
7084e2
+    VDDK_ERROR (err, "VixDiskLib_Wait");
7084e2
+    /* In the end this error indication is ignored because it only
7084e2
+     * happens on the close path when we cannot handle errors.
7084e2
+     */
7084e2
+    return -1;
7084e2
+  }
7084e2
+  return 0;
7084e2
+}
7084e2
+
7084e2
+/* Get size command. */
7084e2
+static int64_t
7084e2
+do_get_size (struct command *cmd, struct vddk_handle *h)
7084e2
+{
7084e2
+  VixError err;
7084e2
+  VixDiskLibInfo *info;
7084e2
+  uint64_t size;
7084e2
+
7084e2
+  VDDK_CALL_START (VixDiskLib_GetInfo, "handle, &info")
7084e2
+    err = VixDiskLib_GetInfo (h->handle, &info;;
7084e2
+  VDDK_CALL_END (VixDiskLib_GetInfo, 0);
7084e2
+  if (err != VIX_OK) {
7084e2
+    VDDK_ERROR (err, "VixDiskLib_GetInfo");
7084e2
+    return -1;
7084e2
+  }
7084e2
+
7084e2
+  size = info->capacity * (uint64_t)VIXDISKLIB_SECTOR_SIZE;
7084e2
+
7084e2
+  if (vddk_debug_diskinfo) {
7084e2
+    nbdkit_debug ("disk info: capacity: %" PRIu64 " sectors "
7084e2
+                  "(%" PRIi64 " bytes)",
7084e2
+                  info->capacity, size);
7084e2
+    nbdkit_debug ("disk info: biosGeo: C:%" PRIu32 " H:%" PRIu32 " S:%" PRIu32,
7084e2
+                  info->biosGeo.cylinders,
7084e2
+                  info->biosGeo.heads,
7084e2
+                  info->biosGeo.sectors);
7084e2
+    nbdkit_debug ("disk info: physGeo: C:%" PRIu32 " H:%" PRIu32 " S:%" PRIu32,
7084e2
+                  info->physGeo.cylinders,
7084e2
+                  info->physGeo.heads,
7084e2
+                  info->physGeo.sectors);
7084e2
+    nbdkit_debug ("disk info: adapter type: %d",
7084e2
+                  (int) info->adapterType);
7084e2
+    nbdkit_debug ("disk info: num links: %d", info->numLinks);
7084e2
+    nbdkit_debug ("disk info: parent filename hint: %s",
7084e2
+                  info->parentFileNameHint ? : "NULL");
7084e2
+    nbdkit_debug ("disk info: uuid: %s",
7084e2
+                  info->uuid ? : "NULL");
7084e2
+    if (library_version >= 7) {
7084e2
+      nbdkit_debug ("disk info: sector size: "
7084e2
+                    "logical %" PRIu32 " physical %" PRIu32,
7084e2
+                    info->logicalSectorSize,
7084e2
+                    info->physicalSectorSize);
7084e2
+    }
7084e2
+  }
7084e2
+
7084e2
+  VDDK_CALL_START (VixDiskLib_FreeInfo, "info")
7084e2
+    VixDiskLib_FreeInfo (info);
7084e2
+  VDDK_CALL_END (VixDiskLib_FreeInfo, 0);
7084e2
+
7084e2
+  return (int64_t) size;
7084e2
+}
7084e2
+
7084e2
+static int
7084e2
+do_read (struct command *cmd, struct vddk_handle *h)
7084e2
+{
7084e2
+  VixError err;
7084e2
+  uint32_t count = cmd->count;
7084e2
+  uint64_t offset = cmd->offset;
7084e2
+  void *buf = cmd->ptr;
7084e2
+
7084e2
+  /* Align to sectors. */
7084e2
+  if (!IS_ALIGNED (offset, VIXDISKLIB_SECTOR_SIZE)) {
7084e2
+    nbdkit_error ("%s is not aligned to sectors", "read");
7084e2
+    return -1;
7084e2
+  }
7084e2
+  if (!IS_ALIGNED (count, VIXDISKLIB_SECTOR_SIZE)) {
7084e2
+    nbdkit_error ("%s is not aligned to sectors", "read");
7084e2
+    return -1;
7084e2
+  }
7084e2
+  offset /= VIXDISKLIB_SECTOR_SIZE;
7084e2
+  count /= VIXDISKLIB_SECTOR_SIZE;
7084e2
+
7084e2
+  VDDK_CALL_START (VixDiskLib_ReadAsync,
7084e2
+                   "handle, %" PRIu64 " sectors, "
7084e2
+                   "%" PRIu32 " sectors, buffer, callback, %" PRIu64,
7084e2
+                   offset, count, cmd->id)
7084e2
+    err = VixDiskLib_ReadAsync (h->handle, offset, count, buf,
7084e2
+                                complete_command, cmd);
7084e2
+  VDDK_CALL_END (VixDiskLib_ReadAsync, count * VIXDISKLIB_SECTOR_SIZE);
7084e2
+  if (err != VIX_ASYNC) {
7084e2
+    VDDK_ERROR (err, "VixDiskLib_ReadAsync");
7084e2
+    return -1;
7084e2
+  }
7084e2
+
7084e2
+  return 0;
7084e2
+}
7084e2
+
7084e2
+static int
7084e2
+do_write (struct command *cmd, struct vddk_handle *h)
7084e2
+{
7084e2
+  VixError err;
7084e2
+  uint32_t count = cmd->count;
7084e2
+  uint64_t offset = cmd->offset;
7084e2
+  const void *buf = cmd->ptr;
7084e2
+
7084e2
+  /* Align to sectors. */
7084e2
+  if (!IS_ALIGNED (offset, VIXDISKLIB_SECTOR_SIZE)) {
7084e2
+    nbdkit_error ("%s is not aligned to sectors", "write");
7084e2
+    return -1;
7084e2
+  }
7084e2
+  if (!IS_ALIGNED (count, VIXDISKLIB_SECTOR_SIZE)) {
7084e2
+    nbdkit_error ("%s is not aligned to sectors", "write");
7084e2
+    return -1;
7084e2
+  }
7084e2
+  offset /= VIXDISKLIB_SECTOR_SIZE;
7084e2
+  count /= VIXDISKLIB_SECTOR_SIZE;
7084e2
+
7084e2
+  VDDK_CALL_START (VixDiskLib_WriteAsync,
7084e2
+                   "handle, %" PRIu64 " sectors, "
7084e2
+                   "%" PRIu32 " sectors, buffer, callback, %" PRIu64,
7084e2
+                   offset, count, cmd->id)
7084e2
+    err = VixDiskLib_WriteAsync (h->handle, offset, count, buf,
7084e2
+                                 complete_command, cmd);
7084e2
+  VDDK_CALL_END (VixDiskLib_WriteAsync, count * VIXDISKLIB_SECTOR_SIZE);
7084e2
+  if (err != VIX_ASYNC) {
7084e2
+    VDDK_ERROR (err, "VixDiskLib_WriteAsync");
7084e2
+    return -1;
7084e2
+  }
7084e2
+
7084e2
+  return 0;
7084e2
+}
7084e2
+
7084e2
+static int
7084e2
+do_flush (struct command *cmd, struct vddk_handle *h)
7084e2
+{
7084e2
+  VixError err;
7084e2
+
7084e2
+  /* It seems safer to wait for outstanding asynchronous commands to
7084e2
+   * complete before doing a flush, so do this but ignore errors
7084e2
+   * except to print them.
7084e2
+   */
7084e2
+  VDDK_CALL_START (VixDiskLib_Wait, "handle")
7084e2
+    err = VixDiskLib_Wait (h->handle);
7084e2
+  VDDK_CALL_END (VixDiskLib_Wait, 0);
7084e2
+  if (err != VIX_OK)
7084e2
+    VDDK_ERROR (err, "VixDiskLib_Wait");
7084e2
+
7084e2
+  /* The documentation for Flush is missing, but the comment in the
7084e2
+   * header file seems to indicate that it waits for WriteAsync
7084e2
+   * commands to finish.  There's a new function Wait to wait for
7084e2
+   * those.  However I verified using strace that in fact Flush calls
7084e2
+   * fsync on the file so it appears to be the correct call to use
7084e2
+   * here.
7084e2
+   */
7084e2
+  VDDK_CALL_START (VixDiskLib_Flush, "handle")
7084e2
+    err = VixDiskLib_Flush (h->handle);
7084e2
+  VDDK_CALL_END (VixDiskLib_Flush, 0);
7084e2
+  if (err != VIX_OK) {
7084e2
+    VDDK_ERROR (err, "VixDiskLib_Flush");
7084e2
+    return -1;
7084e2
+  }
7084e2
+
7084e2
+  return 0;
7084e2
+}
7084e2
+
7084e2
+static int
7084e2
+do_can_extents (struct command *cmd, struct vddk_handle *h)
7084e2
+{
7084e2
+  VixError err;
7084e2
+  VixDiskLibBlockList *block_list;
7084e2
+
7084e2
+  /* This call was added in VDDK 6.7.  In earlier versions the
7084e2
+   * function pointer will be NULL and we cannot query extents.
7084e2
+   */
7084e2
+  if (VixDiskLib_QueryAllocatedBlocks == NULL) {
7084e2
+    nbdkit_debug ("can_extents: VixDiskLib_QueryAllocatedBlocks == NULL, "
7084e2
+                  "probably this is VDDK < 6.7");
7084e2
+    return 0;
7084e2
+  }
7084e2
+
7084e2
+  /* Suppress errors around this call.  See:
7084e2
+   * https://bugzilla.redhat.com/show_bug.cgi?id=1709211#c7
7084e2
+   */
7084e2
+  error_suppression = 1;
7084e2
+
7084e2
+  /* However even when the call is available it rarely works well so
7084e2
+   * the best thing we can do here is to try the call and if it's
7084e2
+   * non-functional return false.
7084e2
+   */
7084e2
+  VDDK_CALL_START (VixDiskLib_QueryAllocatedBlocks,
7084e2
+                   "handle, 0, %d sectors, %d sectors",
7084e2
+                   VIXDISKLIB_MIN_CHUNK_SIZE, VIXDISKLIB_MIN_CHUNK_SIZE)
7084e2
+    err = VixDiskLib_QueryAllocatedBlocks (h->handle,
7084e2
+                                           0, VIXDISKLIB_MIN_CHUNK_SIZE,
7084e2
+                                           VIXDISKLIB_MIN_CHUNK_SIZE,
7084e2
+                                           &block_list);
7084e2
+  VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, 0);
7084e2
+  error_suppression = 0;
7084e2
+  if (err == VIX_OK) {
7084e2
+    VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list")
7084e2
+      VixDiskLib_FreeBlockList (block_list);
7084e2
+    VDDK_CALL_END (VixDiskLib_FreeBlockList, 0);
7084e2
+  }
7084e2
+  if (err != VIX_OK) {
7084e2
+    char *errmsg = VixDiskLib_GetErrorText (err, NULL);
7084e2
+    nbdkit_debug ("can_extents: "
7084e2
+                  "VixDiskLib_QueryAllocatedBlocks test failed, "
7084e2
+                  "extents support will be disabled: "
7084e2
+                  "original error: %s",
7084e2
+                  errmsg);
7084e2
+    VixDiskLib_FreeErrorText (errmsg);
7084e2
+    return 0;
7084e2
+  }
7084e2
+
7084e2
+  return 1;
7084e2
+}
7084e2
+
7084e2
+/* Add an extent to the list of extents. */
7084e2
+static int
7084e2
+add_extent (struct nbdkit_extents *extents,
7084e2
+            uint64_t *position, uint64_t next_position, bool is_hole)
7084e2
+{
7084e2
+  uint32_t type = 0;
7084e2
+  const uint64_t length = next_position - *position;
7084e2
+
7084e2
+  if (is_hole) {
7084e2
+    type = NBDKIT_EXTENT_HOLE;
7084e2
+    /* Images opened as single link might be backed by another file in the
7084e2
+       chain, so the holes are not guaranteed to be zeroes. */
7084e2
+    if (!single_link)
7084e2
+      type |= NBDKIT_EXTENT_ZERO;
7084e2
+  }
7084e2
+
7084e2
+  assert (*position <= next_position);
7084e2
+  if (*position == next_position)
7084e2
+    return 0;
7084e2
+
7084e2
+  if (vddk_debug_extents)
7084e2
+    nbdkit_debug ("adding extent type %s at [%" PRIu64 "...%" PRIu64 "]",
7084e2
+                  is_hole ? "hole" : "allocated data",
7084e2
+                  *position, next_position-1);
7084e2
+  if (nbdkit_add_extent (extents, *position, length, type) == -1)
7084e2
+    return -1;
7084e2
+
7084e2
+  *position = next_position;
7084e2
+  return 0;
7084e2
+}
7084e2
+
7084e2
+static int
7084e2
+do_extents (struct command *cmd, struct vddk_handle *h)
7084e2
+{
7084e2
+  uint32_t count = cmd->count;
7084e2
+  uint64_t offset = cmd->offset;
7084e2
+  bool req_one = cmd->req_one;
7084e2
+  struct nbdkit_extents *extents = cmd->ptr;
7084e2
+  uint64_t position, end, start_sector;
7084e2
+
7084e2
+  position = offset;
7084e2
+  end = offset + count;
7084e2
+
7084e2
+  /* We can only query whole chunks.  Therefore start with the
7084e2
+   * first chunk before offset.
7084e2
+   */
7084e2
+  start_sector =
7084e2
+    ROUND_DOWN (offset, VIXDISKLIB_MIN_CHUNK_SIZE * VIXDISKLIB_SECTOR_SIZE)
7084e2
+    / VIXDISKLIB_SECTOR_SIZE;
7084e2
+  while (start_sector * VIXDISKLIB_SECTOR_SIZE < end) {
7084e2
+    VixError err;
7084e2
+    uint32_t i;
7084e2
+    uint64_t nr_chunks, nr_sectors;
7084e2
+    VixDiskLibBlockList *block_list;
7084e2
+
7084e2
+    assert (IS_ALIGNED (start_sector, VIXDISKLIB_MIN_CHUNK_SIZE));
7084e2
+
7084e2
+    nr_chunks =
7084e2
+      ROUND_UP (end - start_sector * VIXDISKLIB_SECTOR_SIZE,
7084e2
+                VIXDISKLIB_MIN_CHUNK_SIZE * VIXDISKLIB_SECTOR_SIZE)
7084e2
+      / (VIXDISKLIB_MIN_CHUNK_SIZE * VIXDISKLIB_SECTOR_SIZE);
7084e2
+    nr_chunks = MIN (nr_chunks, VIXDISKLIB_MAX_CHUNK_NUMBER);
7084e2
+    nr_sectors = nr_chunks * VIXDISKLIB_MIN_CHUNK_SIZE;
7084e2
+
7084e2
+    VDDK_CALL_START (VixDiskLib_QueryAllocatedBlocks,
7084e2
+                     "handle, %" PRIu64 " sectors, %" PRIu64 " sectors, "
7084e2
+                     "%d sectors",
7084e2
+                     start_sector, nr_sectors, VIXDISKLIB_MIN_CHUNK_SIZE)
7084e2
+      err = VixDiskLib_QueryAllocatedBlocks (h->handle,
7084e2
+                                             start_sector, nr_sectors,
7084e2
+                                             VIXDISKLIB_MIN_CHUNK_SIZE,
7084e2
+                                             &block_list);
7084e2
+    VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, 0);
7084e2
+    if (err != VIX_OK) {
7084e2
+      VDDK_ERROR (err, "VixDiskLib_QueryAllocatedBlocks");
7084e2
+      return -1;
7084e2
+    }
7084e2
+
7084e2
+    for (i = 0; i < block_list->numBlocks; ++i) {
7084e2
+      uint64_t blk_offset, blk_length;
7084e2
+
7084e2
+      blk_offset = block_list->blocks[i].offset * VIXDISKLIB_SECTOR_SIZE;
7084e2
+      blk_length = block_list->blocks[i].length * VIXDISKLIB_SECTOR_SIZE;
7084e2
+
7084e2
+      /* The query returns allocated blocks.  We must insert holes
7084e2
+       * between the blocks as necessary.
7084e2
+       */
7084e2
+      if ((position < blk_offset &&
7084e2
+           add_extent (extents, &position, blk_offset, true) == -1) ||
7084e2
+          (add_extent (extents,
7084e2
+                       &position, blk_offset + blk_length, false) == -1)) {
7084e2
+        VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list")
7084e2
+          VixDiskLib_FreeBlockList (block_list);
7084e2
+        VDDK_CALL_END (VixDiskLib_FreeBlockList, 0);
7084e2
+        return -1;
7084e2
+      }
7084e2
+    }
7084e2
+    VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list")
7084e2
+      VixDiskLib_FreeBlockList (block_list);
7084e2
+    VDDK_CALL_END (VixDiskLib_FreeBlockList, 0);
7084e2
+
7084e2
+    /* There's an implicit hole after the returned list of blocks,
7084e2
+     * up to the end of the QueryAllocatedBlocks request.
7084e2
+     */
7084e2
+    if (add_extent (extents,
7084e2
+                    &position,
7084e2
+                    (start_sector + nr_sectors) * VIXDISKLIB_SECTOR_SIZE,
7084e2
+                    true) == -1) {
7084e2
+      return -1;
7084e2
+    }
7084e2
+
7084e2
+    start_sector += nr_sectors;
7084e2
+
7084e2
+    /* If one extent was requested, as long as we've added an extent
7084e2
+     * overlapping the original offset we're done.
7084e2
+     */
7084e2
+    if (req_one && position > offset)
7084e2
+      break;
7084e2
+  }
7084e2
+
7084e2
+  return 0;
7084e2
+}
7084e2
+
7084e2
+/* Background worker thread, one per connection, which is where the
7084e2
+ * VDDK commands are issued.
7084e2
+ */
7084e2
+void *
7084e2
+vddk_worker_thread (void *handle)
7084e2
+{
7084e2
+  struct vddk_handle *h = handle;
7084e2
+  bool stop = false;
7084e2
+
7084e2
+  while (!stop) {
7084e2
+    struct command *cmd;
7084e2
+    int r;
7084e2
+    bool async = false;
7084e2
+
7084e2
+    /* Wait until we are sent at least one command. */
7084e2
+    {
7084e2
+      ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&h->commands_lock);
7084e2
+      while (h->commands.size == 0)
7084e2
+        pthread_cond_wait (&h->commands_cond, &h->commands_lock);
7084e2
+      cmd = h->commands.ptr[0];
7084e2
+      command_queue_remove (&h->commands, 0);
7084e2
+    }
7084e2
+
7084e2
+    switch (cmd->type) {
7084e2
+    case STOP:
7084e2
+      r = do_stop (cmd, h);
7084e2
+      stop = true;
7084e2
+      break;
7084e2
+
7084e2
+    case GET_SIZE: {
7084e2
+      int64_t size = do_get_size (cmd, h);
7084e2
+      if (size == -1)
7084e2
+        r = -1;
7084e2
+      else {
7084e2
+        r = 0;
7084e2
+        *(uint64_t *)cmd->ptr = size;
7084e2
+      }
7084e2
+      break;
7084e2
+    }
7084e2
+
7084e2
+    case READ:
7084e2
+      r = do_read (cmd, h);
7084e2
+      /* If async is true, don't retire this command now. */
7084e2
+      async = r == 0;
7084e2
+      break;
7084e2
+
7084e2
+    case WRITE:
7084e2
+      r = do_write (cmd, h);
7084e2
+      /* If async is true, don't retire this command now. */
7084e2
+      async = r == 0;
7084e2
+      break;
7084e2
+
7084e2
+    case FLUSH:
7084e2
+      r = do_flush (cmd, h);
7084e2
+      break;
7084e2
+
7084e2
+    case CAN_EXTENTS:
7084e2
+      r = do_can_extents (cmd, h);
7084e2
+      if (r >= 0)
7084e2
+        *(int *)cmd->ptr = r;
7084e2
+      break;
7084e2
+
7084e2
+    case EXTENTS:
7084e2
+      r = do_extents (cmd, h);
7084e2
+      break;
7084e2
+
7084e2
+    default: abort (); /* impossible, but keeps GCC happy */
7084e2
+    } /* switch */
7084e2
+
7084e2
+    if (!async) {
7084e2
+      /* Update the command status. */
7084e2
+      ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&cmd->mutex);
7084e2
+      cmd->status = r >= 0 ? SUCCEEDED : FAILED;
7084e2
+
7084e2
+      /* For synchronous commands signal the caller thread that the
7084e2
+       * command has completed.  (Asynchronous commands are completed in
7084e2
+       * the callback handler).
7084e2
+       */
7084e2
+      pthread_cond_signal (&cmd->cond);
7084e2
+    }
7084e2
+  } /* while (!stop) */
7084e2
+
7084e2
+  /* Exit the worker thread. */
7084e2
+  return NULL;
7084e2
+}
7084e2
diff --git a/tests/dummy-vddk.c b/tests/dummy-vddk.c
7084e2
index cb88380c..b6f12042 100644
7084e2
--- a/tests/dummy-vddk.c
7084e2
+++ b/tests/dummy-vddk.c
7084e2
@@ -188,6 +188,19 @@ VixDiskLib_Read (VixDiskLibHandle handle,
7084e2
   return VIX_OK;
7084e2
 }
7084e2
 
7084e2
+NBDKIT_DLL_PUBLIC VixError
7084e2
+VixDiskLib_ReadAsync (VixDiskLibHandle handle,
7084e2
+                      uint64_t start_sector, uint64_t nr_sectors,
7084e2
+                      unsigned char *buf,
7084e2
+                      VixDiskLibCompletionCB callback, void *data)
7084e2
+{
7084e2
+  size_t offset = start_sector * VIXDISKLIB_SECTOR_SIZE;
7084e2
+
7084e2
+  memcpy (buf, disk + offset, nr_sectors * VIXDISKLIB_SECTOR_SIZE);
7084e2
+  callback (data, VIX_OK);
7084e2
+  return VIX_ASYNC;
7084e2
+}
7084e2
+
7084e2
 NBDKIT_DLL_PUBLIC VixError
7084e2
 VixDiskLib_Write (VixDiskLibHandle handle,
7084e2
                   uint64_t start_sector, uint64_t nr_sectors,
7084e2
@@ -199,6 +212,25 @@ VixDiskLib_Write (VixDiskLibHandle handle,
7084e2
   return VIX_OK;
7084e2
 }
7084e2
 
7084e2
+NBDKIT_DLL_PUBLIC VixError
7084e2
+VixDiskLib_WriteAsync (VixDiskLibHandle handle,
7084e2
+                       uint64_t start_sector, uint64_t nr_sectors,
7084e2
+                       const unsigned char *buf,
7084e2
+                       VixDiskLibCompletionCB callback, void *data)
7084e2
+{
7084e2
+  size_t offset = start_sector * VIXDISKLIB_SECTOR_SIZE;
7084e2
+
7084e2
+  memcpy (disk + offset, buf, nr_sectors * VIXDISKLIB_SECTOR_SIZE);
7084e2
+  callback (data, VIX_OK);
7084e2
+  return VIX_ASYNC;
7084e2
+}
7084e2
+
7084e2
+NBDKIT_DLL_PUBLIC VixError
7084e2
+VixDiskLib_Flush (VixDiskLibHandle handle)
7084e2
+{
7084e2
+  return VIX_OK;
7084e2
+}
7084e2
+
7084e2
 NBDKIT_DLL_PUBLIC VixError
7084e2
 VixDiskLib_Wait (VixDiskLibHandle handle)
7084e2
 {
7084e2
-- 
7084e2
2.31.1
7084e2