0a122b
From f02d8f3eb38c3ed03742cbb981823a1917b3c5b2 Mon Sep 17 00:00:00 2001
0a122b
From: Jeffrey Cody <jcody@redhat.com>
0a122b
Date: Wed, 20 Nov 2013 19:44:00 +0100
0a122b
Subject: [PATCH 17/25] block: vhdx write support
0a122b
0a122b
RH-Author: Jeffrey Cody <jcody@redhat.com>
0a122b
Message-id: <aa16eed6f83efd7ff007cb38cca8d52f4c696054.1384975172.git.jcody@redhat.com>
0a122b
Patchwork-id: 55810
0a122b
O-Subject: [RHEL7 qemu-kvm PATCH 17/26] block: vhdx write support
0a122b
Bugzilla: 879234
0a122b
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
0a122b
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
0a122b
RH-Acked-by: Fam Zheng <famz@redhat.com>
0a122b
0a122b
This adds support for writing to VHDX image files, using coroutines.
0a122b
Writes into the BAT table goes through the VHDX log.  Currently, BAT
0a122b
table writes occur when expanding a dynamic VHDX file, and allocating a
0a122b
new BAT entry.
0a122b
0a122b
Signed-off-by: Jeff Cody <jcody@redhat.com>
0a122b
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
0a122b
(cherry picked from commit d92aa8833c051b53d3bf2614ff885df0037f10bb)
0a122b
---
0a122b
 block/vhdx.c | 212 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
0a122b
 block/vhdx.h |   2 +-
0a122b
 2 files changed, 209 insertions(+), 5 deletions(-)
0a122b
0a122b
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
0a122b
---
0a122b
 block/vhdx.c |  212 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
0a122b
 block/vhdx.h |    2 +-
0a122b
 2 files changed, 209 insertions(+), 5 deletions(-)
0a122b
0a122b
diff --git a/block/vhdx.c b/block/vhdx.c
0a122b
index e36c60e..baf8970 100644
0a122b
--- a/block/vhdx.c
0a122b
+++ b/block/vhdx.c
0a122b
@@ -914,7 +914,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
0a122b
         if (payblocks--) {
0a122b
             /* payload bat entries */
0a122b
             if ((s->bat[i] & VHDX_BAT_STATE_BIT_MASK) ==
0a122b
-                    PAYLOAD_BLOCK_FULL_PRESENT) {
0a122b
+                    PAYLOAD_BLOCK_FULLY_PRESENT) {
0a122b
                 ret = vhdx_region_check(s, s->bat[i] & VHDX_BAT_FILE_OFF_MASK,
0a122b
                                         s->block_size);
0a122b
                 if (ret < 0) {
0a122b
@@ -935,7 +935,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
0a122b
         }
0a122b
     }
0a122b
 
0a122b
-    /* TODO: differencing files, write */
0a122b
+    /* TODO: differencing files */
0a122b
 
0a122b
     /* Disable migration when VHDX images are used */
0a122b
     error_set(&s->migration_blocker,
0a122b
@@ -1052,7 +1052,7 @@ static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
0a122b
                 /* return zero */
0a122b
                 qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail);
0a122b
                 break;
0a122b
-            case PAYLOAD_BLOCK_FULL_PRESENT:
0a122b
+            case PAYLOAD_BLOCK_FULLY_PRESENT:
0a122b
                 qemu_co_mutex_unlock(&s->lock);
0a122b
                 ret = bdrv_co_readv(bs->file,
0a122b
                                     sinfo.file_offset >> BDRV_SECTOR_BITS,
0a122b
@@ -1082,7 +1082,43 @@ exit:
0a122b
     return ret;
0a122b
 }
0a122b
 
0a122b
+/*
0a122b
+ * Allocate a new payload block at the end of the file.
0a122b
+ *
0a122b
+ * Allocation will happen at 1MB alignment inside the file
0a122b
+ *
0a122b
+ * Returns the file offset start of the new payload block
0a122b
+ */
0a122b
+static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s,
0a122b
+                                    uint64_t *new_offset)
0a122b
+{
0a122b
+    *new_offset = bdrv_getlength(bs->file);
0a122b
+
0a122b
+    /* per the spec, the address for a block is in units of 1MB */
0a122b
+    *new_offset = ROUND_UP(*new_offset, 1024 * 1024);
0a122b
+
0a122b
+    return bdrv_truncate(bs->file, *new_offset + s->block_size);
0a122b
+}
0a122b
+
0a122b
+/*
0a122b
+ * Update the BAT table entry with the new file offset, and the new entry
0a122b
+ * state */
0a122b
+static void vhdx_update_bat_table_entry(BlockDriverState *bs, BDRVVHDXState *s,
0a122b
+                                       VHDXSectorInfo *sinfo,
0a122b
+                                       uint64_t *bat_entry_le,
0a122b
+                                       uint64_t *bat_offset, int state)
0a122b
+{
0a122b
+    /* The BAT entry is a uint64, with 44 bits for the file offset in units of
0a122b
+     * 1MB, and 3 bits for the block state. */
0a122b
+    s->bat[sinfo->bat_idx]  = ((sinfo->file_offset>>20) <<
0a122b
+                               VHDX_BAT_FILE_OFF_BITS);
0a122b
 
0a122b
+    s->bat[sinfo->bat_idx] |= state & VHDX_BAT_STATE_BIT_MASK;
0a122b
+
0a122b
+    *bat_entry_le = cpu_to_le64(s->bat[sinfo->bat_idx]);
0a122b
+    *bat_offset = s->bat_offset + sinfo->bat_idx * sizeof(VHDXBatEntry);
0a122b
+
0a122b
+}
0a122b
 
0a122b
 /* Per the spec, on the first write of guest-visible data to the file the
0a122b
  * data write guid must be updated in the header */
0a122b
@@ -1099,7 +1135,175 @@ int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s)
0a122b
 static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
0a122b
                                       int nb_sectors, QEMUIOVector *qiov)
0a122b
 {
0a122b
-    return -ENOTSUP;
0a122b
+    int ret = -ENOTSUP;
0a122b
+    BDRVVHDXState *s = bs->opaque;
0a122b
+    VHDXSectorInfo sinfo;
0a122b
+    uint64_t bytes_done = 0;
0a122b
+    uint64_t bat_entry = 0;
0a122b
+    uint64_t bat_entry_offset = 0;
0a122b
+    QEMUIOVector hd_qiov;
0a122b
+    struct iovec iov1 = { 0 };
0a122b
+    struct iovec iov2 = { 0 };
0a122b
+    int sectors_to_write;
0a122b
+    int bat_state;
0a122b
+    uint64_t bat_prior_offset = 0;
0a122b
+    bool bat_update = false;
0a122b
+
0a122b
+    qemu_iovec_init(&hd_qiov, qiov->niov);
0a122b
+
0a122b
+    qemu_co_mutex_lock(&s->lock);
0a122b
+
0a122b
+    ret = vhdx_user_visible_write(bs, s);
0a122b
+    if (ret < 0) {
0a122b
+        goto exit;
0a122b
+    }
0a122b
+
0a122b
+    while (nb_sectors > 0) {
0a122b
+        bool use_zero_buffers = false;
0a122b
+        bat_update = false;
0a122b
+        if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
0a122b
+            /* not supported yet */
0a122b
+            ret = -ENOTSUP;
0a122b
+            goto exit;
0a122b
+        } else {
0a122b
+            vhdx_block_translate(s, sector_num, nb_sectors, &sinfo);
0a122b
+            sectors_to_write = sinfo.sectors_avail;
0a122b
+
0a122b
+            qemu_iovec_reset(&hd_qiov);
0a122b
+            /* check the payload block state */
0a122b
+            bat_state = s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK;
0a122b
+            switch (bat_state) {
0a122b
+            case PAYLOAD_BLOCK_ZERO:
0a122b
+                /* in this case, we need to preserve zero writes for
0a122b
+                 * data that is not part of this write, so we must pad
0a122b
+                 * the rest of the buffer to zeroes */
0a122b
+
0a122b
+                /* if we are on a posix system with ftruncate() that extends
0a122b
+                 * a file, then it is zero-filled for us.  On Win32, the raw
0a122b
+                 * layer uses SetFilePointer and SetFileEnd, which does not
0a122b
+                 * zero fill AFAIK */
0a122b
+
0a122b
+                /* Queue another write of zero buffers if the underlying file
0a122b
+                 * does not zero-fill on file extension */
0a122b
+
0a122b
+                if (bdrv_has_zero_init(bs->file) == 0) {
0a122b
+                    use_zero_buffers = true;
0a122b
+
0a122b
+                    /* zero fill the front, if any */
0a122b
+                    if (sinfo.block_offset) {
0a122b
+                        iov1.iov_len = sinfo.block_offset;
0a122b
+                        iov1.iov_base = qemu_blockalign(bs, iov1.iov_len);
0a122b
+                        memset(iov1.iov_base, 0, iov1.iov_len);
0a122b
+                        qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0,
0a122b
+                                              sinfo.block_offset);
0a122b
+                        sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS;
0a122b
+                    }
0a122b
+
0a122b
+                    /* our actual data */
0a122b
+                    qemu_iovec_concat(&hd_qiov, qiov,  bytes_done,
0a122b
+                                      sinfo.bytes_avail);
0a122b
+
0a122b
+                    /* zero fill the back, if any */
0a122b
+                    if ((sinfo.bytes_avail - sinfo.block_offset) <
0a122b
+                         s->block_size) {
0a122b
+                        iov2.iov_len = s->block_size -
0a122b
+                                      (sinfo.bytes_avail + sinfo.block_offset);
0a122b
+                        iov2.iov_base = qemu_blockalign(bs, iov2.iov_len);
0a122b
+                        memset(iov2.iov_base, 0, iov2.iov_len);
0a122b
+                        qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0,
0a122b
+                                              sinfo.block_offset);
0a122b
+                        sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS;
0a122b
+                    }
0a122b
+                }
0a122b
+
0a122b
+                /* fall through */
0a122b
+            case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
0a122b
+            case PAYLOAD_BLOCK_UNMAPPED:    /* fall through */
0a122b
+            case PAYLOAD_BLOCK_UNDEFINED:   /* fall through */
0a122b
+                bat_prior_offset = sinfo.file_offset;
0a122b
+                ret = vhdx_allocate_block(bs, s, &sinfo.file_offset);
0a122b
+                if (ret < 0) {
0a122b
+                    goto exit;
0a122b
+                }
0a122b
+                /* once we support differencing files, this may also be
0a122b
+                 * partially present */
0a122b
+                /* update block state to the newly specified state */
0a122b
+                vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry,
0a122b
+                                            &bat_entry_offset,
0a122b
+                                            PAYLOAD_BLOCK_FULLY_PRESENT);
0a122b
+                bat_update = true;
0a122b
+                /* since we just allocated a block, file_offset is the
0a122b
+                 * beginning of the payload block. It needs to be the
0a122b
+                 * write address, which includes the offset into the block */
0a122b
+                if (!use_zero_buffers) {
0a122b
+                    sinfo.file_offset += sinfo.block_offset;
0a122b
+                }
0a122b
+                /* fall through */
0a122b
+            case PAYLOAD_BLOCK_FULLY_PRESENT:
0a122b
+                /* if the file offset address is in the header zone,
0a122b
+                 * there is a problem */
0a122b
+                if (sinfo.file_offset < (1024 * 1024)) {
0a122b
+                    ret = -EFAULT;
0a122b
+                    goto error_bat_restore;
0a122b
+                }
0a122b
+
0a122b
+                if (!use_zero_buffers) {
0a122b
+                    qemu_iovec_concat(&hd_qiov, qiov,  bytes_done,
0a122b
+                                      sinfo.bytes_avail);
0a122b
+                }
0a122b
+                /* block exists, so we can just overwrite it */
0a122b
+                qemu_co_mutex_unlock(&s->lock);
0a122b
+                ret = bdrv_co_writev(bs->file,
0a122b
+                                    sinfo.file_offset >> BDRV_SECTOR_BITS,
0a122b
+                                    sectors_to_write, &hd_qiov);
0a122b
+                qemu_co_mutex_lock(&s->lock);
0a122b
+                if (ret < 0) {
0a122b
+                    goto error_bat_restore;
0a122b
+                }
0a122b
+                break;
0a122b
+            case PAYLOAD_BLOCK_PARTIALLY_PRESENT:
0a122b
+                /* we don't yet support difference files, fall through
0a122b
+                 * to error */
0a122b
+            default:
0a122b
+                ret = -EIO;
0a122b
+                goto exit;
0a122b
+                break;
0a122b
+            }
0a122b
+
0a122b
+            if (bat_update) {
0a122b
+                /* this will update the BAT entry into the log journal, and
0a122b
+                 * then flush the log journal out to disk */
0a122b
+                ret =  vhdx_log_write_and_flush(bs, s, &bat_entry,
0a122b
+                                                sizeof(VHDXBatEntry),
0a122b
+                                                bat_entry_offset);
0a122b
+                if (ret < 0) {
0a122b
+                    goto exit;
0a122b
+                }
0a122b
+            }
0a122b
+
0a122b
+            nb_sectors -= sinfo.sectors_avail;
0a122b
+            sector_num += sinfo.sectors_avail;
0a122b
+            bytes_done += sinfo.bytes_avail;
0a122b
+
0a122b
+        }
0a122b
+    }
0a122b
+
0a122b
+    goto exit;
0a122b
+
0a122b
+error_bat_restore:
0a122b
+    if (bat_update) {
0a122b
+        /* keep metadata in sync, and restore the bat entry state
0a122b
+         * if error. */
0a122b
+        sinfo.file_offset = bat_prior_offset;
0a122b
+        vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry,
0a122b
+                                    &bat_entry_offset, bat_state);
0a122b
+    }
0a122b
+exit:
0a122b
+    qemu_vfree(iov1.iov_base);
0a122b
+    qemu_vfree(iov2.iov_base);
0a122b
+    qemu_co_mutex_unlock(&s->lock);
0a122b
+    qemu_iovec_destroy(&hd_qiov);
0a122b
+    return ret;
0a122b
 }
0a122b
 
0a122b
 
0a122b
diff --git a/block/vhdx.h b/block/vhdx.h
0a122b
index 4bb83de..a85c5c8 100644
0a122b
--- a/block/vhdx.h
0a122b
+++ b/block/vhdx.h
0a122b
@@ -217,7 +217,7 @@ typedef struct QEMU_PACKED VHDXLogDataSector {
0a122b
 #define PAYLOAD_BLOCK_UNDEFINED         1
0a122b
 #define PAYLOAD_BLOCK_ZERO              2
0a122b
 #define PAYLOAD_BLOCK_UNMAPPED          5
0a122b
-#define PAYLOAD_BLOCK_FULL_PRESENT      6
0a122b
+#define PAYLOAD_BLOCK_FULLY_PRESENT     6
0a122b
 #define PAYLOAD_BLOCK_PARTIALLY_PRESENT 7
0a122b
 
0a122b
 #define SB_BLOCK_NOT_PRESENT    0
0a122b
-- 
0a122b
1.7.1
0a122b