yeahuh / rpms / qemu-kvm

Forked from rpms/qemu-kvm 2 years ago
Clone
0a122b
From 243d499717ebba0c0644620237c266112164d5ed Mon Sep 17 00:00:00 2001
0a122b
From: Jeffrey Cody <jcody@redhat.com>
0a122b
Date: Wed, 20 Nov 2013 19:44:05 +0100
0a122b
Subject: [PATCH 22/25] block: vhdx - add .bdrv_create() support
0a122b
0a122b
RH-Author: Jeffrey Cody <jcody@redhat.com>
0a122b
Message-id: <450971418e351130082c4c5f3c8ac8231810c556.1384975172.git.jcody@redhat.com>
0a122b
Patchwork-id: 55814
0a122b
O-Subject: [RHEL7 qemu-kvm PATCH 22/26] block: vhdx - add .bdrv_create() support
0a122b
Bugzilla: 879234
0a122b
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
0a122b
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
0a122b
RH-Acked-by: Fam Zheng <famz@redhat.com>
0a122b
0a122b
This adds support for VHDX image creation, for images of type "Fixed"
0a122b
and "Dynamic".  "Differencing" types (i.e., VHDX images with backing
0a122b
files) are currently not supported.
0a122b
0a122b
Options for image creation include:
0a122b
    * log size:
0a122b
        The size of the journaling log for VHDX.  Minimum is 1MB,
0a122b
        and it must be a multiple of 1MB. Invalid log sizes will be
0a122b
        silently fixed by rounding up to the nearest MB.
0a122b
0a122b
        Default is 1MB.
0a122b
0a122b
    * block size:
0a122b
        This is the size of a payload block.  The range is 1MB to 256MB,
0a122b
        inclusive, and must be a multiple of 1MB as well.  Invalid sizes
0a122b
        and multiples will be silently fixed.  If '0' is passed, then
0a122b
        a sane size is chosen (depending on virtual image size).
0a122b
0a122b
        Default is 0 (Auto-select).
0a122b
0a122b
    * subformat:
0a122b
        - "dynamic"
0a122b
            An image without data pre-allocated.
0a122b
        - "fixed"
0a122b
            An image with data pre-allocated.
0a122b
0a122b
        Default is "dynamic"
0a122b
0a122b
When creating the image file, the lettered sections are created:
0a122b
0a122b
-----------------------------------------------------------------.
0a122b
|   (A)    |   (B)    |    (C)    |     (D)       |     (E)
0a122b
|  File ID |  Header1 |  Header 2 |  Region Tbl 1 |  Region Tbl 2
0a122b
|          |          |           |               |
0a122b
.-----------------------------------------------------------------.
0a122b
0         64KB      128KB       192KB           256KB          320KB
0a122b
0a122b
.---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------.
0a122b
|     (F)     |     (G)       |    (H)    |
0a122b
| Journal Log |  BAT / Bitmap |  Metadata |  .... data ......
0a122b
|             |               |           |
0a122b
.---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------.
0a122b
1MB         (var.)          (var.)      (var.)
0a122b
0a122b
Signed-off-by: Jeff Cody <jcody@redhat.com>
0a122b
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
0a122b
(cherry picked from commit 3412f7b1bd8f250c34c9f933767d06b9444bb821)
0a122b
0a122b
RHEL7 Note: Although the cherry-pick applied clean, the 'bdrv_unref()'
0a122b
            call in the upstream version was reverted back to the
0a122b
            original 'bdrv_delete()' that is present in RHEL7.
0a122b
0a122b
Signed-off-by: Jeff Cody <jcody@redhat.com>
0a122b
---
0a122b
 block/vhdx.c | 558 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
0a122b
 block/vhdx.h |  15 +-
0a122b
 2 files changed, 572 insertions(+), 1 deletion(-)
0a122b
0a122b
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
0a122b
---
0a122b
 block/vhdx.c |  557 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
0a122b
 block/vhdx.h |   15 ++-
0a122b
 2 files changed, 571 insertions(+), 1 deletions(-)
0a122b
0a122b
diff --git a/block/vhdx.c b/block/vhdx.c
0a122b
index 5a112e8..8e3b371 100644
0a122b
--- a/block/vhdx.c
0a122b
+++ b/block/vhdx.c
0a122b
@@ -23,6 +23,19 @@
0a122b
 #include "migration/migration.h"
0a122b
 
0a122b
 #include <uuid/uuid.h>
0a122b
+#include <glib.h>
0a122b
+
0a122b
+/* Options for VHDX creation */
0a122b
+
0a122b
+#define VHDX_BLOCK_OPT_LOG_SIZE   "log_size"
0a122b
+#define VHDX_BLOCK_OPT_BLOCK_SIZE "block_size"
0a122b
+#define VHDX_BLOCK_OPT_ZERO "block_state_zero"
0a122b
+
0a122b
+typedef enum VHDXImageType {
0a122b
+    VHDX_TYPE_DYNAMIC = 0,
0a122b
+    VHDX_TYPE_FIXED,
0a122b
+    VHDX_TYPE_DIFFERENCING,   /* Currently unsupported */
0a122b
+} VHDXImageType;
0a122b
 
0a122b
 /* Several metadata and region table data entries are identified by
0a122b
  * guids in  a MS-specific GUID format. */
0a122b
@@ -1332,6 +1345,548 @@ exit:
0a122b
 }
0a122b
 
0a122b
 
0a122b
+
0a122b
+/*
0a122b
+ * Create VHDX Headers
0a122b
+ *
0a122b
+ * There are 2 headers, and the highest sequence number will represent
0a122b
+ * the active header
0a122b
+ */
0a122b
+static int vhdx_create_new_headers(BlockDriverState *bs, uint64_t image_size,
0a122b
+                                   uint32_t log_size)
0a122b
+{
0a122b
+    int ret = 0;
0a122b
+    VHDXHeader *hdr = NULL;
0a122b
+
0a122b
+    hdr = g_malloc0(sizeof(VHDXHeader));
0a122b
+
0a122b
+    hdr->signature       = VHDX_HEADER_SIGNATURE;
0a122b
+    hdr->sequence_number = g_random_int();
0a122b
+    hdr->log_version     = 0;
0a122b
+    hdr->version         = 1;
0a122b
+    hdr->log_length      = log_size;
0a122b
+    hdr->log_offset      = VHDX_HEADER_SECTION_END;
0a122b
+    vhdx_guid_generate(&hdr->file_write_guid);
0a122b
+    vhdx_guid_generate(&hdr->data_write_guid);
0a122b
+
0a122b
+    ret = vhdx_write_header(bs, hdr, VHDX_HEADER1_OFFSET, false);
0a122b
+    if (ret < 0) {
0a122b
+        goto exit;
0a122b
+    }
0a122b
+    hdr->sequence_number++;
0a122b
+    ret = vhdx_write_header(bs, hdr, VHDX_HEADER2_OFFSET, false);
0a122b
+    if (ret < 0) {
0a122b
+        goto exit;
0a122b
+    }
0a122b
+
0a122b
+exit:
0a122b
+    g_free(hdr);
0a122b
+    return ret;
0a122b
+}
0a122b
+
0a122b
+
0a122b
+/*
0a122b
+ * Create the Metadata entries.
0a122b
+ *
0a122b
+ * For more details on the entries, see section 3.5 (pg 29) in the
0a122b
+ * VHDX 1.00 specification.
0a122b
+ *
0a122b
+ * We support 5 metadata entries (all required by spec):
0a122b
+ *          File Parameters,
0a122b
+ *          Virtual Disk Size,
0a122b
+ *          Page 83 Data,
0a122b
+ *          Logical Sector Size,
0a122b
+ *          Physical Sector Size
0a122b
+ *
0a122b
+ * The first 64KB of the Metadata section is reserved for the metadata
0a122b
+ * header and entries; beyond that, the metadata items themselves reside.
0a122b
+ */
0a122b
+static int vhdx_create_new_metadata(BlockDriverState *bs,
0a122b
+                                    uint64_t image_size,
0a122b
+                                    uint32_t block_size,
0a122b
+                                    uint32_t sector_size,
0a122b
+                                    uint64_t metadata_offset,
0a122b
+                                    VHDXImageType type)
0a122b
+{
0a122b
+    int ret = 0;
0a122b
+    uint32_t offset = 0;
0a122b
+    void *buffer = NULL;
0a122b
+    void *entry_buffer;
0a122b
+    VHDXMetadataTableHeader *md_table;;
0a122b
+    VHDXMetadataTableEntry  *md_table_entry;
0a122b
+
0a122b
+    /* Metadata entries */
0a122b
+    VHDXFileParameters     *mt_file_params;
0a122b
+    VHDXVirtualDiskSize    *mt_virtual_size;
0a122b
+    VHDXPage83Data         *mt_page83;
0a122b
+    VHDXVirtualDiskLogicalSectorSize  *mt_log_sector_size;
0a122b
+    VHDXVirtualDiskPhysicalSectorSize *mt_phys_sector_size;
0a122b
+
0a122b
+    entry_buffer = g_malloc0(sizeof(VHDXFileParameters)               +
0a122b
+                             sizeof(VHDXVirtualDiskSize)              +
0a122b
+                             sizeof(VHDXPage83Data)                   +
0a122b
+                             sizeof(VHDXVirtualDiskLogicalSectorSize) +
0a122b
+                             sizeof(VHDXVirtualDiskPhysicalSectorSize));
0a122b
+
0a122b
+    mt_file_params = entry_buffer;
0a122b
+    offset += sizeof(VHDXFileParameters);
0a122b
+    mt_virtual_size = entry_buffer + offset;
0a122b
+    offset += sizeof(VHDXVirtualDiskSize);
0a122b
+    mt_page83 = entry_buffer + offset;
0a122b
+    offset += sizeof(VHDXPage83Data);
0a122b
+    mt_log_sector_size = entry_buffer + offset;
0a122b
+    offset += sizeof(VHDXVirtualDiskLogicalSectorSize);
0a122b
+    mt_phys_sector_size = entry_buffer + offset;
0a122b
+
0a122b
+    mt_file_params->block_size = cpu_to_le32(block_size);
0a122b
+    if (type == VHDX_TYPE_FIXED) {
0a122b
+        mt_file_params->data_bits |= VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED;
0a122b
+        cpu_to_le32s(&mt_file_params->data_bits);
0a122b
+    }
0a122b
+
0a122b
+    vhdx_guid_generate(&mt_page83->page_83_data);
0a122b
+    cpu_to_leguids(&mt_page83->page_83_data);
0a122b
+    mt_virtual_size->virtual_disk_size        = cpu_to_le64(image_size);
0a122b
+    mt_log_sector_size->logical_sector_size   = cpu_to_le32(sector_size);
0a122b
+    mt_phys_sector_size->physical_sector_size = cpu_to_le32(sector_size);
0a122b
+
0a122b
+    buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE);
0a122b
+    md_table = buffer;
0a122b
+
0a122b
+    md_table->signature   = VHDX_METADATA_SIGNATURE;
0a122b
+    md_table->entry_count = 5;
0a122b
+    vhdx_metadata_header_le_export(md_table);
0a122b
+
0a122b
+
0a122b
+    /* This will reference beyond the reserved table portion */
0a122b
+    offset = 64 * KiB;
0a122b
+
0a122b
+    md_table_entry = buffer + sizeof(VHDXMetadataTableHeader);
0a122b
+
0a122b
+    md_table_entry[0].item_id = file_param_guid;
0a122b
+    md_table_entry[0].offset  = offset;
0a122b
+    md_table_entry[0].length  = sizeof(VHDXFileParameters);
0a122b
+    md_table_entry[0].data_bits |= VHDX_META_FLAGS_IS_REQUIRED;
0a122b
+    offset += md_table_entry[0].length;
0a122b
+    vhdx_metadata_entry_le_export(&md_table_entry[0]);
0a122b
+
0a122b
+    md_table_entry[1].item_id = virtual_size_guid;
0a122b
+    md_table_entry[1].offset  = offset;
0a122b
+    md_table_entry[1].length  = sizeof(VHDXVirtualDiskSize);
0a122b
+    md_table_entry[1].data_bits |= VHDX_META_FLAGS_IS_REQUIRED |
0a122b
+                                   VHDX_META_FLAGS_IS_VIRTUAL_DISK;
0a122b
+    offset += md_table_entry[1].length;
0a122b
+    vhdx_metadata_entry_le_export(&md_table_entry[1]);
0a122b
+
0a122b
+    md_table_entry[2].item_id = page83_guid;
0a122b
+    md_table_entry[2].offset  = offset;
0a122b
+    md_table_entry[2].length  = sizeof(VHDXPage83Data);
0a122b
+    md_table_entry[2].data_bits |= VHDX_META_FLAGS_IS_REQUIRED |
0a122b
+                                   VHDX_META_FLAGS_IS_VIRTUAL_DISK;
0a122b
+    offset += md_table_entry[2].length;
0a122b
+    vhdx_metadata_entry_le_export(&md_table_entry[2]);
0a122b
+
0a122b
+    md_table_entry[3].item_id = logical_sector_guid;
0a122b
+    md_table_entry[3].offset  = offset;
0a122b
+    md_table_entry[3].length  = sizeof(VHDXVirtualDiskLogicalSectorSize);
0a122b
+    md_table_entry[3].data_bits |= VHDX_META_FLAGS_IS_REQUIRED |
0a122b
+                                   VHDX_META_FLAGS_IS_VIRTUAL_DISK;
0a122b
+    offset += md_table_entry[3].length;
0a122b
+    vhdx_metadata_entry_le_export(&md_table_entry[3]);
0a122b
+
0a122b
+    md_table_entry[4].item_id = phys_sector_guid;
0a122b
+    md_table_entry[4].offset  = offset;
0a122b
+    md_table_entry[4].length  = sizeof(VHDXVirtualDiskPhysicalSectorSize);
0a122b
+    md_table_entry[4].data_bits |= VHDX_META_FLAGS_IS_REQUIRED |
0a122b
+                                   VHDX_META_FLAGS_IS_VIRTUAL_DISK;
0a122b
+    vhdx_metadata_entry_le_export(&md_table_entry[4]);
0a122b
+
0a122b
+    ret = bdrv_pwrite(bs, metadata_offset, buffer, VHDX_HEADER_BLOCK_SIZE);
0a122b
+    if (ret < 0) {
0a122b
+        goto exit;
0a122b
+    }
0a122b
+
0a122b
+    ret = bdrv_pwrite(bs, metadata_offset + (64 * KiB), entry_buffer,
0a122b
+                      VHDX_HEADER_BLOCK_SIZE);
0a122b
+    if (ret < 0) {
0a122b
+        goto exit;
0a122b
+    }
0a122b
+
0a122b
+
0a122b
+exit:
0a122b
+    g_free(buffer);
0a122b
+    g_free(entry_buffer);
0a122b
+    return ret;
0a122b
+}
0a122b
+
0a122b
+/* This create the actual BAT itself.  We currently only support
0a122b
+ * 'Dynamic' and 'Fixed' image types.
0a122b
+ *
0a122b
+ *  Dynamic images: default state of the BAT is all zeroes.
0a122b
+ *
0a122b
+ *  Fixed images: default state of the BAT is fully populated, with
0a122b
+ *                file offsets and state PAYLOAD_BLOCK_FULLY_PRESENT.
0a122b
+ */
0a122b
+static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s,
0a122b
+                           uint64_t image_size, VHDXImageType type,
0a122b
+                           bool use_zero_blocks, VHDXRegionTableEntry *rt_bat)
0a122b
+{
0a122b
+    int ret = 0;
0a122b
+    uint64_t data_file_offset;
0a122b
+    uint64_t total_sectors = 0;
0a122b
+    uint64_t sector_num = 0;
0a122b
+    uint64_t unused;
0a122b
+    int block_state;
0a122b
+    VHDXSectorInfo sinfo;
0a122b
+
0a122b
+    assert(s->bat == NULL);
0a122b
+
0a122b
+    /* this gives a data start after BAT/bitmap entries, and well
0a122b
+     * past any metadata entries (with a 4 MB buffer for future
0a122b
+     * expansion */
0a122b
+    data_file_offset = rt_bat->file_offset + rt_bat->length + 5 * MiB;
0a122b
+    total_sectors = image_size >> s->logical_sector_size_bits;
0a122b
+
0a122b
+    if (type == VHDX_TYPE_DYNAMIC) {
0a122b
+        /* All zeroes, so we can just extend the file - the end of the BAT
0a122b
+         * is the furthest thing we have written yet */
0a122b
+        ret = bdrv_truncate(bs, data_file_offset);
0a122b
+        if (ret < 0) {
0a122b
+            goto exit;
0a122b
+        }
0a122b
+    } else if (type == VHDX_TYPE_FIXED) {
0a122b
+        ret = bdrv_truncate(bs, data_file_offset + image_size);
0a122b
+        if (ret < 0) {
0a122b
+            goto exit;
0a122b
+        }
0a122b
+    } else {
0a122b
+        ret = -ENOTSUP;
0a122b
+        goto exit;
0a122b
+    }
0a122b
+
0a122b
+    if (type == VHDX_TYPE_FIXED ||
0a122b
+                use_zero_blocks ||
0a122b
+                bdrv_has_zero_init(bs) == 0) {
0a122b
+        /* for a fixed file, the default BAT entry is not zero */
0a122b
+        s->bat = g_malloc0(rt_bat->length);
0a122b
+        block_state = type == VHDX_TYPE_FIXED ? PAYLOAD_BLOCK_FULLY_PRESENT :
0a122b
+                                                PAYLOAD_BLOCK_NOT_PRESENT;
0a122b
+        block_state = use_zero_blocks ? PAYLOAD_BLOCK_ZERO : block_state;
0a122b
+        /* fill the BAT by emulating sector writes of sectors_per_block size */
0a122b
+        while (sector_num < total_sectors) {
0a122b
+            vhdx_block_translate(s, sector_num, s->sectors_per_block, &sinfo);
0a122b
+            sinfo.file_offset = data_file_offset +
0a122b
+                                (sector_num << s->logical_sector_size_bits);
0a122b
+            sinfo.file_offset = ROUND_UP(sinfo.file_offset, MiB);
0a122b
+            vhdx_update_bat_table_entry(bs, s, &sinfo, &unused, &unused,
0a122b
+                                        block_state);
0a122b
+            cpu_to_le64s(&s->bat[sinfo.bat_idx]);
0a122b
+            sector_num += s->sectors_per_block;
0a122b
+        }
0a122b
+        ret = bdrv_pwrite(bs, rt_bat->file_offset, s->bat, rt_bat->length);
0a122b
+        if (ret < 0) {
0a122b
+            goto exit;
0a122b
+        }
0a122b
+    }
0a122b
+
0a122b
+
0a122b
+
0a122b
+exit:
0a122b
+    g_free(s->bat);
0a122b
+    return ret;
0a122b
+}
0a122b
+
0a122b
+/* Creates the region table header, and region table entries.
0a122b
+ * There are 2 supported region table entries: BAT, and Metadata/
0a122b
+ *
0a122b
+ * As the calculations for the BAT region table are also needed
0a122b
+ * to create the BAT itself, we will also cause the BAT to be
0a122b
+ * created.
0a122b
+ */
0a122b
+static int vhdx_create_new_region_table(BlockDriverState *bs,
0a122b
+                                        uint64_t image_size,
0a122b
+                                        uint32_t block_size,
0a122b
+                                        uint32_t sector_size,
0a122b
+                                        uint32_t log_size,
0a122b
+                                        bool use_zero_blocks,
0a122b
+                                        VHDXImageType type,
0a122b
+                                        uint64_t *metadata_offset)
0a122b
+{
0a122b
+    int ret = 0;
0a122b
+    uint32_t offset = 0;
0a122b
+    void *buffer = NULL;
0a122b
+    BDRVVHDXState *s = NULL;
0a122b
+    VHDXRegionTableHeader *region_table;
0a122b
+    VHDXRegionTableEntry *rt_bat;
0a122b
+    VHDXRegionTableEntry *rt_metadata;
0a122b
+
0a122b
+    assert(metadata_offset != NULL);
0a122b
+
0a122b
+    /* Populate enough of the BDRVVHDXState to be able to use the
0a122b
+     * pre-existing BAT calculation, translation, and update functions */
0a122b
+    s = g_malloc0(sizeof(BDRVVHDXState));
0a122b
+
0a122b
+    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
0a122b
+                     (uint64_t) sector_size / (uint64_t) block_size;
0a122b
+
0a122b
+    s->sectors_per_block = block_size / sector_size;
0a122b
+    s->virtual_disk_size = image_size;
0a122b
+    s->block_size = block_size;
0a122b
+    s->logical_sector_size = sector_size;
0a122b
+
0a122b
+    vhdx_set_shift_bits(s);
0a122b
+
0a122b
+    vhdx_calc_bat_entries(s);
0a122b
+
0a122b
+    /* At this point the VHDX state is populated enough for creation */
0a122b
+
0a122b
+    /* a single buffer is used so we can calculate the checksum over the
0a122b
+     * entire 64KB block */
0a122b
+    buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE);
0a122b
+    region_table = buffer;
0a122b
+    offset += sizeof(VHDXRegionTableHeader);
0a122b
+    rt_bat = buffer + offset;
0a122b
+    offset += sizeof(VHDXRegionTableEntry);
0a122b
+    rt_metadata  = buffer + offset;
0a122b
+
0a122b
+    region_table->signature = VHDX_REGION_SIGNATURE;
0a122b
+    region_table->entry_count = 2;   /* BAT and Metadata */
0a122b
+
0a122b
+    rt_bat->guid        = bat_guid;
0a122b
+    rt_bat->length      = ROUND_UP(s->bat_entries * sizeof(VHDXBatEntry), MiB);
0a122b
+    rt_bat->file_offset = ROUND_UP(VHDX_HEADER_SECTION_END + log_size, MiB);
0a122b
+    s->bat_offset = rt_bat->file_offset;
0a122b
+
0a122b
+    rt_metadata->guid        = metadata_guid;
0a122b
+    rt_metadata->file_offset = ROUND_UP(rt_bat->file_offset + rt_bat->length,
0a122b
+                                        MiB);
0a122b
+    rt_metadata->length      = 1 * MiB; /* min size, and more than enough */
0a122b
+    *metadata_offset = rt_metadata->file_offset;
0a122b
+
0a122b
+    vhdx_update_checksum(buffer, VHDX_HEADER_BLOCK_SIZE,
0a122b
+                         offsetof(VHDXRegionTableHeader, checksum));
0a122b
+
0a122b
+
0a122b
+    /* The region table gives us the data we need to create the BAT,
0a122b
+     * so do that now */
0a122b
+    ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, rt_bat);
0a122b
+
0a122b
+    /* Now write out the region headers to disk */
0a122b
+    vhdx_region_header_le_export(region_table);
0a122b
+    vhdx_region_entry_le_export(rt_bat);
0a122b
+    vhdx_region_entry_le_export(rt_metadata);
0a122b
+
0a122b
+    ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer,
0a122b
+                      VHDX_HEADER_BLOCK_SIZE);
0a122b
+    if (ret < 0) {
0a122b
+        goto exit;
0a122b
+    }
0a122b
+
0a122b
+    ret = bdrv_pwrite(bs, VHDX_REGION_TABLE2_OFFSET, buffer,
0a122b
+                      VHDX_HEADER_BLOCK_SIZE);
0a122b
+    if (ret < 0) {
0a122b
+        goto exit;
0a122b
+    }
0a122b
+
0a122b
+
0a122b
+exit:
0a122b
+    g_free(s);
0a122b
+    g_free(buffer);
0a122b
+    return ret;
0a122b
+}
0a122b
+
0a122b
+/* We need to create the following elements:
0a122b
+ *
0a122b
+ *    .-----------------------------------------------------------------.
0a122b
+ *    |   (A)    |   (B)    |    (C)    |     (D)       |     (E)       |
0a122b
+ *    |  File ID |  Header1 |  Header 2 |  Region Tbl 1 |  Region Tbl 2 |
0a122b
+ *    |          |          |           |               |               |
0a122b
+ *    .-----------------------------------------------------------------.
0a122b
+ *    0         64KB      128KB       192KB           256KB           320KB
0a122b
+ *
0a122b
+ *
0a122b
+ *    .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------.
0a122b
+ *    |     (F)     |     (G)       |    (H)    |                        |
0a122b
+ *    | Journal Log |  BAT / Bitmap |  Metadata |  .... data ......      |
0a122b
+ *    |             |               |           |                        |
0a122b
+ *    .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------.
0a122b
+ *   1MB
0a122b
+ */
0a122b
+static int vhdx_create(const char *filename, QEMUOptionParameter *options,
0a122b
+                       Error **errp)
0a122b
+{
0a122b
+    int ret = 0;
0a122b
+    uint64_t image_size = (uint64_t) 2 * GiB;
0a122b
+    uint32_t log_size   = 1 * MiB;
0a122b
+    uint32_t block_size = 0;
0a122b
+    uint64_t signature;
0a122b
+    uint64_t metadata_offset;
0a122b
+    bool use_zero_blocks = false;
0a122b
+
0a122b
+    gunichar2 *creator = NULL;
0a122b
+    glong creator_items;
0a122b
+    BlockDriverState *bs;
0a122b
+    const char *type = NULL;
0a122b
+    VHDXImageType image_type;
0a122b
+    Error *local_err = NULL;
0a122b
+
0a122b
+    while (options && options->name) {
0a122b
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
0a122b
+            image_size = options->value.n;
0a122b
+        } else if (!strcmp(options->name, VHDX_BLOCK_OPT_LOG_SIZE)) {
0a122b
+            log_size = options->value.n;
0a122b
+        } else if (!strcmp(options->name, VHDX_BLOCK_OPT_BLOCK_SIZE)) {
0a122b
+            block_size = options->value.n;
0a122b
+        } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) {
0a122b
+            type = options->value.s;
0a122b
+        } else if (!strcmp(options->name, VHDX_BLOCK_OPT_ZERO)) {
0a122b
+            use_zero_blocks = options->value.n != 0;
0a122b
+        }
0a122b
+        options++;
0a122b
+    }
0a122b
+
0a122b
+    if (image_size > VHDX_MAX_IMAGE_SIZE) {
0a122b
+        error_setg_errno(errp, EINVAL, "Image size too large; max of 64TB");
0a122b
+        ret = -EINVAL;
0a122b
+        goto exit;
0a122b
+    }
0a122b
+
0a122b
+    if (type == NULL) {
0a122b
+        type = "dynamic";
0a122b
+    }
0a122b
+
0a122b
+    if (!strcmp(type, "dynamic")) {
0a122b
+        image_type = VHDX_TYPE_DYNAMIC;
0a122b
+    } else if (!strcmp(type, "fixed")) {
0a122b
+        image_type = VHDX_TYPE_FIXED;
0a122b
+    } else if (!strcmp(type, "differencing")) {
0a122b
+        error_setg_errno(errp, ENOTSUP,
0a122b
+                         "Differencing files not yet supported");
0a122b
+        ret = -ENOTSUP;
0a122b
+        goto exit;
0a122b
+    } else {
0a122b
+        ret = -EINVAL;
0a122b
+        goto exit;
0a122b
+    }
0a122b
+
0a122b
+    /* These are pretty arbitrary, and mainly designed to keep the BAT
0a122b
+     * size reasonable to load into RAM */
0a122b
+    if (block_size == 0) {
0a122b
+        if (image_size > 32 * TiB) {
0a122b
+            block_size = 64 * MiB;
0a122b
+        } else if (image_size > (uint64_t) 100 * GiB) {
0a122b
+            block_size = 32 * MiB;
0a122b
+        } else if (image_size > 1 * GiB) {
0a122b
+            block_size = 16 * MiB;
0a122b
+        } else {
0a122b
+            block_size = 8 * MiB;
0a122b
+        }
0a122b
+    }
0a122b
+
0a122b
+
0a122b
+    /* make the log size close to what was specified, but must be
0a122b
+     * min 1MB, and multiple of 1MB */
0a122b
+    log_size = ROUND_UP(log_size, MiB);
0a122b
+
0a122b
+    block_size = ROUND_UP(block_size, MiB);
0a122b
+    block_size = block_size > VHDX_BLOCK_SIZE_MAX ? VHDX_BLOCK_SIZE_MAX :
0a122b
+                                                    block_size;
0a122b
+
0a122b
+    ret = bdrv_create_file(filename, options, &local_err);
0a122b
+    if (ret < 0) {
0a122b
+        error_propagate(errp, local_err);
0a122b
+        goto exit;
0a122b
+    }
0a122b
+
0a122b
+    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err);
0a122b
+    if (ret < 0) {
0a122b
+        error_propagate(errp, local_err);
0a122b
+        goto exit;
0a122b
+    }
0a122b
+
0a122b
+    /* Create (A) */
0a122b
+
0a122b
+    /* The creator field is optional, but may be useful for
0a122b
+     * debugging / diagnostics */
0a122b
+    creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL,
0a122b
+                              &creator_items, NULL);
0a122b
+    signature = cpu_to_le64(VHDX_FILE_SIGNATURE);
0a122b
+    bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature));
0a122b
+    if (ret < 0) {
0a122b
+        goto delete_and_exit;
0a122b
+    }
0a122b
+    if (creator) {
0a122b
+        bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET + sizeof(signature), creator,
0a122b
+                    creator_items * sizeof(gunichar2));
0a122b
+        if (ret < 0) {
0a122b
+            goto delete_and_exit;
0a122b
+        }
0a122b
+    }
0a122b
+
0a122b
+
0a122b
+    /* Creates (B),(C) */
0a122b
+    ret = vhdx_create_new_headers(bs, image_size, log_size);
0a122b
+    if (ret < 0) {
0a122b
+        goto delete_and_exit;
0a122b
+    }
0a122b
+
0a122b
+    /* Creates (D),(E),(G) explicitly. (F) created as by-product */
0a122b
+    ret = vhdx_create_new_region_table(bs, image_size, block_size, 512,
0a122b
+                                       log_size, use_zero_blocks, image_type,
0a122b
+                                       &metadata_offset);
0a122b
+    if (ret < 0) {
0a122b
+        goto delete_and_exit;
0a122b
+    }
0a122b
+
0a122b
+    /* Creates (H) */
0a122b
+    ret = vhdx_create_new_metadata(bs, image_size, block_size, 512,
0a122b
+                                   metadata_offset, image_type);
0a122b
+    if (ret < 0) {
0a122b
+        goto delete_and_exit;
0a122b
+    }
0a122b
+
0a122b
+
0a122b
+
0a122b
+delete_and_exit:
0a122b
+    bdrv_delete(bs);
0a122b
+exit:
0a122b
+    g_free(creator);
0a122b
+    return ret;
0a122b
+}
0a122b
+
0a122b
+static QEMUOptionParameter vhdx_create_options[] = {
0a122b
+    {
0a122b
+        .name = BLOCK_OPT_SIZE,
0a122b
+        .type = OPT_SIZE,
0a122b
+        .help = "Virtual disk size; max of 64TB."
0a122b
+    },
0a122b
+    {
0a122b
+        .name = VHDX_BLOCK_OPT_LOG_SIZE,
0a122b
+        .type = OPT_SIZE,
0a122b
+        .value.n = 1 * MiB,
0a122b
+        .help = "Log size; min 1MB."
0a122b
+    },
0a122b
+    {
0a122b
+        .name = VHDX_BLOCK_OPT_BLOCK_SIZE,
0a122b
+        .type = OPT_SIZE,
0a122b
+        .value.n = 0,
0a122b
+        .help = "Block Size; min 1MB, max 256MB. " \
0a122b
+                "0 means auto-calculate based on image size."
0a122b
+    },
0a122b
+    {
0a122b
+        .name = BLOCK_OPT_SUBFMT,
0a122b
+        .type = OPT_STRING,
0a122b
+        .help = "VHDX format type, can be either 'dynamic' or 'fixed'. "\
0a122b
+                "Default is 'dynamic'."
0a122b
+    },
0a122b
+    {
0a122b
+        .name = VHDX_BLOCK_OPT_ZERO,
0a122b
+        .type = OPT_FLAG,
0a122b
+        .help = "Force use of payload blocks of type 'ZERO'.  Non-standard."
0a122b
+    },
0a122b
+    { NULL }
0a122b
+};
0a122b
+
0a122b
 static BlockDriver bdrv_vhdx = {
0a122b
     .format_name            = "vhdx",
0a122b
     .instance_size          = sizeof(BDRVVHDXState),
0a122b
@@ -1342,6 +1897,8 @@ static BlockDriver bdrv_vhdx = {
0a122b
     .bdrv_co_readv          = vhdx_co_readv,
0a122b
     .bdrv_co_writev         = vhdx_co_writev,
0a122b
     .bdrv_get_info          = vhdx_get_info,
0a122b
+    .bdrv_create            = vhdx_create,
0a122b
+    .create_options         = vhdx_create_options,
0a122b
 };
0a122b
 
0a122b
 static void bdrv_vhdx_init(void)
0a122b
diff --git a/block/vhdx.h b/block/vhdx.h
0a122b
index 245547b..365eca0 100644
0a122b
--- a/block/vhdx.h
0a122b
+++ b/block/vhdx.h
0a122b
@@ -18,6 +18,11 @@
0a122b
 #ifndef BLOCK_VHDX_H
0a122b
 #define BLOCK_VHDX_H
0a122b
 
0a122b
+#define KiB              (1 * 1024)
0a122b
+#define MiB            (KiB * 1024)
0a122b
+#define GiB            (MiB * 1024)
0a122b
+#define TiB ((uint64_t) GiB * 1024)
0a122b
+
0a122b
 /* Structures and fields present in the VHDX file */
0a122b
 
0a122b
 /* The header section has the following blocks,
0a122b
@@ -36,8 +41,9 @@
0a122b
 #define VHDX_HEADER1_OFFSET         (VHDX_HEADER_BLOCK_SIZE * 1)
0a122b
 #define VHDX_HEADER2_OFFSET         (VHDX_HEADER_BLOCK_SIZE * 2)
0a122b
 #define VHDX_REGION_TABLE_OFFSET    (VHDX_HEADER_BLOCK_SIZE * 3)
0a122b
+#define VHDX_REGION_TABLE2_OFFSET   (VHDX_HEADER_BLOCK_SIZE * 4)
0a122b
 
0a122b
-
0a122b
+#define VHDX_HEADER_SECTION_END     (1 * MiB)
0a122b
 /*
0a122b
  * A note on the use of MS-GUID fields.  For more details on the GUID,
0a122b
  * please see: https://en.wikipedia.org/wiki/Globally_unique_identifier.
0a122b
@@ -55,6 +61,7 @@
0a122b
 /* These structures are ones that are defined in the VHDX specification
0a122b
  * document */
0a122b
 
0a122b
+#define VHDX_FILE_SIGNATURE 0x656C696678646876  /* "vhdxfile" in ASCII */
0a122b
 typedef struct VHDXFileIdentifier {
0a122b
     uint64_t    signature;              /* "vhdxfile" in ASCII */
0a122b
     uint16_t    creator[256];           /* optional; utf-16 string to identify
0a122b
@@ -85,6 +92,7 @@ typedef struct QEMU_PACKED MSGUID {
0a122b
 /* The full header is 4KB, although the actual header data is much smaller.
0a122b
  * But for the checksum calculation, it is over the entire 4KB structure,
0a122b
  * not just the defined portion of it */
0a122b
+#define VHDX_HEADER_SIGNATURE 0x64616568
0a122b
 typedef struct QEMU_PACKED VHDXHeader {
0a122b
     uint32_t    signature;              /* "head" in ASCII */
0a122b
     uint32_t    checksum;               /* CRC-32C hash of the whole header */
0a122b
@@ -125,6 +133,7 @@ typedef struct QEMU_PACKED VHDXHeader {
0a122b
 } VHDXHeader;
0a122b
 
0a122b
 /* Header for the region table block */
0a122b
+#define VHDX_REGION_SIGNATURE  0x69676572  /* "regi" in ASCII */
0a122b
 typedef struct QEMU_PACKED VHDXRegionTableHeader {
0a122b
     uint32_t    signature;              /* "regi" in ASCII */
0a122b
     uint32_t    checksum;               /* CRC-32C hash of the 64KB table */
0a122b
@@ -238,6 +247,7 @@ typedef uint64_t VHDXBatEntry;
0a122b
 #define VHDX_METADATA_MAX_ENTRIES 2047  /* not including the header */
0a122b
 #define VHDX_METADATA_TABLE_MAX_SIZE \
0a122b
     (VHDX_METADATA_ENTRY_SIZE * (VHDX_METADATA_MAX_ENTRIES+1))
0a122b
+#define VHDX_METADATA_SIGNATURE 0x617461646174656D  /* "metadata" in ASCII */
0a122b
 typedef struct QEMU_PACKED VHDXMetadataTableHeader {
0a122b
     uint64_t    signature;              /* "metadata" in ASCII */
0a122b
     uint16_t    reserved;
0a122b
@@ -267,6 +277,8 @@ typedef struct QEMU_PACKED VHDXMetadataTableEntry {
0a122b
                                                    If set indicates a fixed
0a122b
                                                    size VHDX file */
0a122b
 #define VHDX_PARAMS_HAS_PARENT           0x02    /* has parent / backing file */
0a122b
+#define VHDX_BLOCK_SIZE_MIN             (1   * MiB)
0a122b
+#define VHDX_BLOCK_SIZE_MAX             (256 * MiB)
0a122b
 typedef struct QEMU_PACKED VHDXFileParameters {
0a122b
     uint32_t    block_size;             /* size of each payload block, always
0a122b
                                            power of 2, <= 256MB and >= 1MB. */
0a122b
@@ -274,6 +286,7 @@ typedef struct QEMU_PACKED VHDXFileParameters {
0a122b
                                            the rest are reserved (see above) */
0a122b
 } VHDXFileParameters;
0a122b
 
0a122b
+#define VHDX_MAX_IMAGE_SIZE  ((uint64_t) 64 * TiB)
0a122b
 typedef struct QEMU_PACKED VHDXVirtualDiskSize {
0a122b
     uint64_t    virtual_disk_size;      /* Size of the virtual disk, in bytes.
0a122b
                                            Must be multiple of the sector size,
0a122b
-- 
0a122b
1.7.1
0a122b