0a122b
From 1d6333da24b0175691d25b06c92a3eb7717a8a78 Mon Sep 17 00:00:00 2001
0a122b
From: Max Reitz <mreitz@redhat.com>
0a122b
Date: Tue, 7 Jan 2014 21:57:11 +0100
0a122b
Subject: [PATCH 06/14] qcow2-cluster: Expand zero clusters
0a122b
0a122b
RH-Author: Max Reitz <mreitz@redhat.com>
0a122b
Message-id: <1389131839-12920-7-git-send-email-mreitz@redhat.com>
0a122b
Patchwork-id: 56542
0a122b
O-Subject: [RHEL-7.0 qemu-kvm PATCH v2 06/14] qcow2-cluster: Expand zero clusters
0a122b
Bugzilla: 1033490
0a122b
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
0a122b
RH-Acked-by: Fam Zheng <famz@redhat.com>
0a122b
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
0a122b
0a122b
BZ: 1033490
0a122b
0a122b
Add functionality for expanding zero clusters. This is necessary for
0a122b
downgrading the image version to one without zero cluster support.
0a122b
0a122b
For non-backed images, this function may also just discard zero clusters
0a122b
instead of truly expanding them.
0a122b
0a122b
Signed-off-by: Max Reitz <mreitz@redhat.com>
0a122b
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
0a122b
(cherry picked from commit 32b6444d23d0ff618d73e5b766600cd258066169)
0a122b
0a122b
Signed-off-by: Max Reitz <mreitz@redhat.com>
0a122b
0a122b
Conflicts:
0a122b
	block/qcow2-cluster.c
0a122b
0a122b
Conflicts since the changes omitted from downstream commit
0a122b
ca635f6c3ae10562a2165590bb84667aa61ad12f (because this particular commit
0a122b
had not been backported then) have to be included here.
0a122b
0a122b
The same applies to the change in qcow2-cluster.c omitted from
0a122b
downstream commit 0f795e4e9e128f9ff88a128ed8590d2357228b80.
0a122b
---
0a122b
 block/qcow2-cluster.c  | 232 +++++++++++++++++++++++++++++++++++++++++++++++++
0a122b
 block/qcow2-refcount.c |  29 ++++---
0a122b
 block/qcow2.h          |   5 ++
0a122b
 3 files changed, 252 insertions(+), 14 deletions(-)
0a122b
0a122b
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
0a122b
---
0a122b
 block/qcow2-cluster.c  |  232 ++++++++++++++++++++++++++++++++++++++++++++++++
0a122b
 block/qcow2-refcount.c |   29 +++---
0a122b
 block/qcow2.h          |    5 +
0a122b
 3 files changed, 252 insertions(+), 14 deletions(-)
0a122b
0a122b
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
0a122b
index 5d13515..796d7c5 100644
0a122b
--- a/block/qcow2-cluster.c
0a122b
+++ b/block/qcow2-cluster.c
0a122b
@@ -1499,3 +1499,235 @@ fail:
0a122b
 
0a122b
     return ret;
0a122b
 }
0a122b
+
0a122b
+/*
0a122b
+ * Expands all zero clusters in a specific L1 table (or deallocates them, for
0a122b
+ * non-backed non-pre-allocated zero clusters).
0a122b
+ *
0a122b
+ * expanded_clusters is a bitmap where every bit corresponds to one cluster in
0a122b
+ * the image file; a bit gets set if the corresponding cluster has been used for
0a122b
+ * zero expansion (i.e., has been filled with zeroes and is referenced from an
0a122b
+ * L2 table). nb_clusters contains the total cluster count of the image file,
0a122b
+ * i.e., the number of bits in expanded_clusters.
0a122b
+ */
0a122b
+static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
0a122b
+                                      int l1_size, uint8_t *expanded_clusters,
0a122b
+                                      uint64_t nb_clusters)
0a122b
+{
0a122b
+    BDRVQcowState *s = bs->opaque;
0a122b
+    bool is_active_l1 = (l1_table == s->l1_table);
0a122b
+    uint64_t *l2_table = NULL;
0a122b
+    int ret;
0a122b
+    int i, j;
0a122b
+
0a122b
+    if (!is_active_l1) {
0a122b
+        /* inactive L2 tables require a buffer to be stored in when loading
0a122b
+         * them from disk */
0a122b
+        l2_table = qemu_blockalign(bs, s->cluster_size);
0a122b
+    }
0a122b
+
0a122b
+    for (i = 0; i < l1_size; i++) {
0a122b
+        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
0a122b
+        bool l2_dirty = false;
0a122b
+
0a122b
+        if (!l2_offset) {
0a122b
+            /* unallocated */
0a122b
+            continue;
0a122b
+        }
0a122b
+
0a122b
+        if (is_active_l1) {
0a122b
+            /* get active L2 tables from cache */
0a122b
+            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
0a122b
+                    (void **)&l2_table);
0a122b
+        } else {
0a122b
+            /* load inactive L2 tables from disk */
0a122b
+            ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
0a122b
+                    (void *)l2_table, s->cluster_sectors);
0a122b
+        }
0a122b
+        if (ret < 0) {
0a122b
+            goto fail;
0a122b
+        }
0a122b
+
0a122b
+        for (j = 0; j < s->l2_size; j++) {
0a122b
+            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
0a122b
+            int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index;
0a122b
+            int cluster_type = qcow2_get_cluster_type(l2_entry);
0a122b
+
0a122b
+            if (cluster_type == QCOW2_CLUSTER_NORMAL) {
0a122b
+                cluster_index = offset >> s->cluster_bits;
0a122b
+                assert((cluster_index >= 0) && (cluster_index < nb_clusters));
0a122b
+                if (expanded_clusters[cluster_index / 8] &
0a122b
+                    (1 << (cluster_index % 8))) {
0a122b
+                    /* Probably a shared L2 table; this cluster was a zero
0a122b
+                     * cluster which has been expanded, its refcount
0a122b
+                     * therefore most likely requires an update. */
0a122b
+                    ret = qcow2_update_cluster_refcount(bs, cluster_index, 1,
0a122b
+                                                        QCOW2_DISCARD_NEVER);
0a122b
+                    if (ret < 0) {
0a122b
+                        goto fail;
0a122b
+                    }
0a122b
+                    /* Since we just increased the refcount, the COPIED flag may
0a122b
+                     * no longer be set. */
0a122b
+                    l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED);
0a122b
+                    l2_dirty = true;
0a122b
+                }
0a122b
+                continue;
0a122b
+            }
0a122b
+            else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) {
0a122b
+                continue;
0a122b
+            }
0a122b
+
0a122b
+            if (!offset) {
0a122b
+                /* not preallocated */
0a122b
+                if (!bs->backing_hd) {
0a122b
+                    /* not backed; therefore we can simply deallocate the
0a122b
+                     * cluster */
0a122b
+                    l2_table[j] = 0;
0a122b
+                    l2_dirty = true;
0a122b
+                    continue;
0a122b
+                }
0a122b
+
0a122b
+                offset = qcow2_alloc_clusters(bs, s->cluster_size);
0a122b
+                if (offset < 0) {
0a122b
+                    ret = offset;
0a122b
+                    goto fail;
0a122b
+                }
0a122b
+            }
0a122b
+
0a122b
+            ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
0a122b
+            if (ret < 0) {
0a122b
+                qcow2_free_clusters(bs, offset, s->cluster_size,
0a122b
+                        QCOW2_DISCARD_ALWAYS);
0a122b
+                goto fail;
0a122b
+            }
0a122b
+
0a122b
+            ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE,
0a122b
+                                    s->cluster_sectors, 0);
0a122b
+            if (ret < 0) {
0a122b
+                qcow2_free_clusters(bs, offset, s->cluster_size,
0a122b
+                        QCOW2_DISCARD_ALWAYS);
0a122b
+                goto fail;
0a122b
+            }
0a122b
+
0a122b
+            l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
0a122b
+            l2_dirty = true;
0a122b
+
0a122b
+            cluster_index = offset >> s->cluster_bits;
0a122b
+            assert((cluster_index >= 0) && (cluster_index < nb_clusters));
0a122b
+            expanded_clusters[cluster_index / 8] |= 1 << (cluster_index % 8);
0a122b
+        }
0a122b
+
0a122b
+        if (is_active_l1) {
0a122b
+            if (l2_dirty) {
0a122b
+                qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
0a122b
+                qcow2_cache_depends_on_flush(s->l2_table_cache);
0a122b
+            }
0a122b
+            ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
0a122b
+            if (ret < 0) {
0a122b
+                l2_table = NULL;
0a122b
+                goto fail;
0a122b
+            }
0a122b
+        } else {
0a122b
+            if (l2_dirty) {
0a122b
+                ret = qcow2_pre_write_overlap_check(bs,
0a122b
+                        QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset,
0a122b
+                        s->cluster_size);
0a122b
+                if (ret < 0) {
0a122b
+                    goto fail;
0a122b
+                }
0a122b
+
0a122b
+                ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
0a122b
+                        (void *)l2_table, s->cluster_sectors);
0a122b
+                if (ret < 0) {
0a122b
+                    goto fail;
0a122b
+                }
0a122b
+            }
0a122b
+        }
0a122b
+    }
0a122b
+
0a122b
+    ret = 0;
0a122b
+
0a122b
+fail:
0a122b
+    if (l2_table) {
0a122b
+        if (!is_active_l1) {
0a122b
+            qemu_vfree(l2_table);
0a122b
+        } else {
0a122b
+            if (ret < 0) {
0a122b
+                qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
0a122b
+            } else {
0a122b
+                ret = qcow2_cache_put(bs, s->l2_table_cache,
0a122b
+                        (void **)&l2_table);
0a122b
+            }
0a122b
+        }
0a122b
+    }
0a122b
+    return ret;
0a122b
+}
0a122b
+
0a122b
+/*
0a122b
+ * For backed images, expands all zero clusters on the image. For non-backed
0a122b
+ * images, deallocates all non-pre-allocated zero clusters (and claims the
0a122b
+ * allocation for pre-allocated ones). This is important for downgrading to a
0a122b
+ * qcow2 version which doesn't yet support metadata zero clusters.
0a122b
+ */
0a122b
+int qcow2_expand_zero_clusters(BlockDriverState *bs)
0a122b
+{
0a122b
+    BDRVQcowState *s = bs->opaque;
0a122b
+    uint64_t *l1_table = NULL;
0a122b
+    int cluster_to_sector_bits = s->cluster_bits - BDRV_SECTOR_BITS;
0a122b
+    uint64_t nb_clusters;
0a122b
+    uint8_t *expanded_clusters;
0a122b
+    int ret;
0a122b
+    int i, j;
0a122b
+
0a122b
+    nb_clusters = (bs->total_sectors + (1 << cluster_to_sector_bits) - 1)
0a122b
+            >> cluster_to_sector_bits;
0a122b
+    expanded_clusters = g_malloc0((nb_clusters + 7) / 8);
0a122b
+
0a122b
+    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
0a122b
+                                     expanded_clusters, nb_clusters);
0a122b
+    if (ret < 0) {
0a122b
+        goto fail;
0a122b
+    }
0a122b
+
0a122b
+    /* Inactive L1 tables may point to active L2 tables - therefore it is
0a122b
+     * necessary to flush the L2 table cache before trying to access the L2
0a122b
+     * tables pointed to by inactive L1 entries (else we might try to expand
0a122b
+     * zero clusters that have already been expanded); furthermore, it is also
0a122b
+     * necessary to empty the L2 table cache, since it may contain tables which
0a122b
+     * are now going to be modified directly on disk, bypassing the cache.
0a122b
+     * qcow2_cache_empty() does both for us. */
0a122b
+    ret = qcow2_cache_empty(bs, s->l2_table_cache);
0a122b
+    if (ret < 0) {
0a122b
+        goto fail;
0a122b
+    }
0a122b
+
0a122b
+    for (i = 0; i < s->nb_snapshots; i++) {
0a122b
+        int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) +
0a122b
+                BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE;
0a122b
+
0a122b
+        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
0a122b
+
0a122b
+        ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset /
0a122b
+                BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors);
0a122b
+        if (ret < 0) {
0a122b
+            goto fail;
0a122b
+        }
0a122b
+
0a122b
+        for (j = 0; j < s->snapshots[i].l1_size; j++) {
0a122b
+            be64_to_cpus(&l1_table[j]);
0a122b
+        }
0a122b
+
0a122b
+        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
0a122b
+                                         expanded_clusters, nb_clusters);
0a122b
+        if (ret < 0) {
0a122b
+            goto fail;
0a122b
+        }
0a122b
+    }
0a122b
+
0a122b
+    ret = 0;
0a122b
+
0a122b
+fail:
0a122b
+    g_free(expanded_clusters);
0a122b
+    g_free(l1_table);
0a122b
+    return ret;
0a122b
+}
0a122b
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
0a122b
index 2b72d5e..389a837 100644
0a122b
--- a/block/qcow2-refcount.c
0a122b
+++ b/block/qcow2-refcount.c
0a122b
@@ -601,10 +601,10 @@ fail:
0a122b
  * If the return value is non-negative, it is the new refcount of the cluster.
0a122b
  * If it is negative, it is -errno and indicates an error.
0a122b
  */
0a122b
-static int update_cluster_refcount(BlockDriverState *bs,
0a122b
-                                   int64_t cluster_index,
0a122b
-                                   int addend,
0a122b
-                                   enum qcow2_discard_type type)
0a122b
+int qcow2_update_cluster_refcount(BlockDriverState *bs,
0a122b
+                                  int64_t cluster_index,
0a122b
+                                  int addend,
0a122b
+                                  enum qcow2_discard_type type)
0a122b
 {
0a122b
     BDRVQcowState *s = bs->opaque;
0a122b
     int ret;
0a122b
@@ -733,8 +733,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
0a122b
         if (free_in_cluster == 0)
0a122b
             s->free_byte_offset = 0;
0a122b
         if ((offset & (s->cluster_size - 1)) != 0)
0a122b
-            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
0a122b
-                                    QCOW2_DISCARD_NEVER);
0a122b
+            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
0a122b
+                                          QCOW2_DISCARD_NEVER);
0a122b
     } else {
0a122b
         offset = qcow2_alloc_clusters(bs, s->cluster_size);
0a122b
         if (offset < 0) {
0a122b
@@ -744,8 +744,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
0a122b
         if ((cluster_offset + s->cluster_size) == offset) {
0a122b
             /* we are lucky: contiguous data */
0a122b
             offset = s->free_byte_offset;
0a122b
-            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
0a122b
-                                    QCOW2_DISCARD_NEVER);
0a122b
+            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
0a122b
+                                          QCOW2_DISCARD_NEVER);
0a122b
             s->free_byte_offset += size;
0a122b
         } else {
0a122b
             s->free_byte_offset = offset;
0a122b
@@ -754,8 +754,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
0a122b
     }
0a122b
 
0a122b
     /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
0a122b
-     * or explicitly by update_cluster_refcount().  Refcount blocks must be
0a122b
-     * flushed before the caller's L2 table updates.
0a122b
+     * or explicitly by qcow2_update_cluster_refcount().  Refcount blocks must
0a122b
+     * be flushed before the caller's L2 table updates.
0a122b
      */
0a122b
     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
0a122b
     return offset;
0a122b
@@ -896,8 +896,9 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
0a122b
                             break;
0a122b
                         }
0a122b
                         if (addend != 0) {
0a122b
-                            refcount = update_cluster_refcount(bs, cluster_index, addend,
0a122b
-                                                               QCOW2_DISCARD_SNAPSHOT);
0a122b
+                            refcount = qcow2_update_cluster_refcount(bs,
0a122b
+                                    cluster_index, addend,
0a122b
+                                    QCOW2_DISCARD_SNAPSHOT);
0a122b
                         } else {
0a122b
                             refcount = get_refcount(bs, cluster_index);
0a122b
                         }
0a122b
@@ -936,8 +937,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
0a122b
 
0a122b
 
0a122b
             if (addend != 0) {
0a122b
-                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend,
0a122b
-                                                   QCOW2_DISCARD_SNAPSHOT);
0a122b
+                refcount = qcow2_update_cluster_refcount(bs, l2_offset >>
0a122b
+                        s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT);
0a122b
             } else {
0a122b
                 refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
0a122b
             }
0a122b
diff --git a/block/qcow2.h b/block/qcow2.h
0a122b
index 5ca6b78..2660483 100644
0a122b
--- a/block/qcow2.h
0a122b
+++ b/block/qcow2.h
0a122b
@@ -422,6 +422,9 @@ int qcow2_update_header(BlockDriverState *bs);
0a122b
 int qcow2_refcount_init(BlockDriverState *bs);
0a122b
 void qcow2_refcount_close(BlockDriverState *bs);
0a122b
 
0a122b
+int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index,
0a122b
+                                  int addend, enum qcow2_discard_type type);
0a122b
+
0a122b
 int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size);
0a122b
 int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
0a122b
     int nb_clusters);
0a122b
@@ -469,6 +472,8 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
0a122b
     int nb_sectors);
0a122b
 int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);
0a122b
 
0a122b
+int qcow2_expand_zero_clusters(BlockDriverState *bs);
0a122b
+
0a122b
 /* qcow2-snapshot.c functions */
0a122b
 int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info);
0a122b
 int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id);
0a122b
-- 
0a122b
1.7.1
0a122b