29b115
From b453cf6be8429f4438d51eb24fcf49e7d9f14db6 Mon Sep 17 00:00:00 2001
29b115
From: Hanna Reitz <hreitz@redhat.com>
29b115
Date: Tue, 5 Apr 2022 15:46:50 +0200
29b115
Subject: [PATCH 04/16] qcow2: Improve refcount structure rebuilding
29b115
29b115
RH-Author: Hanna Reitz <hreitz@redhat.com>
29b115
RH-MergeRequest: 96: qcow2: Improve refcount structure rebuilding
29b115
RH-Commit: [1/4] a3606b7abcaebb4930b566e95b1090aead62dfae (hreitz/qemu-kvm-c-9-s)
29b115
RH-Bugzilla: 2072379
29b115
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
29b115
RH-Acked-by: Eric Blake <eblake@redhat.com>
29b115
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
29b115
29b115
When rebuilding the refcount structures (when qemu-img check -r found
29b115
errors with refcount = 0, but reference count > 0), the new refcount
29b115
table defaults to being put at the image file end[1].  There is no good
29b115
reason for that except that it means we will not have to rewrite any
29b115
refblocks we already wrote to disk.
29b115
29b115
Changing the code to rewrite those refblocks is not too difficult,
29b115
though, so let us do that.  That is beneficial for images on block
29b115
devices, where we cannot really write beyond the end of the image file.
29b115
29b115
Use this opportunity to add extensive comments to the code, and refactor
29b115
it a bit, getting rid of the backwards-jumping goto.
29b115
29b115
[1] Unless there is something allocated in the area pointed to by the
29b115
    last refblock, so we have to write that refblock.  In that case, we
29b115
    try to put the reftable in there.
29b115
29b115
Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1519071
29b115
Closes: https://gitlab.com/qemu-project/qemu/-/issues/941
29b115
Reviewed-by: Eric Blake <eblake@redhat.com>
29b115
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
29b115
Message-Id: <20220405134652.19278-2-hreitz@redhat.com>
29b115
(cherry picked from commit a8c07ec287554dcefd33733f0e5888a281ddc95e)
29b115
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
29b115
---
29b115
 block/qcow2-refcount.c | 332 +++++++++++++++++++++++++++++------------
29b115
 1 file changed, 235 insertions(+), 97 deletions(-)
29b115
29b115
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
29b115
index b91499410c..c5669eaa51 100644
29b115
--- a/block/qcow2-refcount.c
29b115
+++ b/block/qcow2-refcount.c
29b115
@@ -2438,111 +2438,140 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs,
29b115
 }
29b115
 
29b115
 /*
29b115
- * Creates a new refcount structure based solely on the in-memory information
29b115
- * given through *refcount_table. All necessary allocations will be reflected
29b115
- * in that array.
29b115
+ * Helper function for rebuild_refcount_structure().
29b115
  *
29b115
- * On success, the old refcount structure is leaked (it will be covered by the
29b115
- * new refcount structure).
29b115
+ * Scan the range of clusters [first_cluster, end_cluster) for allocated
29b115
+ * clusters and write all corresponding refblocks to disk.  The refblock
29b115
+ * and allocation data is taken from the in-memory refcount table
29b115
+ * *refcount_table[] (of size *nb_clusters), which is basically one big
29b115
+ * (unlimited size) refblock for the whole image.
29b115
+ *
29b115
+ * For these refblocks, clusters are allocated using said in-memory
29b115
+ * refcount table.  Care is taken that these allocations are reflected
29b115
+ * in the refblocks written to disk.
29b115
+ *
29b115
+ * The refblocks' offsets are written into a reftable, which is
29b115
+ * *on_disk_reftable_ptr[] (of size *on_disk_reftable_entries_ptr).  If
29b115
+ * that reftable is of insufficient size, it will be resized to fit.
29b115
+ * This reftable is not written to disk.
29b115
+ *
29b115
+ * (If *on_disk_reftable_ptr is not NULL, the entries within are assumed
29b115
+ * to point to existing valid refblocks that do not need to be allocated
29b115
+ * again.)
29b115
+ *
29b115
+ * Return whether the on-disk reftable array was resized (true/false),
29b115
+ * or -errno on error.
29b115
  */
29b115
-static int rebuild_refcount_structure(BlockDriverState *bs,
29b115
-                                      BdrvCheckResult *res,
29b115
-                                      void **refcount_table,
29b115
-                                      int64_t *nb_clusters)
29b115
+static int rebuild_refcounts_write_refblocks(
29b115
+        BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters,
29b115
+        int64_t first_cluster, int64_t end_cluster,
29b115
+        uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr
29b115
+    )
29b115
 {
29b115
     BDRVQcow2State *s = bs->opaque;
29b115
-    int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
29b115
+    int64_t cluster;
29b115
     int64_t refblock_offset, refblock_start, refblock_index;
29b115
-    uint32_t reftable_size = 0;
29b115
-    uint64_t *on_disk_reftable = NULL;
29b115
+    int64_t first_free_cluster = 0;
29b115
+    uint64_t *on_disk_reftable = *on_disk_reftable_ptr;
29b115
+    uint32_t on_disk_reftable_entries = *on_disk_reftable_entries_ptr;
29b115
     void *on_disk_refblock;
29b115
-    int ret = 0;
29b115
-    struct {
29b115
-        uint64_t reftable_offset;
29b115
-        uint32_t reftable_clusters;
29b115
-    } QEMU_PACKED reftable_offset_and_clusters;
29b115
-
29b115
-    qcow2_cache_empty(bs, s->refcount_block_cache);
29b115
+    bool reftable_grown = false;
29b115
+    int ret;
29b115
 
29b115
-write_refblocks:
29b115
-    for (; cluster < *nb_clusters; cluster++) {
29b115
+    for (cluster = first_cluster; cluster < end_cluster; cluster++) {
29b115
+        /* Check all clusters to find refblocks that contain non-zero entries */
29b115
         if (!s->get_refcount(*refcount_table, cluster)) {
29b115
             continue;
29b115
         }
29b115
 
29b115
+        /*
29b115
+         * This cluster is allocated, so we need to create a refblock
29b115
+         * for it.  The data we will write to disk is just the
29b115
+         * respective slice from *refcount_table, so it will contain
29b115
+         * accurate refcounts for all clusters belonging to this
29b115
+         * refblock.  After we have written it, we will therefore skip
29b115
+         * all remaining clusters in this refblock.
29b115
+         */
29b115
+
29b115
         refblock_index = cluster >> s->refcount_block_bits;
29b115
         refblock_start = refblock_index << s->refcount_block_bits;
29b115
 
29b115
-        /* Don't allocate a cluster in a refblock already written to disk */
29b115
-        if (first_free_cluster < refblock_start) {
29b115
-            first_free_cluster = refblock_start;
29b115
-        }
29b115
-        refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
29b115
-                                              nb_clusters, &first_free_cluster);
29b115
-        if (refblock_offset < 0) {
29b115
-            fprintf(stderr, "ERROR allocating refblock: %s\n",
29b115
-                    strerror(-refblock_offset));
29b115
-            res->check_errors++;
29b115
-            ret = refblock_offset;
29b115
-            goto fail;
29b115
-        }
29b115
+        if (on_disk_reftable_entries > refblock_index &&
29b115
+            on_disk_reftable[refblock_index])
29b115
+        {
29b115
+            /*
29b115
+             * We can get here after a `goto write_refblocks`: We have a
29b115
+             * reftable from a previous run, and the refblock is already
29b115
+             * allocated.  No need to allocate it again.
29b115
+             */
29b115
+            refblock_offset = on_disk_reftable[refblock_index];
29b115
+        } else {
29b115
+            int64_t refblock_cluster_index;
29b115
 
29b115
-        if (reftable_size <= refblock_index) {
29b115
-            uint32_t old_reftable_size = reftable_size;
29b115
-            uint64_t *new_on_disk_reftable;
29b115
+            /* Don't allocate a cluster in a refblock already written to disk */
29b115
+            if (first_free_cluster < refblock_start) {
29b115
+                first_free_cluster = refblock_start;
29b115
+            }
29b115
+            refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
29b115
+                                                  nb_clusters,
29b115
+                                                  &first_free_cluster);
29b115
+            if (refblock_offset < 0) {
29b115
+                fprintf(stderr, "ERROR allocating refblock: %s\n",
29b115
+                        strerror(-refblock_offset));
29b115
+                return refblock_offset;
29b115
+            }
29b115
 
29b115
-            reftable_size = ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE,
29b115
-                                     s->cluster_size) / REFTABLE_ENTRY_SIZE;
29b115
-            new_on_disk_reftable = g_try_realloc(on_disk_reftable,
29b115
-                                                 reftable_size *
29b115
-                                                 REFTABLE_ENTRY_SIZE);
29b115
-            if (!new_on_disk_reftable) {
29b115
-                res->check_errors++;
29b115
-                ret = -ENOMEM;
29b115
-                goto fail;
29b115
+            refblock_cluster_index = refblock_offset / s->cluster_size;
29b115
+            if (refblock_cluster_index >= end_cluster) {
29b115
+                /*
29b115
+                 * We must write the refblock that holds this refblock's
29b115
+                 * refcount
29b115
+                 */
29b115
+                end_cluster = refblock_cluster_index + 1;
29b115
             }
29b115
-            on_disk_reftable = new_on_disk_reftable;
29b115
 
29b115
-            memset(on_disk_reftable + old_reftable_size, 0,
29b115
-                   (reftable_size - old_reftable_size) * REFTABLE_ENTRY_SIZE);
29b115
+            if (on_disk_reftable_entries <= refblock_index) {
29b115
+                on_disk_reftable_entries =
29b115
+                    ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE,
29b115
+                             s->cluster_size) / REFTABLE_ENTRY_SIZE;
29b115
+                on_disk_reftable =
29b115
+                    g_try_realloc(on_disk_reftable,
29b115
+                                  on_disk_reftable_entries *
29b115
+                                  REFTABLE_ENTRY_SIZE);
29b115
+                if (!on_disk_reftable) {
29b115
+                    return -ENOMEM;
29b115
+                }
29b115
 
29b115
-            /* The offset we have for the reftable is now no longer valid;
29b115
-             * this will leak that range, but we can easily fix that by running
29b115
-             * a leak-fixing check after this rebuild operation */
29b115
-            reftable_offset = -1;
29b115
-        } else {
29b115
-            assert(on_disk_reftable);
29b115
-        }
29b115
-        on_disk_reftable[refblock_index] = refblock_offset;
29b115
+                memset(on_disk_reftable + *on_disk_reftable_entries_ptr, 0,
29b115
+                       (on_disk_reftable_entries -
29b115
+                        *on_disk_reftable_entries_ptr) *
29b115
+                       REFTABLE_ENTRY_SIZE);
29b115
 
29b115
-        /* If this is apparently the last refblock (for now), try to squeeze the
29b115
-         * reftable in */
29b115
-        if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
29b115
-            reftable_offset < 0)
29b115
-        {
29b115
-            uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
29b115
-                                                          REFTABLE_ENTRY_SIZE);
29b115
-            reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
29b115
-                                                  refcount_table, nb_clusters,
29b115
-                                                  &first_free_cluster);
29b115
-            if (reftable_offset < 0) {
29b115
-                fprintf(stderr, "ERROR allocating reftable: %s\n",
29b115
-                        strerror(-reftable_offset));
29b115
-                res->check_errors++;
29b115
-                ret = reftable_offset;
29b115
-                goto fail;
29b115
+                *on_disk_reftable_ptr = on_disk_reftable;
29b115
+                *on_disk_reftable_entries_ptr = on_disk_reftable_entries;
29b115
+
29b115
+                reftable_grown = true;
29b115
+            } else {
29b115
+                assert(on_disk_reftable);
29b115
             }
29b115
+            on_disk_reftable[refblock_index] = refblock_offset;
29b115
         }
29b115
 
29b115
+        /* Refblock is allocated, write it to disk */
29b115
+
29b115
         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
29b115
                                             s->cluster_size, false);
29b115
         if (ret < 0) {
29b115
             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
29b115
-            goto fail;
29b115
+            return ret;
29b115
         }
29b115
 
29b115
-        /* The size of *refcount_table is always cluster-aligned, therefore the
29b115
-         * write operation will not overflow */
29b115
+        /*
29b115
+         * The refblock is simply a slice of *refcount_table.
29b115
+         * Note that the size of *refcount_table is always aligned to
29b115
+         * whole clusters, so the write operation will not result in
29b115
+         * out-of-bounds accesses.
29b115
+         */
29b115
         on_disk_refblock = (void *)((char *) *refcount_table +
29b115
                                     refblock_index * s->cluster_size);
29b115
 
29b115
@@ -2550,23 +2579,99 @@ write_refblocks:
29b115
                           s->cluster_size);
29b115
         if (ret < 0) {
29b115
             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
29b115
-            goto fail;
29b115
+            return ret;
29b115
         }
29b115
 
29b115
-        /* Go to the end of this refblock */
29b115
+        /* This refblock is done, skip to its end */
29b115
         cluster = refblock_start + s->refcount_block_size - 1;
29b115
     }
29b115
 
29b115
-    if (reftable_offset < 0) {
29b115
-        uint64_t post_refblock_start, reftable_clusters;
29b115
+    return reftable_grown;
29b115
+}
29b115
+
29b115
+/*
29b115
+ * Creates a new refcount structure based solely on the in-memory information
29b115
+ * given through *refcount_table (this in-memory information is basically just
29b115
+ * the concatenation of all refblocks).  All necessary allocations will be
29b115
+ * reflected in that array.
29b115
+ *
29b115
+ * On success, the old refcount structure is leaked (it will be covered by the
29b115
+ * new refcount structure).
29b115
+ */
29b115
+static int rebuild_refcount_structure(BlockDriverState *bs,
29b115
+                                      BdrvCheckResult *res,
29b115
+                                      void **refcount_table,
29b115
+                                      int64_t *nb_clusters)
29b115
+{
29b115
+    BDRVQcow2State *s = bs->opaque;
29b115
+    int64_t reftable_offset = -1;
29b115
+    int64_t reftable_length = 0;
29b115
+    int64_t reftable_clusters;
29b115
+    int64_t refblock_index;
29b115
+    uint32_t on_disk_reftable_entries = 0;
29b115
+    uint64_t *on_disk_reftable = NULL;
29b115
+    int ret = 0;
29b115
+    int reftable_size_changed = 0;
29b115
+    struct {
29b115
+        uint64_t reftable_offset;
29b115
+        uint32_t reftable_clusters;
29b115
+    } QEMU_PACKED reftable_offset_and_clusters;
29b115
+
29b115
+    qcow2_cache_empty(bs, s->refcount_block_cache);
29b115
+
29b115
+    /*
29b115
+     * For each refblock containing entries, we try to allocate a
29b115
+     * cluster (in the in-memory refcount table) and write its offset
29b115
+     * into on_disk_reftable[].  We then write the whole refblock to
29b115
+     * disk (as a slice of the in-memory refcount table).
29b115
+     * This is done by rebuild_refcounts_write_refblocks().
29b115
+     *
29b115
+     * Once we have scanned all clusters, we try to find space for the
29b115
+     * reftable.  This will dirty the in-memory refcount table (i.e.
29b115
+     * make it differ from the refblocks we have already written), so we
29b115
+     * need to run rebuild_refcounts_write_refblocks() again for the
29b115
+     * range of clusters where the reftable has been allocated.
29b115
+     *
29b115
+     * This second run might make the reftable grow again, in which case
29b115
+     * we will need to allocate another space for it, which is why we
29b115
+     * repeat all this until the reftable stops growing.
29b115
+     *
29b115
+     * (This loop will terminate, because with every cluster the
29b115
+     * reftable grows, it can accomodate a multitude of more refcounts,
29b115
+     * so that at some point this must be able to cover the reftable
29b115
+     * and all refblocks describing it.)
29b115
+     *
29b115
+     * We then convert the reftable to big-endian and write it to disk.
29b115
+     *
29b115
+     * Note that we never free any reftable allocations.  Doing so would
29b115
+     * needlessly complicate the algorithm: The eventual second check
29b115
+     * run we do will clean up all leaks we have caused.
29b115
+     */
29b115
+
29b115
+    reftable_size_changed =
29b115
+        rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
29b115
+                                          0, *nb_clusters,
29b115
+                                          &on_disk_reftable,
29b115
+                                          &on_disk_reftable_entries);
29b115
+    if (reftable_size_changed < 0) {
29b115
+        res->check_errors++;
29b115
+        ret = reftable_size_changed;
29b115
+        goto fail;
29b115
+    }
29b115
+
29b115
+    /*
29b115
+     * There was no reftable before, so rebuild_refcounts_write_refblocks()
29b115
+     * must have increased its size (from 0 to something).
29b115
+     */
29b115
+    assert(reftable_size_changed);
29b115
+
29b115
+    do {
29b115
+        int64_t reftable_start_cluster, reftable_end_cluster;
29b115
+        int64_t first_free_cluster = 0;
29b115
+
29b115
+        reftable_length = on_disk_reftable_entries * REFTABLE_ENTRY_SIZE;
29b115
+        reftable_clusters = size_to_clusters(s, reftable_length);
29b115
 
29b115
-        post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
29b115
-        reftable_clusters =
29b115
-            size_to_clusters(s, reftable_size * REFTABLE_ENTRY_SIZE);
29b115
-        /* Not pretty but simple */
29b115
-        if (first_free_cluster < post_refblock_start) {
29b115
-            first_free_cluster = post_refblock_start;
29b115
-        }
29b115
         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
29b115
                                               refcount_table, nb_clusters,
29b115
                                               &first_free_cluster);
29b115
@@ -2578,24 +2683,55 @@ write_refblocks:
29b115
             goto fail;
29b115
         }
29b115
 
29b115
-        goto write_refblocks;
29b115
-    }
29b115
+        /*
29b115
+         * We need to update the affected refblocks, so re-run the
29b115
+         * write_refblocks loop for the reftable's range of clusters.
29b115
+         */
29b115
+        assert(offset_into_cluster(s, reftable_offset) == 0);
29b115
+        reftable_start_cluster = reftable_offset / s->cluster_size;
29b115
+        reftable_end_cluster = reftable_start_cluster + reftable_clusters;
29b115
+        reftable_size_changed =
29b115
+            rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
29b115
+                                              reftable_start_cluster,
29b115
+                                              reftable_end_cluster,
29b115
+                                              &on_disk_reftable,
29b115
+                                              &on_disk_reftable_entries);
29b115
+        if (reftable_size_changed < 0) {
29b115
+            res->check_errors++;
29b115
+            ret = reftable_size_changed;
29b115
+            goto fail;
29b115
+        }
29b115
+
29b115
+        /*
29b115
+         * If the reftable size has changed, we will need to find a new
29b115
+         * allocation, repeating the loop.
29b115
+         */
29b115
+    } while (reftable_size_changed);
29b115
 
29b115
-    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
29b115
+    /* The above loop must have run at least once */
29b115
+    assert(reftable_offset >= 0);
29b115
+
29b115
+    /*
29b115
+     * All allocations are done, all refblocks are written, convert the
29b115
+     * reftable to big-endian and write it to disk.
29b115
+     */
29b115
+
29b115
+    for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
29b115
+         refblock_index++)
29b115
+    {
29b115
         cpu_to_be64s(&on_disk_reftable[refblock_index]);
29b115
     }
29b115
 
29b115
-    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
29b115
-                                        reftable_size * REFTABLE_ENTRY_SIZE,
29b115
+    ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, reftable_length,
29b115
                                         false);
29b115
     if (ret < 0) {
29b115
         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
29b115
         goto fail;
29b115
     }
29b115
 
29b115
-    assert(reftable_size < INT_MAX / REFTABLE_ENTRY_SIZE);
29b115
+    assert(reftable_length < INT_MAX);
29b115
     ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
29b115
-                      reftable_size * REFTABLE_ENTRY_SIZE);
29b115
+                      reftable_length);
29b115
     if (ret < 0) {
29b115
         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
29b115
         goto fail;
29b115
@@ -2604,7 +2740,7 @@ write_refblocks:
29b115
     /* Enter new reftable into the image header */
29b115
     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
29b115
     reftable_offset_and_clusters.reftable_clusters =
29b115
-        cpu_to_be32(size_to_clusters(s, reftable_size * REFTABLE_ENTRY_SIZE));
29b115
+        cpu_to_be32(reftable_clusters);
29b115
     ret = bdrv_pwrite_sync(bs->file,
29b115
                            offsetof(QCowHeader, refcount_table_offset),
29b115
                            &reftable_offset_and_clusters,
29b115
@@ -2614,12 +2750,14 @@ write_refblocks:
29b115
         goto fail;
29b115
     }
29b115
 
29b115
-    for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
29b115
+    for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
29b115
+         refblock_index++)
29b115
+    {
29b115
         be64_to_cpus(&on_disk_reftable[refblock_index]);
29b115
     }
29b115
     s->refcount_table = on_disk_reftable;
29b115
     s->refcount_table_offset = reftable_offset;
29b115
-    s->refcount_table_size = reftable_size;
29b115
+    s->refcount_table_size = on_disk_reftable_entries;
29b115
     update_max_refcount_table_index(s);
29b115
 
29b115
     return 0;
29b115
-- 
29b115
2.31.1
29b115