9ae3a8
From 4413b8524dfa9dc3a6a494a2cf031265d6ef16f3 Mon Sep 17 00:00:00 2001
9ae3a8
From: Max Reitz <mreitz@redhat.com>
9ae3a8
Date: Mon, 4 Nov 2013 22:32:00 +0100
9ae3a8
Subject: [PATCH 07/87] qcow2: Metadata overlap checks
9ae3a8
9ae3a8
RH-Author: Max Reitz <mreitz@redhat.com>
9ae3a8
Message-id: <1383604354-12743-10-git-send-email-mreitz@redhat.com>
9ae3a8
Patchwork-id: 55309
9ae3a8
O-Subject: [RHEL-7.0 qemu-kvm PATCH 09/43] qcow2: Metadata overlap checks
9ae3a8
Bugzilla: 1004347
9ae3a8
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
9ae3a8
RH-Acked-by: Laszlo Ersek <lersek@redhat.com>
9ae3a8
RH-Acked-by: Fam Zheng <famz@redhat.com>
9ae3a8
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
9ae3a8
9ae3a8
BZ: 1004347
9ae3a8
9ae3a8
Two new functions are added; the first one checks a given range in the
9ae3a8
image file for overlaps with metadata (main header, L1 tables, L2
9ae3a8
tables, refcount table and blocks).
9ae3a8
9ae3a8
The second one should be used immediately before writing to the image
9ae3a8
file as it calls the first function and, upon collision, marks the
9ae3a8
image as corrupt and makes the BDS unusable, thereby preventing
9ae3a8
further access.
9ae3a8
9ae3a8
Both functions take a bitmask argument specifying the structures which
9ae3a8
should be checked for overlaps, making it possible to also check
9ae3a8
metadata writes against colliding with other structures.
9ae3a8
9ae3a8
Signed-off-by: Max Reitz <mreitz@redhat.com>
9ae3a8
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9ae3a8
(cherry picked from commit a40f1c2add4d5f58d594f810fe36cabcf32bc4b0)
9ae3a8
9ae3a8
Signed-off-by: Max Reitz <mreitz@redhat.com>
9ae3a8
---
9ae3a8
 block/qcow2-refcount.c    | 172 ++++++++++++++++++++++++++++++++++++++++++++++
9ae3a8
 block/qcow2.h             |  39 +++++++++++
9ae3a8
 include/monitor/monitor.h |   1 +
9ae3a8
 monitor.c                 |   1 +
9ae3a8
 4 files changed, 213 insertions(+)
9ae3a8
9ae3a8
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
9ae3a8
---
9ae3a8
 block/qcow2-refcount.c    |  172 +++++++++++++++++++++++++++++++++++++++++++++
9ae3a8
 block/qcow2.h             |   39 ++++++++++
9ae3a8
 include/monitor/monitor.h |    1 +
9ae3a8
 monitor.c                 |    1 +
9ae3a8
 4 files changed, 213 insertions(+), 0 deletions(-)
9ae3a8
9ae3a8
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
9ae3a8
index 1244693..310efcc 100644
9ae3a8
--- a/block/qcow2-refcount.c
9ae3a8
+++ b/block/qcow2-refcount.c
9ae3a8
@@ -25,6 +25,8 @@
9ae3a8
 #include "qemu-common.h"
9ae3a8
 #include "block/block_int.h"
9ae3a8
 #include "block/qcow2.h"
9ae3a8
+#include "qemu/range.h"
9ae3a8
+#include "qapi/qmp/types.h"
9ae3a8
 
9ae3a8
 static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
9ae3a8
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
9ae3a8
@@ -1372,3 +1374,173 @@ fail:
9ae3a8
     return ret;
9ae3a8
 }
9ae3a8
 
9ae3a8
+#define overlaps_with(ofs, sz) \
9ae3a8
+    ranges_overlap(offset, size, ofs, sz)
9ae3a8
+
9ae3a8
+/*
9ae3a8
+ * Checks if the given offset into the image file is actually free to use by
9ae3a8
+ * looking for overlaps with important metadata sections (L1/L2 tables etc.),
9ae3a8
+ * i.e. a sanity check without relying on the refcount tables.
9ae3a8
+ *
9ae3a8
+ * The chk parameter specifies exactly what checks to perform (being a bitmask
9ae3a8
+ * of QCow2MetadataOverlap values).
9ae3a8
+ *
9ae3a8
+ * Returns:
9ae3a8
+ * - 0 if writing to this offset will not affect the mentioned metadata
9ae3a8
+ * - a positive QCow2MetadataOverlap value indicating one overlapping section
9ae3a8
+ * - a negative value (-errno) indicating an error while performing a check,
9ae3a8
+ *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
9ae3a8
+ */
9ae3a8
+int qcow2_check_metadata_overlap(BlockDriverState *bs, int chk, int64_t offset,
9ae3a8
+                                 int64_t size)
9ae3a8
+{
9ae3a8
+    BDRVQcowState *s = bs->opaque;
9ae3a8
+    int i, j;
9ae3a8
+
9ae3a8
+    if (!size) {
9ae3a8
+        return 0;
9ae3a8
+    }
9ae3a8
+
9ae3a8
+    if (chk & QCOW2_OL_MAIN_HEADER) {
9ae3a8
+        if (offset < s->cluster_size) {
9ae3a8
+            return QCOW2_OL_MAIN_HEADER;
9ae3a8
+        }
9ae3a8
+    }
9ae3a8
+
9ae3a8
+    /* align range to test to cluster boundaries */
9ae3a8
+    size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
9ae3a8
+    offset = start_of_cluster(s, offset);
9ae3a8
+
9ae3a8
+    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
9ae3a8
+        if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
9ae3a8
+            return QCOW2_OL_ACTIVE_L1;
9ae3a8
+        }
9ae3a8
+    }
9ae3a8
+
9ae3a8
+    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
9ae3a8
+        if (overlaps_with(s->refcount_table_offset,
9ae3a8
+            s->refcount_table_size * sizeof(uint64_t))) {
9ae3a8
+            return QCOW2_OL_REFCOUNT_TABLE;
9ae3a8
+        }
9ae3a8
+    }
9ae3a8
+
9ae3a8
+    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
9ae3a8
+        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
9ae3a8
+            return QCOW2_OL_SNAPSHOT_TABLE;
9ae3a8
+        }
9ae3a8
+    }
9ae3a8
+
9ae3a8
+    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
9ae3a8
+        for (i = 0; i < s->nb_snapshots; i++) {
9ae3a8
+            if (s->snapshots[i].l1_size &&
9ae3a8
+                overlaps_with(s->snapshots[i].l1_table_offset,
9ae3a8
+                s->snapshots[i].l1_size * sizeof(uint64_t))) {
9ae3a8
+                return QCOW2_OL_INACTIVE_L1;
9ae3a8
+            }
9ae3a8
+        }
9ae3a8
+    }
9ae3a8
+
9ae3a8
+    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
9ae3a8
+        for (i = 0; i < s->l1_size; i++) {
9ae3a8
+            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
9ae3a8
+                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
9ae3a8
+                s->cluster_size)) {
9ae3a8
+                return QCOW2_OL_ACTIVE_L2;
9ae3a8
+            }
9ae3a8
+        }
9ae3a8
+    }
9ae3a8
+
9ae3a8
+    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
9ae3a8
+        for (i = 0; i < s->refcount_table_size; i++) {
9ae3a8
+            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
9ae3a8
+                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
9ae3a8
+                s->cluster_size)) {
9ae3a8
+                return QCOW2_OL_REFCOUNT_BLOCK;
9ae3a8
+            }
9ae3a8
+        }
9ae3a8
+    }
9ae3a8
+
9ae3a8
+    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
9ae3a8
+        for (i = 0; i < s->nb_snapshots; i++) {
9ae3a8
+            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
9ae3a8
+            uint32_t l1_sz  = s->snapshots[i].l1_size;
9ae3a8
+            uint64_t *l1 = g_malloc(l1_sz * sizeof(uint64_t));
9ae3a8
+            int ret;
9ae3a8
+
9ae3a8
+            ret = bdrv_read(bs->file, l1_ofs / BDRV_SECTOR_SIZE, (uint8_t *)l1,
9ae3a8
+                            l1_sz * sizeof(uint64_t) / BDRV_SECTOR_SIZE);
9ae3a8
+
9ae3a8
+            if (ret < 0) {
9ae3a8
+                g_free(l1);
9ae3a8
+                return ret;
9ae3a8
+            }
9ae3a8
+
9ae3a8
+            for (j = 0; j < l1_sz; j++) {
9ae3a8
+                if ((l1[j] & L1E_OFFSET_MASK) &&
9ae3a8
+                    overlaps_with(l1[j] & L1E_OFFSET_MASK, s->cluster_size)) {
9ae3a8
+                    g_free(l1);
9ae3a8
+                    return QCOW2_OL_INACTIVE_L2;
9ae3a8
+                }
9ae3a8
+            }
9ae3a8
+
9ae3a8
+            g_free(l1);
9ae3a8
+        }
9ae3a8
+    }
9ae3a8
+
9ae3a8
+    return 0;
9ae3a8
+}
9ae3a8
+
9ae3a8
+static const char *metadata_ol_names[] = {
9ae3a8
+    [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
9ae3a8
+    [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
9ae3a8
+    [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
9ae3a8
+    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
9ae3a8
+    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
9ae3a8
+    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
9ae3a8
+    [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
9ae3a8
+    [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
9ae3a8
+};
9ae3a8
+
9ae3a8
+/*
9ae3a8
+ * First performs a check for metadata overlaps (through
9ae3a8
+ * qcow2_check_metadata_overlap); if that fails with a negative value (error
9ae3a8
+ * while performing a check), that value is returned. If an impending overlap
9ae3a8
+ * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
9ae3a8
+ * and -EIO returned.
9ae3a8
+ *
9ae3a8
+ * Returns 0 if there were neither overlaps nor errors while checking for
9ae3a8
+ * overlaps; or a negative value (-errno) on error.
9ae3a8
+ */
9ae3a8
+int qcow2_pre_write_overlap_check(BlockDriverState *bs, int chk, int64_t offset,
9ae3a8
+                                  int64_t size)
9ae3a8
+{
9ae3a8
+    int ret = qcow2_check_metadata_overlap(bs, chk, offset, size);
9ae3a8
+
9ae3a8
+    if (ret < 0) {
9ae3a8
+        return ret;
9ae3a8
+    } else if (ret > 0) {
9ae3a8
+        int metadata_ol_bitnr = ffs(ret) - 1;
9ae3a8
+        char *message;
9ae3a8
+        QObject *data;
9ae3a8
+
9ae3a8
+        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
9ae3a8
+
9ae3a8
+        fprintf(stderr, "qcow2: Preventing invalid write on metadata (overlaps "
9ae3a8
+                "with %s); image marked as corrupt.\n",
9ae3a8
+                metadata_ol_names[metadata_ol_bitnr]);
9ae3a8
+        message = g_strdup_printf("Prevented %s overwrite",
9ae3a8
+                metadata_ol_names[metadata_ol_bitnr]);
9ae3a8
+        data = qobject_from_jsonf("{ 'device': %s, 'msg': %s, 'offset': %"
9ae3a8
+                PRId64 ", 'size': %" PRId64 " }", bs->device_name, message,
9ae3a8
+                offset, size);
9ae3a8
+        monitor_protocol_event(QEVENT_BLOCK_IMAGE_CORRUPTED, data);
9ae3a8
+        g_free(message);
9ae3a8
+        qobject_decref(data);
9ae3a8
+
9ae3a8
+        qcow2_mark_corrupt(bs);
9ae3a8
+        bs->drv = NULL; /* make BDS unusable */
9ae3a8
+        return -EIO;
9ae3a8
+    }
9ae3a8
+
9ae3a8
+    return 0;
9ae3a8
+}
9ae3a8
diff --git a/block/qcow2.h b/block/qcow2.h
9ae3a8
index 4297487..86ddb30 100644
9ae3a8
--- a/block/qcow2.h
9ae3a8
+++ b/block/qcow2.h
9ae3a8
@@ -289,6 +289,40 @@ enum {
9ae3a8
     QCOW2_CLUSTER_ZERO
9ae3a8
 };
9ae3a8
 
9ae3a8
+typedef enum QCow2MetadataOverlap {
9ae3a8
+    QCOW2_OL_MAIN_HEADER_BITNR    = 0,
9ae3a8
+    QCOW2_OL_ACTIVE_L1_BITNR      = 1,
9ae3a8
+    QCOW2_OL_ACTIVE_L2_BITNR      = 2,
9ae3a8
+    QCOW2_OL_REFCOUNT_TABLE_BITNR = 3,
9ae3a8
+    QCOW2_OL_REFCOUNT_BLOCK_BITNR = 4,
9ae3a8
+    QCOW2_OL_SNAPSHOT_TABLE_BITNR = 5,
9ae3a8
+    QCOW2_OL_INACTIVE_L1_BITNR    = 6,
9ae3a8
+    QCOW2_OL_INACTIVE_L2_BITNR    = 7,
9ae3a8
+
9ae3a8
+    QCOW2_OL_MAX_BITNR            = 8,
9ae3a8
+
9ae3a8
+    QCOW2_OL_NONE           = 0,
9ae3a8
+    QCOW2_OL_MAIN_HEADER    = (1 << QCOW2_OL_MAIN_HEADER_BITNR),
9ae3a8
+    QCOW2_OL_ACTIVE_L1      = (1 << QCOW2_OL_ACTIVE_L1_BITNR),
9ae3a8
+    QCOW2_OL_ACTIVE_L2      = (1 << QCOW2_OL_ACTIVE_L2_BITNR),
9ae3a8
+    QCOW2_OL_REFCOUNT_TABLE = (1 << QCOW2_OL_REFCOUNT_TABLE_BITNR),
9ae3a8
+    QCOW2_OL_REFCOUNT_BLOCK = (1 << QCOW2_OL_REFCOUNT_BLOCK_BITNR),
9ae3a8
+    QCOW2_OL_SNAPSHOT_TABLE = (1 << QCOW2_OL_SNAPSHOT_TABLE_BITNR),
9ae3a8
+    QCOW2_OL_INACTIVE_L1    = (1 << QCOW2_OL_INACTIVE_L1_BITNR),
9ae3a8
+    /* NOTE: Checking overlaps with inactive L2 tables will result in bdrv
9ae3a8
+     * reads. */
9ae3a8
+    QCOW2_OL_INACTIVE_L2    = (1 << QCOW2_OL_INACTIVE_L2_BITNR),
9ae3a8
+} QCow2MetadataOverlap;
9ae3a8
+
9ae3a8
+/* Perform all overlap checks which don't require disk access */
9ae3a8
+#define QCOW2_OL_CACHED \
9ae3a8
+    (QCOW2_OL_MAIN_HEADER | QCOW2_OL_ACTIVE_L1 | QCOW2_OL_ACTIVE_L2 | \
9ae3a8
+     QCOW2_OL_REFCOUNT_TABLE | QCOW2_OL_REFCOUNT_BLOCK | \
9ae3a8
+     QCOW2_OL_SNAPSHOT_TABLE | QCOW2_OL_INACTIVE_L1)
9ae3a8
+
9ae3a8
+/* The default checks to perform */
9ae3a8
+#define QCOW2_OL_DEFAULT QCOW2_OL_CACHED
9ae3a8
+
9ae3a8
 #define L1E_OFFSET_MASK 0x00ffffffffffff00ULL
9ae3a8
 #define L2E_OFFSET_MASK 0x00ffffffffffff00ULL
9ae3a8
 #define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL
9ae3a8
@@ -390,6 +424,11 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
9ae3a8
 
9ae3a8
 void qcow2_process_discards(BlockDriverState *bs, int ret);
9ae3a8
 
9ae3a8
+int qcow2_check_metadata_overlap(BlockDriverState *bs, int chk, int64_t offset,
9ae3a8
+                                 int64_t size);
9ae3a8
+int qcow2_pre_write_overlap_check(BlockDriverState *bs, int chk, int64_t offset,
9ae3a8
+                                  int64_t size);
9ae3a8
+
9ae3a8
 /* qcow2-cluster.c functions */
9ae3a8
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
9ae3a8
                         bool exact_size);
9ae3a8
diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
9ae3a8
index 1a6cfcf..07b41a6 100644
9ae3a8
--- a/include/monitor/monitor.h
9ae3a8
+++ b/include/monitor/monitor.h
9ae3a8
@@ -47,6 +47,7 @@ typedef enum MonitorEvent {
9ae3a8
     QEVENT_BALLOON_CHANGE,
9ae3a8
     QEVENT_SPICE_MIGRATE_COMPLETED,
9ae3a8
     QEVENT_GUEST_PANICKED,
9ae3a8
+    QEVENT_BLOCK_IMAGE_CORRUPTED,
9ae3a8
 
9ae3a8
     /* Add to 'monitor_event_names' array in monitor.c when
9ae3a8
      * defining new events here */
9ae3a8
diff --git a/monitor.c b/monitor.c
9ae3a8
index deb0dc8..c226acf 100644
9ae3a8
--- a/monitor.c
9ae3a8
+++ b/monitor.c
9ae3a8
@@ -504,6 +504,7 @@ static const char *monitor_event_names[] = {
9ae3a8
     [QEVENT_BALLOON_CHANGE] = "BALLOON_CHANGE",
9ae3a8
     [QEVENT_SPICE_MIGRATE_COMPLETED] = "SPICE_MIGRATE_COMPLETED",
9ae3a8
     [QEVENT_GUEST_PANICKED] = "GUEST_PANICKED",
9ae3a8
+    [QEVENT_BLOCK_IMAGE_CORRUPTED] = "BLOCK_IMAGE_CORRUPTED",
9ae3a8
 };
9ae3a8
 QEMU_BUILD_BUG_ON(ARRAY_SIZE(monitor_event_names) != QEVENT_MAX)
9ae3a8
 
9ae3a8
-- 
9ae3a8
1.7.1
9ae3a8