Blame SOURCES/kvm-raw-posix-The-SEEK_HOLE-code-is-flawed-rewrite-it.patch

958e1b
From 03b3f6befef3ab33a422d4dad9c2b3892e49b686 Mon Sep 17 00:00:00 2001
09b1b5
From: Max Reitz <mreitz@redhat.com>
09b1b5
Date: Tue, 18 Nov 2014 15:30:20 +0100
958e1b
Subject: [PATCH 41/41] raw-posix: The SEEK_HOLE code is flawed, rewrite it
09b1b5
09b1b5
Message-id: <1416324620-16229-8-git-send-email-mreitz@redhat.com>
09b1b5
Patchwork-id: 62442
09b1b5
O-Subject: [RHEL-7.1/7.0.z qemu-kvm PATCH v3 7/7] raw-posix: The SEEK_HOLE code is flawed, rewrite it
958e1b
Bugzilla: 1160237
09b1b5
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
09b1b5
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
09b1b5
RH-Acked-by: Markus Armbruster <armbru@redhat.com>
09b1b5
09b1b5
From: Markus Armbruster <armbru@redhat.com>
09b1b5
09b1b5
On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris,
09b1b5
but not Linux), try_seek_hole() reports trailing data instead.
09b1b5
09b1b5
Additionally, unlikely lseek() failures are treated badly:
09b1b5
09b1b5
* When SEEK_HOLE fails, try_seek_hole() reports trailing data.  For
09b1b5
  -ENXIO, there's in fact a trailing hole.  Can happen only when
09b1b5
  something truncated the file since we opened it.
09b1b5
09b1b5
* When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds,
09b1b5
  then try_seek_hole() reports a trailing hole.  This is okay only
09b1b5
  when SEEK_DATA failed with -ENXIO (which means the non-trailing hole
09b1b5
  found by SEEK_HOLE has since become trailing somehow).  For other
09b1b5
  failures (unlikely), it's wrong.
09b1b5
09b1b5
* When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely),
09b1b5
  then try_seek_hole() reports bogus data [-1,start), which its caller
09b1b5
  raw_co_get_block_status() turns into zero sectors of data.  Could
09b1b5
  theoretically lead to infinite loops in code that attempts to scan
09b1b5
  data vs. hole forward.
09b1b5
09b1b5
Rewrite from scratch, with very careful comments.
09b1b5
09b1b5
Signed-off-by: Markus Armbruster <armbru@redhat.com>
09b1b5
Reviewed-by: Max Reitz <mreitz@redhat.com>
09b1b5
Reviewed-by: Eric Blake <eblake@redhat.com>
09b1b5
Signed-off-by: Max Reitz <mreitz@redhat.com>
09b1b5
(cherry picked from commit d1f06fe665acdd7aa7a46a5ef88172c3d7d3028e)
09b1b5
09b1b5
Signed-off-by: Max Reitz <mreitz@redhat.com>
09b1b5
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
09b1b5
---
09b1b5
 block/raw-posix.c | 111 +++++++++++++++++++++++++++++++++++++++++-------------
09b1b5
 1 file changed, 85 insertions(+), 26 deletions(-)
09b1b5
09b1b5
diff --git a/block/raw-posix.c b/block/raw-posix.c
958e1b
index aeb8a97..6a50856 100644
09b1b5
--- a/block/raw-posix.c
09b1b5
+++ b/block/raw-posix.c
958e1b
@@ -1302,28 +1302,86 @@ out:
09b1b5
     return result;
09b1b5
 }
09b1b5
 
09b1b5
-static int try_seek_hole(BlockDriverState *bs, off_t start, off_t *data,
09b1b5
-                         off_t *hole)
09b1b5
+/*
09b1b5
+ * Find allocation range in @bs around offset @start.
09b1b5
+ * May change underlying file descriptor's file offset.
09b1b5
+ * If @start is not in a hole, store @start in @data, and the
09b1b5
+ * beginning of the next hole in @hole, and return 0.
09b1b5
+ * If @start is in a non-trailing hole, store @start in @hole and the
09b1b5
+ * beginning of the next non-hole in @data, and return 0.
09b1b5
+ * If @start is in a trailing hole or beyond EOF, return -ENXIO.
09b1b5
+ * If we can't find out, return a negative errno other than -ENXIO.
09b1b5
+ */
09b1b5
+static int find_allocation(BlockDriverState *bs, off_t start,
09b1b5
+                           off_t *data, off_t *hole)
09b1b5
 {
09b1b5
 #if defined SEEK_HOLE && defined SEEK_DATA
09b1b5
     BDRVRawState *s = bs->opaque;
09b1b5
+    off_t offs;
09b1b5
 
09b1b5
-    *hole = lseek(s->fd, start, SEEK_HOLE);
09b1b5
-    if (*hole == -1) {
09b1b5
-        return -errno;
09b1b5
+    /*
09b1b5
+     * SEEK_DATA cases:
09b1b5
+     * D1. offs == start: start is in data
09b1b5
+     * D2. offs > start: start is in a hole, next data at offs
09b1b5
+     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
09b1b5
+     *                              or start is beyond EOF
09b1b5
+     *     If the latter happens, the file has been truncated behind
09b1b5
+     *     our back since we opened it.  All bets are off then.
09b1b5
+     *     Treating like a trailing hole is simplest.
09b1b5
+     * D4. offs < 0, errno != ENXIO: we learned nothing
09b1b5
+     */
09b1b5
+    offs = lseek(s->fd, start, SEEK_DATA);
09b1b5
+    if (offs < 0) {
09b1b5
+        return -errno;          /* D3 or D4 */
09b1b5
+    }
09b1b5
+    assert(offs >= start);
09b1b5
+
09b1b5
+    if (offs > start) {
09b1b5
+        /* D2: in hole, next data at offs */
09b1b5
+        *hole = start;
09b1b5
+        *data = offs;
09b1b5
+        return 0;
09b1b5
     }
09b1b5
 
09b1b5
-    if (*hole > start) {
09b1b5
+    /* D1: in data, end not yet known */
09b1b5
+
09b1b5
+    /*
09b1b5
+     * SEEK_HOLE cases:
09b1b5
+     * H1. offs == start: start is in a hole
09b1b5
+     *     If this happens here, a hole has been dug behind our back
09b1b5
+     *     since the previous lseek().
09b1b5
+     * H2. offs > start: either start is in data, next hole at offs,
09b1b5
+     *                   or start is in trailing hole, EOF at offs
09b1b5
+     *     Linux treats trailing holes like any other hole: offs ==
09b1b5
+     *     start.  Solaris seeks to EOF instead: offs > start (blech).
09b1b5
+     *     If that happens here, a hole has been dug behind our back
09b1b5
+     *     since the previous lseek().
09b1b5
+     * H3. offs < 0, errno = ENXIO: start is beyond EOF
09b1b5
+     *     If this happens, the file has been truncated behind our
09b1b5
+     *     back since we opened it.  Treat it like a trailing hole.
09b1b5
+     * H4. offs < 0, errno != ENXIO: we learned nothing
09b1b5
+     *     Pretend we know nothing at all, i.e. "forget" about D1.
09b1b5
+     */
09b1b5
+    offs = lseek(s->fd, start, SEEK_HOLE);
09b1b5
+    if (offs < 0) {
09b1b5
+        return -errno;          /* D1 and (H3 or H4) */
09b1b5
+    }
09b1b5
+    assert(offs >= start);
09b1b5
+
09b1b5
+    if (offs > start) {
09b1b5
+        /*
09b1b5
+         * D1 and H2: either in data, next hole at offs, or it was in
09b1b5
+         * data but is now in a trailing hole.  In the latter case,
09b1b5
+         * all bets are off.  Treating it as if it there was data all
09b1b5
+         * the way to EOF is safe, so simply do that.
09b1b5
+         */
09b1b5
         *data = start;
09b1b5
-    } else {
09b1b5
-        /* On a hole.  We need another syscall to find its end.  */
09b1b5
-        *data = lseek(s->fd, start, SEEK_DATA);
09b1b5
-        if (*data == -1) {
09b1b5
-            *data = lseek(s->fd, 0, SEEK_END);
09b1b5
-        }
09b1b5
+        *hole = offs;
09b1b5
+        return 0;
09b1b5
     }
09b1b5
 
09b1b5
-    return 0;
09b1b5
+    /* D1 and H1 */
09b1b5
+    return -EBUSY;
09b1b5
 #else
09b1b5
     return -ENOTSUP;
09b1b5
 #endif
958e1b
@@ -1368,25 +1426,26 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
09b1b5
         nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
09b1b5
     }
09b1b5
 
09b1b5
-    ret = try_seek_hole(bs, start, &data, &hole);
09b1b5
-    if (ret < 0) {
09b1b5
-        /* Assume everything is allocated. */
09b1b5
-        data = 0;
09b1b5
-        hole = start + nb_sectors * BDRV_SECTOR_SIZE;
09b1b5
-        ret = 0;
09b1b5
-    }
09b1b5
-
09b1b5
-    assert(ret >= 0);
09b1b5
-
09b1b5
-    if (data <= start) {
09b1b5
+    ret = find_allocation(bs, start, &data, &hole);
09b1b5
+    if (ret == -ENXIO) {
09b1b5
+        /* Trailing hole */
09b1b5
+        *pnum = nb_sectors;
09b1b5
+        ret = BDRV_BLOCK_ZERO;
09b1b5
+    } else if (ret < 0) {
09b1b5
+        /* No info available, so pretend there are no holes */
09b1b5
+        *pnum = nb_sectors;
09b1b5
+        ret = BDRV_BLOCK_DATA;
09b1b5
+    } else if (data == start) {
09b1b5
         /* On a data extent, compute sectors to the end of the extent.  */
09b1b5
         *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
09b1b5
-        return ret | BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
09b1b5
+        ret = BDRV_BLOCK_DATA;
09b1b5
     } else {
09b1b5
         /* On a hole, compute sectors to the beginning of the next extent.  */
09b1b5
+        assert(hole == start);
09b1b5
         *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
09b1b5
-        return ret | BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID | start;
09b1b5
+        ret = BDRV_BLOCK_ZERO;
09b1b5
     }
09b1b5
+    return ret | BDRV_BLOCK_OFFSET_VALID | start;
09b1b5
 }
09b1b5
 
09b1b5
 static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs,
09b1b5
-- 
09b1b5
1.8.3.1
09b1b5