olga / rpms / glibc

Forked from rpms/glibc 5 years ago
Clone

Blame SOURCES/glibc-rh1140250.patch

ce426f
commit 7fe9e2e089f4990b7d18d0798f591ab276b15f2b
ce426f
Author: Florian Weimer <fweimer@redhat.com>
ce426f
Date:   Fri Jun 5 10:50:38 2015 +0200
ce426f
ce426f
    posix_fallocate: Emulation fixes and documentation [BZ #15661]
ce426f
    
ce426f
    Handle signed integer overflow correctly.  Detect and reject O_APPEND.
ce426f
    Document drawbacks of emulation.
ce426f
    
ce426f
    This does not completely address bug 15661, but improves the situation
ce426f
    somewhat.
ce426f
ce426f
commit 543ef578c3304661713950b37abd0c916f52ecf0
ce426f
Author: Paul Eggert <eggert@cs.ucla.edu>
ce426f
Date:   Tue Aug 25 23:42:01 2015 -0700
ce426f
ce426f
    Fix broken overflow check in posix_fallocate [BZ 18873]
ce426f
    
ce426f
    * sysdeps/posix/posix_fallocate.c (posix_fallocate):
ce426f
    * sysdeps/posix/posix_fallocate64.c (__posix_fallocate64_l64):
ce426f
    Fix parenthesization typo.
ce426f
ce426f
Index: b/manual/filesys.texi
ce426f
===================================================================
ce426f
--- a/manual/filesys.texi
ce426f
+++ b/manual/filesys.texi
ce426f
@@ -1723,6 +1723,7 @@ modify the attributes of a file.
ce426f
                                  access a file.
ce426f
 * File Times::                  About the time attributes of a file.
ce426f
 * File Size::			Manually changing the size of a file.
ce426f
+* Storage Allocation::          Allocate backing storage for files.
ce426f
 @end menu
ce426f
 
ce426f
 @node Attribute Meanings
ce426f
@@ -3232,6 +3233,99 @@ is a requirement of @code{mmap}.  The pr
ce426f
 real size, and when it has finished a final @code{ftruncate} call should
ce426f
 set the real size of the file.
ce426f
 
ce426f
+@node Storage Allocation
ce426f
+@subsection Storage Allocation
ce426f
+@cindex allocating file storage
ce426f
+@cindex file allocation
ce426f
+@cindex storage allocating
ce426f
+
ce426f
+@cindex file fragmentation
ce426f
+@cindex fragmentation of files
ce426f
+@cindex sparse files
ce426f
+@cindex files, sparse
ce426f
+Most file systems support allocating large files in a non-contiguous
ce426f
+fashion: the file is split into @emph{fragments} which are allocated
ce426f
+sequentially, but the fragments themselves can be scattered across the
ce426f
+disk.  File systems generally try to avoid such fragmentation because it
ce426f
+decreases performance, but if a file gradually increases in size, there
ce426f
+might be no other option than to fragment it.  In addition, many file
ce426f
+systems support @emph{sparse files} with @emph{holes}: regions of null
ce426f
+bytes for which no backing storage has been allocated by the file
ce426f
+system.  When the holes are finally overwritten with data, fragmentation
ce426f
+can occur as well.
ce426f
+
ce426f
+Explicit allocation of storage for yet-unwritten parts of the file can
ce426f
+help the system to avoid fragmentation.  Additionally, if storage
ce426f
+pre-allocation fails, it is possible to report the out-of-disk error
ce426f
+early, often without filling up the entire disk.  However, due to
ce426f
+deduplication, copy-on-write semantics, and file compression, such
ce426f
+pre-allocation may not reliably prevent the out-of-disk-space error from
ce426f
+occurring later.  Checking for write errors is still required, and
ce426f
+writes to memory-mapped regions created with @code{mmap} can still
ce426f
+result in @code{SIGBUS}.
ce426f
+
ce426f
+@deftypefun int posix_fallocate (int @var{fd}, off_t @var{offset}, off_t @var{length})
ce426f
+@safety{@prelim{}@mtsafe{}@assafe{}@acsafe{}}
ce426f
+@c If the file system does not support allocation,
ce426f
+@c @code{posix_fallocate} has a race with file extension (if
ce426f
+@c @var{length} is zero) or with concurrent writes of non-NUL bytes (if
ce426f
+@c @var{length} is positive).
ce426f
+
ce426f
+Allocate backing store for the region of @var{length} bytes starting at
ce426f
+byte @var{offset} in the file for the descriptor @var{fd}.  The file
ce426f
+length is increased to @samp{@var{length} + @var{offset}} if necessary.
ce426f
+
ce426f
+@var{fd} must be a regular file opened for writing, or @code{EBADF} is
ce426f
+returned.  If there is insufficient disk space to fulfill the allocation
ce426f
+request, @code{ENOSPC} is returned.
ce426f
+
ce426f
+@strong{Note:} If @code{fallocate} is not available (because the file
ce426f
+system does not support it), @code{posix_fallocate} is emulated, which
ce426f
+has the following drawbacks:
ce426f
+
ce426f
+@itemize @bullet
ce426f
+@item
ce426f
+It is very inefficient because all file system blocks in the requested
ce426f
+range need to be examined (even if they have been allocated before) and
ce426f
+potentially rewritten.  In contrast, with proper @code{fallocate}
ce426f
+support (see below), the file system can examine the internal file
ce426f
+allocation data structures and eliminate holes directly, maybe even
ce426f
+using unwritten extents (which are pre-allocated but uninitialized on
ce426f
+disk).
ce426f
+
ce426f
+@item
ce426f
+There is a race condition if another thread or process modifies the
ce426f
+underlying file in the to-be-allocated area.  Non-null bytes could be
ce426f
+overwritten with null bytes.
ce426f
+
ce426f
+@item
ce426f
+If @var{fd} has been opened with the @code{O_APPEND} flag, the function
ce426f
+will fail with an @code{errno} value of @code{EBADF}.
ce426f
+
ce426f
+@item
ce426f
+If @var{length} is zero, @code{ftruncate} is used to increase the file
ce426f
+size as requested, without allocating file system blocks.  There is a
ce426f
+race condition which means that @code{ftruncate} can accidentally
ce426f
+truncate the file if it has been extended concurrently.
ce426f
+@end itemize
ce426f
+
ce426f
+On Linux, if an application does not benefit from emulation or if the
ce426f
+emulation is harmful due to its inherent race conditions, the
ce426f
+application can use the Linux-specific @code{fallocate} function, with a
ce426f
+zero flag argument.  For the @code{fallocate} function, @theglibc{} does
ce426f
+not perform allocation emulation if the file system does not support
ce426f
+allocation.  Instead, an @code{EOPNOTSUPP} is returned to the caller.
ce426f
+
ce426f
+@end deftypefun
ce426f
+
ce426f
+@deftypefun int posix_fallocate64 (int @var{fd}, off64_t @var{length}, off64_t @var{offset})
ce426f
+@safety{@prelim{}@mtsafe{}@assafe{}@acsafe{}}
ce426f
+
ce426f
+This function is a variant of @code{posix_fallocate64} which accepts
ce426f
+64-bit file offsets on all platforms.
ce426f
+
ce426f
+@end deftypefun
ce426f
+
ce426f
 @node Making Special Files
ce426f
 @section Making Special Files
ce426f
 @cindex creating special files
ce426f
Index: b/sysdeps/posix/posix_fallocate.c
ce426f
===================================================================
ce426f
--- a/sysdeps/posix/posix_fallocate.c
ce426f
+++ b/sysdeps/posix/posix_fallocate.c
ce426f
@@ -18,26 +18,36 @@
ce426f
 #include <errno.h>
ce426f
 #include <fcntl.h>
ce426f
 #include <unistd.h>
ce426f
+#include <stdint.h>
ce426f
+#include <sys/fcntl.h>
ce426f
 #include <sys/stat.h>
ce426f
 #include <sys/statfs.h>
ce426f
 
ce426f
-/* Reserve storage for the data of the file associated with FD.  */
ce426f
+/* Reserve storage for the data of the file associated with FD.  This
ce426f
+   emulation is far from perfect, but the kernel cannot do not much
ce426f
+   better for network file systems, either.  */
ce426f
 
ce426f
 int
ce426f
 posix_fallocate (int fd, __off_t offset, __off_t len)
ce426f
 {
ce426f
   struct stat64 st;
ce426f
-  struct statfs f;
ce426f
 
ce426f
-  /* `off_t' is a signed type.  Therefore we can determine whether
ce426f
-     OFFSET + LEN is too large if it is a negative value.  */
ce426f
   if (offset < 0 || len < 0)
ce426f
     return EINVAL;
ce426f
-  if (offset + len < 0)
ce426f
+
ce426f
+  /* Perform overflow check.  The outer cast relies on a GCC
ce426f
+     extension.  */
ce426f
+  if ((__off_t) ((uint64_t) offset + (uint64_t) len) < 0)
ce426f
     return EFBIG;
ce426f
 
ce426f
-  /* First thing we have to make sure is that this is really a regular
ce426f
-     file.  */
ce426f
+  /* pwrite below will not do the right thing in O_APPEND mode.  */
ce426f
+  {
ce426f
+    int flags = __fcntl (fd, F_GETFL, 0);
ce426f
+    if (flags < 0 || (flags & O_APPEND) != 0)
ce426f
+      return EBADF;
ce426f
+  }
ce426f
+
ce426f
+  /* We have to make sure that this is really a regular file.  */
ce426f
   if (__fxstat64 (_STAT_VER, fd, &st) != 0)
ce426f
     return EBADF;
ce426f
   if (S_ISFIFO (st.st_mode))
ce426f
@@ -47,6 +57,8 @@ posix_fallocate (int fd, __off_t offset,
ce426f
 
ce426f
   if (len == 0)
ce426f
     {
ce426f
+      /* This is racy, but there is no good way to satisfy a
ce426f
+	 zero-length allocation request.  */
ce426f
       if (st.st_size < offset)
ce426f
 	{
ce426f
 	  int ret = __ftruncate (fd, offset);
ce426f
@@ -58,19 +70,36 @@ posix_fallocate (int fd, __off_t offset,
ce426f
       return 0;
ce426f
     }
ce426f
 
ce426f
-  /* We have to know the block size of the filesystem to get at least some
ce426f
-     sort of performance.  */
ce426f
-  if (__fstatfs (fd, &f) != 0)
ce426f
-    return errno;
ce426f
-
ce426f
-  /* Try to play safe.  */
ce426f
-  if (f.f_bsize == 0)
ce426f
-    f.f_bsize = 512;
ce426f
-
ce426f
-  /* Write something to every block.  */
ce426f
-  for (offset += (len - 1) % f.f_bsize; len > 0; offset += f.f_bsize)
ce426f
+  /* Minimize data transfer for network file systems, by issuing
ce426f
+     single-byte write requests spaced by the file system block size.
ce426f
+     (Most local file systems have fallocate support, so this fallback
ce426f
+     code is not used there.)  */
ce426f
+
ce426f
+  unsigned increment;
ce426f
+  {
ce426f
+    struct statfs64 f;
ce426f
+
ce426f
+    if (__fstatfs64 (fd, &f) != 0)
ce426f
+      return errno;
ce426f
+    if (f.f_bsize == 0)
ce426f
+      increment = 512;
ce426f
+    else if (f.f_bsize < 4096)
ce426f
+      increment = f.f_bsize;
ce426f
+    else
ce426f
+      /* NFS does not propagate the block size of the underlying
ce426f
+	 storage and may report a much larger value which would still
ce426f
+	 leave holes after the loop below, so we cap the increment at
ce426f
+	 4096.  */
ce426f
+      increment = 4096;
ce426f
+  }
ce426f
+
ce426f
+  /* Write a null byte to every block.  This is racy; we currently
ce426f
+     lack a better option.  Compare-and-swap against a file mapping
ce426f
+     might additional local races, but requires interposition of a
ce426f
+     signal handler to catch SIGBUS.  */
ce426f
+  for (offset += (len - 1) % increment; len > 0; offset += increment)
ce426f
     {
ce426f
-      len -= f.f_bsize;
ce426f
+      len -= increment;
ce426f
 
ce426f
       if (offset < st.st_size)
ce426f
 	{
ce426f
Index: b/sysdeps/posix/posix_fallocate64.c
ce426f
===================================================================
ce426f
--- a/sysdeps/posix/posix_fallocate64.c
ce426f
+++ b/sysdeps/posix/posix_fallocate64.c
ce426f
@@ -18,26 +18,36 @@
ce426f
 #include <errno.h>
ce426f
 #include <fcntl.h>
ce426f
 #include <unistd.h>
ce426f
+#include <stdint.h>
ce426f
+#include <sys/fcntl.h>
ce426f
 #include <sys/stat.h>
ce426f
 #include <sys/statfs.h>
ce426f
 
ce426f
-/* Reserve storage for the data of the file associated with FD.  */
ce426f
+/* Reserve storage for the data of the file associated with FD.  This
ce426f
+   emulation is far from perfect, but the kernel cannot do not much
ce426f
+   better for network file systems, either.  */
ce426f
 
ce426f
 int
ce426f
 __posix_fallocate64_l64 (int fd, __off64_t offset, __off64_t len)
ce426f
 {
ce426f
   struct stat64 st;
ce426f
-  struct statfs64 f;
ce426f
 
ce426f
-  /* `off64_t' is a signed type.  Therefore we can determine whether
ce426f
-     OFFSET + LEN is too large if it is a negative value.  */
ce426f
   if (offset < 0 || len < 0)
ce426f
     return EINVAL;
ce426f
-  if (offset + len < 0)
ce426f
+
ce426f
+  /* Perform overflow check.  The outer cast relies on a GCC
ce426f
+     extension.  */
ce426f
+  if ((__off64_t) ((uint64_t) offset + (uint64_t) len) < 0)
ce426f
     return EFBIG;
ce426f
 
ce426f
-  /* First thing we have to make sure is that this is really a regular
ce426f
-     file.  */
ce426f
+  /* pwrite64 below will not do the right thing in O_APPEND mode.  */
ce426f
+  {
ce426f
+    int flags = __fcntl (fd, F_GETFL, 0);
ce426f
+    if (flags < 0 || (flags & O_APPEND) != 0)
ce426f
+      return EBADF;
ce426f
+  }
ce426f
+
ce426f
+  /* We have to make sure that this is really a regular file.  */
ce426f
   if (__fxstat64 (_STAT_VER, fd, &st) != 0)
ce426f
     return EBADF;
ce426f
   if (S_ISFIFO (st.st_mode))
ce426f
@@ -47,6 +57,8 @@ __posix_fallocate64_l64 (int fd, __off64
ce426f
 
ce426f
   if (len == 0)
ce426f
     {
ce426f
+      /* This is racy, but there is no good way to satisfy a
ce426f
+	 zero-length allocation request.  */
ce426f
       if (st.st_size < offset)
ce426f
 	{
ce426f
 	  int ret = __ftruncate64 (fd, offset);
ce426f
@@ -58,19 +70,36 @@ __posix_fallocate64_l64 (int fd, __off64
ce426f
       return 0;
ce426f
     }
ce426f
 
ce426f
-  /* We have to know the block size of the filesystem to get at least some
ce426f
-     sort of performance.  */
ce426f
-  if (__fstatfs64 (fd, &f) != 0)
ce426f
-    return errno;
ce426f
-
ce426f
-  /* Try to play safe.  */
ce426f
-  if (f.f_bsize == 0)
ce426f
-    f.f_bsize = 512;
ce426f
-
ce426f
-  /* Write something to every block.  */
ce426f
-  for (offset += (len - 1) % f.f_bsize; len > 0; offset += f.f_bsize)
ce426f
+  /* Minimize data transfer for network file systems, by issuing
ce426f
+     single-byte write requests spaced by the file system block size.
ce426f
+     (Most local file systems have fallocate support, so this fallback
ce426f
+     code is not used there.)  */
ce426f
+
ce426f
+  unsigned increment;
ce426f
+  {
ce426f
+    struct statfs64 f;
ce426f
+
ce426f
+    if (__fstatfs64 (fd, &f) != 0)
ce426f
+      return errno;
ce426f
+    if (f.f_bsize == 0)
ce426f
+      increment = 512;
ce426f
+    else if (f.f_bsize < 4096)
ce426f
+      increment = f.f_bsize;
ce426f
+    else
ce426f
+      /* NFS clients do not propagate the block size of the underlying
ce426f
+	 storage and may report a much larger value which would still
ce426f
+	 leave holes after the loop below, so we cap the increment at
ce426f
+	 4096.  */
ce426f
+      increment = 4096;
ce426f
+  }
ce426f
+
ce426f
+  /* Write a null byte to every block.  This is racy; we currently
ce426f
+     lack a better option.  Compare-and-swap against a file mapping
ce426f
+     might address local races, but requires interposition of a signal
ce426f
+     handler to catch SIGBUS.  */
ce426f
+  for (offset += (len - 1) % increment; len > 0; offset += increment)
ce426f
     {
ce426f
-      len -= f.f_bsize;
ce426f
+      len -= increment;
ce426f
 
ce426f
       if (offset < st.st_size)
ce426f
 	{