| commit 7fe9e2e089f4990b7d18d0798f591ab276b15f2b |
| Author: Florian Weimer <fweimer@redhat.com> |
| Date: Fri Jun 5 10:50:38 2015 +0200 |
| |
| posix_fallocate: Emulation fixes and documentation [BZ #15661] |
| |
| Handle signed integer overflow correctly. Detect and reject O_APPEND. |
| Document drawbacks of emulation. |
| |
| This does not completely address bug 15661, but improves the situation |
| somewhat. |
| |
| commit 543ef578c3304661713950b37abd0c916f52ecf0 |
| Author: Paul Eggert <eggert@cs.ucla.edu> |
| Date: Tue Aug 25 23:42:01 2015 -0700 |
| |
| Fix broken overflow check in posix_fallocate [BZ 18873] |
| |
| * sysdeps/posix/posix_fallocate.c (posix_fallocate): |
| * sysdeps/posix/posix_fallocate64.c (__posix_fallocate64_l64): |
| Fix parenthesization typo. |
| |
| |
| |
| |
| |
| @@ -1723,6 +1723,7 @@ modify the attributes of a file. |
| access a file. |
| * File Times:: About the time attributes of a file. |
| * File Size:: Manually changing the size of a file. |
| +* Storage Allocation:: Allocate backing storage for files. |
| @end menu |
| |
| @node Attribute Meanings |
| @@ -3232,6 +3233,99 @@ is a requirement of @code{mmap}. The pr |
| real size, and when it has finished a final @code{ftruncate} call should |
| set the real size of the file. |
| |
| +@node Storage Allocation |
| +@subsection Storage Allocation |
| +@cindex allocating file storage |
| +@cindex file allocation |
| +@cindex storage allocating |
| + |
| +@cindex file fragmentation |
| +@cindex fragmentation of files |
| +@cindex sparse files |
| +@cindex files, sparse |
| +Most file systems support allocating large files in a non-contiguous |
| +fashion: the file is split into @emph{fragments} which are allocated |
| +sequentially, but the fragments themselves can be scattered across the |
| +disk. File systems generally try to avoid such fragmentation because it |
| +decreases performance, but if a file gradually increases in size, there |
| +might be no other option than to fragment it. In addition, many file |
| +systems support @emph{sparse files} with @emph{holes}: regions of null |
| +bytes for which no backing storage has been allocated by the file |
| +system. When the holes are finally overwritten with data, fragmentation |
| +can occur as well. |
| + |
| +Explicit allocation of storage for yet-unwritten parts of the file can |
| +help the system to avoid fragmentation. Additionally, if storage |
| +pre-allocation fails, it is possible to report the out-of-disk error |
| +early, often without filling up the entire disk. However, due to |
| +deduplication, copy-on-write semantics, and file compression, such |
| +pre-allocation may not reliably prevent the out-of-disk-space error from |
| +occurring later. Checking for write errors is still required, and |
| +writes to memory-mapped regions created with @code{mmap} can still |
| +result in @code{SIGBUS}. |
| + |
| +@deftypefun int posix_fallocate (int @var{fd}, off_t @var{offset}, off_t @var{length}) |
| +@safety{@prelim{}@mtsafe{}@assafe{}@acsafe{}} |
| +@c If the file system does not support allocation, |
| +@c @code{posix_fallocate} has a race with file extension (if |
| +@c @var{length} is zero) or with concurrent writes of non-NUL bytes (if |
| +@c @var{length} is positive). |
| + |
| +Allocate backing store for the region of @var{length} bytes starting at |
| +byte @var{offset} in the file for the descriptor @var{fd}. The file |
| +length is increased to @samp{@var{length} + @var{offset}} if necessary. |
| + |
| +@var{fd} must be a regular file opened for writing, or @code{EBADF} is |
| +returned. If there is insufficient disk space to fulfill the allocation |
| +request, @code{ENOSPC} is returned. |
| + |
| +@strong{Note:} If @code{fallocate} is not available (because the file |
| +system does not support it), @code{posix_fallocate} is emulated, which |
| +has the following drawbacks: |
| + |
| +@itemize @bullet |
| +@item |
| +It is very inefficient because all file system blocks in the requested |
| +range need to be examined (even if they have been allocated before) and |
| +potentially rewritten. In contrast, with proper @code{fallocate} |
| +support (see below), the file system can examine the internal file |
| +allocation data structures and eliminate holes directly, maybe even |
| +using unwritten extents (which are pre-allocated but uninitialized on |
| +disk). |
| + |
| +@item |
| +There is a race condition if another thread or process modifies the |
| +underlying file in the to-be-allocated area. Non-null bytes could be |
| +overwritten with null bytes. |
| + |
| +@item |
| +If @var{fd} has been opened with the @code{O_APPEND} flag, the function |
| +will fail with an @code{errno} value of @code{EBADF}. |
| + |
| +@item |
| +If @var{length} is zero, @code{ftruncate} is used to increase the file |
| +size as requested, without allocating file system blocks. There is a |
| +race condition which means that @code{ftruncate} can accidentally |
| +truncate the file if it has been extended concurrently. |
| +@end itemize |
| + |
| +On Linux, if an application does not benefit from emulation or if the |
| +emulation is harmful due to its inherent race conditions, the |
| +application can use the Linux-specific @code{fallocate} function, with a |
| +zero flag argument. For the @code{fallocate} function, @theglibc{} does |
| +not perform allocation emulation if the file system does not support |
| +allocation. Instead, an @code{EOPNOTSUPP} is returned to the caller. |
| + |
| +@end deftypefun |
| + |
| +@deftypefun int posix_fallocate64 (int @var{fd}, off64_t @var{length}, off64_t @var{offset}) |
| +@safety{@prelim{}@mtsafe{}@assafe{}@acsafe{}} |
| + |
| +This function is a variant of @code{posix_fallocate64} which accepts |
| +64-bit file offsets on all platforms. |
| + |
| +@end deftypefun |
| + |
| @node Making Special Files |
| @section Making Special Files |
| @cindex creating special files |
| |
| |
| |
| |
| @@ -18,26 +18,36 @@ |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <unistd.h> |
| +#include <stdint.h> |
| +#include <sys/fcntl.h> |
| #include <sys/stat.h> |
| #include <sys/statfs.h> |
| |
| -/* Reserve storage for the data of the file associated with FD. */ |
| +/* Reserve storage for the data of the file associated with FD. This |
| + emulation is far from perfect, but the kernel cannot do not much |
| + better for network file systems, either. */ |
| |
| int |
| posix_fallocate (int fd, __off_t offset, __off_t len) |
| { |
| struct stat64 st; |
| - struct statfs f; |
| |
| - /* `off_t' is a signed type. Therefore we can determine whether |
| - OFFSET + LEN is too large if it is a negative value. */ |
| if (offset < 0 || len < 0) |
| return EINVAL; |
| - if (offset + len < 0) |
| + |
| + /* Perform overflow check. The outer cast relies on a GCC |
| + extension. */ |
| + if ((__off_t) ((uint64_t) offset + (uint64_t) len) < 0) |
| return EFBIG; |
| |
| - /* First thing we have to make sure is that this is really a regular |
| - file. */ |
| + /* pwrite below will not do the right thing in O_APPEND mode. */ |
| + { |
| + int flags = __fcntl (fd, F_GETFL, 0); |
| + if (flags < 0 || (flags & O_APPEND) != 0) |
| + return EBADF; |
| + } |
| + |
| + /* We have to make sure that this is really a regular file. */ |
| if (__fxstat64 (_STAT_VER, fd, &st) != 0) |
| return EBADF; |
| if (S_ISFIFO (st.st_mode)) |
| @@ -47,6 +57,8 @@ posix_fallocate (int fd, __off_t offset, |
| |
| if (len == 0) |
| { |
| + /* This is racy, but there is no good way to satisfy a |
| + zero-length allocation request. */ |
| if (st.st_size < offset) |
| { |
| int ret = __ftruncate (fd, offset); |
| @@ -58,19 +70,36 @@ posix_fallocate (int fd, __off_t offset, |
| return 0; |
| } |
| |
| - /* We have to know the block size of the filesystem to get at least some |
| - sort of performance. */ |
| - if (__fstatfs (fd, &f) != 0) |
| - return errno; |
| - |
| - /* Try to play safe. */ |
| - if (f.f_bsize == 0) |
| - f.f_bsize = 512; |
| - |
| - /* Write something to every block. */ |
| - for (offset += (len - 1) % f.f_bsize; len > 0; offset += f.f_bsize) |
| + /* Minimize data transfer for network file systems, by issuing |
| + single-byte write requests spaced by the file system block size. |
| + (Most local file systems have fallocate support, so this fallback |
| + code is not used there.) */ |
| + |
| + unsigned increment; |
| + { |
| + struct statfs64 f; |
| + |
| + if (__fstatfs64 (fd, &f) != 0) |
| + return errno; |
| + if (f.f_bsize == 0) |
| + increment = 512; |
| + else if (f.f_bsize < 4096) |
| + increment = f.f_bsize; |
| + else |
| + /* NFS does not propagate the block size of the underlying |
| + storage and may report a much larger value which would still |
| + leave holes after the loop below, so we cap the increment at |
| + 4096. */ |
| + increment = 4096; |
| + } |
| + |
| + /* Write a null byte to every block. This is racy; we currently |
| + lack a better option. Compare-and-swap against a file mapping |
| + might additional local races, but requires interposition of a |
| + signal handler to catch SIGBUS. */ |
| + for (offset += (len - 1) % increment; len > 0; offset += increment) |
| { |
| - len -= f.f_bsize; |
| + len -= increment; |
| |
| if (offset < st.st_size) |
| { |
| |
| |
| |
| |
| @@ -18,26 +18,36 @@ |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <unistd.h> |
| +#include <stdint.h> |
| +#include <sys/fcntl.h> |
| #include <sys/stat.h> |
| #include <sys/statfs.h> |
| |
| -/* Reserve storage for the data of the file associated with FD. */ |
| +/* Reserve storage for the data of the file associated with FD. This |
| + emulation is far from perfect, but the kernel cannot do not much |
| + better for network file systems, either. */ |
| |
| int |
| __posix_fallocate64_l64 (int fd, __off64_t offset, __off64_t len) |
| { |
| struct stat64 st; |
| - struct statfs64 f; |
| |
| - /* `off64_t' is a signed type. Therefore we can determine whether |
| - OFFSET + LEN is too large if it is a negative value. */ |
| if (offset < 0 || len < 0) |
| return EINVAL; |
| - if (offset + len < 0) |
| + |
| + /* Perform overflow check. The outer cast relies on a GCC |
| + extension. */ |
| + if ((__off64_t) ((uint64_t) offset + (uint64_t) len) < 0) |
| return EFBIG; |
| |
| - /* First thing we have to make sure is that this is really a regular |
| - file. */ |
| + /* pwrite64 below will not do the right thing in O_APPEND mode. */ |
| + { |
| + int flags = __fcntl (fd, F_GETFL, 0); |
| + if (flags < 0 || (flags & O_APPEND) != 0) |
| + return EBADF; |
| + } |
| + |
| + /* We have to make sure that this is really a regular file. */ |
| if (__fxstat64 (_STAT_VER, fd, &st) != 0) |
| return EBADF; |
| if (S_ISFIFO (st.st_mode)) |
| @@ -47,6 +57,8 @@ __posix_fallocate64_l64 (int fd, __off64 |
| |
| if (len == 0) |
| { |
| + /* This is racy, but there is no good way to satisfy a |
| + zero-length allocation request. */ |
| if (st.st_size < offset) |
| { |
| int ret = __ftruncate64 (fd, offset); |
| @@ -58,19 +70,36 @@ __posix_fallocate64_l64 (int fd, __off64 |
| return 0; |
| } |
| |
| - /* We have to know the block size of the filesystem to get at least some |
| - sort of performance. */ |
| - if (__fstatfs64 (fd, &f) != 0) |
| - return errno; |
| - |
| - /* Try to play safe. */ |
| - if (f.f_bsize == 0) |
| - f.f_bsize = 512; |
| - |
| - /* Write something to every block. */ |
| - for (offset += (len - 1) % f.f_bsize; len > 0; offset += f.f_bsize) |
| + /* Minimize data transfer for network file systems, by issuing |
| + single-byte write requests spaced by the file system block size. |
| + (Most local file systems have fallocate support, so this fallback |
| + code is not used there.) */ |
| + |
| + unsigned increment; |
| + { |
| + struct statfs64 f; |
| + |
| + if (__fstatfs64 (fd, &f) != 0) |
| + return errno; |
| + if (f.f_bsize == 0) |
| + increment = 512; |
| + else if (f.f_bsize < 4096) |
| + increment = f.f_bsize; |
| + else |
| + /* NFS clients do not propagate the block size of the underlying |
| + storage and may report a much larger value which would still |
| + leave holes after the loop below, so we cap the increment at |
| + 4096. */ |
| + increment = 4096; |
| + } |
| + |
| + /* Write a null byte to every block. This is racy; we currently |
| + lack a better option. Compare-and-swap against a file mapping |
| + might address local races, but requires interposition of a signal |
| + handler to catch SIGBUS. */ |
| + for (offset += (len - 1) % increment; len > 0; offset += increment) |
| { |
| - len -= f.f_bsize; |
| + len -= increment; |
| |
| if (offset < st.st_size) |
| { |