mrc0mmand / rpms / lvm2

Forked from rpms/lvm2 2 years ago
Clone

Blame SOURCES/lvm2-2_03_06-Fix-rounding-writes-up-to-sector-size.patch

26e710
 lib/device/bcache.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++--
26e710
 lib/device/dev-io.c | 52 +++++++++++++++++++++++++++++++
26e710
 lib/device/device.h |  8 +++--
26e710
 lib/label/label.c   | 30 ++++++++++++++----
26e710
 4 files changed, 169 insertions(+), 10 deletions(-)
26e710
26e710
diff --git a/lib/device/bcache.c b/lib/device/bcache.c
26e710
index 7b09353..04fbf35 100644
26e710
--- a/lib/device/bcache.c
26e710
+++ b/lib/device/bcache.c
26e710
@@ -169,6 +169,7 @@ static bool _async_issue(struct io_engine *ioe, enum dir d, int fd,
26e710
 	sector_t offset;
26e710
 	sector_t nbytes;
26e710
 	sector_t limit_nbytes;
26e710
+	sector_t orig_nbytes;
26e710
 	sector_t extra_nbytes = 0;
26e710
 
26e710
 	if (((uintptr_t) data) & e->page_mask) {
26e710
@@ -191,11 +192,41 @@ static bool _async_issue(struct io_engine *ioe, enum dir d, int fd,
26e710
 			return false;
26e710
 		}
26e710
 
26e710
+		/*
26e710
+		 * If the bcache block offset+len goes beyond where lvm is
26e710
+		 * intending to write, then reduce the len being written
26e710
+		 * (which is the bcache block size) so we don't write past
26e710
+		 * the limit set by lvm.  If after applying the limit, the
26e710
+		 * resulting size is not a multiple of the sector size (512
26e710
+		 * or 4096) then extend the reduced size to be a multiple of
26e710
+		 * the sector size (we don't want to write partial sectors.)
26e710
+		 */
26e710
 		if (offset + nbytes > _last_byte_offset) {
26e710
 			limit_nbytes = _last_byte_offset - offset;
26e710
-			if (limit_nbytes % _last_byte_sector_size)
26e710
+
26e710
+			if (limit_nbytes % _last_byte_sector_size) {
26e710
 				extra_nbytes = _last_byte_sector_size - (limit_nbytes % _last_byte_sector_size);
26e710
 
26e710
+				/*
26e710
+				 * adding extra_nbytes to the reduced nbytes (limit_nbytes)
26e710
+				 * should make the final write size a multiple of the
26e710
+				 * sector size.  This should never result in a final size
26e710
+				 * larger than the bcache block size (as long as the bcache
26e710
+				 * block size is a multiple of the sector size).
26e710
+				 */
26e710
+				if (limit_nbytes + extra_nbytes > nbytes) {
26e710
+					log_warn("Skip extending write at %llu len %llu limit %llu extra %llu sector_size %llu",
26e710
+						 (unsigned long long)offset,
26e710
+						 (unsigned long long)nbytes,
26e710
+						 (unsigned long long)limit_nbytes,
26e710
+						 (unsigned long long)extra_nbytes,
26e710
+						 (unsigned long long)_last_byte_sector_size);
26e710
+					extra_nbytes = 0;
26e710
+				}
26e710
+			}
26e710
+
26e710
+			orig_nbytes = nbytes;
26e710
+
26e710
 			if (extra_nbytes) {
26e710
 				log_debug("Limit write at %llu len %llu to len %llu rounded to %llu",
26e710
 					  (unsigned long long)offset,
26e710
@@ -210,6 +241,22 @@ static bool _async_issue(struct io_engine *ioe, enum dir d, int fd,
26e710
 					  (unsigned long long)limit_nbytes);
26e710
 				nbytes = limit_nbytes;
26e710
 			}
26e710
+
26e710
+			/*
26e710
+			 * This shouldn't happen, the reduced+extended
26e710
+			 * nbytes value should never be larger than the
26e710
+			 * bcache block size.
26e710
+			 */
26e710
+			if (nbytes > orig_nbytes) {
26e710
+				log_error("Invalid adjusted write at %llu len %llu adjusted %llu limit %llu extra %llu sector_size %llu",
26e710
+					  (unsigned long long)offset,
26e710
+					  (unsigned long long)orig_nbytes,
26e710
+					  (unsigned long long)nbytes,
26e710
+					  (unsigned long long)limit_nbytes,
26e710
+					  (unsigned long long)extra_nbytes,
26e710
+					  (unsigned long long)_last_byte_sector_size);
26e710
+				return false;
26e710
+			}
26e710
 		}
26e710
 	}
26e710
 
26e710
@@ -403,6 +450,7 @@ static bool _sync_issue(struct io_engine *ioe, enum dir d, int fd,
26e710
 		uint64_t nbytes = len;
26e710
 		sector_t limit_nbytes = 0;
26e710
 		sector_t extra_nbytes = 0;
26e710
+		sector_t orig_nbytes = 0;
26e710
 
26e710
 		if (offset > _last_byte_offset) {
26e710
 			log_error("Limit write at %llu len %llu beyond last byte %llu",
26e710
@@ -415,9 +463,30 @@ static bool _sync_issue(struct io_engine *ioe, enum dir d, int fd,
26e710
 
26e710
 		if (offset + nbytes > _last_byte_offset) {
26e710
 			limit_nbytes = _last_byte_offset - offset;
26e710
-			if (limit_nbytes % _last_byte_sector_size)
26e710
+
26e710
+			if (limit_nbytes % _last_byte_sector_size) {
26e710
 				extra_nbytes = _last_byte_sector_size - (limit_nbytes % _last_byte_sector_size);
26e710
 
26e710
+				/*
26e710
+				 * adding extra_nbytes to the reduced nbytes (limit_nbytes)
26e710
+				 * should make the final write size a multiple of the
26e710
+				 * sector size.  This should never result in a final size
26e710
+				 * larger than the bcache block size (as long as the bcache
26e710
+				 * block size is a multiple of the sector size).
26e710
+				 */
26e710
+				if (limit_nbytes + extra_nbytes > nbytes) {
26e710
+					log_warn("Skip extending write at %llu len %llu limit %llu extra %llu sector_size %llu",
26e710
+						 (unsigned long long)offset,
26e710
+						 (unsigned long long)nbytes,
26e710
+						 (unsigned long long)limit_nbytes,
26e710
+						 (unsigned long long)extra_nbytes,
26e710
+						 (unsigned long long)_last_byte_sector_size);
26e710
+					extra_nbytes = 0;
26e710
+				}
26e710
+			}
26e710
+
26e710
+			orig_nbytes = nbytes;
26e710
+
26e710
 			if (extra_nbytes) {
26e710
 				log_debug("Limit write at %llu len %llu to len %llu rounded to %llu",
26e710
 					  (unsigned long long)offset,
26e710
@@ -432,6 +501,22 @@ static bool _sync_issue(struct io_engine *ioe, enum dir d, int fd,
26e710
 					  (unsigned long long)limit_nbytes);
26e710
 				nbytes = limit_nbytes;
26e710
 			}
26e710
+
26e710
+			/*
26e710
+			 * This shouldn't happen, the reduced+extended
26e710
+			 * nbytes value should never be larger than the
26e710
+			 * bcache block size.
26e710
+			 */
26e710
+			if (nbytes > orig_nbytes) {
26e710
+				log_error("Invalid adjusted write at %llu len %llu adjusted %llu limit %llu extra %llu sector_size %llu",
26e710
+					  (unsigned long long)offset,
26e710
+					  (unsigned long long)orig_nbytes,
26e710
+					  (unsigned long long)nbytes,
26e710
+					  (unsigned long long)limit_nbytes,
26e710
+					  (unsigned long long)extra_nbytes,
26e710
+					  (unsigned long long)_last_byte_sector_size);
26e710
+				return false;
26e710
+			}
26e710
 		}
26e710
 
26e710
 		where = offset;
26e710
diff --git a/lib/device/dev-io.c b/lib/device/dev-io.c
26e710
index 3fe2647..5fa0b7a 100644
26e710
--- a/lib/device/dev-io.c
26e710
+++ b/lib/device/dev-io.c
26e710
@@ -250,6 +250,58 @@ static int _dev_discard_blocks(struct device *dev, uint64_t offset_bytes, uint64
26e710
 	return 1;
26e710
 }
26e710
 
26e710
+int dev_get_direct_block_sizes(struct device *dev, unsigned int *physical_block_size,
26e710
+				unsigned int *logical_block_size)
26e710
+{
26e710
+	int fd = dev->bcache_fd;
26e710
+	int do_close = 0;
26e710
+	unsigned int pbs = 0;
26e710
+	unsigned int lbs = 0;
26e710
+
26e710
+	if (dev->physical_block_size || dev->logical_block_size) {
26e710
+		*physical_block_size = dev->physical_block_size;
26e710
+		*logical_block_size = dev->logical_block_size;
26e710
+		return 1;
26e710
+	}
26e710
+
26e710
+	if (fd <= 0) {
26e710
+		if (!dev_open_readonly(dev))
26e710
+			return 0;
26e710
+		fd = dev_fd(dev);
26e710
+		do_close = 1;
26e710
+	}
26e710
+
26e710
+	/*
26e710
+	 * BLKPBSZGET from kernel comment for blk_queue_physical_block_size:
26e710
+	 * "the lowest possible sector size that the hardware can operate on
26e710
+	 * without reverting to read-modify-write operations"
26e710
+	 */
26e710
+	if (ioctl(fd, BLKPBSZGET, &pbs)) {
26e710
+		stack;
26e710
+		pbs = 0;
26e710
+	}
26e710
+
26e710
+	/*
26e710
+	 * BLKSSZGET from kernel comment for blk_queue_logical_block_size:
26e710
+	 * "the lowest possible block size that the storage device can address."
26e710
+	 */
26e710
+	if (ioctl(fd, BLKSSZGET, &lbs)) {
26e710
+		stack;
26e710
+		lbs = 0;
26e710
+	}
26e710
+
26e710
+	dev->physical_block_size = pbs;
26e710
+	dev->logical_block_size = lbs;
26e710
+
26e710
+	*physical_block_size = pbs;
26e710
+	*logical_block_size = lbs;
26e710
+
26e710
+	if (do_close && !dev_close_immediate(dev))
26e710
+		stack;
26e710
+
26e710
+	return 1;
26e710
+}
26e710
+
26e710
 /*-----------------------------------------------------------------
26e710
  * Public functions
26e710
  *---------------------------------------------------------------*/
26e710
diff --git a/lib/device/device.h b/lib/device/device.h
26e710
index 30e1e79..bb65f84 100644
26e710
--- a/lib/device/device.h
26e710
+++ b/lib/device/device.h
26e710
@@ -67,8 +67,10 @@ struct device {
26e710
 	/* private */
26e710
 	int fd;
26e710
 	int open_count;
26e710
-	int phys_block_size;
26e710
-	int block_size;
26e710
+	int phys_block_size;     /* From either BLKPBSZGET or BLKSSZGET, don't use */
26e710
+	int block_size;	         /* From BLKBSZGET, returns bdev->bd_block_size, likely set by fs, probably don't use */
26e710
+	int physical_block_size; /* From BLKPBSZGET: lowest possible sector size that the hardware can operate on without reverting to read-modify-write operations */
26e710
+	int logical_block_size;  /* From BLKSSZGET: lowest possible block size that the storage device can address */
26e710
 	int read_ahead;
26e710
 	int bcache_fd;
26e710
 	uint32_t flags;
26e710
@@ -132,6 +134,8 @@ void dev_size_seqno_inc(void);
26e710
  * All io should use these routines.
26e710
  */
26e710
 int dev_get_block_size(struct device *dev, unsigned int *phys_block_size, unsigned int *block_size);
26e710
+int dev_get_direct_block_sizes(struct device *dev, unsigned int *physical_block_size,
26e710
+                               unsigned int *logical_block_size);
26e710
 int dev_get_size(struct device *dev, uint64_t *size);
26e710
 int dev_get_read_ahead(struct device *dev, uint32_t *read_ahead);
26e710
 int dev_discard_blocks(struct device *dev, uint64_t offset_bytes, uint64_t size_bytes);
26e710
diff --git a/lib/label/label.c b/lib/label/label.c
26e710
index 4c21d97..72be5ec 100644
26e710
--- a/lib/label/label.c
26e710
+++ b/lib/label/label.c
26e710
@@ -1472,16 +1472,34 @@ bool dev_set_bytes(struct device *dev, uint64_t start, size_t len, uint8_t val)
26e710
 
26e710
 void dev_set_last_byte(struct device *dev, uint64_t offset)
26e710
 {
26e710
-	unsigned int phys_block_size = 0;
26e710
-	unsigned int block_size = 0;
26e710
+	unsigned int physical_block_size = 0;
26e710
+	unsigned int logical_block_size = 0;
26e710
+	unsigned int bs;
26e710
 
26e710
-	if (!dev_get_block_size(dev, &phys_block_size, &block_size)) {
26e710
+	if (!dev_get_direct_block_sizes(dev, &physical_block_size, &logical_block_size)) {
26e710
 		stack;
26e710
-		/* FIXME  ASSERT or regular error testing is missing */
26e710
-		return;
26e710
+		return; /* FIXME: error path ? */
26e710
+	}
26e710
+
26e710
+	if ((physical_block_size == 512) && (logical_block_size == 512))
26e710
+		bs = 512;
26e710
+	else if ((physical_block_size == 4096) && (logical_block_size == 4096))
26e710
+		bs = 4096;
26e710
+	else if ((physical_block_size == 512) || (logical_block_size == 512)) {
26e710
+		log_debug("Set last byte mixed block sizes physical %u logical %u using 512",
26e710
+			  physical_block_size, logical_block_size);
26e710
+		bs = 512;
26e710
+	} else if ((physical_block_size == 4096) || (logical_block_size == 4096)) {
26e710
+		log_debug("Set last byte mixed block sizes physical %u logical %u using 4096",
26e710
+			  physical_block_size, logical_block_size);
26e710
+		bs = 4096;
26e710
+	} else {
26e710
+		log_debug("Set last byte mixed block sizes physical %u logical %u using 512",
26e710
+			  physical_block_size, logical_block_size);
26e710
+		bs = 512;
26e710
 	}
26e710
 
26e710
-	bcache_set_last_byte(scan_bcache, dev->bcache_fd, offset, phys_block_size);
26e710
+	bcache_set_last_byte(scan_bcache, dev->bcache_fd, offset, bs);
26e710
 }
26e710
 
26e710
 void dev_unset_last_byte(struct device *dev)