dcavalca / rpms / mdadm

Forked from rpms/mdadm 3 years ago
Clone

Blame SOURCES/0054-Create-add-support-for-RAID0-layouts.patch

8e8941
From 329dfc28debb58ffe7bd1967cea00fc583139aca Mon Sep 17 00:00:00 2001
8e8941
From: NeilBrown <neilb@suse.de>
8e8941
Date: Mon, 4 Nov 2019 14:27:49 +1100
8e8941
Subject: [RHEL8.2 PATCH 54/61] Create: add support for RAID0 layouts.
8e8941
8e8941
Since Linux 5.4 a layout is needed for RAID0 arrays with
8e8941
varying device sizes.
8e8941
This patch makes the layout of an array visible (via --examine)
8e8941
and sets the layout on newly created arrays.
8e8941
--layout=dangerous
8e8941
can be used to avoid setting a layout so that they array
8e8941
can be used on older kernels.
8e8941
8e8941
Tested-by: dann frazier <dann.frazier@canonical.com>
8e8941
Signed-off-by: NeilBrown <neilb@suse.de>
8e8941
Signed-off-by: Jes Sorensen <jsorensen@fb.com>
8e8941
---
8e8941
 Create.c   | 11 +++++++++++
8e8941
 Detail.c   |  5 +++++
8e8941
 maps.c     | 12 ++++++++++++
8e8941
 md.4       | 14 ++++++++++++++
8e8941
 mdadm.8.in | 30 +++++++++++++++++++++++++++++-
8e8941
 mdadm.c    |  8 ++++++++
8e8941
 mdadm.h    |  8 +++++++-
8e8941
 super0.c   |  6 ++++++
8e8941
 super1.c   | 30 +++++++++++++++++++++++++++++-
8e8941
 9 files changed, 121 insertions(+), 3 deletions(-)
8e8941
8e8941
diff --git a/Create.c b/Create.c
8e8941
index 292f92a..6f84e5b 100644
8e8941
--- a/Create.c
8e8941
+++ b/Create.c
8e8941
@@ -51,6 +51,9 @@ static int default_layout(struct supertype *st, int level, int verbose)
8e8941
 		default: /* no layout */
8e8941
 			layout = 0;
8e8941
 			break;
8e8941
+		case 0:
8e8941
+			layout = RAID0_ORIG_LAYOUT;
8e8941
+			break;
8e8941
 		case 10:
8e8941
 			layout = 0x102; /* near=2, far=1 */
8e8941
 			if (verbose > 0)
8e8941
@@ -950,6 +953,11 @@ int Create(struct supertype *st, char *mddev,
8e8941
 				if (rv) {
8e8941
 					pr_err("ADD_NEW_DISK for %s failed: %s\n",
8e8941
 					       dv->devname, strerror(errno));
8e8941
+					if (errno == EINVAL &&
8e8941
+					    info.array.level == 0) {
8e8941
+						pr_err("Possibly your kernel doesn't support RAID0 layouts.\n");
8e8941
+						pr_err("Either upgrade, or use --layout=dangerous\n");
8e8941
+					}
8e8941
 					goto abort_locked;
8e8941
 				}
8e8941
 				break;
8e8941
@@ -1046,6 +1054,9 @@ int Create(struct supertype *st, char *mddev,
8e8941
 			if (ioctl(mdfd, RUN_ARRAY, &param)) {
8e8941
 				pr_err("RUN_ARRAY failed: %s\n",
8e8941
 				       strerror(errno));
8e8941
+				if (errno == 524 /* ENOTSUP */ &&
8e8941
+				    info.array.level == 0)
8e8941
+					cont_err("Please use --layout=original or --layout=alternate\n");
8e8941
 				if (info.array.chunk_size & (info.array.chunk_size-1)) {
8e8941
 					cont_err("Problem may be that chunk size is not a power of 2\n");
8e8941
 				}
8e8941
diff --git a/Detail.c b/Detail.c
8e8941
index 24fa462..832485f 100644
8e8941
--- a/Detail.c
8e8941
+++ b/Detail.c
8e8941
@@ -525,6 +525,11 @@ int Detail(char *dev, struct context *c)
8e8941
 			printf("            Layout : %s\n",
8e8941
 			       str ? str : "-unknown-");
8e8941
 		}
8e8941
+		if (array.level == 0 && array.layout) {
8e8941
+			str = map_num(r0layout, array.layout);
8e8941
+			printf("            Layout : %s\n",
8e8941
+			       str ? str : "-unknown-");
8e8941
+		}
8e8941
 		if (array.level == 6) {
8e8941
 			str = map_num(r6layout, array.layout);
8e8941
 			printf("            Layout : %s\n",
8e8941
diff --git a/maps.c b/maps.c
8e8941
index 49b7f2c..a4fd279 100644
8e8941
--- a/maps.c
8e8941
+++ b/maps.c
8e8941
@@ -73,6 +73,18 @@ mapping_t r6layout[] = {
8e8941
 	{ NULL, UnSet }
8e8941
 };
8e8941
 
8e8941
+/* raid0 layout is only needed because of a bug in 3.14 which changed
8e8941
+ * the effective layout of raid0 arrays with varying device sizes.
8e8941
+ */
8e8941
+mapping_t r0layout[] = {
8e8941
+	{ "original", RAID0_ORIG_LAYOUT},
8e8941
+	{ "alternate", RAID0_ALT_MULTIZONE_LAYOUT},
8e8941
+	{ "1", 1}, /* aka ORIG */
8e8941
+	{ "2", 2}, /* aka ALT */
8e8941
+	{ "dangerous", 0},
8e8941
+	{ NULL, UnSet},
8e8941
+};
8e8941
+
8e8941
 mapping_t pers[] = {
8e8941
 	{ "linear", LEVEL_LINEAR},
8e8941
 	{ "raid0", 0},
8e8941
diff --git a/md.4 b/md.4
8e8941
index e86707a..6fe2755 100644
8e8941
--- a/md.4
8e8941
+++ b/md.4
8e8941
@@ -193,6 +193,20 @@ smallest device has been exhausted, the RAID0 driver starts
8e8941
 collecting chunks into smaller stripes that only span the drives which
8e8941
 still have remaining space.
8e8941
 
8e8941
+A bug was introduced in linux 3.14 which changed the layout of blocks in
8e8941
+a RAID0 beyond the region that is striped over all devices.  This bug
8e8941
+does not affect an array with all devices the same size, but can affect
8e8941
+other RAID0 arrays.
8e8941
+
8e8941
+Linux 5.4 (and some stable kernels to which the change was backported)
8e8941
+will not normally assemble such an array as it cannot know which layout
8e8941
+to use.  There is a module parameter "raid0.default_layout" which can be
8e8941
+set to "1" to force the kernel to use the pre-3.14 layout or to "2" to
8e8941
+force it to use the 3.14-and-later layout.  when creating a new RAID0
8e8941
+array,
8e8941
+.I mdadm
8e8941
+will record the chosen layout in the metadata in a way that allows newer
8e8941
+kernels to assemble the array without needing a module parameter.
8e8941
 
8e8941
 .SS RAID1
8e8941
 
8e8941
diff --git a/mdadm.8.in b/mdadm.8.in
8e8941
index 9aec9f4..fc9b6a6 100644
8e8941
--- a/mdadm.8.in
8e8941
+++ b/mdadm.8.in
8e8941
@@ -593,6 +593,8 @@ to change the RAID level in some cases.  See LEVEL CHANGES below.
8e8941
 This option configures the fine details of data layout for RAID5, RAID6,
8e8941
 and RAID10 arrays, and controls the failure modes for
8e8941
 .IR faulty .
8e8941
+It can also be used for working around a kernel bug with RAID0, but generally
8e8941
+doesn't need to be used explicitly.
8e8941
 
8e8941
 The layout of the RAID5 parity block can be one of
8e8941
 .BR left\-asymmetric ,
8e8941
@@ -652,7 +654,7 @@ option to set subsequent failure modes.
8e8941
 "clear" or "none" will remove any pending or periodic failure modes,
8e8941
 and "flush" will clear any persistent faults.
8e8941
 
8e8941
-Finally, the layout options for RAID10 are one of 'n', 'o' or 'f' followed
8e8941
+The layout options for RAID10 are one of 'n', 'o' or 'f' followed
8e8941
 by a small number.  The default is 'n2'.  The supported options are:
8e8941
 
8e8941
 .I 'n'
8e8941
@@ -677,6 +679,32 @@ devices in the array.  It does not need to divide evenly into that
8e8941
 number (e.g. it is perfectly legal to have an 'n2' layout for an array
8e8941
 with an odd number of devices).
8e8941
 
8e8941
+A bug introduced in Linux 3.14 means that RAID0 arrays
8e8941
+.B "with devices of differing sizes"
8e8941
+started using a different layout.  This could lead to
8e8941
+data corruption.  Since Linux 5.4 (and various stable releases that received
8e8941
+backports), the kernel will not accept such an array unless
8e8941
+a layout is explictly set.  It can be set to
8e8941
+.RB ' original '
8e8941
+or
8e8941
+.RB ' alternate '.
8e8941
+When creating a new array,
8e8941
+.I mdadm
8e8941
+will select
8e8941
+.RB ' original '
8e8941
+by default, so the layout does not normally need to be set.
8e8941
+An array created for either
8e8941
+.RB ' original '
8e8941
+or
8e8941
+.RB ' alternate '
8e8941
+will not be recognized by an (unpatched) kernel prior to 5.4.  To create
8e8941
+a RAID0 array with devices of differing sizes that can be used on an
8e8941
+older kernel, you can set the layout to
8e8941
+.RB ' dangerous '.
8e8941
+This will use whichever layout the running kernel supports, so the data
8e8941
+on the array may become corrupt when changing kernel from pre-3.14 to a
8e8941
+later kernel.
8e8941
+
8e8941
 When an array is converted between RAID5 and RAID6 an intermediate
8e8941
 RAID6 layout is used in which the second parity block (Q) is always on
8e8941
 the last device.  To convert a RAID5 to RAID6 and leave it in this new
8e8941
diff --git a/mdadm.c b/mdadm.c
8e8941
index 1fb8086..e438f9c 100644
8e8941
--- a/mdadm.c
8e8941
+++ b/mdadm.c
8e8941
@@ -550,6 +550,14 @@ int main(int argc, char *argv[])
8e8941
 				pr_err("raid level must be given before layout.\n");
8e8941
 				exit(2);
8e8941
 
8e8941
+			case 0:
8e8941
+				s.layout = map_name(r0layout, optarg);
8e8941
+				if (s.layout == UnSet) {
8e8941
+					pr_err("layout %s not understood for raid0.\n",
8e8941
+						optarg);
8e8941
+					exit(2);
8e8941
+				}
8e8941
+				break;
8e8941
 			case 5:
8e8941
 				s.layout = map_name(r5layout, optarg);
8e8941
 				if (s.layout == UnSet) {
8e8941
diff --git a/mdadm.h b/mdadm.h
8e8941
index 91f1338..9e98778 100644
8e8941
--- a/mdadm.h
8e8941
+++ b/mdadm.h
8e8941
@@ -763,7 +763,8 @@ extern int restore_stripes(int *dest, unsigned long long *offsets,
8e8941
 
8e8941
 extern char *map_num(mapping_t *map, int num);
8e8941
 extern int map_name(mapping_t *map, char *name);
8e8941
-extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[];
8e8941
+extern mapping_t r0layout[], r5layout[], r6layout[],
8e8941
+	pers[], modes[], faultylayout[];
8e8941
 extern mapping_t consistency_policies[], sysfs_array_states[];
8e8941
 
8e8941
 extern char *map_dev_preferred(int major, int minor, int create,
8e8941
@@ -1758,6 +1759,11 @@ char *xstrdup(const char *str);
8e8941
 #define makedev(M,m) (((M)<<8) | (m))
8e8941
 #endif
8e8941
 
8e8941
+enum r0layout {
8e8941
+	RAID0_ORIG_LAYOUT = 1,
8e8941
+	RAID0_ALT_MULTIZONE_LAYOUT = 2,
8e8941
+};
8e8941
+
8e8941
 /* for raid4/5/6 */
8e8941
 #define ALGORITHM_LEFT_ASYMMETRIC	0
8e8941
 #define ALGORITHM_RIGHT_ASYMMETRIC	1
8e8941
diff --git a/super0.c b/super0.c
8e8941
index 6b7c0e3..6af140b 100644
8e8941
--- a/super0.c
8e8941
+++ b/super0.c
8e8941
@@ -1291,6 +1291,12 @@ static int validate_geometry0(struct supertype *st, int level,
8e8941
 	if (*chunk == UnSet)
8e8941
 		*chunk = DEFAULT_CHUNK;
8e8941
 
8e8941
+	if (level == 0 && layout != UnSet) {
8e8941
+		if (verbose)
8e8941
+			pr_err("0.90 metadata does not support layouts for RAID0\n");
8e8941
+		return 0;
8e8941
+	}
8e8941
+
8e8941
 	if (!subdev)
8e8941
 		return 1;
8e8941
 
8e8941
diff --git a/super1.c b/super1.c
8e8941
index 929466d..cedbb53 100644
8e8941
--- a/super1.c
8e8941
+++ b/super1.c
8e8941
@@ -43,7 +43,7 @@ struct mdp_superblock_1 {
8e8941
 
8e8941
 	__u64	ctime;		/* lo 40 bits are seconds, top 24 are microseconds or 0*/
8e8941
 	__u32	level;		/* -4 (multipath), -1 (linear), 0,1,4,5 */
8e8941
-	__u32	layout;		/* only for raid5 currently */
8e8941
+	__u32	layout;		/* used for raid5, raid6, raid10, and raid0 */
8e8941
 	__u64	size;		/* used size of component devices, in 512byte sectors */
8e8941
 
8e8941
 	__u32	chunksize;	/* in 512byte sectors */
8e8941
@@ -144,6 +144,7 @@ struct misc_dev_info {
8e8941
 #define	MD_FEATURE_JOURNAL		512 /* support write journal */
8e8941
 #define	MD_FEATURE_PPL			1024 /* support PPL */
8e8941
 #define	MD_FEATURE_MUTLIPLE_PPLS	2048 /* support for multiple PPLs */
8e8941
+#define	MD_FEATURE_RAID0_LAYOUT		4096 /* layout is meaningful in RAID0 */
8e8941
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
8e8941
 					|MD_FEATURE_RECOVERY_OFFSET	\
8e8941
 					|MD_FEATURE_RESHAPE_ACTIVE	\
8e8941
@@ -155,6 +156,7 @@ struct misc_dev_info {
8e8941
 					|MD_FEATURE_JOURNAL		\
8e8941
 					|MD_FEATURE_PPL			\
8e8941
 					|MD_FEATURE_MULTIPLE_PPLS	\
8e8941
+					|MD_FEATURE_RAID0_LAYOUT	\
8e8941
 					)
8e8941
 
8e8941
 static int role_from_sb(struct mdp_superblock_1 *sb)
8e8941
@@ -498,6 +500,11 @@ static void examine_super1(struct supertype *st, char *homehost)
8e8941
 	printf("         Events : %llu\n",
8e8941
 	       (unsigned long long)__le64_to_cpu(sb->events));
8e8941
 	printf("\n");
8e8941
+	if (__le32_to_cpu(sb->level) == 0 &&
8e8941
+	    (sb->feature_map & __cpu_to_le32(MD_FEATURE_RAID0_LAYOUT))) {
8e8941
+		c = map_num(r0layout, __le32_to_cpu(sb->layout));
8e8941
+		printf("         Layout : %s\n", c?c:"-unknown-");
8e8941
+	}
8e8941
 	if (__le32_to_cpu(sb->level) == 5) {
8e8941
 		c = map_num(r5layout, __le32_to_cpu(sb->layout));
8e8941
 		printf("         Layout : %s\n", c?c:"-unknown-");
8e8941
@@ -1646,6 +1653,7 @@ struct devinfo {
8e8941
 	int fd;
8e8941
 	char *devname;
8e8941
 	long long data_offset;
8e8941
+	unsigned long long dev_size;
8e8941
 	mdu_disk_info_t disk;
8e8941
 	struct devinfo *next;
8e8941
 };
8e8941
@@ -1687,6 +1695,7 @@ static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk,
8e8941
 	di->devname = devname;
8e8941
 	di->disk = *dk;
8e8941
 	di->data_offset = data_offset;
8e8941
+	get_dev_size(fd, NULL, &di->dev_size);
8e8941
 	di->next = NULL;
8e8941
 	*dip = di;
8e8941
 
8e8941
@@ -1888,10 +1897,25 @@ static int write_init_super1(struct supertype *st)
8e8941
 	unsigned long long sb_offset;
8e8941
 	unsigned long long data_offset;
8e8941
 	long bm_offset;
8e8941
+	int raid0_need_layout = 0;
8e8941
 
8e8941
 	for (di = st->info; di; di = di->next) {
8e8941
 		if (di->disk.state & (1 << MD_DISK_JOURNAL))
8e8941
 			sb->feature_map |= __cpu_to_le32(MD_FEATURE_JOURNAL);
8e8941
+		if (sb->level == 0 && sb->layout != 0) {
8e8941
+			struct devinfo *di2 = st->info;
8e8941
+			unsigned long long s1, s2;
8e8941
+			s1 = di->dev_size;
8e8941
+			if (di->data_offset != INVALID_SECTORS)
8e8941
+				s1 -= di->data_offset;
8e8941
+			s1 /= __le32_to_cpu(sb->chunksize);
8e8941
+			s2 = di2->dev_size;
8e8941
+			if (di2->data_offset != INVALID_SECTORS)
8e8941
+				s2 -= di2->data_offset;
8e8941
+			s2 /= __le32_to_cpu(sb->chunksize);
8e8941
+			if (s1 != s2)
8e8941
+				raid0_need_layout = 1;
8e8941
+		}
8e8941
 	}
8e8941
 
8e8941
 	for (di = st->info; di; di = di->next) {
8e8941
@@ -2039,6 +2063,10 @@ static int write_init_super1(struct supertype *st)
8e8941
 			sb->bblog_offset = 0;
8e8941
 		}
8e8941
 
8e8941
+		/* RAID0 needs a layout if devices aren't all the same size */
8e8941
+		if (raid0_need_layout)
8e8941
+			sb->feature_map |= __cpu_to_le32(MD_FEATURE_RAID0_LAYOUT);
8e8941
+
8e8941
 		sb->sb_csum = calc_sb_1_csum(sb);
8e8941
 		rv = store_super1(st, di->fd);
8e8941
 
8e8941
-- 
8e8941
2.7.5
8e8941