Blame SOURCES/0025-mdmon-fix-wrong-array-state-when-disk-fails-during-m.patch

5d5466
From ae7d61e35ec2ab6361c3e509a8db00698ef3396f Mon Sep 17 00:00:00 2001
5d5466
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
5d5466
Date: Tue, 7 May 2019 16:08:47 +0200
5d5466
Subject: [RHEL7.8 PATCH V2 25/47] mdmon: fix wrong array state when disk fails
5d5466
 during mdmon startup
5d5466
5d5466
If a member drive disappears and is set faulty by the kernel during
5d5466
mdmon startup, after ss->load_container() but before manage_new(), mdmon
5d5466
will try to readd the faulty drive to the array and start rebuilding.
5d5466
Metadata on the active drive is updated, but the faulty drive is not
5d5466
removed from the array and is left in a "blocked" state and any write
5d5466
request to the array will block. If the faulty drive reappears in the
5d5466
system e.g. after a reboot, the array will not assemble because metadata
5d5466
on the drives will be incompatible (at least on imsm).
5d5466
5d5466
Fix this by adding a new option for sysfs_read(): "GET_DEVS_ALL". This
5d5466
is an extension for the "GET_DEVS" option and causes all member devices
5d5466
to be returned, even if the associated block device has been removed.
5d5466
Use this option in manage_new() to include the faulty device on the
5d5466
active_array's devices list. Mdmon will then properly remove the faulty
5d5466
device from the array and update the metadata to reflect the degraded
5d5466
state.
5d5466
5d5466
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
5d5466
Signed-off-by: Jes Sorensen <jsorensen@fb.com>
5d5466
---
5d5466
 managemon.c   |  2 +-
5d5466
 mdadm.h       |  1 +
5d5466
 super-intel.c |  2 +-
5d5466
 sysfs.c       | 23 ++++++++++++++---------
5d5466
 4 files changed, 17 insertions(+), 11 deletions(-)
5d5466
5d5466
diff --git a/managemon.c b/managemon.c
5d5466
index 29b91ba..200cf83 100644
5d5466
--- a/managemon.c
5d5466
+++ b/managemon.c
5d5466
@@ -678,7 +678,7 @@ static void manage_new(struct mdstat_ent *mdstat,
5d5466
 	mdi = sysfs_read(-1, mdstat->devnm,
5d5466
 			 GET_LEVEL|GET_CHUNK|GET_DISKS|GET_COMPONENT|
5d5466
 			 GET_SAFEMODE|GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE|
5d5466
-			 GET_LAYOUT);
5d5466
+			 GET_LAYOUT|GET_DEVS_ALL);
5d5466
 
5d5466
 	if (!mdi)
5d5466
 		return;
5d5466
diff --git a/mdadm.h b/mdadm.h
5d5466
index 705bd9b..427cc52 100644
5d5466
--- a/mdadm.h
5d5466
+++ b/mdadm.h
5d5466
@@ -647,6 +647,7 @@ enum sysfs_read_flags {
5d5466
 	GET_ERROR	= (1 << 24),
5d5466
 	GET_ARRAY_STATE = (1 << 25),
5d5466
 	GET_CONSISTENCY_POLICY	= (1 << 26),
5d5466
+	GET_DEVS_ALL	= (1 << 27),
5d5466
 };
5d5466
 
5d5466
 /* If fd >= 0, get the array it is open on,
5d5466
diff --git a/super-intel.c b/super-intel.c
5d5466
index 2ba045a..4fd5e84 100644
5d5466
--- a/super-intel.c
5d5466
+++ b/super-intel.c
5d5466
@@ -8560,7 +8560,7 @@ static void imsm_set_disk(struct active_array *a, int n, int state)
5d5466
 	disk = get_imsm_disk(super, ord_to_idx(ord));
5d5466
 
5d5466
 	/* check for new failures */
5d5466
-	if (state & DS_FAULTY) {
5d5466
+	if (disk && (state & DS_FAULTY)) {
5d5466
 		if (mark_failure(super, dev, disk, ord_to_idx(ord)))
5d5466
 			super->updates_pending++;
5d5466
 	}
5d5466
diff --git a/sysfs.c b/sysfs.c
5d5466
index df6fdda..2dd9ab6 100644
5d5466
--- a/sysfs.c
5d5466
+++ b/sysfs.c
5d5466
@@ -313,17 +313,22 @@ struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options)
5d5466
 			/* assume this is a stale reference to a hot
5d5466
 			 * removed device
5d5466
 			 */
5d5466
-			free(dev);
5d5466
-			continue;
5d5466
+			if (!(options & GET_DEVS_ALL)) {
5d5466
+				free(dev);
5d5466
+				continue;
5d5466
+			}
5d5466
+		} else {
5d5466
+			sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor);
5d5466
 		}
5d5466
-		sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor);
5d5466
 
5d5466
-		/* special case check for block devices that can go 'offline' */
5d5466
-		strcpy(dbase, "block/device/state");
5d5466
-		if (load_sys(fname, buf, sizeof(buf)) == 0 &&
5d5466
-		    strncmp(buf, "offline", 7) == 0) {
5d5466
-			free(dev);
5d5466
-			continue;
5d5466
+		if (!(options & GET_DEVS_ALL)) {
5d5466
+			/* special case check for block devices that can go 'offline' */
5d5466
+			strcpy(dbase, "block/device/state");
5d5466
+			if (load_sys(fname, buf, sizeof(buf)) == 0 &&
5d5466
+			    strncmp(buf, "offline", 7) == 0) {
5d5466
+				free(dev);
5d5466
+				continue;
5d5466
+			}
5d5466
 		}
5d5466
 
5d5466
 		/* finally add this disk to the array */
5d5466
-- 
5d5466
2.7.5
5d5466