Blame SOURCES/imsmppl-support.patch

dd3a91
commit 2432ce9b3235f34d00ef6c28ef6b624a32b85530
dd3a91
Author: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
dd3a91
Date:   Wed Mar 29 11:54:17 2017 +0200
dd3a91
dd3a91
    imsm: PPL support
dd3a91
    
dd3a91
    Enable creating and assembling IMSM raid5 arrays with PPL. Update the
dd3a91
    IMSM metadata format to include new fields used for PPL.
dd3a91
    
dd3a91
    Add structures for PPL metadata. They are used also by super1 and shared
dd3a91
    with the kernel, so put them in md_p.h.
dd3a91
    
dd3a91
    Write the initial empty PPL header when creating an array. When
dd3a91
    assembling an array with PPL, validate the PPL header and in case it is
dd3a91
    not correct allow to overwrite it if --force was provided.
dd3a91
    
dd3a91
    Write the PPL location and size for a device to the new rdev sysfs
dd3a91
    attributes 'ppl_sector' and 'ppl_size'. Enable PPL in the kernel by
dd3a91
    writing to 'consistency_policy' before the array is activated.
dd3a91
    
dd3a91
    Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
dd3a91
    Signed-off-by: Jes Sorensen <Jes.Sorensen@gmail.com>
dd3a91
dd3a91
diff --git a/Assemble.c b/Assemble.c
dd3a91
index 3da0903..8e55b49 100644
dd3a91
--- a/Assemble.c
dd3a91
+++ b/Assemble.c
dd3a91
@@ -1942,6 +1942,55 @@ int assemble_container_content(struct supertype *st, int mdfd,
dd3a91
 	map_update(NULL, fd2devnm(mdfd), content->text_version,
dd3a91
 		   content->uuid, chosen_name);
dd3a91
 
dd3a91
+	if (content->consistency_policy == CONSISTENCY_POLICY_PPL &&
dd3a91
+	    st->ss->validate_ppl) {
dd3a91
+		content->array.state |= 1;
dd3a91
+		err = 0;
dd3a91
+
dd3a91
+		for (dev = content->devs; dev; dev = dev->next) {
dd3a91
+			int dfd;
dd3a91
+			char *devpath;
dd3a91
+			int ret;
dd3a91
+
dd3a91
+			ret = st->ss->validate_ppl(st, content, dev);
dd3a91
+			if (ret == 0)
dd3a91
+				continue;
dd3a91
+
dd3a91
+			if (ret < 0) {
dd3a91
+				err = 1;
dd3a91
+				break;
dd3a91
+			}
dd3a91
+
dd3a91
+			if (!c->force) {
dd3a91
+				pr_err("%s contains invalid PPL - consider --force or --update-subarray with --update=no-ppl\n",
dd3a91
+					chosen_name);
dd3a91
+				content->array.state &= ~1;
dd3a91
+				avail[dev->disk.raid_disk] = 0;
dd3a91
+				break;
dd3a91
+			}
dd3a91
+
dd3a91
+			/* have --force - overwrite the invalid ppl */
dd3a91
+			devpath = map_dev(dev->disk.major, dev->disk.minor, 0);
dd3a91
+			dfd = dev_open(devpath, O_RDWR);
dd3a91
+			if (dfd < 0) {
dd3a91
+				pr_err("Failed to open %s\n", devpath);
dd3a91
+				err = 1;
dd3a91
+				break;
dd3a91
+			}
dd3a91
+
dd3a91
+			err = st->ss->write_init_ppl(st, content, dfd);
dd3a91
+			close(dfd);
dd3a91
+
dd3a91
+			if (err)
dd3a91
+				break;
dd3a91
+		}
dd3a91
+
dd3a91
+		if (err) {
dd3a91
+			free(avail);
dd3a91
+			return err;
dd3a91
+		}
dd3a91
+	}
dd3a91
+
dd3a91
 	if (enough(content->array.level, content->array.raid_disks,
dd3a91
 		   content->array.layout, content->array.state & 1, avail) == 0) {
dd3a91
 		if (c->export && result)
dd3a91
diff --git a/Makefile b/Makefile
dd3a91
index d1a6ac4..5ff6cc0 100644
dd3a91
--- a/Makefile
dd3a91
+++ b/Makefile
dd3a91
@@ -151,7 +151,7 @@ MON_OBJS = mdmon.o monitor.o managemon.o util.o maps.o mdstat.o sysfs.o \
dd3a91
 	Kill.o sg_io.o dlink.o ReadMe.o super-intel.o \
dd3a91
 	super-mbr.o super-gpt.o \
dd3a91
 	super-ddf.o sha1.o crc32.o msg.o bitmap.o xmalloc.o \
dd3a91
-	platform-intel.o probe_roms.o
dd3a91
+	platform-intel.o probe_roms.o crc32c.o
dd3a91
 
dd3a91
 MON_SRCS = $(patsubst %.o,%.c,$(MON_OBJS))
dd3a91
 
dd3a91
@@ -161,7 +161,8 @@ STATICOBJS = pwgr.o
dd3a91
 ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c policy.c dlink.c util.c \
dd3a91
 	maps.c lib.c xmalloc.c \
dd3a91
 	super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c mdstat.c \
dd3a91
-	platform-intel.c probe_roms.c sysfs.c super-mbr.c super-gpt.c mapfile.c
dd3a91
+	platform-intel.c probe_roms.c sysfs.c super-mbr.c super-gpt.c mapfile.c \
dd3a91
+	crc32c.c
dd3a91
 ASSEMBLE_AUTO_SRCS := mdopen.c
dd3a91
 ASSEMBLE_FLAGS:= $(CFLAGS) -DMDASSEMBLE
dd3a91
 ifdef MDASSEMBLE_AUTO
dd3a91
diff --git a/md_p.h b/md_p.h
dd3a91
index dc9fec1..358a28c 100644
dd3a91
--- a/md_p.h
dd3a91
+++ b/md_p.h
dd3a91
@@ -267,4 +267,29 @@ struct r5l_meta_block {
dd3a91
 #define R5LOG_VERSION 0x1
dd3a91
 #define R5LOG_MAGIC 0x6433c509
dd3a91
 
dd3a91
+struct ppl_header_entry {
dd3a91
+	__u64 data_sector;	/* raid sector of the new data */
dd3a91
+	__u32 pp_size;		/* length of partial parity */
dd3a91
+	__u32 data_size;	/* length of data */
dd3a91
+	__u32 parity_disk;	/* member disk containing parity */
dd3a91
+	__u32 checksum;		/* checksum of this entry's partial parity */
dd3a91
+} __attribute__ ((__packed__));
dd3a91
+
dd3a91
+#define PPL_HEADER_SIZE 4096
dd3a91
+#define PPL_HDR_RESERVED 512
dd3a91
+#define PPL_HDR_ENTRY_SPACE \
dd3a91
+	(PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(__u32) - sizeof(__u64))
dd3a91
+#define PPL_HDR_MAX_ENTRIES \
dd3a91
+	(PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
dd3a91
+
dd3a91
+struct ppl_header {
dd3a91
+	__u8 reserved[PPL_HDR_RESERVED];/* reserved space, fill with 0xff */
dd3a91
+	__u32 signature;		/* signature (family number of volume) */
dd3a91
+	__u32 padding;			/* zero pad */
dd3a91
+	__u64 generation;		/* generation number of the header */
dd3a91
+	__u32 entries_count;		/* number of entries in entry array */
dd3a91
+	__u32 checksum;			/* checksum of the header */
dd3a91
+	struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
dd3a91
+} __attribute__ ((__packed__));
dd3a91
+
dd3a91
 #endif
dd3a91
diff --git a/mdadm.h b/mdadm.h
dd3a91
index b52d4d3..d222cc3 100644
dd3a91
--- a/mdadm.h
dd3a91
+++ b/mdadm.h
dd3a91
@@ -300,6 +300,8 @@ struct mdinfo {
dd3a91
 		#define MaxSector  (~0ULL) /* resync/recovery complete position */
dd3a91
 	};
dd3a91
 	long			bitmap_offset;	/* 0 == none, 1 == a file */
dd3a91
+	unsigned int		ppl_size;
dd3a91
+	unsigned long long	ppl_sector;
dd3a91
 	unsigned long		safe_mode_delay; /* ms delay to mark clean */
dd3a91
 	int			new_level, delta_disks, new_layout, new_chunk;
dd3a91
 	int			errors;
dd3a91
@@ -1074,6 +1076,10 @@ extern struct superswitch {
dd3a91
 	/* write initial empty PPL on device */
dd3a91
 	int (*write_init_ppl)(struct supertype *st, struct mdinfo *info, int fd);
dd3a91
 
dd3a91
+	/* validate ppl before assemble */
dd3a91
+	int (*validate_ppl)(struct supertype *st, struct mdinfo *info,
dd3a91
+			    struct mdinfo *disk);
dd3a91
+
dd3a91
 	/* records new bad block in metadata */
dd3a91
 	int (*record_bad_block)(struct active_array *a, int n,
dd3a91
 					unsigned long long sector, int length);
dd3a91
diff --git a/super-intel.c b/super-intel.c
dd3a91
index 2d92c8e..87fec8b 100644
dd3a91
--- a/super-intel.c
dd3a91
+++ b/super-intel.c
dd3a91
@@ -102,6 +102,7 @@ struct imsm_disk {
dd3a91
 #define SPARE_DISK      __cpu_to_le32(0x01)  /* Spare */
dd3a91
 #define CONFIGURED_DISK __cpu_to_le32(0x02)  /* Member of some RaidDev */
dd3a91
 #define FAILED_DISK     __cpu_to_le32(0x04)  /* Permanent failure */
dd3a91
+#define JOURNAL_DISK    __cpu_to_le32(0x2000000) /* Device marked as Journaling Drive */
dd3a91
 	__u32 status;			 /* 0xF0 - 0xF3 */
dd3a91
 	__u32 owner_cfg_num; /* which config 0,1,2... owns this disk */
dd3a91
 	__u32 total_blocks_hi;		 /* 0xF4 - 0xF5 total blocks hi */
dd3a91
@@ -155,6 +156,9 @@ struct imsm_vol {
dd3a91
 #define MIGR_STATE_CHANGE 4
dd3a91
 #define MIGR_REPAIR 5
dd3a91
 	__u8  migr_type;	/* Initializing, Rebuilding, ... */
dd3a91
+#define RAIDVOL_CLEAN          0
dd3a91
+#define RAIDVOL_DIRTY          1
dd3a91
+#define RAIDVOL_DSRECORD_VALID 2
dd3a91
 	__u8  dirty;
dd3a91
 	__u8  fs_state;		/* fast-sync state for CnG (0xff == disabled) */
dd3a91
 	__u16 verify_errors;	/* number of mismatches */
dd3a91
@@ -190,7 +194,24 @@ struct imsm_dev {
dd3a91
 	__u16 cache_policy;
dd3a91
 	__u8  cng_state;
dd3a91
 	__u8  cng_sub_state;
dd3a91
-#define IMSM_DEV_FILLERS 10
dd3a91
+	__u16 my_vol_raid_dev_num; /* Used in Unique volume Id for this RaidDev */
dd3a91
+
dd3a91
+	/* NVM_EN */
dd3a91
+	__u8 nv_cache_mode;
dd3a91
+	__u8 nv_cache_flags;
dd3a91
+
dd3a91
+	/* Unique Volume Id of the NvCache Volume associated with this volume */
dd3a91
+	__u32 nvc_vol_orig_family_num;
dd3a91
+	__u16 nvc_vol_raid_dev_num;
dd3a91
+
dd3a91
+#define RWH_OFF 0
dd3a91
+#define RWH_DISTRIBUTED 1
dd3a91
+#define RWH_JOURNALING_DRIVE 2
dd3a91
+	__u8  rwh_policy; /* Raid Write Hole Policy */
dd3a91
+	__u8  jd_serial[MAX_RAID_SERIAL_LEN]; /* Journal Drive serial number */
dd3a91
+	__u8  filler1;
dd3a91
+
dd3a91
+#define IMSM_DEV_FILLERS 3
dd3a91
 	__u32 filler[IMSM_DEV_FILLERS];
dd3a91
 	struct imsm_vol vol;
dd3a91
 } __attribute__ ((packed));
dd3a91
@@ -257,6 +278,9 @@ static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed"
dd3a91
 #define UNIT_SRC_IN_CP_AREA 1   /* Source data for curr_migr_unit has
dd3a91
 				 *  already been migrated and must
dd3a91
 				 *  be recovered from checkpoint area */
dd3a91
+
dd3a91
+#define PPL_ENTRY_SPACE (128 * 1024) /* Size of the PPL, without the header */
dd3a91
+
dd3a91
 struct migr_record {
dd3a91
 	__u32 rec_status;	    /* Status used to determine how to restart
dd3a91
 				     * migration in case it aborts
dd3a91
@@ -1288,6 +1312,11 @@ static int is_failed(struct imsm_disk *disk)
dd3a91
 	return (disk->status & FAILED_DISK) == FAILED_DISK;
dd3a91
 }
dd3a91
 
dd3a91
+static int is_journal(struct imsm_disk *disk)
dd3a91
+{
dd3a91
+	return (disk->status & JOURNAL_DISK) == JOURNAL_DISK;
dd3a91
+}
dd3a91
+
dd3a91
 /* try to determine how much space is reserved for metadata from
dd3a91
  * the last get_extents() entry on the smallest active disk,
dd3a91
  * otherwise fallback to the default
dd3a91
@@ -1477,7 +1506,17 @@ static void print_imsm_dev(struct intel_super *super,
dd3a91
 				   blocks_per_migr_unit(super, dev));
dd3a91
 	}
dd3a91
 	printf("\n");
dd3a91
-	printf("    Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean");
dd3a91
+	printf("    Dirty State : %s\n", (dev->vol.dirty & RAIDVOL_DIRTY) ?
dd3a91
+					 "dirty" : "clean");
dd3a91
+	printf("     RWH Policy : ");
dd3a91
+	if (dev->rwh_policy == RWH_OFF)
dd3a91
+		printf("off\n");
dd3a91
+	else if (dev->rwh_policy == RWH_DISTRIBUTED)
dd3a91
+		printf("PPL distributed\n");
dd3a91
+	else if (dev->rwh_policy == RWH_JOURNALING_DRIVE)
dd3a91
+		printf("PPL journaling drive\n");
dd3a91
+	else
dd3a91
+		printf("<unknown:%d>\n", dev->rwh_policy);
dd3a91
 }
dd3a91
 
dd3a91
 static void print_imsm_disk(struct imsm_disk *disk,
dd3a91
@@ -1496,9 +1535,10 @@ static void print_imsm_disk(struct imsm_disk *disk,
dd3a91
 		printf("  Disk%02d Serial : %s\n", index, str);
dd3a91
 	else
dd3a91
 		printf("    Disk Serial : %s\n", str);
dd3a91
-	printf("          State :%s%s%s\n", is_spare(disk) ? " spare" : "",
dd3a91
-					    is_configured(disk) ? " active" : "",
dd3a91
-					    is_failed(disk) ? " failed" : "");
dd3a91
+	printf("          State :%s%s%s%s\n", is_spare(disk) ? " spare" : "",
dd3a91
+					      is_configured(disk) ? " active" : "",
dd3a91
+					      is_failed(disk) ? " failed" : "",
dd3a91
+					      is_journal(disk) ? " journal" : "");
dd3a91
 	printf("             Id : %08x\n", __le32_to_cpu(disk->scsi_id));
dd3a91
 	sz = total_blocks(disk) - reserved;
dd3a91
 	printf("    Usable Size : %llu%s\n",
dd3a91
@@ -3114,6 +3154,15 @@ static unsigned long long imsm_component_size_aligment_check(int level,
dd3a91
 	return component_size;
dd3a91
 }
dd3a91
 
dd3a91
+static unsigned long long get_ppl_sector(struct intel_super *super, int dev_idx)
dd3a91
+{
dd3a91
+	struct imsm_dev *dev = get_imsm_dev(super, dev_idx);
dd3a91
+	struct imsm_map *map = get_imsm_map(dev, MAP_0);
dd3a91
+
dd3a91
+	return pba_of_lba0(map) +
dd3a91
+	       (num_data_stripes(map) * map->blocks_per_strip);
dd3a91
+}
dd3a91
+
dd3a91
 static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info, char *dmap)
dd3a91
 {
dd3a91
 	struct intel_super *super = st->sb;
dd3a91
@@ -3140,7 +3189,7 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
dd3a91
 	info->array.utime	  = 0;
dd3a91
 	info->array.chunk_size	  =
dd3a91
 		__le16_to_cpu(map_to_analyse->blocks_per_strip) << 9;
dd3a91
-	info->array.state	  = !dev->vol.dirty;
dd3a91
+	info->array.state	  = !(dev->vol.dirty & RAIDVOL_DIRTY);
dd3a91
 	info->custom_array_size   = __le32_to_cpu(dev->size_high);
dd3a91
 	info->custom_array_size   <<= 32;
dd3a91
 	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
dd3a91
@@ -3221,10 +3270,20 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
dd3a91
 	memset(info->uuid, 0, sizeof(info->uuid));
dd3a91
 	info->recovery_start = MaxSector;
dd3a91
 
dd3a91
+	if (info->array.level == 5 && dev->rwh_policy == RWH_DISTRIBUTED) {
dd3a91
+		info->consistency_policy = CONSISTENCY_POLICY_PPL;
dd3a91
+		info->ppl_sector = get_ppl_sector(super, super->current_vol);
dd3a91
+		info->ppl_size = (PPL_HEADER_SIZE + PPL_ENTRY_SPACE) >> 9;
dd3a91
+	} else if (info->array.level <= 0) {
dd3a91
+		info->consistency_policy = CONSISTENCY_POLICY_NONE;
dd3a91
+	} else {
dd3a91
+		info->consistency_policy = CONSISTENCY_POLICY_RESYNC;
dd3a91
+	}
dd3a91
+
dd3a91
 	info->reshape_progress = 0;
dd3a91
 	info->resync_start = MaxSector;
dd3a91
 	if ((map_to_analyse->map_state == IMSM_T_STATE_UNINITIALIZED ||
dd3a91
-	    dev->vol.dirty) &&
dd3a91
+	    !(info->array.state & 1)) &&
dd3a91
 	    imsm_reshape_blocks_arrays_changes(super) == 0) {
dd3a91
 		info->resync_start = 0;
dd3a91
 	}
dd3a91
@@ -3451,7 +3510,8 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *
dd3a91
 		 * found the 'most fresh' version of the metadata
dd3a91
 		 */
dd3a91
 		info->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0;
dd3a91
-		info->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC);
dd3a91
+		info->disk.state |= (is_spare(disk) || is_journal(disk)) ?
dd3a91
+				    0 : (1 << MD_DISK_SYNC);
dd3a91
 	}
dd3a91
 
dd3a91
 	/* only call uuid_from_super_imsm when this disk is part of a populated container,
dd3a91
@@ -3906,7 +3966,7 @@ load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd)
dd3a91
 		 */
dd3a91
 		if (is_failed(&dl->disk))
dd3a91
 			dl->index = -2;
dd3a91
-		else if (is_spare(&dl->disk))
dd3a91
+		else if (is_spare(&dl->disk) || is_journal(&dl->disk))
dd3a91
 			dl->index = -1;
dd3a91
 	}
dd3a91
 
dd3a91
@@ -5303,6 +5363,20 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
dd3a91
 	}
dd3a91
 	mpb->num_raid_devs++;
dd3a91
 
dd3a91
+	if (s->consistency_policy == UnSet ||
dd3a91
+	    s->consistency_policy == CONSISTENCY_POLICY_RESYNC ||
dd3a91
+	    s->consistency_policy == CONSISTENCY_POLICY_NONE) {
dd3a91
+		dev->rwh_policy = RWH_OFF;
dd3a91
+	} else if (s->consistency_policy == CONSISTENCY_POLICY_PPL) {
dd3a91
+		dev->rwh_policy = RWH_DISTRIBUTED;
dd3a91
+	} else {
dd3a91
+		free(dev);
dd3a91
+		free(dv);
dd3a91
+		pr_err("imsm does not support consistency policy %s\n",
dd3a91
+		       map_num(consistency_policies, s->consistency_policy));
dd3a91
+		return 0;
dd3a91
+	}
dd3a91
+
dd3a91
 	dv->dev = dev;
dd3a91
 	dv->index = super->current_vol;
dd3a91
 	dv->next = super->devlist;
dd3a91
@@ -5927,11 +6001,146 @@ static int mgmt_disk(struct supertype *st)
dd3a91
 
dd3a91
 	return 0;
dd3a91
 }
dd3a91
+#endif
dd3a91
+
dd3a91
+__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len);
dd3a91
+
dd3a91
+static int write_init_ppl_imsm(struct supertype *st, struct mdinfo *info, int fd)
dd3a91
+{
dd3a91
+	struct intel_super *super = st->sb;
dd3a91
+	void *buf;
dd3a91
+	struct ppl_header *ppl_hdr;
dd3a91
+	int ret;
dd3a91
+
dd3a91
+	ret = posix_memalign(&buf, 4096, PPL_HEADER_SIZE);
dd3a91
+	if (ret) {
dd3a91
+		pr_err("Failed to allocate PPL header buffer\n");
dd3a91
+		return ret;
dd3a91
+	}
dd3a91
+
dd3a91
+	memset(buf, 0, PPL_HEADER_SIZE);
dd3a91
+	ppl_hdr = buf;
dd3a91
+	memset(ppl_hdr->reserved, 0xff, PPL_HDR_RESERVED);
dd3a91
+	ppl_hdr->signature = __cpu_to_le32(super->anchor->orig_family_num);
dd3a91
+	ppl_hdr->checksum = __cpu_to_le32(~crc32c_le(~0, buf, PPL_HEADER_SIZE));
dd3a91
+
dd3a91
+	if (lseek64(fd, info->ppl_sector * 512, SEEK_SET) < 0) {
dd3a91
+		ret = errno;
dd3a91
+		perror("Failed to seek to PPL header location");
dd3a91
+	}
dd3a91
+
dd3a91
+	if (!ret && write(fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) {
dd3a91
+		ret = errno;
dd3a91
+		perror("Write PPL header failed");
dd3a91
+	}
dd3a91
+
dd3a91
+	if (!ret)
dd3a91
+		fsync(fd);
dd3a91
+
dd3a91
+	free(buf);
dd3a91
+	return ret;
dd3a91
+}
dd3a91
+
dd3a91
+static int validate_ppl_imsm(struct supertype *st, struct mdinfo *info,
dd3a91
+			     struct mdinfo *disk)
dd3a91
+{
dd3a91
+	struct intel_super *super = st->sb;
dd3a91
+	struct dl *d;
dd3a91
+	void *buf;
dd3a91
+	int ret = 0;
dd3a91
+	struct ppl_header *ppl_hdr;
dd3a91
+	__u32 crc;
dd3a91
+	struct imsm_dev *dev;
dd3a91
+	struct imsm_map *map;
dd3a91
+	__u32 idx;
dd3a91
+
dd3a91
+	if (disk->disk.raid_disk < 0)
dd3a91
+		return 0;
dd3a91
+
dd3a91
+	if (posix_memalign(&buf, 4096, PPL_HEADER_SIZE)) {
dd3a91
+		pr_err("Failed to allocate PPL header buffer\n");
dd3a91
+		return -1;
dd3a91
+	}
dd3a91
+
dd3a91
+	dev = get_imsm_dev(super, info->container_member);
dd3a91
+	map = get_imsm_map(dev, MAP_X);
dd3a91
+	idx = get_imsm_disk_idx(dev, disk->disk.raid_disk, MAP_X);
dd3a91
+	d = get_imsm_dl_disk(super, idx);
dd3a91
+
dd3a91
+	if (!d || d->index < 0 || is_failed(&d->disk))
dd3a91
+		goto out;
dd3a91
+
dd3a91
+	if (lseek64(d->fd, info->ppl_sector * 512, SEEK_SET) < 0) {
dd3a91
+		perror("Failed to seek to PPL header location");
dd3a91
+		ret = -1;
dd3a91
+		goto out;
dd3a91
+	}
dd3a91
+
dd3a91
+	if (read(d->fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) {
dd3a91
+		perror("Read PPL header failed");
dd3a91
+		ret = -1;
dd3a91
+		goto out;
dd3a91
+	}
dd3a91
+
dd3a91
+	ppl_hdr = buf;
dd3a91
+
dd3a91
+	crc = __le32_to_cpu(ppl_hdr->checksum);
dd3a91
+	ppl_hdr->checksum = 0;
dd3a91
+
dd3a91
+	if (crc != ~crc32c_le(~0, buf, PPL_HEADER_SIZE)) {
dd3a91
+		dprintf("Wrong PPL header checksum on %s\n",
dd3a91
+			d->devname);
dd3a91
+		ret = 1;
dd3a91
+	}
dd3a91
+
dd3a91
+	if (!ret && (__le32_to_cpu(ppl_hdr->signature) !=
dd3a91
+		      super->anchor->orig_family_num)) {
dd3a91
+		dprintf("Wrong PPL header signature on %s\n",
dd3a91
+			d->devname);
dd3a91
+		ret = 1;
dd3a91
+	}
dd3a91
+
dd3a91
+out:
dd3a91
+	free(buf);
dd3a91
+
dd3a91
+	if (ret == 1 && map->map_state == IMSM_T_STATE_UNINITIALIZED)
dd3a91
+		return st->ss->write_init_ppl(st, info, d->fd);
dd3a91
+
dd3a91
+	return ret;
dd3a91
+}
dd3a91
+
dd3a91
+#ifndef MDASSEMBLE
dd3a91
+
dd3a91
+static int write_init_ppl_imsm_all(struct supertype *st, struct mdinfo *info)
dd3a91
+{
dd3a91
+	struct intel_super *super = st->sb;
dd3a91
+	struct dl *d;
dd3a91
+	int ret = 0;
dd3a91
+
dd3a91
+	if (info->consistency_policy != CONSISTENCY_POLICY_PPL ||
dd3a91
+	    info->array.level != 5)
dd3a91
+		return 0;
dd3a91
+
dd3a91
+	for (d = super->disks; d ; d = d->next) {
dd3a91
+		if (d->index < 0 || is_failed(&d->disk))
dd3a91
+			continue;
dd3a91
+
dd3a91
+		ret = st->ss->write_init_ppl(st, info, d->fd);
dd3a91
+		if (ret)
dd3a91
+			break;
dd3a91
+	}
dd3a91
+
dd3a91
+	return ret;
dd3a91
+}
dd3a91
 
dd3a91
 static int write_init_super_imsm(struct supertype *st)
dd3a91
 {
dd3a91
 	struct intel_super *super = st->sb;
dd3a91
 	int current_vol = super->current_vol;
dd3a91
+	int rv = 0;
dd3a91
+	struct mdinfo info;
dd3a91
+
dd3a91
+	getinfo_super_imsm(st, &info, NULL);
dd3a91
 
dd3a91
 	/* we are done with current_vol reset it to point st at the container */
dd3a91
 	super->current_vol = -1;
dd3a91
@@ -5939,24 +6148,29 @@ static int write_init_super_imsm(struct supertype *st)
dd3a91
 	if (st->update_tail) {
dd3a91
 		/* queue the recently created array / added disk
dd3a91
 		 * as a metadata update */
dd3a91
-		int rv;
dd3a91
 
dd3a91
 		/* determine if we are creating a volume or adding a disk */
dd3a91
 		if (current_vol < 0) {
dd3a91
 			/* in the mgmt (add/remove) disk case we are running
dd3a91
 			 * in mdmon context, so don't close fd's
dd3a91
 			 */
dd3a91
-			return mgmt_disk(st);
dd3a91
-		} else
dd3a91
-			rv = create_array(st, current_vol);
dd3a91
-
dd3a91
-		return rv;
dd3a91
+			rv = mgmt_disk(st);
dd3a91
+		} else {
dd3a91
+			rv = write_init_ppl_imsm_all(st, &info;;
dd3a91
+			if (!rv)
dd3a91
+				rv = create_array(st, current_vol);
dd3a91
+		}
dd3a91
 	} else {
dd3a91
 		struct dl *d;
dd3a91
 		for (d = super->disks; d; d = d->next)
dd3a91
 			Kill(d->devname, NULL, 0, -1, 1);
dd3a91
-		return write_super_imsm(st, 1);
dd3a91
+		if (current_vol >= 0)
dd3a91
+			rv = write_init_ppl_imsm_all(st, &info;;
dd3a91
+		if (!rv)
dd3a91
+			rv = write_super_imsm(st, 1);
dd3a91
 	}
dd3a91
+
dd3a91
+	return rv;
dd3a91
 }
dd3a91
 #endif
dd3a91
 
dd3a91
@@ -7375,7 +7589,8 @@ static struct mdinfo *container_content_imsm(struct supertype *st, char *subarra
dd3a91
 			 *
dd3a91
 			 * FIXME handle dirty degraded
dd3a91
 			 */
dd3a91
-			if ((skip || recovery_start == 0) && !dev->vol.dirty)
dd3a91
+			if ((skip || recovery_start == 0) &&
dd3a91
+			    !(dev->vol.dirty & RAIDVOL_DIRTY))
dd3a91
 				this->resync_start = MaxSector;
dd3a91
 			if (skip)
dd3a91
 				continue;
dd3a91
@@ -7410,9 +7625,12 @@ static struct mdinfo *container_content_imsm(struct supertype *st, char *subarra
dd3a91
 				info_d->component_size =
dd3a91
 						num_data_stripes(map) *
dd3a91
 						map->blocks_per_strip;
dd3a91
+				info_d->ppl_sector = this->ppl_sector;
dd3a91
+				info_d->ppl_size = this->ppl_size;
dd3a91
 			} else {
dd3a91
 				info_d->component_size = blocks_per_member(map);
dd3a91
 			}
dd3a91
+			info_d->consistency_policy = this->consistency_policy;
dd3a91
 
dd3a91
 			info_d->bb.supported = 1;
dd3a91
 			get_volume_badblocks(super->bbm_log, ord_to_idx(ord),
dd3a91
@@ -7928,12 +8146,16 @@ mark_checkpoint:
dd3a91
 
dd3a91
 skip_mark_checkpoint:
dd3a91
 	/* mark dirty / clean */
dd3a91
-	if (dev->vol.dirty != !consistent) {
dd3a91
+	if (((dev->vol.dirty & RAIDVOL_DIRTY) && consistent) ||
dd3a91
+	    (!(dev->vol.dirty & RAIDVOL_DIRTY) && !consistent)) {
dd3a91
 		dprintf("imsm: mark '%s'\n", consistent ? "clean" : "dirty");
dd3a91
-		if (consistent)
dd3a91
-			dev->vol.dirty = 0;
dd3a91
-		else
dd3a91
-			dev->vol.dirty = 1;
dd3a91
+		if (consistent) {
dd3a91
+			dev->vol.dirty = RAIDVOL_CLEAN;
dd3a91
+		} else {
dd3a91
+			dev->vol.dirty = RAIDVOL_DIRTY;
dd3a91
+			if (dev->rwh_policy == RWH_DISTRIBUTED)
dd3a91
+				dev->vol.dirty |= RAIDVOL_DSRECORD_VALID;
dd3a91
+		}
dd3a91
 		super->updates_pending++;
dd3a91
 	}
dd3a91
 
dd3a91
@@ -8445,6 +8667,11 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a,
dd3a91
 		di->component_size = a->info.component_size;
dd3a91
 		di->container_member = inst;
dd3a91
 		di->bb.supported = 1;
dd3a91
+		if (dev->rwh_policy == RWH_DISTRIBUTED) {
dd3a91
+			di->consistency_policy = CONSISTENCY_POLICY_PPL;
dd3a91
+			di->ppl_sector = get_ppl_sector(super, inst);
dd3a91
+			di->ppl_size = (PPL_HEADER_SIZE + PPL_ENTRY_SPACE) >> 9;
dd3a91
+		}
dd3a91
 		super->random = random32();
dd3a91
 		di->next = rv;
dd3a91
 		rv = di;
dd3a91
@@ -11600,6 +11827,9 @@ struct superswitch super_imsm = {
dd3a91
 	.container_content = container_content_imsm,
dd3a91
 	.validate_container = validate_container_imsm,
dd3a91
 
dd3a91
+	.write_init_ppl = write_init_ppl_imsm,
dd3a91
+	.validate_ppl	= validate_ppl_imsm,
dd3a91
+
dd3a91
 	.external	= 1,
dd3a91
 	.name = "imsm",
dd3a91
 
dd3a91
diff --git a/sysfs.c b/sysfs.c
dd3a91
index 53589a7..2a91ba0 100644
dd3a91
--- a/sysfs.c
dd3a91
+++ b/sysfs.c
dd3a91
@@ -689,6 +689,16 @@ int sysfs_set_array(struct mdinfo *info, int vers)
dd3a91
 		 * once the reshape completes.
dd3a91
 		 */
dd3a91
 	}
dd3a91
+
dd3a91
+	if (info->consistency_policy == CONSISTENCY_POLICY_PPL) {
dd3a91
+		if (sysfs_set_str(info, NULL, "consistency_policy",
dd3a91
+				  map_num(consistency_policies,
dd3a91
+					  info->consistency_policy))) {
dd3a91
+			pr_err("This kernel does not support PPL\n");
dd3a91
+			return 1;
dd3a91
+		}
dd3a91
+	}
dd3a91
+
dd3a91
 	return rv;
dd3a91
 }
dd3a91
 
dd3a91
@@ -720,6 +730,10 @@ int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume)
dd3a91
 	rv = sysfs_set_num(sra, sd, "offset", sd->data_offset);
dd3a91
 	rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2);
dd3a91
 	if (sra->array.level != LEVEL_CONTAINER) {
dd3a91
+		if (sd->consistency_policy == CONSISTENCY_POLICY_PPL) {
dd3a91
+			rv |= sysfs_set_num(sra, sd, "ppl_sector", sd->ppl_sector);
dd3a91
+			rv |= sysfs_set_num(sra, sd, "ppl_size", sd->ppl_size);
dd3a91
+		}
dd3a91
 		if (sd->recovery_start == MaxSector)
dd3a91
 			/* This can correctly fail if array isn't started,
dd3a91
 			 * yet, so just ignore status for now.