dcavalca / rpms / mdadm

Forked from rpms/mdadm 3 years ago
Clone

Blame SOURCES/raid-check

1f6b6a
#!/bin/bash
1f6b6a
#
1f6b6a
# This script reads it's configuration from /etc/sysconfig/raid-check
1f6b6a
# Please use that file to enable/disable this script or to set the
1f6b6a
# type of check you wish performed.
1f6b6a
1f6b6a
# We might be on a kernel with no raid support at all, exit if so
1f6b6a
[ -f /proc/mdstat ] || exit 0
1f6b6a
1f6b6a
# and exit if we haven't been set up properly
1f6b6a
[ -f /etc/sysconfig/raid-check ] || exit 0
1f6b6a
. /etc/sysconfig/raid-check
1f6b6a
1f6b6a
# Wait until no more than arg1 arrays in arg2 list are busy
1f6b6a
waitbusy() {
1f6b6a
    local threshold=$(($1 + 1))
1f6b6a
    local dev_list="$2"
1f6b6a
    while true
1f6b6a
    do
1f6b6a
	local busy=0
1f6b6a
	local dev=""
1f6b6a
	for dev in $dev_list; do
1f6b6a
	    local sync_action=`cat /sys/block/$dev/md/sync_action`
1f6b6a
	    if [ "$sync_action" != "idle" ]; then
1f6b6a
		let busy++
1f6b6a
	    fi
1f6b6a
	done
1f6b6a
        [ $busy -lt $threshold ] && break
1f6b6a
	sleep 60
1f6b6a
    done
1f6b6a
}
1f6b6a
1f6b6a
[ "$ENABLED" != "yes" ] && exit 0
1f6b6a
1f6b6a
case "$CHECK" in
1f6b6a
    check) ;;
1f6b6a
    repair) ;;
1f6b6a
    *) exit 0;;
1f6b6a
esac
1f6b6a
1f6b6a
ionice=""
1f6b6a
renice=""
1f6b6a
case $NICE in
1f6b6a
    high)
1f6b6a
	renice="-n -5"
1f6b6a
	;;
1f6b6a
    low)
1f6b6a
	renice="-n 5"
1f6b6a
	ionice="-c2 -n7"
1f6b6a
	;;
1f6b6a
    idle)
1f6b6a
	renice="-n 15"
1f6b6a
	ionice="-c3"
1f6b6a
	;;
1f6b6a
    *)
1f6b6a
	;;
1f6b6a
esac
1f6b6a
1f6b6a
active_list=`grep "^md.*: active" /proc/mdstat | cut -f 1 -d ' '`
1f6b6a
[ -z "$active_list" ] && exit 0
1f6b6a
1f6b6a
declare -A check
1f6b6a
dev_list=""
1f6b6a
check_list=""
1f6b6a
for dev in $active_list; do
1f6b6a
    echo $SKIP_DEVS | grep -w $dev >&/dev/null && continue
1f6b6a
    if [ -f /sys/block/$dev/md/sync_action ]; then
1f6b6a
	# Only perform the checks on idle, healthy arrays, but delay
1f6b6a
	# actually writing the check field until the next loop so we
1f6b6a
	# don't switch currently idle arrays to active, which happens
1f6b6a
	# when two or more arrays are on the same physical disk
1f6b6a
	array_state=`cat /sys/block/$dev/md/array_state`
1f6b6a
	if [ "$array_state" != "clean" -a "$array_state" != "active" ]; then
1f6b6a
	    continue
1f6b6a
	fi
1f6b6a
	sync_action=`cat /sys/block/$dev/md/sync_action`
1f6b6a
	if [ "$sync_action" != idle ]; then
1f6b6a
	    continue
1f6b6a
	fi
1f6b6a
	ck=""
1f6b6a
	echo $REPAIR_DEVS | grep -w $dev >&/dev/null && ck="repair"
1f6b6a
	echo $CHECK_DEVS | grep -w $dev >&/dev/null && ck="check"
1f6b6a
	[ -z "$ck" ] && ck=$CHECK
1f6b6a
	dev_list="$dev_list $dev"
1f6b6a
	check[$dev]=$ck
1f6b6a
	[ "$ck" = "check" ] && check_list="$check_list $dev"
1f6b6a
    fi
1f6b6a
done
1f6b6a
[ -z "$dev_list" ] && exit 0
1f6b6a
1f6b6a
for dev in $dev_list; do
1f6b6a
    #Only run $MAXCONCURRENT checks at a time
1f6b6a
    if [ -n "$MAXCONCURRENT" ]; then
1f6b6a
	waitbusy $((MAXCONCURRENT - 1)) "$dev_list"
1f6b6a
    fi
1f6b6a
    echo "${check[$dev]}" > /sys/block/$dev/md/sync_action
1f6b6a
1f6b6a
    resync_pid=""
1f6b6a
    wait=10
1f6b6a
    while [ $wait -gt 0 -a -z "$resync_pid" ]; do
1f6b6a
	sleep 6
1f6b6a
	let wait--
1f6b6a
	resync_pid=$(ps -ef | awk -v mddev=$dev 'BEGIN { pattern = "^\\[" mddev "_resync]$" } $8 ~ pattern { print $2 }')
1f6b6a
    done
1f6b6a
    [ -n "$resync_pid" -a -n "$renice" ] &&
1f6b6a
    	renice $renice -p $resync_pid >&/dev/null
1f6b6a
    [ -n "$resync_pid" -a -n "$ionice" ] &&
1f6b6a
    	ionice $ionice -p $resync_pid >&/dev/null
1f6b6a
done
1f6b6a
[ -z "$check_list" ] && exit 0
1f6b6a
1f6b6a
waitbusy 0 "$check_list"
1f6b6a
1f6b6a
for dev in $check_list; do
1f6b6a
	mismatch_cnt=`cat /sys/block/$dev/md/mismatch_cnt`
1f6b6a
	# Due to the fact that raid1/10 writes in the kernel are unbuffered,
1f6b6a
	# a raid1 array can have non-0 mismatch counts even when the
1f6b6a
	# array is healthy.  These non-0 counts will only exist in
1f6b6a
	# transient data areas where they don't pose a problem.  However,
1f6b6a
	# since we can't tell the difference between a non-0 count that
1f6b6a
	# is just in transient data or a non-0 count that signifies a
1f6b6a
	# real problem, simply don't check the mismatch_cnt on raid1
1f6b6a
	# devices as it's providing far too many false positives.  But by
1f6b6a
	# leaving the raid1 device in the check list and performing the
1f6b6a
	# check, we still catch and correct any bad sectors there might
1f6b6a
	# be in the device.
1f6b6a
	raid_lvl=`cat /sys/block/$dev/md/level`
1f6b6a
	if [ "$raid_lvl" = "raid1" -o "$raid_lvl" = "raid10" ]; then
1f6b6a
	    continue
1f6b6a
	fi
1f6b6a
	if [ "$mismatch_cnt" -ne 0 ]; then
1f6b6a
		echo "WARNING: mismatch_cnt is not 0 on /dev/$dev"
1f6b6a
	fi
1f6b6a
done
1f6b6a