|
|
5d5466 |
#!/bin/bash
|
|
|
5d5466 |
#
|
|
|
5d5466 |
# This script reads it's configuration from /etc/sysconfig/raid-check
|
|
|
5d5466 |
# Please use that file to enable/disable this script or to set the
|
|
|
5d5466 |
# type of check you wish performed.
|
|
|
5d5466 |
|
|
|
5d5466 |
# We might be on a kernel with no raid support at all, exit if so
|
|
|
5d5466 |
[ -f /proc/mdstat ] || exit 0
|
|
|
5d5466 |
|
|
|
5d5466 |
# and exit if we haven't been set up properly
|
|
|
5d5466 |
[ -f /etc/sysconfig/raid-check ] || exit 0
|
|
|
5d5466 |
. /etc/sysconfig/raid-check
|
|
|
5d5466 |
|
|
|
5d5466 |
# Wait until no more than arg1 arrays in arg2 list are busy
|
|
|
5d5466 |
waitbusy() {
|
|
|
5d5466 |
local threshold=$(($1 + 1))
|
|
|
5d5466 |
local dev_list="$2"
|
|
|
5d5466 |
while true
|
|
|
5d5466 |
do
|
|
|
5d5466 |
local busy=0
|
|
|
5d5466 |
local dev=""
|
|
|
5d5466 |
for dev in $dev_list; do
|
|
|
5d5466 |
local sync_action=`cat /sys/block/$dev/md/sync_action`
|
|
|
5d5466 |
if [ "$sync_action" != "idle" ]; then
|
|
|
5d5466 |
let busy++
|
|
|
5d5466 |
fi
|
|
|
5d5466 |
done
|
|
|
5d5466 |
[ $busy -lt $threshold ] && break
|
|
|
5d5466 |
sleep 60
|
|
|
5d5466 |
done
|
|
|
5d5466 |
}
|
|
|
5d5466 |
|
|
|
5d5466 |
[ "$ENABLED" != "yes" ] && exit 0
|
|
|
5d5466 |
|
|
|
5d5466 |
case "$CHECK" in
|
|
|
5d5466 |
check) ;;
|
|
|
5d5466 |
repair) ;;
|
|
|
5d5466 |
*) exit 0;;
|
|
|
5d5466 |
esac
|
|
|
5d5466 |
|
|
|
5d5466 |
ionice=""
|
|
|
5d5466 |
renice=""
|
|
|
5d5466 |
case $NICE in
|
|
|
5d5466 |
high)
|
|
|
5d5466 |
renice="-n -5"
|
|
|
5d5466 |
;;
|
|
|
5d5466 |
low)
|
|
|
5d5466 |
renice="-n 5"
|
|
|
5d5466 |
ionice="-c2 -n7"
|
|
|
5d5466 |
;;
|
|
|
5d5466 |
idle)
|
|
|
5d5466 |
renice="-n 15"
|
|
|
5d5466 |
ionice="-c3"
|
|
|
5d5466 |
;;
|
|
|
5d5466 |
*)
|
|
|
5d5466 |
;;
|
|
|
5d5466 |
esac
|
|
|
5d5466 |
|
|
|
5d5466 |
active_list=`grep "^md.*: active" /proc/mdstat | cut -f 1 -d ' '`
|
|
|
5d5466 |
[ -z "$active_list" ] && exit 0
|
|
|
5d5466 |
|
|
|
5d5466 |
declare -A check
|
|
|
5d5466 |
dev_list=""
|
|
|
5d5466 |
check_list=""
|
|
|
5d5466 |
for dev in $active_list; do
|
|
|
5d5466 |
echo $SKIP_DEVS | grep -w $dev >&/dev/null && continue
|
|
|
5d5466 |
if [ -f /sys/block/$dev/md/sync_action ]; then
|
|
|
5d5466 |
# Only perform the checks on idle, healthy arrays, but delay
|
|
|
5d5466 |
# actually writing the check field until the next loop so we
|
|
|
5d5466 |
# don't switch currently idle arrays to active, which happens
|
|
|
5d5466 |
# when two or more arrays are on the same physical disk
|
|
|
5d5466 |
array_state=`cat /sys/block/$dev/md/array_state`
|
|
|
5d5466 |
if [ "$array_state" != "clean" -a "$array_state" != "active" ]; then
|
|
|
5d5466 |
continue
|
|
|
5d5466 |
fi
|
|
|
5d5466 |
sync_action=`cat /sys/block/$dev/md/sync_action`
|
|
|
5d5466 |
if [ "$sync_action" != idle ]; then
|
|
|
5d5466 |
continue
|
|
|
5d5466 |
fi
|
|
|
5d5466 |
ck=""
|
|
|
5d5466 |
echo $REPAIR_DEVS | grep -w $dev >&/dev/null && ck="repair"
|
|
|
5d5466 |
echo $CHECK_DEVS | grep -w $dev >&/dev/null && ck="check"
|
|
|
5d5466 |
[ -z "$ck" ] && ck=$CHECK
|
|
|
5d5466 |
dev_list="$dev_list $dev"
|
|
|
5d5466 |
check[$dev]=$ck
|
|
|
5d5466 |
[ "$ck" = "check" ] && check_list="$check_list $dev"
|
|
|
5d5466 |
fi
|
|
|
5d5466 |
done
|
|
|
5d5466 |
[ -z "$dev_list" ] && exit 0
|
|
|
5d5466 |
|
|
|
5d5466 |
for dev in $dev_list; do
|
|
|
5d5466 |
#Only run $MAXCONCURRENT checks at a time
|
|
|
5d5466 |
if [ -n "$MAXCONCURRENT" ]; then
|
|
|
5d5466 |
waitbusy $((MAXCONCURRENT - 1)) "$dev_list"
|
|
|
5d5466 |
fi
|
|
|
5d5466 |
echo "${check[$dev]}" > /sys/block/$dev/md/sync_action
|
|
|
5d5466 |
|
|
|
5d5466 |
resync_pid=""
|
|
|
5d5466 |
wait=10
|
|
|
5d5466 |
while [ $wait -gt 0 -a -z "$resync_pid" ]; do
|
|
|
5d5466 |
sleep 6
|
|
|
5d5466 |
let wait--
|
|
|
5d5466 |
resync_pid=$(ps -ef | awk -v mddev=$dev 'BEGIN { pattern = "^\\[" mddev "_resync]$" } $8 ~ pattern { print $2 }')
|
|
|
5d5466 |
done
|
|
|
5d5466 |
[ -n "$resync_pid" -a -n "$renice" ] &&
|
|
|
5d5466 |
renice $renice -p $resync_pid >&/dev/null
|
|
|
5d5466 |
[ -n "$resync_pid" -a -n "$ionice" ] &&
|
|
|
5d5466 |
ionice $ionice -p $resync_pid >&/dev/null
|
|
|
5d5466 |
done
|
|
|
5d5466 |
[ -z "$check_list" ] && exit 0
|
|
|
5d5466 |
|
|
|
5d5466 |
waitbusy 0 "$check_list"
|
|
|
5d5466 |
|
|
|
5d5466 |
for dev in $check_list; do
|
|
|
5d5466 |
mismatch_cnt=`cat /sys/block/$dev/md/mismatch_cnt`
|
|
|
5d5466 |
# Due to the fact that raid1/10 writes in the kernel are unbuffered,
|
|
|
5d5466 |
# a raid1 array can have non-0 mismatch counts even when the
|
|
|
5d5466 |
# array is healthy. These non-0 counts will only exist in
|
|
|
5d5466 |
# transient data areas where they don't pose a problem. However,
|
|
|
5d5466 |
# since we can't tell the difference between a non-0 count that
|
|
|
5d5466 |
# is just in transient data or a non-0 count that signifies a
|
|
|
5d5466 |
# real problem, simply don't check the mismatch_cnt on raid1
|
|
|
5d5466 |
# devices as it's providing far too many false positives. But by
|
|
|
5d5466 |
# leaving the raid1 device in the check list and performing the
|
|
|
5d5466 |
# check, we still catch and correct any bad sectors there might
|
|
|
5d5466 |
# be in the device.
|
|
|
5d5466 |
raid_lvl=`cat /sys/block/$dev/md/level`
|
|
|
5d5466 |
if [ "$raid_lvl" = "raid1" -o "$raid_lvl" = "raid10" ]; then
|
|
|
5d5466 |
continue
|
|
|
5d5466 |
fi
|
|
|
5d5466 |
if [ "$mismatch_cnt" -ne 0 ]; then
|
|
|
5d5466 |
echo "WARNING: mismatch_cnt is not 0 on /dev/$dev"
|
|
|
5d5466 |
fi
|
|
|
5d5466 |
done
|
|
|
5d5466 |
|