#!/bin/sh
#
# report issues in the collection of PCP archives managed by
# pmlogger_check and pmlogger_daily
#

. /etc/pcp.conf

tmp=/var/tmp/check-archives.$$
echo 0 >$tmp.sts
trap "_exit; exit \$sts" 0 1 2 3 15

_exit()
{
    sts=`cat $tmp.sts`
    rm -f $tmp.*
}

# need to salt away the biggest status seen so far ...
# $1 values
# 0  no error
# 1  warning
# 2  error in archives' status
# 4  fatal botch in this script or environment
#
_set_sts()
{
    _sts=`cat $tmp.sts`
    [ $_sts -lt $1 ] && echo $1 >$tmp.sts
}

# strip any known compression suffix from the filename(s) on stdin
#
_strip_compress_suffix()
{
    sed \
	-e 's/\.xz$//' \
	-e 's/\.lzma$//' \
	-e 's/\.bz2$//' \
	-e 's/\.bz$//' \
	-e 's/\.gz$//' \
	-e 's/\.Z$//' \
	-e 's/\.z$//' \
    # end
}

progname=`basename $0`

if [ ! -d $PCP_LOG_DIR/pmlogger ]
then
    echo "$progname: Error: $PCP_LOG_DIR/pmlogger does not exist"
    _set_sts 4
    exit
fi

if cd $PCP_LOG_DIR/pmlogger
then
    :
else
    echo "$progname: Error: cannot chdir to $PCP_LOG_DIR/pmlogger"
    _set_sts 4
    exit
fi

# recent script log and timestamp files?
#
find pmlogger_* -maxdepth 0 -type f -mtime -1 >$tmp.tmp

for stamp in pmlogger_daily.stamp # TODO pmlogger_daily_report.stamp
do
    if grep "^$stamp\$" <$tmp.tmp >/dev/null
    then
	:
    else
	echo
	echo "Warning: no $stamp file in the last 24 hours"
	ls -l $stamp* 2>&1 | sed -e 's/^/    /'
	_set_sts 1
    fi
done

# remove "expected" lines from logs and report if anything remains
#
sed <$tmp.tmp \
    -e '/stamp$/d' \
    -e '/stamp.prev$/d' \
| while read log
do
    rm -f $tmp.ok
    case $log
    in
	pmlogger_check.log*)
		sed <$log >$tmp.tmp \
		    -e '/^pmlogger_check: \[.*[0-9]]$/d' \
		    -e '/^Restarting primary pmlogger for host .* done$/d' \
		    -e '/^Latest folio created for .*[0-9]$/d' \
		    -e '/^Duplicate archive basename \.\.\. rename .*\*$/d' \
		    -e '/^\.\.\. logging for host ".*" unchanged$/d' \
		# end
		;;

	pmlogger_daily-K.log*)
		sed <$log >$tmp.tmp \
		    -e '/^pmlogger_daily: \[.*[0-9]]$/d' \
		    -e '/Error: no pmlogger instance running for host/d' \
		    -e '/^\.\.\. logging for host ".*" unchanged$/d' \
		# end
		;;

	pmlogger_daily_report.log*)
		sed <$log >$tmp.tmp \
		    -e '/PM_ERR_NAME Unknown metric name/d' \
		# end
		;;

	*)
		cat $log >$tmp.tmp
		;;
    esac
    if [ -s $tmp.tmp ]
    then
	echo
	echo "Warning: Unexpected lines (!) in $log"
	ls -l $log 2>&1 | sed -e 's/^/    /'
	diff -e $log $tmp.tmp \
	| sed -n >$tmp.sed \
	    -e '/^[0-9][0-9]*d/s/d.*/s\/^\/s\//p' \
	    -e '/^[0-9][0-9]*,[0-9][0-9]*d/s/d.*/s\/^\/s\//p' \
	# end
	if [ -s $tmp.sed ]
	then
	    # some expected lines
	    :
	else
	    # all unexpected lines
	    echo "s/^/s/" >$tmp.sed
	fi
	sed -e 's/^/  /' <$log \
	| sed -f $tmp.sed \
	| sed \
	    -e '/^ /s/ /!/' \
	    -e '/^s/s/s / /' \
	    -e 's/^/    /' \
	# end
    fi
    _set_sts 1
done

# traverse each directory below here ... assume it contains PCP
# archives of interest
#
find * -maxdepth 0 -type d \
| sed \
    -e '/^SaveLogs/d' \
| while read dir
do
    if cd $dir
    then
	:
    else
	echo "$progname: Error: cannot chdir to $PCP_LOG_DIR/pmlogger/$dir"
	_set_sts 4
	exit
    fi

    echo
    echo "=== $dir ==="

    # all the files of interest begin YYYY or YY (old school)
    #
    if [ "`echo [0-9]*`" = '[0-9]*' ]
    then
	echo "Warning: no PCP archives here?"
	_set_sts 1
	cd ..
	continue
    fi

    ls [0-9]* \
    | _strip_compress_suffix \
    | sed \
	-e 's/\.[0-9][0-9]*$//' \
	-e 's/\.index$//' \
	-e 's/\.meta$//' \
    | sort \
    | uniq >$tmp.list

    # for each archive base name in $tmp.list expect ...
    #
    for base in `cat $tmp.list`
    do
	# ... an index file (warn if missing)
	#
	rm -f $tmp.err
	if [ "`echo $base.index*`" = "$base.index*" ]
	then
	    if [ ! -f $tmp.err ]
	    then
		echo
		touch $tmp.err
	    fi
	    echo "Error: $base.index is missing"
	fi

	# ... and a metadata file (error if missing)
	#
	if [ "`echo $base.meta*`" = "$base.meta*" ]
	then
	    if [ ! -f $tmp.err ]
	    then
		echo
		touch $tmp.err
	    fi
	    echo "Error: $base.meta is missing"
	fi

	# ... and data volume 0 file (error if missing)
	#
	if [ -f $base.0 -o -f "`echo $base.0.*`" ]
	then
	    # we expect all the volumes from 0, 1, 2, ... N
	    # where N is the maximum volume seen for this base name
	    #
	    for file in `echo $base.[0-9]*`
	    do
		maxvol=`echo "$file" \
		        | _strip_compress_suffix \
			| sed -e 's/.*\.//' \
			| sort -rn \
			| head -1`
		if [ -z "$maxvol" ]
		then
		    echo "Botch: could not get maxvol for base=$base"
		    _set_sts 4
		    exit
		else
		    # already tested for volume 0 above, so just the others
		    # (if any)
		    #
		    vol=1
		    while [ $vol -lt $maxvol ]
		    do
			if [ -f $base.$vol -o -f "`echo $base.$vol.*`" ]
			then
			    :
			else
			    if [ ! -f $tmp.err ]
			    then
				echo
				touch $tmp.err
			    fi
			    echo "Error: $base.$vol: data volume is missing"
			fi
			vol=`expr $vol + 1`
		    done
		fi
	    done
	else
	    if [ ! -f $tmp.err ]
	    then
		echo
		touch $tmp.err
	    fi
	    echo "Error: $base.0 is missing"
	fi

	if [ -f $tmp.err ]
	then
	    ls -l $base* 2>&1 | sed -e 's/^/    /'
	    _set_sts 2
	else
	    # all the bits-n-pieces appear to be in place, what does
	    # pmlogcheck make of it?
	    #
	    if pmlogcheck -w $base >$tmp.tmp
	    then
		:
	    else
		echo "Botch: pmlogcheck -w $base failed!"
		_set_sts 4
		exit
	    fi
	    if [ -s $tmp.tmp ]
	    then
		echo "Error: $base: pmlogcheck failure"
		sed -e 's/^/    /' <$tmp.tmp
		_set_sts 2
	    fi
	fi

    done

    # Now, go back 31 days ... find the first archive and then expect
    # to see an archive for each date between then and yesterday.
    # If the archive basename is not YYYYMMDD, then a merge has been
    # missed, so report this also.
    #
    rm -f $tmp.found
    back=31
    while [ "$back" -gt 0 ]
    do
	date=`pmdate -${back}d '%Y%m%d'`
	if [ -z "$date" ]
	then
	    echo "Botch: pmdate -${back}d failed!"
	    _set_sts 4
	    exit
	fi
	grep "$date" $tmp.list >$tmp.tmp
	if [ -s $tmp.tmp ]
	then
	    [ -f $tmp.found ] || touch $tmp.found
	    # expect exactly one archive
	    #
	    narch="`wc -l <$tmp.tmp | sed -e 's/[ 	]//g'`"
	    if [ "$narch" -ne 1 ]
	    then
		echo "Error: multiple archives for $date, merge not done?"
		ls -l $date* 2>&1 | sed -e 's/^/    /'
		_set_sts 2
	    else
		# and expect it to match post-daily base name format
		# YYYYMMDD, not YYYYMMDD.HH.MM or YYYYMMDD.HH.MM-seq#
		#
		if grep "^$date\$" $tmp.list >/dev/null
		then
		    :
		else
		    echo "Error: pmlogger_daily not run for $date?"
		    ls -l $date* 2>&1 | sed -e 's/^/    /'
		fi
	    fi
	else
	    if [ -f $tmp.found ]
	    then
		echo "Error: no archive(s) for $date"
		_set_sts 2
	    fi
	fi
	back=`expr $back - 1`
    done

    cd ..
done
