#!/bin/bash
# randomly soft offline pages
# random_offline options
# -t seconds   runtime in seconds (default unlimited)
# -m max-pages maximum pages to tie up before unpoisoning
# -s seed      random seed
# Note: running this for too long may still run out of memory
# because unpoison cannot completely undo what soft offline
# does to larger free memory areas (TBD in the kernel)
# Author: Andi Kleen

# fixme: uses time seed, non reproducible

#mount -t debugfs none /debug

ROOT=`(cd ../../../; pwd)`
. $ROOT/lib/mce.sh

THRESH=1000
SEED=""
RUNTIME=""
DEBUG=""

fail() {
	echo "ERROR: $@"
	exit 0
}

usage() {
	echo "Usage:"
	echo "random_offline options"
	echo -- "-t seconds   runtime in seconds (default unlimited)"
	echo -- "-m max-pages maximum pages to tie up before unpoisoning"
	echo -- "-s seed      random seed"
	fail "Invalid option $1"
}

while getopts "t:m:s:" option ; do
	case "$option" in
	t) RUNTIME=$OPTARG ;;
	m) THRESH=$OPTARG ;;
	s) SEED=$OPTARG ;;
	*) usage $option ;;
	esac
done

[ "$(whoami)" != root ] && fail "Not root"
check_debugfs
DEBUG=`mount | grep debugfs | cut -d ' ' -f3 | head -1`
#if hwpoison_inject is a module, it is ensured to have been loaded
modinfo hwpoison_inject > /dev/null 2>&1
if [ $? -eq 0 ]; then
	[ -d $DEBUG/hwpoison/ ] || modprobe hwpoison_inject
	[ $? -eq 0 ] || fail "module hwpoison_inject isn't supported ?"
fi
[ ! -w /sys/devices/system/memory/soft_offline_page ] && fail "No soft offlining support in kernel"
[ ! -w $DEBUG/hwpoison/unpoison-pfn ] && fail "no unpoison support in kernel"

end_of_memory() {
	for i in /sys/firmware/memmap/* ; do
		case "$(< $i/type)" in
		"System RAM") ;;
		*) continue ;;
		esac

		k=$(< $i/end)
		k=${k/0x/}
		k=$(echo $k | tr a-z A-Z)

		echo "ibase=16; $k/1000" | bc
	done | sort -n | tail -n1
}

E=$(end_of_memory)

echo "soft offlining pages upto $E"

unpoison() {
	if [ ! -f offlined ] ; then
		return
	fi

	echo unpoisioning
	while read i ; do
		#echo -n ,
		#echo "u $i"
		(( utotal++ ))
	 	if ! echo $i | sed 's/000$//' > $DEBUG/hwpoison/unpoison-pfn ; then
			echo "$i $?" >> unpoison-failed
			echo "unpoisioning $i failed: $?"
		else
			(( usuccess++ ))
		fi
	done < offlined
	echo done
	echo
}

trap unpoison 0

if [ "$SEED" = "" ] ; then
	SEED=$(date +%s)
fi
RANDOM=$SEED
echo "Using random seed $SEED"

start=$(date +%s)
failed=0
ufailed=0
success=0
usuccess=0
total=0
utotal=0

cbefore=$(grep HardwareCorrupted /proc/meminfo)


(( k = 0 ))
rm -f offlined unpoison-failed
while true ; do
	T=$(
	R=$RANDOM
	X=$(echo "obase=16; ($R%$E)*4096"  | bc)
	echo 0x$X
	)
	#echo "p $T"
	(( total++ ))
	if echo 2>/dev/null $T >/sys/devices/system/memory/soft_offline_page ; then
		echo $T >> offlined
		(( success++ ))
	else
		#echo offlining $T failed $?
		(( failed++ ))
		true
	fi
	#echo -n .

	(( k++ ))
	if [ $k -gt $THRESH ] ; then
		unpoison
		(( k = 0 ))
		rm offlined
	fi

	if [ ! -z "$RUNTIME" ] ; then
		((DIFF = $(date +%s) - $start))
		if [ $DIFF -gt "$RUNTIME" ] ; then
			echo time over
			trap 0
			break
		fi
	fi
done	

if [ -f unpoison-failed ] ; then
	ufailed=$(wc -l unpoison-failed | awk ' {print $1}')
fi
echo "soft-poison: success $success failed $failed of total $total"
echo "unpoison-failed: success $usuccess failed $ufailed of total $utotal"
echo "poisoned before: $cbefore"
echo -n "poisoned after: "
grep HardwareCorrupted /proc/meminfo

### xxx automatic success/failure criteria?

