Blob Blame History Raw
From 90b595650d7d8a6f6a69a9f7060c6406aa731c18 Mon Sep 17 00:00:00 2001
From: "Fabio M. Di Nitto" <fdinitto@redhat.com>
Date: Wed, 28 Jul 2021 10:08:10 +0200
Subject: [PATCH] Add storage-mon pacemaker health check

Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
---
 .gitignore               |  41 ++++++
 configure.ac             |   1 +
 doc/man/Makefile.am      |   3 +-
 heartbeat/Makefile.am    |  17 +--
 heartbeat/storage-mon.in | 263 +++++++++++++++++++++++++++++++++++++++
 tools/Makefile.am        |   5 +-
 tools/storage_mon.c      | 263 +++++++++++++++++++++++++++++++++++++++
 7 files changed, 583 insertions(+), 10 deletions(-)
 create mode 100644 heartbeat/storage-mon.in
 create mode 100644 tools/storage_mon.c

diff --git a/.gitignore b/.gitignore
index 38d3566205..f7277bf04e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,46 @@ heartbeat/ocf-shellfuncs
 heartbeat/send_ua
 heartbeat/shellfuncs
 heartbeat/*.pyc
+heartbeat/AoEtarget
+heartbeat/CTDB
+heartbeat/ManageRAID
+heartbeat/ManageVE
+heartbeat/Squid
+heartbeat/SysInfo
+heartbeat/aws-vpc-route53
+heartbeat/azure-events
+heartbeat/clvm
+heartbeat/conntrackd
+heartbeat/dnsupdate
+heartbeat/dummypy
+heartbeat/eDir88
+heartbeat/fio
+heartbeat/galera
+heartbeat/gcp-pd-move
+heartbeat/gcp-vpc-move-ip
+heartbeat/gcp-vpc-move-route
+heartbeat/gcp-vpc-move-vip
+heartbeat/iSCSILogicalUnit
+heartbeat/iSCSITarget
+heartbeat/jira
+heartbeat/kamailio
+heartbeat/lxc
+heartbeat/lxd-info
+heartbeat/machine-info
+heartbeat/mariadb
+heartbeat/mpathpersist
+heartbeat/nfsnotify
+heartbeat/openstack-info
+heartbeat/rabbitmq-cluster
+heartbeat/redis
+heartbeat/rsyslog
+heartbeat/sg_persist
+heartbeat/slapd
+heartbeat/smb-share
+heartbeat/storage-mon
+heartbeat/sybaseASE
+heartbeat/syslog-ng
+heartbeat/vsftpd
 include/agent_config.h
 include/config.h
 include/config.h.in
@@ -61,6 +101,7 @@ systemd/resource-agents.conf
 tools/findif
 tools/ocf-tester
 tools/send_arp
+tools/storage_mon
 tools/tickle_tcp
 tools/ocft/README
 tools/ocft/README.zh_CN
diff --git a/configure.ac b/configure.ac
index 717fb95432..c125df98f6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1002,6 +1002,7 @@ AC_CONFIG_FILES([heartbeat/rsyslog], [chmod +x heartbeat/rsyslog])
 AC_CONFIG_FILES([heartbeat/smb-share], [chmod +x heartbeat/smb-share])
 AC_CONFIG_FILES([heartbeat/sg_persist], [chmod +x heartbeat/sg_persist])
 AC_CONFIG_FILES([heartbeat/slapd], [chmod +x heartbeat/slapd])
+AC_CONFIG_FILES([heartbeat/storage-mon], [chmod +x heartbeat/storage-mon])
 AC_CONFIG_FILES([heartbeat/sybaseASE], [chmod +x heartbeat/sybaseASE])
 AC_CONFIG_FILES([heartbeat/syslog-ng], [chmod +x heartbeat/syslog-ng])
 AC_CONFIG_FILES([heartbeat/vsftpd], [chmod +x heartbeat/vsftpd])
diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am
index 947d83cb2b..97904ccb16 100644
--- a/doc/man/Makefile.am
+++ b/doc/man/Makefile.am
@@ -138,6 +138,7 @@ man_MANS                = ocf_heartbeat_AoEtarget.7 \
                           ocf_heartbeat_mariadb.7 \
                           ocf_heartbeat_mdraid.7 \
                           ocf_heartbeat_minio.7 \
+                          ocf_heartbeat_mpathpersist.7 \
                           ocf_heartbeat_mysql.7 \
                           ocf_heartbeat_mysql-proxy.7 \
                           ocf_heartbeat_nagios.7 \
@@ -175,7 +176,7 @@ man_MANS                = ocf_heartbeat_AoEtarget.7 \
                           ocf_heartbeat_smb-share.7 \
                           ocf_heartbeat_sybaseASE.7 \
                           ocf_heartbeat_sg_persist.7 \
-                          ocf_heartbeat_mpathpersist.7 \
+                          ocf_heartbeat_storage-mon.7 \
                           ocf_heartbeat_symlink.7 \
                           ocf_heartbeat_syslog-ng.7 \
                           ocf_heartbeat_tomcat.7 \
diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am
index 9af44cc127..5d52d211f2 100644
--- a/heartbeat/Makefile.am
+++ b/heartbeat/Makefile.am
@@ -32,22 +32,22 @@ ocfdir		        = $(OCF_RA_DIR_PREFIX)/heartbeat
 dtddir			= $(datadir)/$(PACKAGE_NAME)
 dtd_DATA		= ra-api-1.dtd metadata.rng
 
+ocf_PROGRAMS		=
+
 if USE_IPV6ADDR_AGENT
-ocf_PROGRAMS           = IPv6addr
-else
-ocf_PROGRAMS           =
+ocf_PROGRAMS		+= IPv6addr
 endif
 
+halib_PROGRAMS		=
+
 if IPV6ADDR_COMPATIBLE
-halib_PROGRAMS         = send_ua
-else
-halib_PROGRAMS         =
+halib_PROGRAMS		+= send_ua
 endif
 
 IPv6addr_SOURCES        = IPv6addr.c IPv6addr_utils.c
-send_ua_SOURCES         = send_ua.c IPv6addr_utils.c
-
 IPv6addr_LDADD          = -lplumb $(LIBNETLIBS)
+
+send_ua_SOURCES         = send_ua.c IPv6addr_utils.c
 send_ua_LDADD           = $(LIBNETLIBS)

 osp_SCRIPTS	     =  nova-compute-wait	\
@@ -170,6 +170,7 @@ ocf_SCRIPTS	      = AoEtarget		\
 			mpathpersist		\
 			slapd			\
+			storage-mon		\
 			sybaseASE		\
 			symlink			\
 			syslog-ng		\
 			tomcat			\
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
new file mode 100644
index 0000000000..5b289fe554
--- /dev/null
+++ b/heartbeat/storage-mon.in
@@ -0,0 +1,263 @@
+#!@BASH_SHELL@
+#
+# Copyright (C) 2021 Red Hat, Inc.  All rights reserved.
+#
+# Authors: Christine Caulfield <ccaulfie@redhat.com>
+#          Fabio M. Di Nitto <fdinitto@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.  Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+
+#
+# Checks storage I/O status of all given drives and writes the #health-storage
+# status into the CIB
+# Implementation is heavily based on ocf:pacemaker:HealtSMART
+#
+# It sends a single block on IO to a radom location on the device and reports any errors returned.
+# If the IO hangs, that will also be returned. (bear in mind tha tmay also hang the C app in some
+# instances).
+#
+# It's worth making a note in the RA description that the smartmon RA is also recommended (this
+# does not replace it), and that Pacemaker health checking should be configued.
+#
+# https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/singlehtml/index.html#tracking-node-health
+
+#######################################################################
+
+#######################################################################
+# Initialization:
+
+: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
+. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
+
+#
+STORAGEMON=$HA_BIN/storage_mon
+ATTRDUP=/usr/sbin/attrd_updater
+
+OCF_RESKEY_CRM_meta_interval_default="0"
+OCF_RESKEY_io_timeout_default="10"
+OCF_RESKEY_inject_errors_default=""
+OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state"
+
+# Explicitly list all environment variables used, to make static analysis happy
+: ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}}
+: ${OCF_RESKEY_drives:=""}
+: ${OCF_RESKEY_io_timeout:=${OCF_RESKEY_io_timeout_default}}
+: ${OCF_RESKEY_inject_errors:=${OCF_RESKEY_inject_errors_default}}
+: ${OCF_RESKEY_state_file:=${OCF_RESKEY_state_file_default}}
+
+#######################################################################
+
+meta_data() {
+	cat <<END
+<?xml version="1.0"?>
+<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
+<resource-agent name="storage-mon">
+<version>1.0</version>
+
+<longdesc lang="en">
+System health agent that checks the storage I/O status of the given drives and
+updates the #health-storage attribute. Usage is highly recommended in combination
+with storage-mon monitoring agent. The agent currently support a maximum of 25
+devices per instance.
+</longdesc>
+<shortdesc lang="en">storage I/O health status</shortdesc>
+
+<parameters>
+
+<parameter name="state_file" unique="1">
+<longdesc lang="en">
+Location to store the resource state in.
+</longdesc>
+<shortdesc lang="en">State file</shortdesc>
+<content type="string" default="${OCF_RESKEY_state_file_default}" />
+</parameter>
+
+<parameter name="drives" unique="1" required="1">
+<longdesc lang="en">
+The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda".
+</longdesc>
+<shortdesc lang="en">Drives to check</shortdesc>
+<content type="string" default="" />
+</parameter>
+
+<parameter name="io_timeout" unique="0">
+<longdesc lang="en">
+Specify disk I/O timeout in seconds. Minimum 1, recommeded 10 (default).
+</longdesc>
+<shortdesc lang="en">Disk I/O timeout</shortdesc>
+<content type="integer" default="${OCF_RESKEY_io_timeout_default}" />
+</parameter>
+
+<parameter name="inject_errors" unique="0">
+<longdesc lang="en">
+Used only for testing! Specify % of I/O errors to simulate drives failures.
+</longdesc>
+<shortdesc lang="en">Specify % of I/O errors to simulate drives failures</shortdesc>
+<content type="integer" default="${OCF_RESKEY_inject_errors_default}" />
+</parameter>
+
+</parameters>
+
+<actions>
+<action name="start"        timeout="10s" />
+<action name="stop"         timeout="120s" />
+<action name="monitor"      timeout="120s" interval="30s" start-delay="0s" />
+<action name="meta-data"    timeout="5s" />
+<action name="validate-all" timeout="10s" />
+</actions>
+</resource-agent>
+END
+	return $OCF_SUCCESS
+}
+
+#######################################################################
+
+storage-mon_usage() {
+	cat <<END
+usage: $0 {start|stop|monitor|validate-all|meta-data}
+
+Expects to have a fully populated OCF RA-compliant environment set.
+END
+	return $1
+}
+
+storage-mon_init() {
+	#Test for presence of storage_mon helper
+	if [ ! -x "$STORAGEMON" ] ; then
+		ocf_log err "${STORAGEMON} not installed."
+		exit $OCF_ERR_INSTALLED
+	fi
+
+	i=0
+	for DRIVE in ${OCF_RESKEY_drives}; do
+		if [ ! -e "$DRIVE" ] ; then
+			ocf_log err "${DRIVE} not found on the system"
+			exit $OCF_ERR_INSTALLED
+		fi
+		i=$((i + 1))
+	done
+
+	if [ "$i" -gt "25" ]; then
+		ocf_log err "Too many drives ($i) configured for this agent. Max 25."
+		exit $OCF_ERR_CONFIGURED
+	fi
+
+	if [ "${OCF_RESKEY_io_timeout}" -lt "1" ]; then
+		ocf_log err "Minimum timeout is 1. Recommended 10 (default)."
+		exit $OCF_ERR_CONFIGURED
+	fi
+
+	if [ -n "${OCF_RESKEY_inject_errors}" ]; then
+		if [ "${OCF_RESKEY_inject_errors}" -lt "1" ] || [ "${OCF_RESKEY_inject_errors}" -gt "100" ]; then
+			ocf_log err "Inject errors % has to be a value between 1 and 100."
+			exit $OCF_ERR_CONFIGURED
+		fi
+	fi
+}
+
+storage-mon_validate() {
+	storage-mon_init
+
+	# Is the state directory writable?
+	state_dir=$(dirname "$OCF_RESKEY_state_file")
+	touch "$state_dir/$$"
+	if [ $? -ne 0 ]; then
+		return $OCF_ERR_CONFIGURED
+	fi
+	rm "$state_dir/$$"
+
+	return $OCF_SUCCESS
+}
+
+storage-mon_monitor() {
+	storage-mon_init
+
+	# Monitor _MUST!_ differentiate correctly between running
+	# (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
+	# That is THREE states, not just yes/no.
+
+	if [ ! -f "${OCF_RESKEY_state_file}" ]; then
+		return $OCF_NOT_RUNNING
+	fi
+
+	# generate command line
+	cmdline=""
+	for DRIVE in ${OCF_RESKEY_drives}; do
+		cmdline="$cmdline --device $DRIVE --score 1"
+	done
+	cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}"
+	if [ -n "${OCF_RESKEY_inject_errors}" ]; then
+		cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
+	fi
+	$STORAGEMON $cmdline
+	if [ $? -ne 0 ]; then
+		status="red"
+	else
+		status="green"
+	fi
+
+	"$ATTRDUP" -n "#health-${OCF_RESOURCE_INSTANCE}" -U "$status" -d "5s"
+	return $OCF_SUCCESS
+}
+
+storage-mon_start() {
+	storage-mon_monitor
+	if [ $? -eq $OCF_SUCCESS ]; then
+		return $OCF_SUCCESS
+	fi
+	touch "${OCF_RESKEY_state_file}"
+}
+
+storage-mon_stop() {
+	storage-mon_monitor
+	if [ $? -eq $OCF_SUCCESS ]; then
+		rm "${OCF_RESKEY_state_file}"
+	fi
+	return $OCF_SUCCESS
+}
+
+storage-mon_validate() {
+	storage-mon_init
+
+	# Is the state directory writable?
+	state_dir=$(dirname "${OCF_RESKEY_state_file}")
+	touch "$state_dir/$$"
+	if [ $? -ne 0 ]; then
+		return $OCF_ERR_CONFIGURED
+	fi
+	rm "$state_dir/$$"
+
+	return $OCF_SUCCESS
+}
+
+case "$__OCF_ACTION" in
+	start)		storage-mon_start;;
+	stop)		storage-mon_stop;;
+	monitor)	storage-mon_monitor;;
+	validate-all)	storage-mon_validate;;
+	meta-data)	meta_data;;
+	usage|help)	storage-mon_usage $OCF_SUCCESS;;
+	*)		storage-mon_usage $OCF_ERR_UNIMPLEMENTED;;
+esac
+rc=$?
+ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
+exit $rc
+# vim: set filetype=sh:
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 1186967cfb..83ff43651d 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -29,7 +29,8 @@ EXTRA_DIST		= ocf-tester.8 sfex_init.8
 
 sbin_PROGRAMS		= 
 sbin_SCRIPTS		= ocf-tester
-halib_PROGRAMS		= findif
+halib_PROGRAMS		= findif \
+			  storage_mon
 
 man8_MANS		= ocf-tester.8
 
@@ -67,6 +68,8 @@ sfex_stat_LDADD		= $(GLIBLIB) -lplumb -lplumbgpl
 
 findif_SOURCES		= findif.c
 
+storage_mon_SOURCES	= storage_mon.c
+
 if BUILD_TICKLE
 halib_PROGRAMS		+= tickle_tcp
 tickle_tcp_SOURCES	= tickle_tcp.c
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
new file mode 100644
index 0000000000..7b65bb4191
--- /dev/null
+++ b/tools/storage_mon.c
@@ -0,0 +1,263 @@
+#include <stdio.h>
+#include <getopt.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <syslog.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#ifdef __FreeBSD__
+#include <sys/disk.h>
+#endif
+
+#define MAX_DEVICES 25
+#define DEFAULT_TIMEOUT 10
+
+static void usage(char *name, FILE *f)
+{
+	fprintf(f, "usage: %s [-hv] [-d <device>]... [-s <score>]... [-t <secs>]\n", name);
+	fprintf(f, "      --device <dev>  device to test, up to %d instances\n", MAX_DEVICES);
+	fprintf(f, "      --score  <n>    score if device fails the test. Must match --device count\n");
+	fprintf(f, "      --timeout <n>   max time to wait for a device test to come back. in seconds (default %d)\n", DEFAULT_TIMEOUT);
+	fprintf(f, "      --inject-errors-percent <n> Generate EIO errors <n>%% of the time (for testing only)\n");
+	fprintf(f, "      --verbose        emit extra output to stdout\n");
+	fprintf(f, "      --help           print this messages\n");
+}
+
+/* Check one device */
+static void *test_device(const char *device, int verbose, int inject_error_percent)
+{
+	uint64_t devsize;
+	int device_fd;
+	int res;
+	off_t seek_spot;
+	char buffer[512];
+
+	if (verbose) {
+		printf("Testing device %s\n", device);
+	}
+
+	device_fd = open(device, O_RDONLY);
+	if (device_fd < 0) {
+		fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno));
+		exit(-1);
+	}
+#ifdef __FreeBSD__
+	res = ioctl(device_fd, DIOCGMEDIASIZE, &devsize);
+#else
+	res = ioctl(device_fd, BLKGETSIZE64, &devsize);
+#endif
+	if (res != 0) {
+		fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno));
+		close(device_fd);
+		exit(-1);
+	}
+	if (verbose) {
+		fprintf(stderr, "%s: size=%zu\n", device, devsize);
+	}
+	/* Don't fret about real randomness */
+	srand(time(NULL) + getpid());
+	/* Pick a random place on the device - sector aligned */
+	seek_spot = (rand() % (devsize-1024)) & 0xFFFFFFFFFFFFFE00;
+	res = lseek(device_fd, seek_spot, SEEK_SET);
+	if (res < 0) {
+		fprintf(stderr, "Failed to seek %s: %s\n", device, strerror(errno));
+		close(device_fd);
+		exit(-1);
+	}
+
+	if (verbose) {
+		printf("%s: reading from pos %ld\n", device, seek_spot);
+	}
+
+	res = read(device_fd, buffer, sizeof(buffer));
+	if (res < 0) {
+		fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno));
+		close(device_fd);
+		exit(-1);
+	}
+	if (res < (int)sizeof(buffer)) {
+		fprintf(stderr, "Failed to read %ld bytes from %s, got %d\n", sizeof(buffer), device, res);
+		close(device_fd);
+		exit(-1);
+	}
+
+	/* Fake an error */
+	if (inject_error_percent && ((rand() % 100) < inject_error_percent)) {
+		fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n");
+		close(device_fd);
+		exit(-1);
+	}
+	res = close(device_fd);
+	if (res != 0) {
+		fprintf(stderr, "Failed to close %s: %s\n", device, strerror(errno));
+		close(device_fd);
+		exit(-1);
+	}
+
+	if (verbose) {
+		printf("%s: done\n", device);
+	}
+	exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+	char *devices[MAX_DEVICES];
+	int scores[MAX_DEVICES];
+	pid_t test_forks[MAX_DEVICES];
+	size_t device_count = 0;
+	size_t score_count = 0;
+	size_t finished_count = 0;
+	int timeout = DEFAULT_TIMEOUT;
+	struct timespec ts;
+	time_t start_time;
+	size_t i;
+	int final_score = 0;
+	int opt, option_index;
+	int verbose = 0;
+	int inject_error_percent = 0;
+	struct option long_options[] = {
+		{"timeout", required_argument, 0, 't' },
+		{"device",  required_argument, 0, 'd' },
+		{"score",   required_argument, 0, 's' },
+		{"inject-errors-percent",   required_argument, 0, 0 },
+		{"verbose", no_argument, 0, 'v' },
+		{"help",    no_argument, 0,       'h' },
+		{0,         0,           0,        0  }
+	};
+	while ( (opt = getopt_long(argc, argv, "hvt:d:s:",
+				   long_options, &option_index)) != -1 ) {
+		switch (opt) {
+			case 0: /* Long-only options */
+				if (strcmp(long_options[option_index].name, "inject-errors-percent") == 0) {
+					inject_error_percent = atoi(optarg);
+					if (inject_error_percent < 1 || inject_error_percent > 100) {
+						fprintf(stderr, "inject_error_percent should be between 1 and 100\n");
+						return -1;
+					}
+				}
+				break;
+			case 'd':
+				if (device_count < MAX_DEVICES) {
+					devices[device_count++] = strdup(optarg);
+				} else {
+					fprintf(stderr, "too many devices, max is %d\n", MAX_DEVICES);
+					return -1;
+				}
+				break;
+			case 's':
+				if (device_count < MAX_DEVICES) {
+					int score = atoi(optarg);
+					if (score < 1 || score > 10) {
+						fprintf(stderr, "Score must be between 1 and 10 inclusive\n");
+						return -1;
+					}
+					scores[score_count++] = score;
+				} else {
+					fprintf(stderr, "too many scores, max is %d\n", MAX_DEVICES);
+					return -1;
+				}
+				break;
+			case 'v':
+				verbose++;
+				break;
+			case 't':
+				timeout = atoi(optarg);
+				if (timeout < 1) {
+					fprintf(stderr, "invalid timeout %d. Min 1, recommended %d (default)\n", timeout, DEFAULT_TIMEOUT);
+					return -1;
+				}
+				break;
+			case 'h':
+				usage(argv[0], stdout);
+				break;
+			default:
+				usage(argv[0], stderr);
+				break;
+		}
+
+	}
+	if (device_count == 0) {
+		fprintf(stderr, "No devices to test, use the -d  or --device argument\n");
+		return -1;
+	}
+
+	if (device_count != score_count) {
+		fprintf(stderr, "There must be the same number of devices and scores\n");
+		return -1;
+	}
+
+	openlog("storage_mon", 0, LOG_DAEMON);
+
+	memset(test_forks, 0, sizeof(test_forks));
+	for (i=0; i<device_count; i++) {
+		test_forks[i] = fork();
+		if (test_forks[i] < 0) {
+			fprintf(stderr, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
+			syslog(LOG_ERR, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
+			/* Just test the devices we have */
+			break;
+		}
+		/* child */
+		if (test_forks[i] == 0) {
+			test_device(devices[i], verbose, inject_error_percent);
+		}
+	}
+
+	/* See if they have finished */
+	clock_gettime(CLOCK_REALTIME, &ts);
+	start_time = ts.tv_sec;
+
+	while ((finished_count < device_count) && ((start_time + timeout) > ts.tv_sec)) {
+		for (i=0; i<device_count; i++) {
+			int wstatus;
+			pid_t w;
+
+			if (test_forks[i] > 0) {
+				w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED);
+				if (w < 0) {
+					fprintf(stderr, "waitpid on %s failed: %s\n", devices[i], strerror(errno));
+					return -1;
+				}
+
+				if (w == test_forks[i]) {
+					if (WIFEXITED(wstatus)) {
+						if (WEXITSTATUS(wstatus) == 0) {
+							finished_count++;
+							test_forks[i] = 0;
+						} else {
+							syslog(LOG_ERR, "Error reading from device %s", devices[i]);
+							final_score += scores[i];
+						}
+					}
+				}
+			}
+		}
+
+		usleep(100000);
+
+		clock_gettime(CLOCK_REALTIME, &ts);
+	}
+
+	/* See which threads have not finished */
+	for (i=0; i<device_count; i++) {
+		if (test_forks[i] != 0) {
+			syslog(LOG_ERR, "Reading from device %s did not complete in %d seconds timeout", devices[i], timeout);
+			fprintf(stderr, "Thread for device %s did not complete in time\n", devices[i]);
+			final_score += scores[i];
+		}
+	}
+
+	if (verbose) {
+		printf("Final score is %d\n", final_score);
+	}
+	return final_score;
+}