diff --git a/SOURCES/bz1509319-storage-mon-new-ra.patch b/SOURCES/bz1509319-storage-mon-new-ra.patch new file mode 100644 index 0000000..7406eb2 --- /dev/null +++ b/SOURCES/bz1509319-storage-mon-new-ra.patch @@ -0,0 +1,714 @@ +From 90b595650d7d8a6f6a69a9f7060c6406aa731c18 Mon Sep 17 00:00:00 2001 +From: "Fabio M. Di Nitto" <fdinitto@redhat.com> +Date: Wed, 28 Jul 2021 10:08:10 +0200 +Subject: [PATCH] Add storage-mon pacemaker health check + +Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com> +--- + .gitignore | 41 ++++++ + configure.ac | 1 + + doc/man/Makefile.am | 3 +- + heartbeat/Makefile.am | 17 +-- + heartbeat/storage-mon.in | 263 +++++++++++++++++++++++++++++++++++++++ + tools/Makefile.am | 5 +- + tools/storage_mon.c | 263 +++++++++++++++++++++++++++++++++++++++ + 7 files changed, 583 insertions(+), 10 deletions(-) + create mode 100644 heartbeat/storage-mon.in + create mode 100644 tools/storage_mon.c + +diff --git a/.gitignore b/.gitignore +index 38d3566205..f7277bf04e 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -45,6 +45,46 @@ heartbeat/ocf-shellfuncs + heartbeat/send_ua + heartbeat/shellfuncs + heartbeat/*.pyc ++heartbeat/AoEtarget ++heartbeat/CTDB ++heartbeat/ManageRAID ++heartbeat/ManageVE ++heartbeat/Squid ++heartbeat/SysInfo ++heartbeat/aws-vpc-route53 ++heartbeat/azure-events ++heartbeat/clvm ++heartbeat/conntrackd ++heartbeat/dnsupdate ++heartbeat/dummypy ++heartbeat/eDir88 ++heartbeat/fio ++heartbeat/galera ++heartbeat/gcp-pd-move ++heartbeat/gcp-vpc-move-ip ++heartbeat/gcp-vpc-move-route ++heartbeat/gcp-vpc-move-vip ++heartbeat/iSCSILogicalUnit ++heartbeat/iSCSITarget ++heartbeat/jira ++heartbeat/kamailio ++heartbeat/lxc ++heartbeat/lxd-info ++heartbeat/machine-info ++heartbeat/mariadb ++heartbeat/mpathpersist ++heartbeat/nfsnotify ++heartbeat/openstack-info ++heartbeat/rabbitmq-cluster ++heartbeat/redis ++heartbeat/rsyslog ++heartbeat/sg_persist ++heartbeat/slapd ++heartbeat/smb-share ++heartbeat/storage-mon ++heartbeat/sybaseASE ++heartbeat/syslog-ng ++heartbeat/vsftpd + include/agent_config.h + include/config.h + include/config.h.in +@@ -61,6 +101,7 @@ systemd/resource-agents.conf + tools/findif + tools/ocf-tester + tools/send_arp ++tools/storage_mon + tools/tickle_tcp + tools/ocft/README + tools/ocft/README.zh_CN +diff --git a/configure.ac b/configure.ac +index 717fb95432..c125df98f6 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -1002,6 +1002,7 @@ AC_CONFIG_FILES([heartbeat/rsyslog], [chmod +x heartbeat/rsyslog]) + AC_CONFIG_FILES([heartbeat/smb-share], [chmod +x heartbeat/smb-share]) + AC_CONFIG_FILES([heartbeat/sg_persist], [chmod +x heartbeat/sg_persist]) + AC_CONFIG_FILES([heartbeat/slapd], [chmod +x heartbeat/slapd]) ++AC_CONFIG_FILES([heartbeat/storage-mon], [chmod +x heartbeat/storage-mon]) + AC_CONFIG_FILES([heartbeat/sybaseASE], [chmod +x heartbeat/sybaseASE]) + AC_CONFIG_FILES([heartbeat/syslog-ng], [chmod +x heartbeat/syslog-ng]) + AC_CONFIG_FILES([heartbeat/vsftpd], [chmod +x heartbeat/vsftpd]) +diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am +index 947d83cb2b..97904ccb16 100644 +--- a/doc/man/Makefile.am ++++ b/doc/man/Makefile.am +@@ -138,6 +138,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \ + ocf_heartbeat_mariadb.7 \ + ocf_heartbeat_mdraid.7 \ + ocf_heartbeat_minio.7 \ ++ ocf_heartbeat_mpathpersist.7 \ + ocf_heartbeat_mysql.7 \ + ocf_heartbeat_mysql-proxy.7 \ + ocf_heartbeat_nagios.7 \ +@@ -175,7 +176,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \ + ocf_heartbeat_smb-share.7 \ + ocf_heartbeat_sybaseASE.7 \ + ocf_heartbeat_sg_persist.7 \ +- ocf_heartbeat_mpathpersist.7 \ ++ ocf_heartbeat_storage-mon.7 \ + ocf_heartbeat_symlink.7 \ + ocf_heartbeat_syslog-ng.7 \ + ocf_heartbeat_tomcat.7 \ +diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am +index 9af44cc127..5d52d211f2 100644 +--- a/heartbeat/Makefile.am ++++ b/heartbeat/Makefile.am +@@ -32,22 +32,22 @@ ocfdir = $(OCF_RA_DIR_PREFIX)/heartbeat + dtddir = $(datadir)/$(PACKAGE_NAME) + dtd_DATA = ra-api-1.dtd metadata.rng + ++ocf_PROGRAMS = ++ + if USE_IPV6ADDR_AGENT +-ocf_PROGRAMS = IPv6addr +-else +-ocf_PROGRAMS = ++ocf_PROGRAMS += IPv6addr + endif + ++halib_PROGRAMS = ++ + if IPV6ADDR_COMPATIBLE +-halib_PROGRAMS = send_ua +-else +-halib_PROGRAMS = ++halib_PROGRAMS += send_ua + endif + + IPv6addr_SOURCES = IPv6addr.c IPv6addr_utils.c +-send_ua_SOURCES = send_ua.c IPv6addr_utils.c +- + IPv6addr_LDADD = -lplumb $(LIBNETLIBS) ++ ++send_ua_SOURCES = send_ua.c IPv6addr_utils.c + send_ua_LDADD = $(LIBNETLIBS) + + osp_SCRIPTS = nova-compute-wait \ +@@ -170,6 +170,7 @@ ocf_SCRIPTS = AoEtarget \ + mpathpersist \ + slapd \ ++ storage-mon \ + sybaseASE \ + symlink \ + syslog-ng \ + tomcat \ +diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in +new file mode 100644 +index 0000000000..5b289fe554 +--- /dev/null ++++ b/heartbeat/storage-mon.in +@@ -0,0 +1,263 @@ ++#!@BASH_SHELL@ ++# ++# Copyright (C) 2021 Red Hat, Inc. All rights reserved. ++# ++# Authors: Christine Caulfield <ccaulfie@redhat.com> ++# Fabio M. Di Nitto <fdinitto@redhat.com> ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of version 2 of the GNU General Public License as ++# published by the Free Software Foundation. ++# ++# This program is distributed in the hope that it would be useful, but ++# WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ++# ++# Further, this software is distributed without any warranty that it is ++# free of the rightful claim of any third person regarding infringement ++# or the like. Any license provided herein, whether implied or ++# otherwise, applies only to this software file. Patent licenses, if ++# any, provided herein do not apply to combinations of this program with ++# other software, or any other product whatsoever. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write the Free Software Foundation, ++# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. ++# ++ ++# ++# Checks storage I/O status of all given drives and writes the #health-storage ++# status into the CIB ++# Implementation is heavily based on ocf:pacemaker:HealtSMART ++# ++# It sends a single block on IO to a radom location on the device and reports any errors returned. ++# If the IO hangs, that will also be returned. (bear in mind tha tmay also hang the C app in some ++# instances). ++# ++# It's worth making a note in the RA description that the smartmon RA is also recommended (this ++# does not replace it), and that Pacemaker health checking should be configued. ++# ++# https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/singlehtml/index.html#tracking-node-health ++ ++####################################################################### ++ ++####################################################################### ++# Initialization: ++ ++: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} ++. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ++ ++# ++STORAGEMON=$HA_BIN/storage_mon ++ATTRDUP=/usr/sbin/attrd_updater ++ ++OCF_RESKEY_CRM_meta_interval_default="0" ++OCF_RESKEY_io_timeout_default="10" ++OCF_RESKEY_inject_errors_default="" ++OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state" ++ ++# Explicitly list all environment variables used, to make static analysis happy ++: ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}} ++: ${OCF_RESKEY_drives:=""} ++: ${OCF_RESKEY_io_timeout:=${OCF_RESKEY_io_timeout_default}} ++: ${OCF_RESKEY_inject_errors:=${OCF_RESKEY_inject_errors_default}} ++: ${OCF_RESKEY_state_file:=${OCF_RESKEY_state_file_default}} ++ ++####################################################################### ++ ++meta_data() { ++ cat <<END ++<?xml version="1.0"?> ++<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> ++<resource-agent name="storage-mon"> ++<version>1.0</version> ++ ++<longdesc lang="en"> ++System health agent that checks the storage I/O status of the given drives and ++updates the #health-storage attribute. Usage is highly recommended in combination ++with storage-mon monitoring agent. The agent currently support a maximum of 25 ++devices per instance. ++</longdesc> ++<shortdesc lang="en">storage I/O health status</shortdesc> ++ ++<parameters> ++ ++<parameter name="state_file" unique="1"> ++<longdesc lang="en"> ++Location to store the resource state in. ++</longdesc> ++<shortdesc lang="en">State file</shortdesc> ++<content type="string" default="${OCF_RESKEY_state_file_default}" /> ++</parameter> ++ ++<parameter name="drives" unique="1" required="1"> ++<longdesc lang="en"> ++The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda". ++</longdesc> ++<shortdesc lang="en">Drives to check</shortdesc> ++<content type="string" default="" /> ++</parameter> ++ ++<parameter name="io_timeout" unique="0"> ++<longdesc lang="en"> ++Specify disk I/O timeout in seconds. Minimum 1, recommeded 10 (default). ++</longdesc> ++<shortdesc lang="en">Disk I/O timeout</shortdesc> ++<content type="integer" default="${OCF_RESKEY_io_timeout_default}" /> ++</parameter> ++ ++<parameter name="inject_errors" unique="0"> ++<longdesc lang="en"> ++Used only for testing! Specify % of I/O errors to simulate drives failures. ++</longdesc> ++<shortdesc lang="en">Specify % of I/O errors to simulate drives failures</shortdesc> ++<content type="integer" default="${OCF_RESKEY_inject_errors_default}" /> ++</parameter> ++ ++</parameters> ++ ++<actions> ++<action name="start" timeout="10s" /> ++<action name="stop" timeout="120s" /> ++<action name="monitor" timeout="120s" interval="30s" start-delay="0s" /> ++<action name="meta-data" timeout="5s" /> ++<action name="validate-all" timeout="10s" /> ++</actions> ++</resource-agent> ++END ++ return $OCF_SUCCESS ++} ++ ++####################################################################### ++ ++storage-mon_usage() { ++ cat <<END ++usage: $0 {start|stop|monitor|validate-all|meta-data} ++ ++Expects to have a fully populated OCF RA-compliant environment set. ++END ++ return $1 ++} ++ ++storage-mon_init() { ++ #Test for presence of storage_mon helper ++ if [ ! -x "$STORAGEMON" ] ; then ++ ocf_log err "${STORAGEMON} not installed." ++ exit $OCF_ERR_INSTALLED ++ fi ++ ++ i=0 ++ for DRIVE in ${OCF_RESKEY_drives}; do ++ if [ ! -e "$DRIVE" ] ; then ++ ocf_log err "${DRIVE} not found on the system" ++ exit $OCF_ERR_INSTALLED ++ fi ++ i=$((i + 1)) ++ done ++ ++ if [ "$i" -gt "25" ]; then ++ ocf_log err "Too many drives ($i) configured for this agent. Max 25." ++ exit $OCF_ERR_CONFIGURED ++ fi ++ ++ if [ "${OCF_RESKEY_io_timeout}" -lt "1" ]; then ++ ocf_log err "Minimum timeout is 1. Recommended 10 (default)." ++ exit $OCF_ERR_CONFIGURED ++ fi ++ ++ if [ -n "${OCF_RESKEY_inject_errors}" ]; then ++ if [ "${OCF_RESKEY_inject_errors}" -lt "1" ] || [ "${OCF_RESKEY_inject_errors}" -gt "100" ]; then ++ ocf_log err "Inject errors % has to be a value between 1 and 100." ++ exit $OCF_ERR_CONFIGURED ++ fi ++ fi ++} ++ ++storage-mon_validate() { ++ storage-mon_init ++ ++ # Is the state directory writable? ++ state_dir=$(dirname "$OCF_RESKEY_state_file") ++ touch "$state_dir/$$" ++ if [ $? -ne 0 ]; then ++ return $OCF_ERR_CONFIGURED ++ fi ++ rm "$state_dir/$$" ++ ++ return $OCF_SUCCESS ++} ++ ++storage-mon_monitor() { ++ storage-mon_init ++ ++ # Monitor _MUST!_ differentiate correctly between running ++ # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING). ++ # That is THREE states, not just yes/no. ++ ++ if [ ! -f "${OCF_RESKEY_state_file}" ]; then ++ return $OCF_NOT_RUNNING ++ fi ++ ++ # generate command line ++ cmdline="" ++ for DRIVE in ${OCF_RESKEY_drives}; do ++ cmdline="$cmdline --device $DRIVE --score 1" ++ done ++ cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}" ++ if [ -n "${OCF_RESKEY_inject_errors}" ]; then ++ cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}" ++ fi ++ $STORAGEMON $cmdline ++ if [ $? -ne 0 ]; then ++ status="red" ++ else ++ status="green" ++ fi ++ ++ "$ATTRDUP" -n "#health-${OCF_RESOURCE_INSTANCE}" -U "$status" -d "5s" ++ return $OCF_SUCCESS ++} ++ ++storage-mon_start() { ++ storage-mon_monitor ++ if [ $? -eq $OCF_SUCCESS ]; then ++ return $OCF_SUCCESS ++ fi ++ touch "${OCF_RESKEY_state_file}" ++} ++ ++storage-mon_stop() { ++ storage-mon_monitor ++ if [ $? -eq $OCF_SUCCESS ]; then ++ rm "${OCF_RESKEY_state_file}" ++ fi ++ return $OCF_SUCCESS ++} ++ ++storage-mon_validate() { ++ storage-mon_init ++ ++ # Is the state directory writable? ++ state_dir=$(dirname "${OCF_RESKEY_state_file}") ++ touch "$state_dir/$$" ++ if [ $? -ne 0 ]; then ++ return $OCF_ERR_CONFIGURED ++ fi ++ rm "$state_dir/$$" ++ ++ return $OCF_SUCCESS ++} ++ ++case "$__OCF_ACTION" in ++ start) storage-mon_start;; ++ stop) storage-mon_stop;; ++ monitor) storage-mon_monitor;; ++ validate-all) storage-mon_validate;; ++ meta-data) meta_data;; ++ usage|help) storage-mon_usage $OCF_SUCCESS;; ++ *) storage-mon_usage $OCF_ERR_UNIMPLEMENTED;; ++esac ++rc=$? ++ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" ++exit $rc ++# vim: set filetype=sh: +diff --git a/tools/Makefile.am b/tools/Makefile.am +index 1186967cfb..83ff43651d 100644 +--- a/tools/Makefile.am ++++ b/tools/Makefile.am +@@ -29,7 +29,8 @@ EXTRA_DIST = ocf-tester.8 sfex_init.8 + + sbin_PROGRAMS = + sbin_SCRIPTS = ocf-tester +-halib_PROGRAMS = findif ++halib_PROGRAMS = findif \ ++ storage_mon + + man8_MANS = ocf-tester.8 + +@@ -67,6 +68,8 @@ sfex_stat_LDADD = $(GLIBLIB) -lplumb -lplumbgpl + + findif_SOURCES = findif.c + ++storage_mon_SOURCES = storage_mon.c ++ + if BUILD_TICKLE + halib_PROGRAMS += tickle_tcp + tickle_tcp_SOURCES = tickle_tcp.c +diff --git a/tools/storage_mon.c b/tools/storage_mon.c +new file mode 100644 +index 0000000000..7b65bb4191 +--- /dev/null ++++ b/tools/storage_mon.c +@@ -0,0 +1,263 @@ ++#include <stdio.h> ++#include <getopt.h> ++#include <stdlib.h> ++#include <stdint.h> ++#include <syslog.h> ++#include <unistd.h> ++#include <errno.h> ++#include <string.h> ++#include <fcntl.h> ++#include <time.h> ++#include <sys/types.h> ++#include <sys/wait.h> ++#include <sys/stat.h> ++#include <sys/ioctl.h> ++#include <sys/mount.h> ++#ifdef __FreeBSD__ ++#include <sys/disk.h> ++#endif ++ ++#define MAX_DEVICES 25 ++#define DEFAULT_TIMEOUT 10 ++ ++static void usage(char *name, FILE *f) ++{ ++ fprintf(f, "usage: %s [-hv] [-d <device>]... [-s <score>]... [-t <secs>]\n", name); ++ fprintf(f, " --device <dev> device to test, up to %d instances\n", MAX_DEVICES); ++ fprintf(f, " --score <n> score if device fails the test. Must match --device count\n"); ++ fprintf(f, " --timeout <n> max time to wait for a device test to come back. in seconds (default %d)\n", DEFAULT_TIMEOUT); ++ fprintf(f, " --inject-errors-percent <n> Generate EIO errors <n>%% of the time (for testing only)\n"); ++ fprintf(f, " --verbose emit extra output to stdout\n"); ++ fprintf(f, " --help print this messages\n"); ++} ++ ++/* Check one device */ ++static void *test_device(const char *device, int verbose, int inject_error_percent) ++{ ++ uint64_t devsize; ++ int device_fd; ++ int res; ++ off_t seek_spot; ++ char buffer[512]; ++ ++ if (verbose) { ++ printf("Testing device %s\n", device); ++ } ++ ++ device_fd = open(device, O_RDONLY); ++ if (device_fd < 0) { ++ fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno)); ++ exit(-1); ++ } ++#ifdef __FreeBSD__ ++ res = ioctl(device_fd, DIOCGMEDIASIZE, &devsize); ++#else ++ res = ioctl(device_fd, BLKGETSIZE64, &devsize); ++#endif ++ if (res != 0) { ++ fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno)); ++ close(device_fd); ++ exit(-1); ++ } ++ if (verbose) { ++ fprintf(stderr, "%s: size=%zu\n", device, devsize); ++ } ++ /* Don't fret about real randomness */ ++ srand(time(NULL) + getpid()); ++ /* Pick a random place on the device - sector aligned */ ++ seek_spot = (rand() % (devsize-1024)) & 0xFFFFFFFFFFFFFE00; ++ res = lseek(device_fd, seek_spot, SEEK_SET); ++ if (res < 0) { ++ fprintf(stderr, "Failed to seek %s: %s\n", device, strerror(errno)); ++ close(device_fd); ++ exit(-1); ++ } ++ ++ if (verbose) { ++ printf("%s: reading from pos %ld\n", device, seek_spot); ++ } ++ ++ res = read(device_fd, buffer, sizeof(buffer)); ++ if (res < 0) { ++ fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno)); ++ close(device_fd); ++ exit(-1); ++ } ++ if (res < (int)sizeof(buffer)) { ++ fprintf(stderr, "Failed to read %ld bytes from %s, got %d\n", sizeof(buffer), device, res); ++ close(device_fd); ++ exit(-1); ++ } ++ ++ /* Fake an error */ ++ if (inject_error_percent && ((rand() % 100) < inject_error_percent)) { ++ fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n"); ++ close(device_fd); ++ exit(-1); ++ } ++ res = close(device_fd); ++ if (res != 0) { ++ fprintf(stderr, "Failed to close %s: %s\n", device, strerror(errno)); ++ close(device_fd); ++ exit(-1); ++ } ++ ++ if (verbose) { ++ printf("%s: done\n", device); ++ } ++ exit(0); ++} ++ ++int main(int argc, char *argv[]) ++{ ++ char *devices[MAX_DEVICES]; ++ int scores[MAX_DEVICES]; ++ pid_t test_forks[MAX_DEVICES]; ++ size_t device_count = 0; ++ size_t score_count = 0; ++ size_t finished_count = 0; ++ int timeout = DEFAULT_TIMEOUT; ++ struct timespec ts; ++ time_t start_time; ++ size_t i; ++ int final_score = 0; ++ int opt, option_index; ++ int verbose = 0; ++ int inject_error_percent = 0; ++ struct option long_options[] = { ++ {"timeout", required_argument, 0, 't' }, ++ {"device", required_argument, 0, 'd' }, ++ {"score", required_argument, 0, 's' }, ++ {"inject-errors-percent", required_argument, 0, 0 }, ++ {"verbose", no_argument, 0, 'v' }, ++ {"help", no_argument, 0, 'h' }, ++ {0, 0, 0, 0 } ++ }; ++ while ( (opt = getopt_long(argc, argv, "hvt:d:s:", ++ long_options, &option_index)) != -1 ) { ++ switch (opt) { ++ case 0: /* Long-only options */ ++ if (strcmp(long_options[option_index].name, "inject-errors-percent") == 0) { ++ inject_error_percent = atoi(optarg); ++ if (inject_error_percent < 1 || inject_error_percent > 100) { ++ fprintf(stderr, "inject_error_percent should be between 1 and 100\n"); ++ return -1; ++ } ++ } ++ break; ++ case 'd': ++ if (device_count < MAX_DEVICES) { ++ devices[device_count++] = strdup(optarg); ++ } else { ++ fprintf(stderr, "too many devices, max is %d\n", MAX_DEVICES); ++ return -1; ++ } ++ break; ++ case 's': ++ if (device_count < MAX_DEVICES) { ++ int score = atoi(optarg); ++ if (score < 1 || score > 10) { ++ fprintf(stderr, "Score must be between 1 and 10 inclusive\n"); ++ return -1; ++ } ++ scores[score_count++] = score; ++ } else { ++ fprintf(stderr, "too many scores, max is %d\n", MAX_DEVICES); ++ return -1; ++ } ++ break; ++ case 'v': ++ verbose++; ++ break; ++ case 't': ++ timeout = atoi(optarg); ++ if (timeout < 1) { ++ fprintf(stderr, "invalid timeout %d. Min 1, recommended %d (default)\n", timeout, DEFAULT_TIMEOUT); ++ return -1; ++ } ++ break; ++ case 'h': ++ usage(argv[0], stdout); ++ break; ++ default: ++ usage(argv[0], stderr); ++ break; ++ } ++ ++ } ++ if (device_count == 0) { ++ fprintf(stderr, "No devices to test, use the -d or --device argument\n"); ++ return -1; ++ } ++ ++ if (device_count != score_count) { ++ fprintf(stderr, "There must be the same number of devices and scores\n"); ++ return -1; ++ } ++ ++ openlog("storage_mon", 0, LOG_DAEMON); ++ ++ memset(test_forks, 0, sizeof(test_forks)); ++ for (i=0; i<device_count; i++) { ++ test_forks[i] = fork(); ++ if (test_forks[i] < 0) { ++ fprintf(stderr, "Error spawning fork for %s: %s\n", devices[i], strerror(errno)); ++ syslog(LOG_ERR, "Error spawning fork for %s: %s\n", devices[i], strerror(errno)); ++ /* Just test the devices we have */ ++ break; ++ } ++ /* child */ ++ if (test_forks[i] == 0) { ++ test_device(devices[i], verbose, inject_error_percent); ++ } ++ } ++ ++ /* See if they have finished */ ++ clock_gettime(CLOCK_REALTIME, &ts); ++ start_time = ts.tv_sec; ++ ++ while ((finished_count < device_count) && ((start_time + timeout) > ts.tv_sec)) { ++ for (i=0; i<device_count; i++) { ++ int wstatus; ++ pid_t w; ++ ++ if (test_forks[i] > 0) { ++ w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED); ++ if (w < 0) { ++ fprintf(stderr, "waitpid on %s failed: %s\n", devices[i], strerror(errno)); ++ return -1; ++ } ++ ++ if (w == test_forks[i]) { ++ if (WIFEXITED(wstatus)) { ++ if (WEXITSTATUS(wstatus) == 0) { ++ finished_count++; ++ test_forks[i] = 0; ++ } else { ++ syslog(LOG_ERR, "Error reading from device %s", devices[i]); ++ final_score += scores[i]; ++ } ++ } ++ } ++ } ++ } ++ ++ usleep(100000); ++ ++ clock_gettime(CLOCK_REALTIME, &ts); ++ } ++ ++ /* See which threads have not finished */ ++ for (i=0; i<device_count; i++) { ++ if (test_forks[i] != 0) { ++ syslog(LOG_ERR, "Reading from device %s did not complete in %d seconds timeout", devices[i], timeout); ++ fprintf(stderr, "Thread for device %s did not complete in time\n", devices[i]); ++ final_score += scores[i]; ++ } ++ } ++ ++ if (verbose) { ++ printf("Final score is %d\n", final_score); ++ } ++ return final_score; ++} diff --git a/SPECS/resource-agents.spec b/SPECS/resource-agents.spec index 66f2005..1b9d9c6 100644 --- a/SPECS/resource-agents.spec +++ b/SPECS/resource-agents.spec @@ -70,7 +70,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.1.1 -Release: 97%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} +Release: 98%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} @@ -282,6 +282,7 @@ Patch190: bz1957765-gcp-vpc-move-vip-retry.patch Patch191: bz1969968-lvmlockd-remove-with_cmirrord.patch Patch192: bz1972035-LVM-activate-fix-drop-in.patch Patch193: bz1972743-podman-fix-container-creation-race.patch +Patch194: bz1509319-storage-mon-new-ra.patch # bundle patches Patch1000: 7-gcp-bundled.patch @@ -648,6 +649,7 @@ exit 1 %patch191 -p1 %patch192 -p1 %patch193 -p1 +%patch194 -p1 -F2 chmod 755 heartbeat/nova-compute-wait chmod 755 heartbeat/NovaEvacuate @@ -1229,6 +1231,11 @@ ccs_update_schema > /dev/null 2>&1 ||: %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog +* Mon Aug 30 2021 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.1.1-98 +- storage-mon: new resource agent + + Resolves: rhbz#1509319 + * Thu Jun 17 2021 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.1.1-97 - podman: fix possible race during container creation