diff --git a/SOURCES/bz1509319-storage-mon-new-ra.patch b/SOURCES/bz1509319-storage-mon-new-ra.patch new file mode 100644 index 0000000..7406eb2 --- /dev/null +++ b/SOURCES/bz1509319-storage-mon-new-ra.patch @@ -0,0 +1,714 @@ +From 90b595650d7d8a6f6a69a9f7060c6406aa731c18 Mon Sep 17 00:00:00 2001 +From: "Fabio M. Di Nitto" +Date: Wed, 28 Jul 2021 10:08:10 +0200 +Subject: [PATCH] Add storage-mon pacemaker health check + +Signed-off-by: Fabio M. Di Nitto +--- + .gitignore | 41 ++++++ + configure.ac | 1 + + doc/man/Makefile.am | 3 +- + heartbeat/Makefile.am | 17 +-- + heartbeat/storage-mon.in | 263 +++++++++++++++++++++++++++++++++++++++ + tools/Makefile.am | 5 +- + tools/storage_mon.c | 263 +++++++++++++++++++++++++++++++++++++++ + 7 files changed, 583 insertions(+), 10 deletions(-) + create mode 100644 heartbeat/storage-mon.in + create mode 100644 tools/storage_mon.c + +diff --git a/.gitignore b/.gitignore +index 38d3566205..f7277bf04e 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -45,6 +45,46 @@ heartbeat/ocf-shellfuncs + heartbeat/send_ua + heartbeat/shellfuncs + heartbeat/*.pyc ++heartbeat/AoEtarget ++heartbeat/CTDB ++heartbeat/ManageRAID ++heartbeat/ManageVE ++heartbeat/Squid ++heartbeat/SysInfo ++heartbeat/aws-vpc-route53 ++heartbeat/azure-events ++heartbeat/clvm ++heartbeat/conntrackd ++heartbeat/dnsupdate ++heartbeat/dummypy ++heartbeat/eDir88 ++heartbeat/fio ++heartbeat/galera ++heartbeat/gcp-pd-move ++heartbeat/gcp-vpc-move-ip ++heartbeat/gcp-vpc-move-route ++heartbeat/gcp-vpc-move-vip ++heartbeat/iSCSILogicalUnit ++heartbeat/iSCSITarget ++heartbeat/jira ++heartbeat/kamailio ++heartbeat/lxc ++heartbeat/lxd-info ++heartbeat/machine-info ++heartbeat/mariadb ++heartbeat/mpathpersist ++heartbeat/nfsnotify ++heartbeat/openstack-info ++heartbeat/rabbitmq-cluster ++heartbeat/redis ++heartbeat/rsyslog ++heartbeat/sg_persist ++heartbeat/slapd ++heartbeat/smb-share ++heartbeat/storage-mon ++heartbeat/sybaseASE ++heartbeat/syslog-ng ++heartbeat/vsftpd + include/agent_config.h + include/config.h + include/config.h.in +@@ -61,6 +101,7 @@ systemd/resource-agents.conf + tools/findif + tools/ocf-tester + tools/send_arp ++tools/storage_mon + tools/tickle_tcp + tools/ocft/README + tools/ocft/README.zh_CN +diff --git a/configure.ac b/configure.ac +index 717fb95432..c125df98f6 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -1002,6 +1002,7 @@ AC_CONFIG_FILES([heartbeat/rsyslog], [chmod +x heartbeat/rsyslog]) + AC_CONFIG_FILES([heartbeat/smb-share], [chmod +x heartbeat/smb-share]) + AC_CONFIG_FILES([heartbeat/sg_persist], [chmod +x heartbeat/sg_persist]) + AC_CONFIG_FILES([heartbeat/slapd], [chmod +x heartbeat/slapd]) ++AC_CONFIG_FILES([heartbeat/storage-mon], [chmod +x heartbeat/storage-mon]) + AC_CONFIG_FILES([heartbeat/sybaseASE], [chmod +x heartbeat/sybaseASE]) + AC_CONFIG_FILES([heartbeat/syslog-ng], [chmod +x heartbeat/syslog-ng]) + AC_CONFIG_FILES([heartbeat/vsftpd], [chmod +x heartbeat/vsftpd]) +diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am +index 947d83cb2b..97904ccb16 100644 +--- a/doc/man/Makefile.am ++++ b/doc/man/Makefile.am +@@ -138,6 +138,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \ + ocf_heartbeat_mariadb.7 \ + ocf_heartbeat_mdraid.7 \ + ocf_heartbeat_minio.7 \ ++ ocf_heartbeat_mpathpersist.7 \ + ocf_heartbeat_mysql.7 \ + ocf_heartbeat_mysql-proxy.7 \ + ocf_heartbeat_nagios.7 \ +@@ -175,7 +176,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \ + ocf_heartbeat_smb-share.7 \ + ocf_heartbeat_sybaseASE.7 \ + ocf_heartbeat_sg_persist.7 \ +- ocf_heartbeat_mpathpersist.7 \ ++ ocf_heartbeat_storage-mon.7 \ + ocf_heartbeat_symlink.7 \ + ocf_heartbeat_syslog-ng.7 \ + ocf_heartbeat_tomcat.7 \ +diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am +index 9af44cc127..5d52d211f2 100644 +--- a/heartbeat/Makefile.am ++++ b/heartbeat/Makefile.am +@@ -32,22 +32,22 @@ ocfdir = $(OCF_RA_DIR_PREFIX)/heartbeat + dtddir = $(datadir)/$(PACKAGE_NAME) + dtd_DATA = ra-api-1.dtd metadata.rng + ++ocf_PROGRAMS = ++ + if USE_IPV6ADDR_AGENT +-ocf_PROGRAMS = IPv6addr +-else +-ocf_PROGRAMS = ++ocf_PROGRAMS += IPv6addr + endif + ++halib_PROGRAMS = ++ + if IPV6ADDR_COMPATIBLE +-halib_PROGRAMS = send_ua +-else +-halib_PROGRAMS = ++halib_PROGRAMS += send_ua + endif + + IPv6addr_SOURCES = IPv6addr.c IPv6addr_utils.c +-send_ua_SOURCES = send_ua.c IPv6addr_utils.c +- + IPv6addr_LDADD = -lplumb $(LIBNETLIBS) ++ ++send_ua_SOURCES = send_ua.c IPv6addr_utils.c + send_ua_LDADD = $(LIBNETLIBS) + + osp_SCRIPTS = nova-compute-wait \ +@@ -170,6 +170,7 @@ ocf_SCRIPTS = AoEtarget \ + mpathpersist \ + slapd \ ++ storage-mon \ + sybaseASE \ + symlink \ + syslog-ng \ + tomcat \ +diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in +new file mode 100644 +index 0000000000..5b289fe554 +--- /dev/null ++++ b/heartbeat/storage-mon.in +@@ -0,0 +1,263 @@ ++#!@BASH_SHELL@ ++# ++# Copyright (C) 2021 Red Hat, Inc. All rights reserved. ++# ++# Authors: Christine Caulfield ++# Fabio M. Di Nitto ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of version 2 of the GNU General Public License as ++# published by the Free Software Foundation. ++# ++# This program is distributed in the hope that it would be useful, but ++# WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ++# ++# Further, this software is distributed without any warranty that it is ++# free of the rightful claim of any third person regarding infringement ++# or the like. Any license provided herein, whether implied or ++# otherwise, applies only to this software file. Patent licenses, if ++# any, provided herein do not apply to combinations of this program with ++# other software, or any other product whatsoever. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write the Free Software Foundation, ++# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. ++# ++ ++# ++# Checks storage I/O status of all given drives and writes the #health-storage ++# status into the CIB ++# Implementation is heavily based on ocf:pacemaker:HealtSMART ++# ++# It sends a single block on IO to a radom location on the device and reports any errors returned. ++# If the IO hangs, that will also be returned. (bear in mind tha tmay also hang the C app in some ++# instances). ++# ++# It's worth making a note in the RA description that the smartmon RA is also recommended (this ++# does not replace it), and that Pacemaker health checking should be configued. ++# ++# https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/singlehtml/index.html#tracking-node-health ++ ++####################################################################### ++ ++####################################################################### ++# Initialization: ++ ++: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} ++. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ++ ++# ++STORAGEMON=$HA_BIN/storage_mon ++ATTRDUP=/usr/sbin/attrd_updater ++ ++OCF_RESKEY_CRM_meta_interval_default="0" ++OCF_RESKEY_io_timeout_default="10" ++OCF_RESKEY_inject_errors_default="" ++OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state" ++ ++# Explicitly list all environment variables used, to make static analysis happy ++: ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}} ++: ${OCF_RESKEY_drives:=""} ++: ${OCF_RESKEY_io_timeout:=${OCF_RESKEY_io_timeout_default}} ++: ${OCF_RESKEY_inject_errors:=${OCF_RESKEY_inject_errors_default}} ++: ${OCF_RESKEY_state_file:=${OCF_RESKEY_state_file_default}} ++ ++####################################################################### ++ ++meta_data() { ++ cat < ++ ++ ++1.0 ++ ++ ++System health agent that checks the storage I/O status of the given drives and ++updates the #health-storage attribute. Usage is highly recommended in combination ++with storage-mon monitoring agent. The agent currently support a maximum of 25 ++devices per instance. ++ ++storage I/O health status ++ ++ ++ ++ ++ ++Location to store the resource state in. ++ ++State file ++ ++ ++ ++ ++ ++The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda". ++ ++Drives to check ++ ++ ++ ++ ++ ++Specify disk I/O timeout in seconds. Minimum 1, recommeded 10 (default). ++ ++Disk I/O timeout ++ ++ ++ ++ ++ ++Used only for testing! Specify % of I/O errors to simulate drives failures. ++ ++Specify % of I/O errors to simulate drives failures ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++END ++ return $OCF_SUCCESS ++} ++ ++####################################################################### ++ ++storage-mon_usage() { ++ cat < ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef __FreeBSD__ ++#include ++#endif ++ ++#define MAX_DEVICES 25 ++#define DEFAULT_TIMEOUT 10 ++ ++static void usage(char *name, FILE *f) ++{ ++ fprintf(f, "usage: %s [-hv] [-d ]... [-s ]... [-t ]\n", name); ++ fprintf(f, " --device device to test, up to %d instances\n", MAX_DEVICES); ++ fprintf(f, " --score score if device fails the test. Must match --device count\n"); ++ fprintf(f, " --timeout max time to wait for a device test to come back. in seconds (default %d)\n", DEFAULT_TIMEOUT); ++ fprintf(f, " --inject-errors-percent Generate EIO errors %% of the time (for testing only)\n"); ++ fprintf(f, " --verbose emit extra output to stdout\n"); ++ fprintf(f, " --help print this messages\n"); ++} ++ ++/* Check one device */ ++static void *test_device(const char *device, int verbose, int inject_error_percent) ++{ ++ uint64_t devsize; ++ int device_fd; ++ int res; ++ off_t seek_spot; ++ char buffer[512]; ++ ++ if (verbose) { ++ printf("Testing device %s\n", device); ++ } ++ ++ device_fd = open(device, O_RDONLY); ++ if (device_fd < 0) { ++ fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno)); ++ exit(-1); ++ } ++#ifdef __FreeBSD__ ++ res = ioctl(device_fd, DIOCGMEDIASIZE, &devsize); ++#else ++ res = ioctl(device_fd, BLKGETSIZE64, &devsize); ++#endif ++ if (res != 0) { ++ fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno)); ++ close(device_fd); ++ exit(-1); ++ } ++ if (verbose) { ++ fprintf(stderr, "%s: size=%zu\n", device, devsize); ++ } ++ /* Don't fret about real randomness */ ++ srand(time(NULL) + getpid()); ++ /* Pick a random place on the device - sector aligned */ ++ seek_spot = (rand() % (devsize-1024)) & 0xFFFFFFFFFFFFFE00; ++ res = lseek(device_fd, seek_spot, SEEK_SET); ++ if (res < 0) { ++ fprintf(stderr, "Failed to seek %s: %s\n", device, strerror(errno)); ++ close(device_fd); ++ exit(-1); ++ } ++ ++ if (verbose) { ++ printf("%s: reading from pos %ld\n", device, seek_spot); ++ } ++ ++ res = read(device_fd, buffer, sizeof(buffer)); ++ if (res < 0) { ++ fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno)); ++ close(device_fd); ++ exit(-1); ++ } ++ if (res < (int)sizeof(buffer)) { ++ fprintf(stderr, "Failed to read %ld bytes from %s, got %d\n", sizeof(buffer), device, res); ++ close(device_fd); ++ exit(-1); ++ } ++ ++ /* Fake an error */ ++ if (inject_error_percent && ((rand() % 100) < inject_error_percent)) { ++ fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n"); ++ close(device_fd); ++ exit(-1); ++ } ++ res = close(device_fd); ++ if (res != 0) { ++ fprintf(stderr, "Failed to close %s: %s\n", device, strerror(errno)); ++ close(device_fd); ++ exit(-1); ++ } ++ ++ if (verbose) { ++ printf("%s: done\n", device); ++ } ++ exit(0); ++} ++ ++int main(int argc, char *argv[]) ++{ ++ char *devices[MAX_DEVICES]; ++ int scores[MAX_DEVICES]; ++ pid_t test_forks[MAX_DEVICES]; ++ size_t device_count = 0; ++ size_t score_count = 0; ++ size_t finished_count = 0; ++ int timeout = DEFAULT_TIMEOUT; ++ struct timespec ts; ++ time_t start_time; ++ size_t i; ++ int final_score = 0; ++ int opt, option_index; ++ int verbose = 0; ++ int inject_error_percent = 0; ++ struct option long_options[] = { ++ {"timeout", required_argument, 0, 't' }, ++ {"device", required_argument, 0, 'd' }, ++ {"score", required_argument, 0, 's' }, ++ {"inject-errors-percent", required_argument, 0, 0 }, ++ {"verbose", no_argument, 0, 'v' }, ++ {"help", no_argument, 0, 'h' }, ++ {0, 0, 0, 0 } ++ }; ++ while ( (opt = getopt_long(argc, argv, "hvt:d:s:", ++ long_options, &option_index)) != -1 ) { ++ switch (opt) { ++ case 0: /* Long-only options */ ++ if (strcmp(long_options[option_index].name, "inject-errors-percent") == 0) { ++ inject_error_percent = atoi(optarg); ++ if (inject_error_percent < 1 || inject_error_percent > 100) { ++ fprintf(stderr, "inject_error_percent should be between 1 and 100\n"); ++ return -1; ++ } ++ } ++ break; ++ case 'd': ++ if (device_count < MAX_DEVICES) { ++ devices[device_count++] = strdup(optarg); ++ } else { ++ fprintf(stderr, "too many devices, max is %d\n", MAX_DEVICES); ++ return -1; ++ } ++ break; ++ case 's': ++ if (device_count < MAX_DEVICES) { ++ int score = atoi(optarg); ++ if (score < 1 || score > 10) { ++ fprintf(stderr, "Score must be between 1 and 10 inclusive\n"); ++ return -1; ++ } ++ scores[score_count++] = score; ++ } else { ++ fprintf(stderr, "too many scores, max is %d\n", MAX_DEVICES); ++ return -1; ++ } ++ break; ++ case 'v': ++ verbose++; ++ break; ++ case 't': ++ timeout = atoi(optarg); ++ if (timeout < 1) { ++ fprintf(stderr, "invalid timeout %d. Min 1, recommended %d (default)\n", timeout, DEFAULT_TIMEOUT); ++ return -1; ++ } ++ break; ++ case 'h': ++ usage(argv[0], stdout); ++ break; ++ default: ++ usage(argv[0], stderr); ++ break; ++ } ++ ++ } ++ if (device_count == 0) { ++ fprintf(stderr, "No devices to test, use the -d or --device argument\n"); ++ return -1; ++ } ++ ++ if (device_count != score_count) { ++ fprintf(stderr, "There must be the same number of devices and scores\n"); ++ return -1; ++ } ++ ++ openlog("storage_mon", 0, LOG_DAEMON); ++ ++ memset(test_forks, 0, sizeof(test_forks)); ++ for (i=0; i ts.tv_sec)) { ++ for (i=0; i 0) { ++ w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED); ++ if (w < 0) { ++ fprintf(stderr, "waitpid on %s failed: %s\n", devices[i], strerror(errno)); ++ return -1; ++ } ++ ++ if (w == test_forks[i]) { ++ if (WIFEXITED(wstatus)) { ++ if (WEXITSTATUS(wstatus) == 0) { ++ finished_count++; ++ test_forks[i] = 0; ++ } else { ++ syslog(LOG_ERR, "Error reading from device %s", devices[i]); ++ final_score += scores[i]; ++ } ++ } ++ } ++ } ++ } ++ ++ usleep(100000); ++ ++ clock_gettime(CLOCK_REALTIME, &ts); ++ } ++ ++ /* See which threads have not finished */ ++ for (i=0; i. ++ ++=head1 VARIABLES ++ ++Here are the variables exported by this module: ++ ++=over ++ ++=item $INITDIR ++ ++=item $HA_DIR ++ ++=item $HA_RCDIR ++ ++=item $HA_CONFDIR ++ ++=item $HA_CF ++ ++=item $HA_VARLIB ++ ++=item $HA_RSCTMP ++ ++=item $HA_RSCTMP_OLD ++ ++=item $HA_FIFO ++ ++=item $HA_BIN ++ ++=item $HA_SBIN_DIR ++ ++=item $HA_DATEFMT ++ ++=item $HA_DEBUGLOG ++ ++=item $HA_RESOURCEDIR ++ ++=item $HA_DOCDIR ++ ++=item $__SCRIPT_NAME ++ ++=item $HA_VARRUN ++ ++=item $HA_VARLOCK ++ ++=item $ocf_prefix ++ ++=item $ocf_exec_prefix ++ ++=back ++ ++=cut ++ ++package OCF_Directories; ++ ++use strict; ++use warnings; ++use 5.008; ++use File::Basename; ++ ++BEGIN { ++ use Exporter; ++ ++ ++ our $VERSION = 'v2.3.0'; ++ our @ISA = ('Exporter'); ++ our @EXPORT = qw( ++ $INITDIR ++ $HA_DIR ++ $HA_RCDIR ++ $HA_CONFDIR ++ $HA_CF ++ $HA_VARLIB ++ $HA_RSCTMP ++ $HA_RSCTMP_OLD ++ $HA_FIFO ++ $HA_BIN ++ $HA_SBIN_DIR ++ $HA_DATEFMT ++ $HA_DEBUGLOG ++ $HA_RESOURCEDIR ++ $HA_DOCDIR ++ $__SCRIPT_NAME ++ $HA_VARRUN ++ $HA_VARLOCK ++ $ocf_prefix ++ $ocf_exec_prefix ++ ); ++ our @EXPORT_OK = ( @EXPORT ); ++} ++ ++our $INITDIR = ( $ENV{'INITDIR'} || '/etc/init.d' ); ++our $HA_DIR = ( $ENV{'HA_DIR'} || '/etc/ha.d' ); ++our $HA_RCDIR = ( $ENV{'HA_RCDIR'} || '/etc/ha.d/rc.d' ); ++our $HA_CONFDIR = ( $ENV{'HA_CONFDIR'} || '/etc/ha.d/conf' ); ++our $HA_CF = ( $ENV{'HA_CF'} || '/etc/ha.d/ha.cf' ); ++our $HA_VARLIB = ( $ENV{'HA_VARLIB'} || '/var/lib/heartbeat' ); ++our $HA_RSCTMP = ( $ENV{'HA_RSCTMP'} || '/run/resource-agents' ); ++our $HA_RSCTMP_OLD = ( $ENV{'HA_RSCTMP_OLD'} || '/var/run/heartbeat/rsctmp' ); ++our $HA_FIFO = ( $ENV{'HA_FIFO'} || '/var/lib/heartbeat/fifo' ); ++our $HA_BIN = ( $ENV{'HA_BIN'} || '/usr/libexec/heartbeat' ); ++our $HA_SBIN_DIR = ( $ENV{'HA_SBIN_DIR'} || '/usr/sbin' ); ++our $HA_DATEFMT = ( $ENV{'HA_DATEFMT'} || '%b %d %T ' ); ++our $HA_DEBUGLOG = ( $ENV{'HA_DEBUGLOG'} || '/dev/null' ); ++our $HA_RESOURCEDIR = ( $ENV{'HA_RESOURCEDIR'}|| '/etc/ha.d/resource.d' ); ++our $HA_DOCDIR = ( $ENV{'HA_DOCDIR'} || '/usr/share/doc/heartbeat' ); ++our $__SCRIPT_NAME = ( $ENV{'__SCRIPT_NAME'} || fileparse($0) ); ++our $HA_VARRUN = ( $ENV{'HA_VARRUN'} || '/var/run' ); ++our $HA_VARLOCK = ( $ENV{'HA_VARLOCK'} || '/var/lock/subsys' ); ++our $ocf_prefix = '/usr'; ++our $ocf_exec_prefix = '/usr'; ++ ++1; ++ ++=head1 COPYRIGHT AND LICENSE ++ ++Copyright (C) 2016: Jehan-Guillaume de Rorthais and Mael Rimbault. ++ ++Licensed under the PostgreSQL License. ++ +diff --color -uNr a/heartbeat/OCF_Functions.pm b/heartbeat/OCF_Functions.pm +--- a/heartbeat/OCF_Functions.pm 1970-01-01 01:00:00.000000000 +0100 ++++ b/heartbeat/OCF_Functions.pm 2021-04-13 13:37:35.621267404 +0200 +@@ -0,0 +1,631 @@ ++#!/usr/bin/perl ++# This program is open source, licensed under the PostgreSQL License. ++# For license terms, see the LICENSE file. ++# ++# Copyright (C) 2016-2020: Jehan-Guillaume de Rorthais and Mael Rimbault ++ ++=head1 NAME ++ ++OCF_Functions - helper subroutines for OCF agent ++ ++=head1 SYNOPSIS ++ ++ use FindBin; ++ use lib "$FindBin::RealBin/../../lib/heartbeat/"; ++ ++ use OCF_Functions; ++ ++=head1 DESCRIPTION ++ ++This module has been ported from the ocf-shellfuncs shell script of the ++resource-agents project. See L. ++ ++=head1 VARIABLE ++ ++The only variable exported by this module is C<__OCF_ACTION>. ++ ++=head1 SUBROUTINES ++ ++Here are the subroutines ported from ocf-shellfuncs and exported by this module: ++ ++=over ++ ++=item ha_debug ++ ++=item ha_log ++ ++=item hadate ++ ++=item ocf_is_clone ++ ++=item ocf_is_ms ++ ++=item ocf_is_probe ++ ++=item ocf_is_root ++ ++=item ocf_is_true ++ ++=item ocf_is_ver ++ ++=item ocf_local_nodename ++ ++=item ocf_log ++ ++=item ocf_exit_reason ++ ++=item ocf_maybe_random ++ ++=item ocf_ver2num ++ ++=item ocf_ver_complete_level ++ ++=item ocf_ver_level ++ ++=item ocf_version_cmp ++ ++=item set_logtag ++ ++=back ++ ++Here are the subroutines only existing in the perl module but not in the ++ocf-shellfuncs script: ++ ++=over ++ ++=item ocf_notify_env ++ ++=back ++ ++=cut ++ ++package OCF_Functions; ++ ++use strict; ++use warnings; ++use 5.008; ++use POSIX qw( strftime setlocale LC_ALL ); ++use English; ++ ++use FindBin; ++use lib "$FindBin::RealBin/../../lib/heartbeat/"; ++ ++use OCF_ReturnCodes; ++use OCF_Directories; ++ ++BEGIN { ++ use Exporter; ++ ++ our $VERSION = 'v2.3.0'; ++ our @ISA = ('Exporter'); ++ our @EXPORT = qw( ++ $__OCF_ACTION ++ ocf_is_root ++ ocf_maybe_random ++ ocf_is_true ++ hadate ++ set_logtag ++ ha_log ++ ha_debug ++ ocf_log ++ ocf_exit_reason ++ ocf_is_probe ++ ocf_is_clone ++ ocf_is_ms ++ ocf_is_ver ++ ocf_ver2num ++ ocf_ver_level ++ ocf_ver_complete_level ++ ocf_version_cmp ++ ocf_local_nodename ++ ocf_notify_env ++ ); ++ our @EXPORT_OK = ( @EXPORT ); ++} ++ ++our $__OCF_ACTION; ++ ++sub ocf_is_root { ++ return $EUID == 0; ++} ++ ++sub ocf_maybe_random { ++ return int( rand( 32767 ) ); ++} ++ ++sub ocf_is_true { ++ my $v = shift; ++ return ( defined $v and $v =~ /^(?:yes|true|1|YES|TRUE|ja|on|ON)$/ ); ++} ++ ++sub hadate { ++ return strftime( $HA_DATEFMT, localtime ); ++} ++ ++sub set_logtag { ++ ++ return if defined $ENV{'HA_LOGTAG'} and $ENV{'HA_LOGTAG'} ne ''; ++ ++ if ( defined $ENV{'OCF_RESOURCE_INSTANCE'} and $ENV{'OCF_RESOURCE_INSTANCE'} ne '' ) { ++ $ENV{'HA_LOGTAG'} = "$__SCRIPT_NAME($ENV{'OCF_RESOURCE_INSTANCE'})[$PID]"; ++ } ++ else { ++ $ENV{'HA_LOGTAG'}="${__SCRIPT_NAME}[$PID]"; ++ } ++} ++ ++sub __ha_log { ++ my $ignore_stderr = 0; ++ my $loglevel = ''; ++ ++ if ( $_[0] eq '--ignore-stderr' ) { ++ $ignore_stderr = 1; ++ shift; ++ } ++ ++ $ENV{'HA_LOGFACILITY'} = '' if not defined $ENV{'HA_LOGFACILITY'} ++ or $ENV{'HA_LOGFACILITY'} eq 'none'; ++ ++ # if we're connected to a tty, then output to stderr ++ if ( -t STDERR ) { ++ # FIXME ++ # T.N.: this was ported with the bug on $loglevel being empty ++ # and never set before the test here... ++ if ( defined $ENV{'HA_debug'} ++ and $ENV{'HA_debug'} == 0 ++ and $loglevel eq 'debug' ++ ) { ++ return 0; ++ } ++ elsif ( $ignore_stderr ) { ++ # something already printed this error to stderr, so ignore ++ return 0; ++ } ++ if ( defined $ENV{'HA_LOGTAG'} and $ENV{'HA_LOGTAG'} ne '' ) { ++ printf STDERR "%s: %s\n", $ENV{'HA_LOGTAG'}, join ' ', @ARG; ++ } ++ else { ++ printf STDERR "%s\n", join ' ', @ARG; ++ } ++ return 0; ++ } ++ ++ set_logtag(); ++ ++ if ( defined $ENV{'HA_LOGD'} and $ENV{'HA_LOGD'} eq 'yes' ) { ++ system 'ha_logger', '-t', $ENV{'HA_LOGTAG'}, @ARG; ++ return 0 if ( $? >> 8 ) == 0; ++ } ++ ++ unless ( $ENV{'HA_LOGFACILITY'} eq '' ) { ++ # logging through syslog ++ # loglevel is unknown, use 'notice' for now ++ $loglevel = 'notice'; ++ for ( "@ARG" ) { ++ if ( /ERROR/ ) { ++ $loglevel = 'err'; ++ } ++ elsif ( /WARN/ ) { ++ $loglevel = 'warning'; ++ } ++ elsif (/INFO|info/ ) { ++ $loglevel = 'info'; ++ } ++ } ++ ++ system 'logger', '-t', $ENV{'HA_LOGTAG'}, '-p', ++ "$ENV{'HA_LOGFACILITY'}.$loglevel", @ARG; ++ } ++ ++ if ( defined $ENV{'HA_LOGFILE'} and $ENV{'HA_LOGFILE'} ne '' ) { ++ # appending to $HA_LOGFILE ++ open my $logfile, '>>', $ENV{'HA_LOGFILE'}; ++ printf $logfile "%s: %s %s\n", $ENV{'HA_LOGTAG'}, hadate(), ++ join (' ', @ARG); ++ close $logfile; ++ } ++ ++ # appending to stderr ++ printf STDERR "%s %s\n", hadate(), join ' ', @ARG ++ if (not defined $ENV{'HA_LOGFACILITY'} or $ENV{'HA_LOGFACILITY'} eq '') ++ and (not defined $ENV{'HA_LOGFILE'} or $ENV{'HA_LOGFILE'} eq '' ) ++ and not $ignore_stderr; ++ ++ if ( defined $ENV{'HA_DEBUGLOG'} and $ENV{'HA_DEBUGLOG'} ne '' ++ and $ENV{'HA_LOGFILE'} ne $ENV{'HA_DEBUGLOG'} ++ ) { ++ # appending to $HA_DEBUGLOG ++ open my $logfile, '>>', $ENV{'HA_DEBUGLOG'}; ++ printf $logfile "%s: %s %s\n", $ENV{'HA_LOGTAG'}, hadate(), ++ join (' ', @ARG); ++ close $logfile; ++ } ++} ++ ++sub ha_log { ++ return __ha_log( @ARG ); ++} ++ ++sub ha_debug { ++ ++ return 0 if defined $ENV{'HA_debug'} and $ENV{'HA_debug'} == 0; ++ ++ if ( -t STDERR ) { ++ if ( defined $ENV{'HA_LOGTAG'} and $ENV{'HA_LOGTAG'} ne '' ) { ++ printf STDERR "%s: %s\n", $ENV{'HA_LOGTAG'}, join ' ', @ARG; ++ } ++ else { ++ printf STDERR "%s\n", join ' ', @ARG; ++ } ++ ++ return 0; ++ } ++ ++ set_logtag(); ++ ++ if ( defined $ENV{'HA_LOGD'} and $ENV{'HA_LOGD'} eq 'yes' ) { ++ system 'ha_logger', '-t', $ENV{'HA_LOGTAG'}, '-D', 'ha-debug', @ARG; ++ return 0 if ( $? >> 8 ) == 0; ++ } ++ ++ $ENV{'HA_LOGFACILITY'} = '' if not defined $ENV{'HA_LOGFACILITY'} ++ or $ENV{'HA_LOGFACILITY'} eq 'none'; ++ ++ unless ( $ENV{'HA_LOGFACILITY'} eq '' ) { ++ # logging through syslog ++ ++ system 'logger', '-t', $ENV{'HA_LOGTAG'}, '-p', ++ "$ENV{'HA_LOGFACILITY'}.debug", @ARG; ++ } ++ ++ if ( defined $ENV{'HA_DEBUGLOG'} and -f $ENV{'HA_DEBUGLOG'} ) { ++ my $logfile; ++ # appending to $HA_DEBUGLOG ++ open $logfile, '>>', $ENV{'HA_DEBUGLOG'}; ++ printf $logfile "%s: %s %s\n", $ENV{'HA_LOGTAG'}, hadate(), ++ join (' ', @ARG); ++ close $logfile; ++ } ++ ++ # appending to stderr ++ printf STDERR "%s: %s %s\n", $ENV{'HA_LOGTAG'}, hadate(), join ' ', @ARG ++ if (not defined $ENV{'HA_LOGFACILITY'} or $ENV{'HA_LOGFACILITY'} eq '') ++ and (not defined $ENV{'HA_DEBUGLOG'} or $ENV{'HA_DEBUGLOG'} eq '' ); ++} ++ ++# ++# ocf_log: log messages from the resource agent ++# This function is slightly different from its equivalent in ocf-shellfuncs.in ++# as it behaves like printf. ++# Arguments: ++# * __OCF_PRIO: log level ++# * __OCF_MSG: printf-like format string ++# * all other arguments are values for the printf-like format string ++# ++sub ocf_log { ++ my $__OCF_PRIO; ++ my $__OCF_MSG; ++ ++ # TODO: Revisit and implement internally. ++ if ( scalar @ARG < 2 ) { ++ ocf_log ( 'err', "Not enough arguments [%d] to ocf_log", scalar @ARG ); ++ } ++ ++ $__OCF_PRIO = shift; ++ $__OCF_MSG = shift; ++ $__OCF_MSG = sprintf $__OCF_MSG, @ARG; ++ ++ for ( $__OCF_PRIO ) { ++ if ( /crit/ ) { $__OCF_PRIO = 'CRIT' } ++ elsif ( /err/ ) { $__OCF_PRIO = 'ERROR' } ++ elsif ( /warn/ ) { $__OCF_PRIO = 'WARNING' } ++ elsif ( /info/ ) { $__OCF_PRIO = 'INFO' } ++ elsif ( /debug/ ) { $__OCF_PRIO = 'DEBUG' } ++ else { $__OCF_PRIO =~ tr/[a-z]/[A-Z]/ } ++ } ++ ++ if ( $__OCF_PRIO eq 'DEBUG' ) { ++ ha_debug( "$__OCF_PRIO: $__OCF_MSG"); ++ } ++ else { ++ ha_log( "$__OCF_PRIO: $__OCF_MSG"); ++ } ++} ++ ++ ++# ++# ocf_exit_reason: print exit error string to stderr and log ++# Usage: Allows the OCF script to provide a string ++# describing why the exit code was returned. ++# Arguments: reason - required, The string that represents ++# why the error occured. ++# ++sub ocf_exit_reason { ++ my $cookie = $ENV{'OCF_EXIT_REASON_PREFIX'} || 'ocf-exit-reason:'; ++ my $fmt; ++ my $msg; ++ ++ # No argument is likely not intentional. ++ # Just one argument implies a printf format string of just "%s". ++ # "Least surprise" in case some interpolated string from variable ++ # expansion or other contains a percent sign. ++ # More than one argument: first argument is going to be the format string. ++ ocf_log ( 'err', 'Not enough arguments [%d] to ocf_exit_reason', ++ scalar @ARG ) if scalar @ARG < 1; ++ ++ $fmt = shift; ++ $msg = sprintf $fmt, @ARG; ++ ++ print STDERR "$cookie$msg\n"; ++ __ha_log( '--ignore-stderr', "ERROR: $msg" ); ++} ++ ++# returns true if the CRM is currently running a probe. A probe is ++# defined as a monitor operation with a monitoring interval of zero. ++sub ocf_is_probe { ++ return ( $__OCF_ACTION eq 'monitor' ++ and $ENV{'OCF_RESKEY_CRM_meta_interval'} == 0 ); ++} ++ ++# returns true if the resource is configured as a clone. This is ++# defined as a resource where the clone-max meta attribute is present, ++# and set to greater than zero. ++sub ocf_is_clone { ++ return ( defined $ENV{'OCF_RESKEY_CRM_meta_clone_max'} ++ and $ENV{'OCF_RESKEY_CRM_meta_clone_max'} > 0 ); ++} ++ ++# returns true if the resource is configured as a multistate ++# (master/slave) resource. This is defined as a resource where the ++# master-max meta attribute is present, and set to greater than zero. ++sub ocf_is_ms { ++ return ( defined $ENV{'OCF_RESKEY_CRM_meta_master_max'} ++ and $ENV{'OCF_RESKEY_CRM_meta_master_max'} > 0 ); ++} ++ ++# version check functions ++# allow . and - to delimit version numbers ++# max version number is 999 ++# letters and such are effectively ignored ++# ++sub ocf_is_ver { ++ return $ARG[0] =~ /^[0-9][0-9.-]*[0-9]$/; ++} ++ ++sub ocf_ver2num { ++ my $v = 0; ++ ++ $v = $v * 1000 + $1 while $ARG[0] =~ /(\d+)/g; ++ ++ return $v; ++} ++ ++sub ocf_ver_level { ++ my $v = () = $ARG[0] =~ /(\d+)/g; ++ return $v; ++} ++ ++sub ocf_ver_complete_level { ++ my $ver = shift; ++ my $level = shift; ++ my $i = 0; ++ ++ for ( my $i = 0; $i < $level; $i++ ) { ++ $ver .= "$ver.0"; ++ } ++ ++ return $ver; ++} ++ ++# usage: ocf_version_cmp VER1 VER2 ++# version strings can contain digits, dots, and dashes ++# must start and end with a digit ++# returns: ++# 0: VER1 smaller (older) than VER2 ++# 1: versions equal ++# 2: VER1 greater (newer) than VER2 ++# 3: bad format ++sub ocf_version_cmp { ++ my $v1 = shift; ++ my $v2 = shift; ++ my $v1_level; ++ my $v2_level; ++ my $level_diff; ++ ++ return 3 unless ocf_is_ver( $v1 ); ++ return 3 unless ocf_is_ver( $v2 ); ++ ++ $v1_level = ocf_ver_level( $v1 ); ++ $v2_level = ocf_ver_level( $v2 ); ++ ++ if ( $v1_level < $v2_level ) { ++ $level_diff = $v2_level - $v1_level; ++ $v1 = ocf_ver_complete_level( $v1, $level_diff ); ++ } ++ elsif ( $v1_level > $v2_level ) { ++ $level_diff = $v1_level - $v2_level; ++ $v2 = ocf_ver_complete_level( $v2, $level_diff ); ++ } ++ ++ $v1 = ocf_ver2num( $v1 ); ++ $v2 = ocf_ver2num( $v2 ); ++ ++ if ( $v1 == $v2 ) { return 1; } ++ elsif ( $v1 < $v2 ) { return 0; } ++ ++ return 2; # -1 would look funny in shell ;-) ( T.N. not in perl ;) ) ++} ++ ++sub ocf_local_nodename { ++ # use crm_node -n for pacemaker > 1.1.8 ++ my $nodename; ++ ++ qx{ which pacemakerd > /dev/null 2>&1 }; ++ if ( $? == 0 ) { ++ my $version; ++ my $ret = qx{ pacemakerd -\$ }; ++ ++ $ret =~ /Pacemaker ([\d.]+)/; ++ $version = $1; ++ ++ if ( ocf_version_cmp( $version, '1.1.8' ) == 2 ) { ++ qx{ which crm_node > /dev/null 2>&1 }; ++ $nodename = qx{ crm_node -n } if $? == 0; ++ } ++ } ++ else { ++ # otherwise use uname -n ++ $nodename = qx { uname -n }; ++ } ++ ++ chomp $nodename; ++ return $nodename; ++} ++ ++# Parse and returns the notify environment variables in a convenient structure ++# Returns undef if the action is not a notify ++# Returns undef if the resource is neither a clone or a multistate one ++sub ocf_notify_env { ++ my $i; ++ my %notify_env; ++ ++ return undef unless $__OCF_ACTION eq 'notify'; ++ ++ return undef unless ocf_is_clone() or ocf_is_ms(); ++ ++ %notify_env = ( ++ 'type' => $ENV{'OCF_RESKEY_CRM_meta_notify_type'} || '', ++ 'operation' => $ENV{'OCF_RESKEY_CRM_meta_notify_operation'} || '', ++ 'active' => [ ], ++ 'inactive' => [ ], ++ 'start' => [ ], ++ 'stop' => [ ], ++ ); ++ ++ for my $action ( qw{ active start stop } ) { ++ next unless ++ defined $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_resource"} ++ and defined $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_uname"}; ++ ++ $i = 0; ++ $notify_env{ $action }[$i++]{'rsc'} = $_ foreach split /\s+/ => ++ $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_resource"}; ++ ++ $i = 0; ++ $notify_env{ $action }[$i++]{'uname'} = $_ foreach split /\s+/ => ++ $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_uname"}; ++ } ++ ++ # notify_nactive_uname doesn't exists. See: ++ # http://lists.clusterlabs.org/pipermail/developers/2017-January/000406.html ++ if ( defined $ENV{"OCF_RESKEY_CRM_meta_notify_inactive_resource"} ) { ++ $i = 0; ++ $notify_env{'inactive'}[$i++]{'rsc'} = $_ foreach split /\s+/ => ++ $ENV{"OCF_RESKEY_CRM_meta_notify_inactive_resource"}; ++ } ++ ++ # exit if the resource is not a mutistate one ++ return %notify_env unless ocf_is_ms(); ++ ++ for my $action ( qw{ master slave promote demote } ) { ++ $notify_env{ $action } = [ ]; ++ ++ next unless ++ defined $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_resource"} ++ and defined $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_uname"}; ++ ++ $i = 0; ++ $notify_env{ $action }[$i++]{'rsc'} = $_ foreach split /\s+/ => ++ $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_resource"}; ++ ++ $i = 0; ++ $notify_env{ $action }[$i++]{'uname'} = $_ foreach split /\s+/ => ++ $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_uname"}; ++ } ++ ++ # Fix active and inactive fields for Pacemaker version < 1.1.16 ++ # ie. crm_feature_set < 3.0.11 ++ # See http://lists.clusterlabs.org/pipermail/developers/2016-August/000265.html ++ # and git commit a6713c5d40327eff8549e7f596501ab1785b8765 ++ if ( ++ ocf_version_cmp( $ENV{"OCF_RESKEY_crm_feature_set"}, '3.0.11' ) == 0 ++ ) { ++ $notify_env{ 'active' } = [ ++ @{ $notify_env{ 'master' } }, ++ @{ $notify_env{ 'slave' } } ++ ]; ++ } ++ ++ return %notify_env; ++} ++ ++$__OCF_ACTION = $ARGV[0]; ++ ++# Return to sanity for the agents... ++ ++undef $ENV{'LC_ALL'}; ++$ENV{'LC_ALL'} = 'C'; ++setlocale( LC_ALL, 'C' ); ++undef $ENV{'LANG'}; ++undef $ENV{'LANGUAGE'}; ++ ++$ENV{'OCF_ROOT'} = '/usr/lib/ocf' ++ unless defined $ENV{'OCF_ROOT'} and $ENV{'OCF_ROOT'} ne ''; ++ ++# old ++undef $ENV{'OCF_FUNCTIONS_DIR'} ++ if defined $ENV{'OCF_FUNCTIONS_DIR'} ++ and $ENV{'OCF_FUNCTIONS_DIR'} eq "$ENV{'OCF_ROOT'}/resource.d/heartbeat"; ++ ++# Define OCF_RESKEY_CRM_meta_interval in case it isn't already set, ++# to make sure that ocf_is_probe() always works ++$ENV{'OCF_RESKEY_CRM_meta_interval'} = 0 ++ unless defined $ENV{'OCF_RESKEY_CRM_meta_interval'}; ++ ++# Strip the OCF_RESKEY_ prefix from this particular parameter ++unless ( defined $ENV{'$OCF_RESKEY_OCF_CHECK_LEVEL'} ++ and $ENV{'$OCF_RESKEY_OCF_CHECK_LEVEL'} ne '' ++) { ++ $ENV{'OCF_CHECK_LEVEL'} = $ENV{'$OCF_RESKEY_OCF_CHECK_LEVEL'}; ++} ++else { ++ ENV{'OCF_CHECK_LEVEL'} = 0; ++} ++ ++unless ( -d $ENV{'OCF_ROOT'} ) { ++ ha_log( "ERROR: OCF_ROOT points to non-directory $ENV{'OCF_ROOT'}." ); ++ $! = $OCF_ERR_GENERIC; ++ die; ++} ++ ++$ENV{'OCF_RESOURCE_TYPE'} = $__SCRIPT_NAME ++ unless defined $ENV{'OCF_RESOURCE_TYPE'} ++ and $ENV{'OCF_RESOURCE_TYPE'} ne ''; ++ ++unless ( defined $ENV{'OCF_RA_VERSION_MAJOR'} ++ and $ENV{'OCF_RA_VERSION_MAJOR'} ne '' ++) { ++ # We are being invoked as an init script. ++ # Fill in some things with reasonable values. ++ $ENV{'OCF_RESOURCE_INSTANCE'} = 'default'; ++ return 1; ++} ++ ++$ENV{'OCF_RESOURCE_INSTANCE'} = "undef" if $__OCF_ACTION eq 'meta-data'; ++ ++unless ( defined $ENV{'OCF_RESOURCE_INSTANCE'} ++ and $ENV{'OCF_RESOURCE_INSTANCE'} ne '' ++) { ++ ha_log( "ERROR: Need to tell us our resource instance name." ); ++ $! = $OCF_ERR_ARGS; ++ die; ++} ++ ++1; ++ ++ ++=head1 COPYRIGHT AND LICENSE ++ ++Copyright (C) 2016: Jehan-Guillaume de Rorthais and Mael Rimbault. ++ ++Licensed under the PostgreSQL License. +diff --color -uNr a/heartbeat/OCF_ReturnCodes.pm b/heartbeat/OCF_ReturnCodes.pm +--- a/heartbeat/OCF_ReturnCodes.pm 1970-01-01 01:00:00.000000000 +0100 ++++ b/heartbeat/OCF_ReturnCodes.pm 2021-04-13 13:37:35.621267404 +0200 +@@ -0,0 +1,97 @@ ++#!/usr/bin/perl ++# This program is open source, licensed under the PostgreSQL License. ++# For license terms, see the LICENSE file. ++# ++# Copyright (C) 2016-2020: Jehan-Guillaume de Rorthais and Mael Rimbault ++ ++=head1 NAME ++ ++OCF_ReturnCodes - Common varibales for the OCF Resource Agents supplied by ++heartbeat. ++ ++=head1 SYNOPSIS ++ ++ use FindBin; ++ use lib "$FindBin::RealBin/../../lib/heartbeat/"; ++ ++ use OCF_ReturnCodes; ++ ++=head1 DESCRIPTION ++ ++This module has been ported from the ocf-retrurncodes shell script of the ++resource-agents project. See L. ++ ++=head1 VARIABLES ++ ++Here are the variables exported by this module: ++ ++=over ++ ++=item $OCF_SUCCESS ++ ++=item $OCF_ERR_GENERIC ++ ++=item $OCF_ERR_ARGS ++ ++=item $OCF_ERR_UNIMPLEMENTED ++ ++=item $OCF_ERR_PERM ++ ++=item $OCF_ERR_INSTALLED ++ ++=item $OCF_ERR_CONFIGURED ++ ++=item $OCF_NOT_RUNNING ++ ++=item $OCF_RUNNING_MASTER ++ ++=item $OCF_FAILED_MASTER ++ ++=back ++ ++=cut ++ ++package OCF_ReturnCodes; ++ ++use strict; ++use warnings; ++use 5.008; ++ ++BEGIN { ++ use Exporter; ++ ++ our $VERSION = 'v2.3.0'; ++ our @ISA = ('Exporter'); ++ our @EXPORT = qw( ++ $OCF_SUCCESS ++ $OCF_ERR_GENERIC ++ $OCF_ERR_ARGS ++ $OCF_ERR_UNIMPLEMENTED ++ $OCF_ERR_PERM ++ $OCF_ERR_INSTALLED ++ $OCF_ERR_CONFIGURED ++ $OCF_NOT_RUNNING ++ $OCF_RUNNING_MASTER ++ $OCF_FAILED_MASTER ++ ); ++ our @EXPORT_OK = ( @EXPORT ); ++} ++ ++our $OCF_SUCCESS = 0; ++our $OCF_ERR_GENERIC = 1; ++our $OCF_ERR_ARGS = 2; ++our $OCF_ERR_UNIMPLEMENTED = 3; ++our $OCF_ERR_PERM = 4; ++our $OCF_ERR_INSTALLED = 5; ++our $OCF_ERR_CONFIGURED = 6; ++our $OCF_NOT_RUNNING = 7; ++our $OCF_RUNNING_MASTER = 8; ++our $OCF_FAILED_MASTER = 9; ++ ++1; ++ ++=head1 COPYRIGHT AND LICENSE ++ ++Copyright (C) 2016: Jehan-Guillaume de Rorthais and Mael Rimbault. ++ ++Licensed under the PostgreSQL License. +diff --color -uNr a/heartbeat/pgsqlms b/heartbeat/pgsqlms +--- a/heartbeat/pgsqlms 1970-01-01 01:00:00.000000000 +0100 ++++ b/heartbeat/pgsqlms 2021-04-13 13:37:40.934280411 +0200 +@@ -0,0 +1,2308 @@ ++#!/usr/bin/perl ++# This program is open source, licensed under the PostgreSQL License. ++# For license terms, see the LICENSE file. ++# ++# Copyright (C) 2016-2020: Jehan-Guillaume de Rorthais and Mael Rimbault ++ ++=head1 NAME ++ ++ocf_heartbeat_pgsqlms - A PostgreSQL multi-state resource agent for Pacemaker ++ ++=head1 SYNOPSIS ++ ++B [start | stop | monitor | promote | demote | notify | reload | methods | meta-data | validate-all] ++ ++=head1 DESCRIPTION ++ ++Resource script for PostgreSQL in replication. It manages PostgreSQL servers using streaming replication as an HA resource. ++ ++=cut ++ ++use strict; ++use warnings; ++use 5.008; ++ ++use POSIX qw(locale_h); ++use Scalar::Util qw(looks_like_number); ++use File::Spec; ++use File::Temp; ++use Data::Dumper; ++ ++my $OCF_FUNCTIONS_DIR; ++BEGIN { ++ $OCF_FUNCTIONS_DIR = defined $ENV{'OCF_FUNCTIONS_DIR'} ? "$ENV{'OCF_FUNCTIONS_DIR'}" : "$ENV{'OCF_ROOT'}/lib/heartbeat"; ++} ++use lib "$OCF_FUNCTIONS_DIR"; ++ ++use OCF_ReturnCodes; ++use OCF_Directories; ++use OCF_Functions; ++ ++our $VERSION = 'v2.3.0'; ++our $PROGRAM = 'pgsqlms'; ++ ++# OCF environment ++my $OCF_RESOURCE_INSTANCE = $ENV{'OCF_RESOURCE_INSTANCE'}; ++my $OCF_RUNNING_SLAVE = $OCF_SUCCESS; ++my %OCF_NOTIFY_ENV = ocf_notify_env() if $__OCF_ACTION eq 'notify'; ++ ++# Default parameters values ++my $system_user_default = "postgres"; ++my $bindir_default = "/usr/bin"; ++my $pgdata_default = "/var/lib/pgsql/data"; ++my $pghost_default = "/tmp"; ++my $pgport_default = 5432; ++my $start_opts_default = ""; ++my $maxlag_default = "0"; ++ ++# Set default values if not found in environment ++my $system_user = $ENV{'OCF_RESKEY_system_user'} || $system_user_default; ++my $bindir = $ENV{'OCF_RESKEY_bindir'} || $bindir_default; ++my $pgdata = $ENV{'OCF_RESKEY_pgdata'} || $pgdata_default; ++my $datadir = $ENV{'OCF_RESKEY_datadir'} || $pgdata; ++my $pghost = $ENV{'OCF_RESKEY_pghost'} || $pghost_default; ++my $pgport = $ENV{'OCF_RESKEY_pgport'} || $pgport_default; ++my $start_opts = $ENV{'OCF_RESKEY_start_opts'} || $start_opts_default; ++my $maxlag = $ENV{'OCF_RESKEY_maxlag'} || $maxlag_default; ++my $recovery_tpl = $ENV{'OCF_RESKEY_recovery_template'} ++ || "$pgdata/recovery.conf.pcmk"; ++ ++ ++# PostgreSQL commands path ++my $POSTGRES = "$bindir/postgres"; ++my $PGCTL = "$bindir/pg_ctl"; ++my $PGPSQL = "$bindir/psql"; ++my $PGCTRLDATA = "$bindir/pg_controldata"; ++my $PGISREADY = "$bindir/pg_isready"; ++my $PGWALDUMP = "$bindir/pg_waldump"; ++ ++# pacemaker commands path ++my $CRM_MASTER = "$HA_SBIN_DIR/crm_master --lifetime forever"; ++my $CRM_NODE = "$HA_SBIN_DIR/crm_node"; ++my $CRM_RESOURCE = "$HA_SBIN_DIR/crm_resource"; ++my $ATTRD_PRIV = "$HA_SBIN_DIR/attrd_updater --private --lifetime reboot"; ++ ++# Global vars ++my $nodename; ++my $exit_code = 0; ++# numeric pgsql versions ++my $PGVERNUM; ++my $PGVER_93 = 90300; ++my $PGVER_10 = 100000; ++my $PGVER_12 = 120000; ++ ++# Run a query using psql. ++# ++# This function returns an array with psql return code as first element and ++# the result as second one. ++# ++sub _query { ++ my $query = shift; ++ my $res = shift; ++ my $connstr = "dbname=postgres"; ++ my $RS = chr(30); # ASCII RS (record separator) ++ my $FS = chr(3); # ASCII ETX (end of text) ++ my $postgres_uid = getpwnam( $system_user ); ++ my $oldeuid = $>; ++ my $tmpfile; ++ my @res; ++ my $ans; ++ my $pid; ++ my $rc; ++ ++ unless ( defined $res and defined $query and $query ne '' ) { ++ ocf_log( 'debug', '_query: wrong parameters!' ); ++ return -1; ++ } ++ ++ unless ( $tmpfile = File::Temp->new( ++ TEMPLATE => 'pgsqlms-XXXXXXXX', ++ DIR => $HA_RSCTMP ++ ) ) ++ { ++ ocf_exit_reason( 'Could not create or write in a temp file' ); ++ exit $OCF_ERR_INSTALLED; ++ } ++ ++ print $tmpfile $query; ++ chmod 0644, $tmpfile; ++ ++ ocf_log( 'debug', '_query: %s', $query ); ++ ++ # Change the effective user to the given system_user so after forking ++ # the given uid to the process should allow psql to connect w/o password ++ $> = $postgres_uid; ++ ++ # Forking + piping ++ $pid = open(my $KID, "-|"); ++ ++ if ( $pid == 0 ) { # child ++ exec $PGPSQL, '--set', 'ON_ERROR_STOP=1', '-qXAtf', $tmpfile, ++ '-R', $RS, '-F', $FS, '--port', $pgport, '--host', $pghost, ++ $connstr; ++ } ++ ++ # parent ++ $> = $oldeuid; ++ ++ { ++ local $/; ++ $ans = <$KID>; ++ } ++ ++ close $KID; ++ $rc = $? >> 8; ++ ++ ocf_log( 'debug', '_query: psql return code: %d', $rc ); ++ ++ if ( defined $ans ) { ++ chop $ans; ++ ++ push @{ $res }, [ split(chr(3) => $_, -1) ] ++ foreach split (chr(30) => $ans, -1); ++ ++ ocf_log( 'debug', '_query: @res: %s', ++ Data::Dumper->new( [ $res ] )->Terse(1)->Dump ); ++ } ++ ++ # Possible return codes: ++ # -1: wrong parameters ++ # 0: OK ++ # 1: failed to get resources (memory, missing file, ...) ++ # 2: unable to connect ++ # 3: query failed ++ return $rc; ++} ++ ++# Get the last received location on a standby ++# if the first argument is true, returns the value as decimal ++# if the first argument is false, returns the value as LSN ++# Returns undef if query failed ++sub _get_last_received_lsn { ++ my ( $dec ) = @_; ++ my $pg_last_wal_receive_lsn = 'pg_last_wal_receive_lsn()'; ++ my $pg_wal_lsn_diff = 'pg_wal_lsn_diff'; ++ my $query; ++ my $rc; ++ my @rs; ++ ++ if ( $PGVERNUM < $PGVER_10 ) { ++ $pg_last_wal_receive_lsn = 'pg_last_xlog_receive_location()'; ++ $pg_wal_lsn_diff = 'pg_xlog_location_diff'; ++ } ++ ++ if ( $dec ) { ++ $query = "SELECT $pg_wal_lsn_diff( $pg_last_wal_receive_lsn, '0/0' )"; ++ } ++ else { ++ $query = "SELECT $pg_last_wal_receive_lsn"; ++ } ++ ++ $rc = _query( $query, \@rs ); ++ ++ return $rs[0][0] if $rc == 0 and $rs[0][0]; ++ ++ ocf_log( 'err', 'Could not query last received LSN (%s)', $rc ) if $rc != 0; ++ ocf_log( 'err', 'No values for last received LSN' ) ++ if $rc == 0 and not $rs[0][0]; ++ ++ return undef; ++} ++ ++# Get the master score for each connected standby ++# Returns directly the result set of the query or exit with an error. ++# Exits with OCF_ERR_GENERIC if the query failed ++sub _get_lag_scores { ++ my $pg_current_wal_lsn = 'pg_current_wal_lsn()'; ++ my $pg_wal_lsn_diff = 'pg_wal_lsn_diff'; ++ my $write_lsn = 'write_lsn'; ++ my $query; ++ my $rc; ++ my @rs; ++ ++ if ( $PGVERNUM < $PGVER_10 ) { ++ $pg_current_wal_lsn = 'pg_current_xlog_location()'; ++ $pg_wal_lsn_diff = 'pg_xlog_location_diff'; ++ $write_lsn = 'write_location'; ++ } ++ ++ # We check locations of connected standbies by querying the ++ # "pg_stat_replication" view. ++ # The row_number applies on the result set ordered on write_location ASC so ++ # the highest row_number should be given to the closest node from the ++ # master, then the lowest node name (alphanumeric sort) in case of equality. ++ # The result set itself is order by priority DESC to process best known ++ # candidate first. ++ $query = qq{ ++ SELECT application_name, priority, location, state, current_lag ++ FROM ( ++ SELECT application_name, ++ (1000 - ( ++ row_number() OVER ( ++ PARTITION BY state IN ('startup', 'backup') ++ ORDER BY location ASC, application_name ASC ++ ) - 1 ++ ) * 10 ++ ) * CASE WHEN ( $maxlag > 0 ++ AND current_lag > $maxlag) ++ THEN -1 ++ ELSE 1 ++ END AS priority, ++ location, state, current_lag ++ FROM ( ++ SELECT application_name, $write_lsn AS location, state, ++ $pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag ++ FROM pg_stat_replication ++ ) AS s2 ++ ) AS s1 ++ ORDER BY priority DESC ++ }; ++ ++ $rc = _query( $query, \@rs ); ++ ++ if ( $rc != 0 ) { ++ ocf_exit_reason( 'Query to get standby locations failed (%d)', $rc ); ++ exit $OCF_ERR_GENERIC; ++ } ++ ++ return \@rs; ++} ++ ++# get the timeout for the current action given from environment var ++# Returns timeout as integer ++# undef if unknown ++sub _get_action_timeout { ++ my $timeout = $ENV{'OCF_RESKEY_CRM_meta_timeout'} / 1000; ++ ++ ocf_log( 'debug', '_get_action_timeout: known timeout: %s', ++ defined $timeout ? $timeout : 'undef' ); ++ ++ return $timeout if defined $timeout and $timeout =~ /^\d+$/; ++ ++ return undef; ++} ++ ++# Get, parse and return the value of the given private attribute name ++# Returns an empty string if not found. ++sub _get_priv_attr { ++ my ( $name, $node ) = @_; ++ my $val = ''; ++ my $node_arg = ''; ++ my $ans; ++ ++ $node = '' unless defined $node; ++ $name = "$name-$OCF_RESOURCE_INSTANCE"; ++ ++ $node_arg= "--node $node" if $node ne ''; ++ ++ $ans = qx{ $ATTRD_PRIV --name "$name" --query $node_arg }; ++ ++ $ans =~ m/^name=".*" host=".*" value="(.*)"$/; ++ ++ $val = $1 if defined $1; ++ ++ ocf_log( 'debug', '_get_priv_attr: value of "%s"%s is "%s"', $name, ++ ( $node ? " on \"$node\"": ""), ++ $val ); ++ ++ return $val; ++} ++ ++# Set the given private attribute name to the given value ++# As setting an attribute is asynchronous, this will return as soon as the ++# attribute is really set by attrd and available. ++sub _set_priv_attr { ++ my ( $name, $val ) = @_; ++ my $name_instance = "$name-$OCF_RESOURCE_INSTANCE"; ++ ++ ocf_log( 'debug', '_set_priv_attr: set "%s=%s"...', $name_instance, $val ); ++ ++ qx{ $ATTRD_PRIV --name "$name_instance" --update "$val" }; ++ ++ # give attr name without the resource instance name as _get_priv_attr adds ++ # it as well ++ while ( _get_priv_attr( $name ) ne $val ) { ++ ocf_log( 'debug', '_set_priv_attr: waiting attrd ack for "%s"...', $name_instance ); ++ select( undef, undef, undef, 0.1 ); ++ } ++ ++ return; ++} ++ ++# Delete the given private attribute. ++# As setting an attribute is asynchronous, this will return as soon as the ++# attribute is really deleted by attrd. ++sub _delete_priv_attr { ++ my ( $name ) = @_; ++ my $name_instance = "$name-$OCF_RESOURCE_INSTANCE"; ++ ++ ocf_log( 'debug', '_delete_priv_attr: delete "%s"...', $name_instance ); ++ ++ qx{ $ATTRD_PRIV --name "$name_instance" --delete }; ++ ++ # give attr name without the resource instance name as _get_priv_attr adds ++ # it as well ++ while ( _get_priv_attr( $name ) ne '' ) { ++ ocf_log( 'debug', '_delete_priv_attr: waiting attrd ack for "%s"...', ++ $name_instance ); ++ select( undef, undef, undef, 0.1 ); ++ } ++ ++ return; ++} ++ ++# Get, parse and return the resource master score on given node. ++# Returns an empty string if not found. ++# Returns undef on crm_master call on error ++sub _get_master_score { ++ my ( $node ) = @_; ++ my $node_arg = ''; ++ my $score; ++ ++ $node_arg = sprintf '--node "%s"', $node if defined $node and $node ne ''; ++ ++ $score = qx{ $CRM_MASTER --quiet --get-value $node_arg 2> /dev/null }; ++ ++ return '' unless $? == 0 and defined $score; ++ ++ chomp $score; ++ ++ return $score; ++} ++ ++# Set the master score of the local node or the optionally given node. ++# As setting an attribute is asynchronous, this will return as soon as the ++# attribute is really set by attrd and available everywhere. ++sub _set_master_score { ++ my ( $score, $node ) = @_; ++ my $node_arg = ''; ++ my $tmp; ++ ++ $node_arg = sprintf '--node "%s"', $node if defined $node and $node ne ''; ++ ++ qx{ $CRM_MASTER $node_arg --quiet --update "$score" }; ++ ++ while ( ( $tmp = _get_master_score( $node ) ) ne $score ) { ++ ocf_log( 'debug', ++ '_set_master_score: waiting to set score to "%s" (currently "%s")...', ++ $score, $tmp ); ++ select(undef, undef, undef, 0.1); ++ } ++ ++ return; ++} ++ ++# _master_score_exists ++# This subroutine checks if a master score is set for one of the relative clones ++# in the cluster and the score is greater or equal of 0. ++# Returns 1 if at least one master score >= 0 is found. ++# Returns 0 otherwise ++sub _master_score_exists { ++ my @partition_nodes = split /\s+/ => qx{ $CRM_NODE --partition }; ++ ++ foreach my $node ( @partition_nodes ) { ++ my $score = _get_master_score( $node ); ++ ++ return 1 if defined $score and $score ne '' and $score > -1; ++ } ++ ++ return 0; ++} ++ ++# Check if the current transiation is a recover of a master clone on given node. ++sub _is_master_recover { ++ my ( $n ) = @_; ++ ++ return ( ++ scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'master'} } ++ and scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'promote'} } ++ ); ++} ++ ++# Check if the current transition is a recover of a slave clone on given node. ++sub _is_slave_recover { ++ my ( $n ) = @_; ++ ++ return ( ++ scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'slave'} } ++ and scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'start'} } ++ ); ++} ++ ++# check if th current transition is a switchover to the given node. ++sub _is_switchover { ++ my ( $n ) = @_; ++ my $old = $OCF_NOTIFY_ENV{'master'}[0]{'uname'}; ++ ++ return 0 if scalar @{ $OCF_NOTIFY_ENV{'master'} } != 1 ++ or scalar @{ $OCF_NOTIFY_ENV{'demote'} } != 1 ++ or scalar @{ $OCF_NOTIFY_ENV{'promote'} } != 1; ++ ++ return ( ++ scalar grep { $_->{'uname'} eq $old } @{ $OCF_NOTIFY_ENV{'demote'} } ++ and scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'slave'} } ++ and scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'promote'} } ++ and not scalar grep { $_->{'uname'} eq $old } @{ $OCF_NOTIFY_ENV{'stop'} } ++ ); ++} ++ ++# Run the given command as the "system_user" given as parameter. ++# It basically forks and seteuid/setuid away from root. ++# ++sub _runas { ++ my $rc; ++ my $pid; ++ my @cmd = @_; ++ my (undef, undef, $postgres_uid, $postgres_gid ) = getpwnam( $system_user ); ++ ++ $pid = fork; ++ ++ if ( $pid == 0 ) { # in child ++ $) = "$postgres_gid $postgres_gid"; ++ while ( my ( undef, undef, $gid, $members ) = getgrent ) { ++ $) .= " $gid" if grep { $system_user eq $_ } split /\s+/, $members ++ } ++ $( = $postgres_gid; ++ ++ $< = $> = $postgres_uid; ++ ++ exec @cmd; ++ } ++ ++ ocf_log( 'debug', '_runas: launching as "%s" command "%s"', $system_user, ++ join(' ', @cmd) ); ++ ++ waitpid $pid, 0; ++ $rc = $? >> 8; ++ ++ return $rc; ++} ++ ++# Check if instance is listening on the given host/port. ++# ++sub _pg_isready { ++ # Add 60s to the timeout or use a 24h timeout fallback to make sure ++ # Pacemaker will give up before us and take decisions ++ my $timeout = ( _get_action_timeout() || 60*60*24 ) + 60; ++ my $rc = _runas( $PGISREADY, '-h', $pghost, '-p', $pgport, '-d', 'postgres', '-t', $timeout ); ++ ++ # Possible error codes: ++ # 1: ping rejected (usually when instance is in startup, in crash ++ # recovery, in warm standby, or when a shutdown is in progress) ++ # 2: no response, usually means the instance is down ++ # 3: no attempt, probably a syntax error, should not happen ++ return $rc; ++} ++ ++# Check the postmaster.pid file and the postmaster process. ++# WARNING: we do not distinguish the scenario where postmaster.pid does not ++# exist from the scenario where the process is still alive. It should be ok ++# though, as this is considered a hard error from monitor. ++# ++sub _pg_ctl_status { ++ my $rc = _runas( $PGCTL, '--pgdata', $pgdata, 'status' ); ++ ++ # Possible error codes: ++ # 3: postmaster.pid file does not exist OR it does but the process ++ # with the PID found in the file is not alive ++ return $rc; ++} ++ ++# Start the local instance using pg_ctl ++# ++sub _pg_ctl_start { ++ # Add 60s to the timeout or use a 24h timeout fallback to make sure ++ # Pacemaker will give up before us and take decisions ++ my $timeout = ( _get_action_timeout() || 60*60*24 ) + 60; ++ ++ my @cmd = ( $PGCTL, '--pgdata', $pgdata, '-w', '--timeout', $timeout, 'start' ); ++ ++ push @cmd => ( '-o', $start_opts ) if $start_opts ne ''; ++ ++ return _runas( @cmd ); ++} ++ ++# Enable the Standby mode. ++# ++# Up to v11, creates the recovery.conf file based on the given template. ++# Since v12, creates standby.signal. ++sub _enable_recovery { ++ my $fh; ++ my $content = ''; ++ my $standby_file = "$datadir/standby.signal"; ++ my (undef, undef, $uid, $gid) = getpwnam($system_user); ++ ++ if ( $PGVERNUM < $PGVER_12 ) { ++ $standby_file = "$datadir/recovery.conf"; ++ ++ ocf_log( 'debug', ++ '_enable_recovery: get replication configuration from the template file "%s"', ++ $recovery_tpl ); ++ ++ # Create the recovery.conf file to start the instance as a secondary. ++ # NOTE: the recovery.conf is supposed to be set up so the secondary can ++ # connect to the primary instance, eg. using a virtual IP address. ++ # As there is no primary instance available at startup, secondaries will ++ # complain about failing to connect. ++ # As we can not reload a recovery.conf file on a standby without restarting ++ # it, we will leave with this. ++ # FIXME how would the reload help us in this case ? ++ unless ( defined open( $fh, '<', $recovery_tpl ) ) { ++ ocf_exit_reason( 'Could not open file "%s": %s', $recovery_tpl, $! ); ++ exit $OCF_ERR_CONFIGURED; ++ } ++ ++ # Copy all parameters from the template file ++ while (my $line = <$fh>) { ++ chomp $line; ++ $content .= "$line\n"; ++ } ++ close $fh; ++ } ++ ++ ocf_log( 'debug', '_enable_recovery: write the standby file "%s"', $standby_file ); ++ ++ unless ( open( $fh, '>', $standby_file ) ) { ++ ocf_exit_reason( 'Could not open file "%s": %s', $standby_file, $! ); ++ exit $OCF_ERR_CONFIGURED; ++ } ++ ++ # Write the recovery.conf file using configuration from the template file ++ print $fh $content; ++ ++ close $fh; ++ ++ unless ( chown $uid, $gid, $standby_file ) { ++ ocf_exit_reason( 'Could not set owner of "%s"', $standby_file ); ++ exit $OCF_ERR_CONFIGURED; ++ }; ++} ++ ++# Parse and return various informations about the local PostgreSQL instance as ++# reported by its controldata file. ++# ++# WARNING: the status is NOT updated in case of crash. ++# ++# This sub exit the script with an error on failure ++sub _get_controldata { ++ my %controldata; ++ my $ans; ++ ++ $ans = qx{ $PGCTRLDATA "$datadir" 2>/dev/null }; ++ ++ # Parse the output of pg_controldata. ++ # This output is quite stable between pg versions, but we might need to sort ++ # it at some point if things are moving in there... ++ $ans =~ m{ ++ # get the current state ++ ^\QDatabase cluster state\E:\s+(.*?)\s*$ ++ .* ++ # Get the latest known REDO location ++ ^\QLatest checkpoint's REDO location\E:\s+([/0-9A-F]+)\s*$ ++ .* ++ # Get the latest known TL ++ ^\QLatest checkpoint's TimeLineID\E:\s+(\d+)\s*$ ++ .* ++ # Get the wal level ++ # NOTE: pg_controldata output changed with PostgreSQL 9.5, so we need to ++ # account for both syntaxes ++ ^(?:\QCurrent \E)?\Qwal_level setting\E:\s+(.*?)\s*$ ++ }smx; ++ ++ $controldata{'state'} = $1 if defined $1; ++ $controldata{'redo'} = $2 if defined $2; ++ $controldata{'tl'} = $3 if defined $3; ++ $controldata{'wal_level'} = $4 if defined $4; ++ ++ ocf_log( 'debug', ++ "_get_controldata: found: %s", ++ Data::Dumper->new( [ \%controldata ] )->Terse(1)->Dump ); ++ ++ return %controldata if defined $controldata{'state'} ++ and defined $controldata{'tl'} ++ and defined $controldata{'redo'} ++ and defined $controldata{'wal_level'}; ++ ++ ocf_exit_reason( 'Could not read all datas from controldata file for "%s"', ++ $datadir ); ++ ++ ocf_log( 'debug', ++ "_get_controldata: controldata file: %s", ++ Data::Dumper->new( [ \%controldata ] )->Terse(1)->Dump, $ans ); ++ ++ exit $OCF_ERR_ARGS; ++} ++ ++# Pead major version from datadir/PG_VERSION and return it as numeric version ++sub _get_pg_version { ++ my $fh; ++ my $PGVERSION; ++ my $PGVERNUM; ++ ++ # check PG_VERSION ++ if ( ! -s "$datadir/PG_VERSION" ) { ++ ocf_exit_reason( 'PG_VERSION does not exist in "%s"', $datadir ); ++ exit $OCF_ERR_ARGS; ++ } ++ ++ unless ( open( $fh, '<', "$datadir/PG_VERSION" ) ) { ++ ocf_exit_reason( "Could not open file \"$datadir/PG_VERSION\": $!" ); ++ exit $OCF_ERR_ARGS; ++ } ++ ++ read( $fh, $PGVERSION, 32 ); ++ close $fh; ++ ++ chomp $PGVERSION; ++ ++ $PGVERSION =~ /^(\d+)(?:\.(\d+))?$/; ++ $PGVERNUM = $1 * 10000; ++ $PGVERNUM += $2 * 100 if $1 < 10; # no 2nd num in the major version from v10 ++ ++ return $PGVERNUM; ++} ++ ++# Use pg_controldata to check the state of the PostgreSQL server. This ++# function returns codes depending on this state, so we can find whether the ++# instance is a primary or a secondary, or use it to detect any inconsistency ++# that could indicate the instance has crashed. ++# ++sub _controldata_to_ocf { ++ my %cdata = _get_controldata(); ++ ++ while ( 1 ) { ++ ocf_log( 'debug', '_controldata: instance "%s" state is "%s"', ++ $OCF_RESOURCE_INSTANCE, $cdata{'state'} ); ++ ++ # Instance should be running as a primary. ++ return $OCF_RUNNING_MASTER if $cdata{'state'} eq "in production"; ++ ++ # Instance should be running as a secondary. ++ # This state includes warm standby (rejects connections attempts, ++ # including pg_isready) ++ return $OCF_SUCCESS if $cdata{'state'} eq "in archive recovery"; ++ ++ ++ # The instance should be stopped. ++ # We don't care if it was a primary or secondary before, because we ++ # always start instances as secondaries, and then promote if necessary. ++ return $OCF_NOT_RUNNING if $cdata{'state'} eq "shut down" ++ or $cdata{'state'} eq "shut down in recovery"; ++ ++ # The state is "in crash recovery", "starting up" or "shutting down". ++ # This state should be transitional, so we wait and loop to check if ++ # it changes. ++ # If it does not, pacemaker will eventually abort with a timeout. ++ ocf_log( 'debug', ++ '_controldata: waiting for transitionnal state "%s" to finish', ++ $cdata{'state'} ); ++ sleep 1; ++ %cdata = _get_controldata(); ++ } ++ ++ # If we reach this point, something went really wrong with this code or ++ # pg_controldata. ++ ocf_exit_reason( 'Unable get instance "%s" state using pg_controldata', ++ $OCF_RESOURCE_INSTANCE ); ++ ++ return $OCF_ERR_INSTALLED ; ++} ++ ++# Check the write_location of all secondaries, and adapt their master score so ++# that the instance closest to the master will be the selected candidate should ++# a promotion be triggered. ++# NOTE: This is only a hint to pacemaker! The selected candidate to promotion ++# actually re-check it is the best candidate and force a re-election by failing ++# if a better one exists. This avoid a race condition between the call of the ++# monitor action and the promotion where another slave might have catchup faster ++# with the master. ++# NOTE: we cannot directly use the write_location, neither a lsn_diff value as ++# promotion score as Pacemaker considers any value greater than 1,000,000 as ++# INFINITY. ++# ++# This sub must be executed from a master monitor action. ++# ++sub _check_locations { ++ my $partition_nodes; ++ my $node_score; ++ my $row_num; ++ my $row; ++ my @rs; ++ ++ # Set the master score if not already done ++ $node_score = _get_master_score(); ++ _set_master_score( '1001' ) unless $node_score eq '1001'; ++ ++ # Ask crm_node what nodes are present in our current cluster partition ++ $partition_nodes = qx{ $CRM_NODE --partition }; ++ ++ @rs = @{ _get_lag_scores() }; ++ ++ $row_num = scalar @rs; ++ ++ # If no lag are reported at this point, it means that there is no ++ # secondary instance connected. ++ ocf_log( 'warning', 'No secondary connected to the master' ) ++ if $row_num == 0; ++ ++ # For each standby connected, set their master score based on the following ++ # rule: the first known node/application, with the highest priority and ++ # an acceptable state. ++ while ( $row = shift @rs ) { ++ ++ if ( $partition_nodes !~ /$row->[0]/ ) { ++ ocf_log( 'info', 'Ignoring unknown application_name/node "%s"', ++ $row->[0] ); ++ next; ++ } ++ ++ if ( $row->[0] eq $nodename ) { ++ ocf_log( 'warning', 'Streaming replication with myself!' ); ++ next; ++ } ++ ++ $node_score = _get_master_score( $row->[0] ); ++ ++ if ( $row->[3] =~ /^\s*(?:startup|backup)\s*$/ ) { ++ # We exclude any standby being in state backup (pg_basebackup) or ++ # startup (new standby or failing standby) ++ ocf_log( 'info', 'Forbidding promotion on "%s" in state "%s"', ++ $row->[0], $row->[3] ); ++ ++ _set_master_score( '-1', $row->[0] ) unless $node_score eq '-1'; ++ } ++ else { ++ ocf_log( 'debug', ++ '_check_locations: checking "%s" promotion ability (current_score: %s, priority: %s, location: %s, lag: %s)', ++ $row->[0], $node_score, $row->[1], $row->[2], $row->[4] ); ++ ++ if ( $node_score ne $row->[1] ) { ++ if ( $row->[1] < -1 ) { ++ ocf_log( 'info', 'Update score of "%s" from %s to %s because replication lag (%s) is higher than given maxlag (%s).', ++ $row->[0], $node_score, $row->[1], $row->[4], $maxlag ); ++ } ++ else { ++ ocf_log( 'info', 'Update score of "%s" from %s to %s because of a change in the replication lag (%s).', ++ $row->[0], $node_score, $row->[1], $row->[4] ); ++ } ++ _set_master_score( $row->[1], $row->[0] ); ++ } ++ else { ++ ocf_log( 'debug', ++ '_check_locations: "%s" keeps its current score of %s', ++ $row->[0], $row->[1] ); ++ } ++ } ++ ++ # Remove this node from the known nodes list. ++ $partition_nodes =~ s/(?:^|\s)$row->[0](?:\s|$)/ /g; ++ } ++ ++ $partition_nodes =~ s/(?:^\s+)|(?:\s+$)//g; ++ ++ # If there are still nodes in "partition_nodes", it means there is no ++ # corresponding line in "pg_stat_replication". ++ # Exclude these nodes that are not part of the cluster at this ++ # point. ++ foreach my $node (split /\s+/ => $partition_nodes) { ++ # Exclude the current node. ++ next if $node eq $nodename; ++ ++ # do not warn if the master score is already set to -1000. ++ # this avoid log flooding (gh #138) ++ $node_score = _get_master_score( $node ); ++ next if $node_score eq '-1000'; ++ ++ ocf_log( 'warning', '"%s" is not connected to the primary', $node ); ++ _set_master_score( '-1000', $node ); ++ } ++ ++ return $OCF_SUCCESS; ++} ++ ++# _check_switchover ++# check if the pgsql switchover to the localnode is safe. ++# This is supposed to be called **after** the master has been stopped or demoted. ++# This sub checks if the local standby received the shutdown checkpoint from the ++# old master to make sure it can take over the master role and the old master ++# will be able to catchup as a standby after. ++# ++# Returns 0 if switchover is safe ++# Returns 1 if swithcover is not safe ++# Returns 2 for internal error ++sub _check_switchover { ++ my $has_sht_chk = 0; ++ my $last_redo; ++ my $last_lsn; ++ my $ans; ++ my $rc; ++ my $tl; ++ my %cdata; ++ ++ $PGWALDUMP = "$bindir/pg_xlogdump" if $PGVERNUM < $PGVER_10; ++ ++ ocf_log( 'info', 'Switchover in progress from "%s" to "%s".' ++ .' Need to check the last record in WAL', ++ $OCF_NOTIFY_ENV{'demote'}[0]{'uname'}, $nodename ); ++ ++ # check if we received the shutdown checkpoint of the master during its ++ # demote process. ++ # We need the last local checkpoint LSN and the last received LSN from ++ # master to check in the WAL between these adresses if we have a ++ # "checkpoint shutdown" using pg_xlogdump/pg_waldump. ++ # ++ # Force a checkpoint to make sure the controldata shows the very last TL ++ # and the master's shutdown checkpoint ++ _query( q{ CHECKPOINT }, {} ); ++ %cdata = _get_controldata(); ++ $tl = $cdata{'tl'}; ++ $last_redo = $cdata{'redo'}; ++ ++ # Get the last received LSN from master ++ $last_lsn = _get_last_received_lsn(); ++ ++ unless ( defined $last_lsn ) { ++ ocf_exit_reason( 'Could not fetch last received LSN!' ); ++ ++ return 2; ++ } ++ ++ $ans = qx{ $PGWALDUMP --path "$datadir" --timeline "$tl" \\ ++ --start "$last_redo" --end "$last_lsn" 2>&1 }; ++ $rc = $?; ++ ++ ocf_log( 'debug', ++ '_check_switchover: %s rc: "%s", tl: "%s", last_chk: %s, last_lsn: %s, output: "%s"', ++ $PGWALDUMP, $rc, $tl, $last_redo, $last_lsn, $ans ++ ); ++ ++ if ( $rc == 0 and ++ $ans =~ m{^rmgr: XLOG.*desc: (?i:checkpoint)(?::|_SHUTDOWN) redo [0-9A-F/]+; tli $tl;.*; shutdown$}m ++ ) { ++ ocf_log( 'info', 'Slave received the shutdown checkpoint' ); ++ return 0; ++ } ++ ++ ocf_exit_reason( ++ 'Did not receive the shutdown checkpoint from the old master!' ); ++ ++ return 1; ++} ++ ++# Check to confirm if the instance is really started as _pg_isready stated and ++# check if the instance is primary or secondary. ++# ++sub _confirm_role { ++ my $is_in_recovery; ++ my $rc; ++ my @rs; ++ ++ $rc = _query( "SELECT pg_is_in_recovery()", \@rs ); ++ ++ $is_in_recovery = $rs[0][0]; ++ ++ if ( $rc == 0 ) { ++ # The query was executed, check the result. ++ if ( $is_in_recovery eq 't' ) { ++ # The instance is a secondary. ++ ocf_log( 'debug', "_confirm_role: instance $OCF_RESOURCE_INSTANCE is a secondary"); ++ return $OCF_SUCCESS; ++ } ++ elsif ( $is_in_recovery eq 'f' ) { ++ # The instance is a primary. ++ ocf_log( 'debug', "_confirm_role: instance $OCF_RESOURCE_INSTANCE is a primary"); ++ # Check lsn diff with current slaves if any ++ _check_locations() if $__OCF_ACTION eq 'monitor'; ++ return $OCF_RUNNING_MASTER; ++ } ++ ++ # This should not happen, raise a hard configuration error. ++ ocf_exit_reason( ++ 'Unexpected result from query to check if "%s" is a primary or a secondary: "%s"', ++ $OCF_RESOURCE_INSTANCE, $is_in_recovery ); ++ ++ return $OCF_ERR_CONFIGURED; ++ } ++ elsif ( $rc == 1 or $rc == 2 ) { ++ # psql cound not connect to the instance. ++ # As pg_isready reported the instance was listening, this error ++ # could be a max_connection saturation. Just report a soft error. ++ ocf_exit_reason( 'psql could not connect to instance "%s"', ++ $OCF_RESOURCE_INSTANCE ); ++ return $OCF_ERR_GENERIC; ++ } ++ ++ # The query failed (rc: 3) or bad parameters (rc: -1). ++ # This should not happen, raise a hard configuration error. ++ ocf_exit_reason( ++ 'The query to check if instance "%s" is a primary or a secondary failed (rc: %d)', ++ $OCF_RESOURCE_INSTANCE, $rc ); ++ ++ return $OCF_ERR_CONFIGURED; ++} ++ ++ ++# Check to confirm if the instance is really stopped as _pg_isready stated ++# and if it was propertly shut down. ++# ++sub _confirm_stopped { ++ my $pgctlstatus_rc; ++ my $controldata_rc; ++ ++ # Check the postmaster process status. ++ $pgctlstatus_rc = _pg_ctl_status(); ++ ++ if ( $pgctlstatus_rc == 0 ) { ++ # The PID file exists and the process is available. ++ # That should not be the case, return an error. ++ ocf_exit_reason( ++ 'Instance "%s" is not listening, but the process referenced in postmaster.pid exists', ++ $OCF_RESOURCE_INSTANCE ); ++ return $OCF_ERR_GENERIC; ++ } ++ ++ # The PID file does not exist or the process is not available. ++ ocf_log( 'debug', ++ '_confirm_stopped: no postmaster process found for instance "%s"', ++ $OCF_RESOURCE_INSTANCE ); ++ ++ if ( -f "$datadir/backup_label" ) { ++ # We are probably on a freshly built secondary that was not started yet. ++ ocf_log( 'debug', ++ '_confirm_stopped: backup_label file exists: probably on a never started secondary', ++ ); ++ return $OCF_NOT_RUNNING; ++ } ++ ++ # Continue the check with pg_controldata. ++ $controldata_rc = _controldata_to_ocf(); ++ if ( $controldata_rc == $OCF_RUNNING_MASTER ) { ++ # The controldata has not been updated to "shutdown". ++ # It should mean we had a crash on a primary instance. ++ ocf_exit_reason( ++ 'Instance "%s" controldata indicates a running primary instance, the instance has probably crashed', ++ $OCF_RESOURCE_INSTANCE ); ++ return $OCF_FAILED_MASTER; ++ } ++ elsif ( $controldata_rc == $OCF_SUCCESS ) { ++ # The controldata has not been updated to "shutdown in recovery". ++ # It should mean we had a crash on a secondary instance. ++ # There is no "FAILED_SLAVE" return code, so we return a generic error. ++ ocf_exit_reason( ++ 'Instance "%s" controldata indicates a running secondary instance, the instance has probably crashed', ++ $OCF_RESOURCE_INSTANCE ); ++ return $OCF_ERR_GENERIC; ++ } ++ elsif ( $controldata_rc == $OCF_NOT_RUNNING ) { ++ # The controldata state is consistent, the instance was probably ++ # propertly shut down. ++ ocf_log( 'debug', ++ '_confirm_stopped: instance "%s" controldata indicates that the instance was propertly shut down', ++ $OCF_RESOURCE_INSTANCE ); ++ return $OCF_NOT_RUNNING; ++ } ++ ++ # Something went wrong with the controldata check. ++ ocf_exit_reason( ++ 'Could not get instance "%s" status from controldata (returned: %d)', ++ $OCF_RESOURCE_INSTANCE, $controldata_rc ); ++ ++ return $OCF_ERR_GENERIC; ++} ++ ++############################################################ ++#### OCF FUNCS ++ ++ ++ ++=head1 SUPPORTED PARAMETERS ++ ++=over ++ ++=item B ++ ++Location of the PGDATA of your instance ++ ++(optional, string, default "/var/lib/pgsql/data") ++ ++=item B ++ ++The socket directory or IP address to use to connect to the local instance ++ ++(optional, string, default "/tmp") ++ ++=item B ++ ++The port to connect to the local instance ++ ++(optional, integer, default "5432") ++ ++=item B ++ ++Location of the PostgreSQL binaries. ++ ++(optional, string, default "/usr/bin") ++ ++=item B ++ ++The system owner of your instance's process ++ ++(optional, string, default "postgres") ++ ++=item B ++ ++B for PostgreSQL 11 and bellow. ++ ++The local template that will be copied as the C file. ++This template file must exists on all node. ++ ++With PostgreSQL 12 and higher, the cluster will refuse to start if this ++parameter is set or a template file is found. ++ ++(optional, string, default "$PGDATA/recovery.conf.pcmk") ++ ++=item B ++ ++Maximum lag allowed on a standby before we set a negative master score on it. ++The calculation is based on the difference between the current xlog location on ++the master and the write location on the standby. ++ ++(optional, integer, default "0" disables this feature) ++ ++=item B ++ ++Path to the directory set in C from your postgresql.conf file. ++This parameter has same default than PostgreSQL itself: the C parameter ++value. ++ ++Unless you have a special PostgreSQL setup and you understand this parameter, ++B ++ ++(optional, string, default to the value of C) ++ ++=item B ++ ++Additional arguments given to the postgres process on startup. See ++"postgres --help" for available options. Useful when the postgresql.conf file ++is not in the data directory (PGDATA), eg.: ++ ++ -c config_file=/etc/postgresql/9.3/main/postgresql.conf ++ ++(optinal, string, default "") ++ ++=back ++ ++=cut ++ ++sub ocf_meta_data { ++ print qq{ ++ ++ ++ 1.0 ++ ++ ++ Resource script for PostgreSQL in replication. It manages PostgreSQL servers using streaming replication as an HA resource. ++ ++ Manages PostgreSQL servers in replication ++ ++ ++ ++ System user account used to run the PostgreSQL server ++ ++ PostgreSQL system User ++ ++ ++ ++ ++ ++ Path to the directory storing the PostgreSQL binaries. The agent uses psql, pg_isready, pg_controldata and pg_ctl. ++ ++ Path to the PostgreSQL binaries ++ ++ ++ ++ ++ ++ Path to the data directory, e.g. PGDATA ++ ++ Path to the data directory ++ ++ ++ ++ ++ ++ Path to the directory set in data_directory from your postgresql.conf file. This parameter ++ has the same default than PostgreSQL itself: the pgdata parameter value. Unless you have a ++ special PostgreSQL setup and you understand this parameter, ignore it. ++ ++ Path to the directory set in data_directory from your postgresql.conf file ++ ++ ++ ++ ++ ++ Host IP address or unix socket folder the instance is listening on. ++ ++ Instance IP or unix socket folder ++ ++ ++ ++ ++ ++ Port the instance is listening on. ++ ++ Instance port ++ ++ ++ ++ ++ ++ Maximum lag allowed on a standby before we set a negative master score on it. The calculation ++ is based on the difference between the current LSN on the master and the LSN ++ written on the standby. ++ This parameter must be a valid positive number as described in PostgreSQL documentation. ++ See: https://www.postgresql.org/docs/current/static/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC ++ ++ Maximum write lag before we mark a standby as inappropriate to promote ++ ++ ++ ++ ++ ++ Path to the recovery.conf template. This file is simply copied to \$PGDATA ++ before starting the instance as slave. ++ ONLY for PostgreSQL 11 and bellow. This parameter is IGNORED for ++ PostgreSQL 12 and higher. The cluster will refuse to start if a template ++ file is found. ++ ++ Path to the recovery.conf template for PostgreSQL 11 and older. ++ ++ ++ ++ ++ ++ Additionnal arguments given to the postgres process on startup. ++ See "postgres --help" for available options. Usefull when the ++ postgresql.conf file is not in the data directory (PGDATA), eg.: ++ "-c config_file=/etc/postgresql/9.3/main/postgresql.conf". ++ ++ Additionnal arguments given to the postgres process on startup. ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ }; ++ return $OCF_SUCCESS; ++} ++ ++ ++=head1 SUPPORTED ACTIONS ++ ++This resource agent supports the following actions (operations): ++ ++=over ++ ++=item B ++ ++Starts the resource. Suggested minimum timeout: 60. ++ ++=item B ++ ++Stops the resource. Suggested minimum timeout: 60. ++ ++=item B ++ ++Suggested minimum timeout: 20. ++ ++=item B ++ ++Promotes the resource to the Master role. Suggested minimum timeout: 30. ++ ++=item B ++ ++Demotes the resource to the Slave role. Suggested minimum timeout: 120. ++ ++=item B ++ ++Performs a detailed status check. Suggested minimum timeout: 10. ++Suggested interval: 15. ++ ++=item B ++ ++Performs a detailed status check. Suggested minimum timeout: 10. ++Suggested interval: 16. ++ ++=item B ++ ++Suggested minimum timeout: 60 ++ ++=item B ++ ++Retrieves resource agent metadata (internal use only). ++Suggested minimum timeout: 5. ++ ++=item B ++ ++Suggested minimum timeout: 5. ++ ++=item B ++ ++Performs a validation of the resource configuration. ++Suggested minimum timeout: 5. ++ ++=back ++ ++=cut ++ ++sub ocf_methods { ++ print q{ ++ start ++ stop ++ reload ++ promote ++ demote ++ monitor ++ notify ++ methods ++ meta-data ++ validate-all ++ }; ++ ++ return $OCF_SUCCESS; ++} ++ ++############################################################ ++#### RA FUNCS ++ ++sub pgsql_validate_all { ++ my $fh; ++ my $ans = ''; ++ my %cdata; ++ ++ unless ( ++ ocf_version_cmp( $ENV{"OCF_RESKEY_crm_feature_set"}, '3.0.9' ) == 2 ++ ) { ++ ocf_exit_reason( ++ 'PAF %s is compatible with Pacemaker 1.1.13 and greater', ++ $VERSION ++ ); ++ return $OCF_ERR_INSTALLED; ++ } ++ ++ # check notify=true ++ $ans = qx{ $CRM_RESOURCE --resource "$OCF_RESOURCE_INSTANCE" \\ ++ --meta --get-parameter notify 2>/dev/null }; ++ chomp $ans; ++ unless ( lc($ans) =~ /^true$|^on$|^yes$|^y$|^1$/ ) { ++ ocf_exit_reason( ++ 'You must set meta parameter notify=true for your master resource' ++ ); ++ return $OCF_ERR_INSTALLED; ++ } ++ ++ # check master-max=1 ++ unless ( ++ defined $ENV{'OCF_RESKEY_CRM_meta_master_max'} ++ and $ENV{'OCF_RESKEY_CRM_meta_master_max'} eq '1' ++ ) { ++ ocf_exit_reason( ++ 'You must set meta parameter master-max=1 for your master resource' ++ ); ++ return $OCF_ERR_INSTALLED; ++ } ++ ++ if ( $PGVERNUM >= $PGVER_12 ) { ++ # check PostgreSQL setup: checks related to v12 and after ++ my $guc; ++ ++ # recovery.conf template must not exists ++ if ( -f $recovery_tpl ) { ++ ocf_exit_reason( ++ 'Recovery template file "%s" is forbidden for PostgreSQL 12 and above', ++ $recovery_tpl ); ++ exit $OCF_ERR_ARGS; ++ } ++ ++ # WARNING: you MUST put -C as first argument to bypass the root check ++ $guc = qx{ $POSTGRES -C recovery_target_timeline -D "$pgdata" $start_opts}; ++ chomp $guc; ++ unless ( $guc eq 'latest' ) { ++ ocf_exit_reason( ++ q{Parameter "recovery_target_timeline" MUST be set to 'latest'. } . ++ q{It is currently set to '%s'}, $guc ); ++ return $OCF_ERR_ARGS; ++ } ++ ++ $guc = qx{ $POSTGRES -C primary_conninfo -D "$pgdata" $start_opts}; ++ unless ($guc =~ /\bapplication_name='?$nodename'?\b/) { ++ ocf_exit_reason( ++ q{Parameter "primary_conninfo" MUST contain 'application_name=%s'. }. ++ q{It is currently set to '%s'}, $nodename, $guc ); ++ return $OCF_ERR_ARGS; ++ } ++ } ++ else { ++ my @content; ++ ++ # check recovery template ++ if ( ! -f $recovery_tpl ) { ++ ocf_exit_reason( 'Recovery template file "%s" does not exist', ++ $recovery_tpl ); ++ return $OCF_ERR_ARGS; ++ } ++ ++ # check content of the recovery template file ++ unless ( open( $fh, '<', $recovery_tpl ) ) { ++ ocf_exit_reason( 'Could not open file "%s": %s', $recovery_tpl, $! ); ++ return $OCF_ERR_ARGS; ++ } ++ @content = <$fh>; ++ close $fh; ++ ++ ++ unless ( grep /^\s*standby_mode\s*=\s*'?on'?\s*$/, @content ) { ++ ocf_exit_reason( ++ 'Recovery template file must contain "standby_mode = on"' ); ++ return $OCF_ERR_ARGS; ++ } ++ ++ unless ( grep /^\s*recovery_target_timeline\s*=\s*'?latest'?\s*$/, @content ) { ++ ocf_exit_reason( ++ "Recovery template file must contain \"recovery_target_timeline = 'latest'\"" ++ ); ++ return $OCF_ERR_ARGS; ++ } ++ ++ unless ( ++ grep /^\s*primary_conninfo\s*=.*['\s]application_name=$nodename['\s]/, ++ @content ++ ) { ++ ocf_exit_reason( ++ 'Recovery template file must contain in primary_conninfo parameter "application_name=%s"', ++ $nodename ); ++ return $OCF_ERR_ARGS; ++ } ++ } ++ ++ unless ( looks_like_number($maxlag) ) { ++ ocf_exit_reason( 'maxlag is not a number: "%s"', $maxlag ); ++ return $OCF_ERR_INSTALLED; ++ } ++ ++ # check system user ++ unless ( defined getpwnam $system_user ) { ++ ocf_exit_reason( 'System user "%s" does not exist', $system_user ); ++ return $OCF_ERR_ARGS; ++ } ++ ++ # require 9.3 minimum ++ if ( $PGVERNUM < $PGVER_93 ) { ++ ocf_exit_reason( "Require 9.3 and more" ); ++ return $OCF_ERR_INSTALLED; ++ } ++ ++ # check binaries ++ unless ( -x $PGCTL and -x $PGPSQL and -x $PGCTRLDATA and -x $PGISREADY ++ and ( -x $PGWALDUMP or -x "$bindir/pg_xlogdump") ++ ) { ++ ocf_exit_reason( ++ "Missing one or more binary. Check following path: %s, %s, %s, %s, %s or %s", ++ $PGCTL, $PGPSQL, $PGCTRLDATA, $PGISREADY, $PGWALDUMP, "$bindir/pg_xlogdump" ); ++ return $OCF_ERR_ARGS; ++ } ++ ++ # require wal_level >= hot_standby ++ %cdata = _get_controldata(); ++ unless ( $cdata{'wal_level'} =~ m{hot_standby|logical|replica} ) { ++ ocf_exit_reason( ++ 'wal_level must be one of "hot_standby", "logical" or "replica"' ); ++ return $OCF_ERR_ARGS; ++ } ++ ++ return $OCF_SUCCESS; ++} ++ ++ ++# Start the PostgreSQL instance as a *secondary* ++# ++sub pgsql_start { ++ my $rc = pgsql_monitor(); ++ my %cdata = _get_controldata(); ++ my $prev_state = $cdata{'state'}; ++ ++ # Instance must be running as secondary or being stopped. ++ # Anything else is an error. ++ if ( $rc == $OCF_SUCCESS ) { ++ ocf_log( 'info', 'Instance "%s" already started', ++ $OCF_RESOURCE_INSTANCE ); ++ return $OCF_SUCCESS; ++ } ++ elsif ( $rc != $OCF_NOT_RUNNING ) { ++ ocf_exit_reason( 'Unexpected state for instance "%s" (returned %d)', ++ $OCF_RESOURCE_INSTANCE, $rc ); ++ return $OCF_ERR_GENERIC; ++ } ++ ++ # ++ # From here, the instance is NOT running for sure. ++ # ++ ++ ocf_log( 'debug', ++ 'pgsql_start: instance "%s" is not running, starting it as a secondary', ++ $OCF_RESOURCE_INSTANCE ); ++ ++ # Must start as a standby, so enable recovery. ++ _enable_recovery(); ++ ++ # Start the instance as a secondary. ++ $rc = _pg_ctl_start(); ++ ++ if ( $rc == 0 ) { ++ ++ # Wait for the start to finish. ++ sleep 1 while ( $rc = pgsql_monitor() ) == $OCF_NOT_RUNNING; ++ ++ if ( $rc == $OCF_SUCCESS ) { ++ ocf_log( 'info', 'Instance "%s" started', $OCF_RESOURCE_INSTANCE ); ++ ++ # Check if a master score exists in the cluster. ++ # During the very first start of the cluster, no master score will ++ # exists on any of the existing slaves, unless an admin designated ++ # one of them using crm_master. If no master exists the cluster will ++ # not promote a master among the slaves. ++ # To solve this situation, we check if there is at least one master ++ # score existing on one node in the cluster. Do nothing if at least ++ # one master score is found among the clones of the resource. If no ++ # master score exists, set a score of 1 only if the resource was a ++ # shut downed master before the start. ++ if ( $prev_state eq "shut down" and not _master_score_exists() ) { ++ ocf_log( 'info', 'No master score around. Set mine to 1' ); ++ ++ _set_master_score( '1' ); ++ } ++ ++ return $OCF_SUCCESS; ++ } ++ ++ ocf_exit_reason( ++ 'Instance "%s" is not running as a slave (returned %d)', ++ $OCF_RESOURCE_INSTANCE, $rc ); ++ ++ return $OCF_ERR_GENERIC; ++ } ++ ++ ocf_exit_reason( 'Instance "%s" failed to start (rc: %d)', ++ $OCF_RESOURCE_INSTANCE, $rc ); ++ ++ return $OCF_ERR_GENERIC; ++} ++ ++# Stop the PostgreSQL instance ++# ++sub pgsql_stop { ++ my $rc; ++ my $state; ++ my $pidfile = "$datadir/postmaster.pid"; ++ # Add 60s to the timeout or use a 24h timeout fallback to make sure ++ # Pacemaker will give up before us and take decisions ++ my $timeout = ( _get_action_timeout() || 60*60*24 ) + 60; ++ ++ # Instance must be running as secondary or primary or being stopped. ++ # Anything else is an error. ++ $rc = pgsql_monitor(); ++ if ( $rc == $OCF_NOT_RUNNING ) { ++ ocf_log( 'info', 'Instance "%s" already stopped', ++ $OCF_RESOURCE_INSTANCE ); ++ return $OCF_SUCCESS; ++ } ++ elsif ( $rc != $OCF_SUCCESS and $rc != $OCF_RUNNING_MASTER ) { ++ ocf_exit_reason( 'Unexpected state for instance "%s" (returned %d)', ++ $OCF_RESOURCE_INSTANCE, $rc ); ++ return $OCF_ERR_GENERIC; ++ } ++ ++ # ++ # From here, the instance is running for sure. ++ # ++ ++ ocf_log( 'debug', 'pgsql_stop: instance "%s" is running, stopping it', ++ $OCF_RESOURCE_INSTANCE ); ++ ++ # Try to quit with proper shutdown. ++ ++ ++ $rc = _runas( $PGCTL, '--pgdata', $pgdata, '-w', '--timeout', $timeout, ++ '-m', 'fast', 'stop' ); ++ ++ if ( $rc == 0 ) { ++ # Wait for the stop to finish. ++ sleep 1 while ( $rc = pgsql_monitor() ) != $OCF_NOT_RUNNING ; ++ ++ ocf_log( 'info', 'Instance "%s" stopped', $OCF_RESOURCE_INSTANCE ); ++ ++ return $OCF_SUCCESS; ++ } ++ ++ ocf_exit_reason( 'Instance "%s" failed to stop', $OCF_RESOURCE_INSTANCE ); ++ return $OCF_ERR_GENERIC; ++} ++ ++# Monitor the PostgreSQL instance ++# ++sub pgsql_monitor { ++ my $pgisready_rc; ++ my $controldata_rc; ++ ++ ocf_log( 'debug', 'pgsql_monitor: monitor is a probe' ) if ocf_is_probe(); ++ ++ # First check, verify if the instance is listening. ++ $pgisready_rc = _pg_isready(); ++ ++ if ( $pgisready_rc == 0 ) { ++ # The instance is listening. ++ # We confirm that the instance is up and return if it is a primary or a ++ # secondary ++ ocf_log( 'debug', 'pgsql_monitor: instance "%s" is listening', ++ $OCF_RESOURCE_INSTANCE ); ++ return _confirm_role(); ++ } ++ ++ if ( $pgisready_rc == 1 ) { ++ # The attempt was rejected. ++ # This could happen in several cases: ++ # - at startup ++ # - during shutdown ++ # - during crash recovery ++ # - if instance is a warm standby ++ # Except for the warm standby case, this should be a transitional state. ++ # We try to confirm using pg_controldata. ++ ocf_log( 'debug', ++ 'pgsql_monitor: instance "%s" rejects connections - checking again...', ++ $OCF_RESOURCE_INSTANCE ); ++ $controldata_rc = _controldata_to_ocf(); ++ ++ if ( $controldata_rc == $OCF_RUNNING_MASTER ++ or $controldata_rc == $OCF_SUCCESS ++ ) { ++ # This state indicates that pg_isready check should succeed. ++ # We check again. ++ ocf_log( 'debug', ++ 'pgsql_monitor: instance "%s" controldata shows a running status', ++ $OCF_RESOURCE_INSTANCE ); ++ ++ $pgisready_rc = _pg_isready(); ++ if ( $pgisready_rc == 0 ) { ++ # Consistent with pg_controdata output. ++ # We can check if the instance is primary or secondary ++ ocf_log( 'debug', 'pgsql_monitor: instance "%s" is listening', ++ $OCF_RESOURCE_INSTANCE ); ++ return _confirm_role(); ++ } ++ ++ # Still not consistent, raise an error. ++ # NOTE: if the instance is a warm standby, we end here. ++ # TODO raise an hard error here ? ++ ocf_exit_reason( ++ 'Instance "%s" controldata is not consistent with pg_isready (returned: %d)', ++ $OCF_RESOURCE_INSTANCE, $pgisready_rc ); ++ ocf_log( 'info', ++ 'If this instance is in warm standby, this resource agent only supports hot standby', ++ $OCF_RESOURCE_INSTANCE, $pgisready_rc ); ++ ++ return $OCF_ERR_GENERIC; ++ } ++ ++ if ( $controldata_rc == $OCF_NOT_RUNNING ) { ++ # This state indicates that pg_isready check should fail with rc 2. ++ # We check again. ++ $pgisready_rc = _pg_isready(); ++ if ( $pgisready_rc == 2 ) { ++ # Consistent with pg_controdata output. ++ # We check the process status using pg_ctl status and check ++ # if it was propertly shut down using pg_controldata. ++ ocf_log( 'debug', ++ 'pgsql_monitor: instance "%s" is not listening', ++ $OCF_RESOURCE_INSTANCE ); ++ return _confirm_stopped(); ++ } ++ # Still not consistent, raise an error. ++ # TODO raise an hard error here ? ++ ocf_exit_reason( ++ 'Instance "%s" controldata is not consistent with pg_isready (returned: %d)', ++ $OCF_RESOURCE_INSTANCE, $pgisready_rc ); ++ ++ return $OCF_ERR_GENERIC; ++ } ++ ++ # Something went wrong with the controldata check, hard fail. ++ ocf_exit_reason( ++ 'Could not get instance "%s" status from controldata (returned: %d)', ++ $OCF_RESOURCE_INSTANCE, $controldata_rc ); ++ ++ return $OCF_ERR_INSTALLED; ++ } ++ ++ elsif ( $pgisready_rc == 2 ) { ++ # The instance is not listening. ++ # We check the process status using pg_ctl status and check ++ # if it was propertly shut down using pg_controldata. ++ ocf_log( 'debug', 'pgsql_monitor: instance "%s" is not listening', ++ $OCF_RESOURCE_INSTANCE ); ++ return _confirm_stopped(); ++ } ++ ++ elsif ( $pgisready_rc == 3 ) { ++ # No attempt was done, probably a syntax error. ++ # Hard configuration error, we don't want to retry or failover here. ++ ocf_exit_reason( ++ 'Unknown error while checking if instance "%s" is listening (returned %d)', ++ $OCF_RESOURCE_INSTANCE, $pgisready_rc ); ++ ++ return $OCF_ERR_CONFIGURED; ++ } ++ ++ ocf_exit_reason( 'Unexpected result when checking instance "%s" status', ++ $OCF_RESOURCE_INSTANCE ); ++ ++ return $OCF_ERR_GENERIC; ++} ++ ++ ++# Demote the PostgreSQL instance from primary to secondary ++# To demote a PostgreSQL instance, we must: ++# * stop it gracefully ++# * create recovery.conf with standby_mode = on ++# * start it ++# ++sub pgsql_demote { ++ my $rc; ++ ++ $rc = pgsql_monitor(); ++ ++ # Running as primary. Normal, expected behavior. ++ if ( $rc == $OCF_RUNNING_MASTER ) { ++ ocf_log( 'debug', 'pgsql_demote: "%s" currently running as a primary', ++ $OCF_RESOURCE_INSTANCE ) ; ++ } ++ elsif ( $rc == $OCF_SUCCESS ) { ++ # Already running as secondary. Nothing to do. ++ ocf_log( 'debug', ++ 'pgsql_demote: "%s" currently running as a secondary', ++ $OCF_RESOURCE_INSTANCE ); ++ return $OCF_SUCCESS; ++ } ++ elsif ( $rc == $OCF_NOT_RUNNING ) { ++ # Instance is stopped. Nothing to do. ++ ocf_log( 'debug', 'pgsql_demote: "%s" currently shut down', ++ $OCF_RESOURCE_INSTANCE ); ++ } ++ elsif ( $rc == $OCF_ERR_CONFIGURED ) { ++ # We actually prefer raising a hard or fatal error instead of leaving ++ # the CRM abording its transition for a new one because of a soft error. ++ # The hard error will force the CRM to move the resource immediately. ++ return $OCF_ERR_CONFIGURED; ++ } ++ else { ++ return $OCF_ERR_GENERIC; ++ } ++ ++ # TODO we need to make sure at least one slave is connected!! ++ ++ # WARNING if the resource state is stopped instead of master, the ocf ra dev ++ # rsc advises to return OCF_ERR_GENERIC, misleading the CRM in a loop where ++ # it computes transitions of demote(failing)->stop->start->promote actions ++ # until failcount == migration-threshold. ++ # This is a really ugly trick to keep going with the demode action if the ++ # rsc is already stopped gracefully. ++ # See discussion "CRM trying to demote a stopped resource" on ++ # developers@clusterlabs.org ++ unless ( $rc == $OCF_NOT_RUNNING ) { ++ # Add 60s to the timeout or use a 24h timeout fallback to make sure ++ # Pacemaker will give up before us and take decisions ++ my $timeout = ( _get_action_timeout() || 60*60*24 ) + 60; ++ ++ # WARNING the instance **MUST** be stopped gracefully. ++ # Do **not** use pg_stop() or service or systemctl here as these ++ # commands might force-stop the PostgreSQL instance using immediate ++ # after some timeout and return success, which is misleading. ++ ++ $rc = _runas( $PGCTL, '--pgdata', $pgdata, '--mode', 'fast', '-w', ++ '--timeout', $timeout , 'stop' ); ++ ++ # No need to wait for stop to complete, this is handled in pg_ctl ++ # using -w option. ++ unless ( $rc == 0 ) { ++ ocf_exit_reason( 'Failed to stop "%s" using pg_ctl (returned %d)', ++ $OCF_RESOURCE_INSTANCE, $rc ); ++ return $OCF_ERR_GENERIC; ++ } ++ ++ # Double check that the instance is stopped correctly. ++ $rc = pgsql_monitor(); ++ unless ( $rc == $OCF_NOT_RUNNING ) { ++ ocf_exit_reason( ++ 'Unexpected "%s" state: monitor status (%d) disagree with pg_ctl return code', ++ $OCF_RESOURCE_INSTANCE, $rc ); ++ return $OCF_ERR_GENERIC; ++ } ++ } ++ ++ # ++ # At this point, the instance **MUST** be stopped gracefully. ++ # ++ ++ # Note: We do not need to handle the recovery.conf file here as pgsql_start ++ # deal with that itself. Equally, no need to wait for the start to complete ++ # here, handled in pgsql_start. ++ $rc = pgsql_start(); ++ if ( $rc == $OCF_SUCCESS ) { ++ ocf_log( 'info', 'pgsql_demote: "%s" started as a secondary', ++ $OCF_RESOURCE_INSTANCE ); ++ return $OCF_SUCCESS; ++ } ++ ++ # NOTE: No need to double check the instance state as pgsql_start already use ++ # pgsql_monitor to check the state before returning. ++ ++ ocf_exit_reason( 'Starting "%s" as a standby failed (returned %d)', ++ $OCF_RESOURCE_INSTANCE, $rc ); ++ return $OCF_ERR_GENERIC; ++} ++ ++ ++# Promote the secondary instance to primary ++# ++sub pgsql_promote { ++ my $rc; ++ my $cancel_switchover; ++ ++ $rc = pgsql_monitor(); ++ ++ if ( $rc == $OCF_SUCCESS ) { ++ # Running as slave. Normal, expected behavior. ++ ocf_log( 'debug', 'pgsql_promote: "%s" currently running as a standby', ++ $OCF_RESOURCE_INSTANCE ); ++ } ++ elsif ( $rc == $OCF_RUNNING_MASTER ) { ++ # Already a master. Unexpected, but not a problem. ++ ocf_log( 'info', '"%s" already running as a primary', ++ $OCF_RESOURCE_INSTANCE ); ++ return $OCF_SUCCESS; ++ } ++ elsif ( $rc == $OCF_NOT_RUNNING ) { # INFO this is not supposed to happen. ++ # Currently not running. Need to start before promoting. ++ ocf_log( 'info', '"%s" currently not running, starting it', ++ $OCF_RESOURCE_INSTANCE ); ++ ++ $rc = pgsql_start(); ++ if ( $rc != $OCF_SUCCESS ) { ++ ocf_exit_reason( 'Failed to start the instance "%s"', ++ $OCF_RESOURCE_INSTANCE ); ++ return $OCF_ERR_GENERIC; ++ } ++ } ++ else { ++ ocf_exit_reason( 'Unexpected error, cannot promote "%s"', ++ $OCF_RESOURCE_INSTANCE ); ++ return $OCF_ERR_GENERIC; ++ } ++ ++ # ++ # At this point, the instance **MUST** be started as a secondary. ++ # ++ ++ # Cancel the switchover if it has been considered not safe during the ++ # pre-promote action ++ $cancel_switchover = _get_priv_attr('cancel_switchover'); ++ if ( $cancel_switchover ) { # if not empty or not 0 ++ ocf_exit_reason( 'Switchover has been canceled from pre-promote action' ); ++ ++ _delete_priv_attr( 'cancel_switchover' ); ++ ++ return $OCF_ERR_GENERIC if $cancel_switchover eq '1'; ++ return $OCF_ERR_ARGS; # ban the resource from the node if we have an ++ # internal error during _check_switchover ++ } ++ ++ # Do not check for a better candidate if we try to recover the master ++ # Recover of a master is detected during the pre-promote action. It sets the ++ # private attribute 'recover_master' to '1' if this is a master recover. ++ if ( _get_priv_attr( 'recover_master' ) eq '1' ) { ++ ocf_log( 'info', 'Recovering old master, no election needed'); ++ } ++ else { ++ ++ # The promotion is occurring on the best known candidate (highest ++ # master score), as chosen by pacemaker during the last working monitor ++ # on previous master (see pgsql_monitor/_check_locations subs). ++ # To avoid any race condition between the last monitor action on the ++ # previous master and the **real** most up-to-date standby, we ++ # set each standby location during the "pre-promote" action, and stored ++ # them using the "lsn_location" resource attribute. ++ # ++ # The best standby to promote would have the highest known LSN. If the ++ # current resource is not the best one, we need to modify the master ++ # scores accordingly, and abort the current promotion. ++ ocf_log( 'debug', ++ 'pgsql_promote: checking if current node is the best candidate for promotion' ); ++ ++ # Exclude nodes that are known to be unavailable (not in the current ++ # partition) using the "crm_node" command ++ my @active_nodes = split /\s+/ => _get_priv_attr( 'nodes' ); ++ my $node_to_promote = ''; ++ my $ans; ++ my $max_tl; ++ my $max_lsn; ++ my $node_tl; ++ my $node_lsn; ++ my $wal_num; ++ my $wal_off; ++ ++ # Get the "lsn_location" attribute value for the current node, as set ++ # during the "pre-promote" action. ++ # It should be the greatest among the secondary instances. ++ $ans = _get_priv_attr( 'lsn_location' ); ++ ++ if ( $ans eq '' ) { ++ # This should not happen as the "lsn_location" attribute should have ++ # been updated during the "pre-promote" action. ++ ocf_exit_reason( 'Can not get current node LSN location' ); ++ return $OCF_ERR_GENERIC; ++ } ++ ++ chomp $ans; ++ ( $max_tl, $max_lsn ) = split /#/, $ans; ++ ++ ocf_log( 'debug', 'pgsql_promote: current node TL#LSN location: %s#%s', ++ $max_tl, $max_lsn ); ++ ++ # Now we compare with the other available nodes. ++ foreach my $node ( @active_nodes ) { ++ # We exclude the current node from the check. ++ next if $node eq $nodename; ++ ++ # Get the "lsn_location" attribute value for the node, as set during ++ # the "pre-promote" action. ++ # This is implemented as a loop as private attributes are asynchronously ++ # available from other nodes. ++ # see: https://github.com/ClusterLabs/PAF/issues/131 ++ # NOTE: if a node did not set its lsn_location for some reason, this will end ++ # with a timeout and the whole promotion will start again. ++ WAIT_FOR_LSN: { ++ $ans = _get_priv_attr( 'lsn_location', $node ); ++ if ( $ans eq '' ) { ++ ocf_log( 'info', 'pgsql_promote: waiting for LSN from %s', $node ); ++ select( undef, undef, undef, 0.1 ); ++ redo WAIT_FOR_LSN; ++ } ++ } ++ ++ chomp $ans; ++ ( $node_tl, $node_lsn ) = split /#/, $ans; ++ ++ ocf_log( 'debug', ++ 'pgsql_promote: comparing with "%s": TL#LSN is %s#%s', ++ $node, $node_tl, $node_lsn ); ++ ++ # If the node has a higher LSN, select it as a best candidate to ++ # promotion and keep looping to check the TL/LSN of other nodes. ++ if ( $node_tl > $max_tl ++ or ( $node_tl == $max_tl and $node_lsn > $max_lsn ) ++ ) { ++ ocf_log( 'debug', ++ 'pgsql_promote: "%s" is a better candidate to promote (%s#%s > %s#%s)', ++ $node, $node_tl, $node_lsn, $max_tl, $max_lsn ); ++ $node_to_promote = $node; ++ $max_tl = $node_tl; ++ $max_lsn = $node_lsn; ++ } ++ } ++ ++ # If any node has been selected, we adapt the master scores accordingly ++ # and break the current promotion. ++ if ( $node_to_promote ne '' ) { ++ ocf_exit_reason( ++ '%s is the best candidate to promote, aborting current promotion', ++ $node_to_promote ); ++ ++ # Reset current node master score. ++ _set_master_score( '1' ); ++ ++ # Set promotion candidate master score. ++ _set_master_score( '1000', $node_to_promote ); ++ ++ # We fail the promotion to trigger another promotion transition ++ # with the new scores. ++ return $OCF_ERR_GENERIC; ++ } ++ ++ # Else, we will keep on promoting the current node. ++ } ++ ++ unless ( ++ # Promote the instance on the current node. ++ _runas( $PGCTL, '--pgdata', $pgdata, '-w', 'promote' ) == 0 ) ++ { ++ ocf_exit_reason( 'Error during promotion command' ); ++ return $OCF_ERR_GENERIC; ++ } ++ ++ # The instance promotion is asynchronous, so we need to wait for this ++ # process to complete. ++ while ( pgsql_monitor() != $OCF_RUNNING_MASTER ) { ++ ocf_log( 'info', 'Waiting for the promote to complete' ); ++ sleep 1; ++ } ++ ++ ocf_log( 'info', 'Promote complete' ); ++ ++ return $OCF_SUCCESS; ++} ++ ++# This action is called **before** the actual promotion when a failing master is ++# considered unreclaimable, recoverable or a new master must be promoted ++# (switchover or first start). ++# As every "notify" action, it is executed almost simultaneously on all ++# available nodes. ++sub pgsql_notify_pre_promote { ++ my $rc; ++ my $node_tl; ++ my $node_lsn; ++ my %cdata; ++ my %active_nodes; ++ my $attr_nodes; ++ ++ ocf_log( 'info', 'Promoting instance on node "%s"', ++ $OCF_NOTIFY_ENV{'promote'}[0]{'uname'} ); ++ ++ # No need to do an election between slaves if this is recovery of the master ++ if ( _is_master_recover( $OCF_NOTIFY_ENV{'promote'}[0]{'uname'} ) ) { ++ ocf_log( 'warning', 'This is a master recovery!' ); ++ ++ _set_priv_attr( 'recover_master', '1' ) ++ if $OCF_NOTIFY_ENV{'promote'}[0]{'uname'} eq $nodename; ++ ++ return $OCF_SUCCESS; ++ } ++ ++ # Environment cleanup! ++ _delete_priv_attr( 'lsn_location' ); ++ _delete_priv_attr( 'recover_master' ); ++ _delete_priv_attr( 'nodes' ); ++ _delete_priv_attr( 'cancel_switchover' ); ++ ++ # check for the last received entry of WAL from the master if we are ++ # the designated slave to promote ++ if ( _is_switchover( $nodename ) and scalar ++ grep { $_->{'uname'} eq $nodename } @{ $OCF_NOTIFY_ENV{'promote'} } ++ ) { ++ $rc = _check_switchover(); ++ ++ unless ( $rc == 0 ) { ++ # Shortcut the election process as the switchover will be ++ # canceled ++ _set_priv_attr( 'cancel_switchover', $rc ); ++ return $OCF_SUCCESS; # return code is ignored during notify ++ } ++ ++ # If the sub keeps going, that means the switchover is safe. ++ # Keep going with the election process in case the switchover was ++ # instruct to the wrong node. ++ # FIXME: should we allow a switchover to a lagging slave? ++ } ++ ++ # We need to trigger an election between existing slaves to promote the best ++ # one based on its current LSN location. Each node set a private attribute ++ # "lsn_location" with its TL and LSN location. ++ # ++ # During the following promote action, The designated standby for ++ # promotion use these attributes to check if the instance to be promoted ++ # is the best one, so we can avoid a race condition between the last ++ # successful monitor on the previous master and the current promotion. ++ ++ # As we can not break the transition from a notification action, we check ++ # during the promotion if each node TL and LSN are valid. ++ ++ # Force a checpoint to make sure the controldata shows the very last TL ++ _query( q{ CHECKPOINT }, {} ); ++ %cdata = _get_controldata(); ++ $node_lsn = _get_last_received_lsn( 'in decimal' ); ++ ++ unless ( defined $node_lsn ) { ++ ocf_log( 'warning', 'Unknown current node LSN' ); ++ # Return code are ignored during notifications... ++ return $OCF_SUCCESS; ++ } ++ ++ $node_lsn = "$cdata{'tl'}#$node_lsn"; ++ ++ ocf_log( 'info', 'Current node TL#LSN: %s', $node_lsn ); ++ ++ # Set the "lsn_location" attribute value for this node so we can use it ++ # during the following "promote" action. ++ _set_priv_attr( 'lsn_location', $node_lsn ); ++ ++ ocf_log( 'warning', 'Could not set the current node LSN' ) ++ if $? != 0 ; ++ ++ # If this node is the future master, keep track of the slaves that ++ # received the same notification to compare our LSN with them during ++ # promotion ++ if ( $OCF_NOTIFY_ENV{'promote'}[0]{'uname'} eq $nodename ) { ++ # Build the list of active nodes: ++ # master + slave + start - stop ++ # FIXME: Deal with rsc started during the same transaction but **after** ++ # the promotion ? ++ $active_nodes{ $_->{'uname'} }++ foreach @{ $OCF_NOTIFY_ENV{'active'} }, ++ @{ $OCF_NOTIFY_ENV{'start'} }; ++ $active_nodes{ $_->{'uname'} }-- foreach @{ $OCF_NOTIFY_ENV{'stop'} }; ++ ++ $attr_nodes = join " " ++ => grep { $active_nodes{$_} > 0 } keys %active_nodes; ++ ++ _set_priv_attr( 'nodes', $attr_nodes ); ++ } ++ ++ return $OCF_SUCCESS; ++} ++ ++# This action is called after a promote action. ++sub pgsql_notify_post_promote { ++ ++ # We have a new master (or the previous one recovered). ++ # Environment cleanup! ++ _delete_priv_attr( 'lsn_location' ); ++ _delete_priv_attr( 'recover_master' ); ++ _delete_priv_attr( 'nodes' ); ++ _delete_priv_attr( 'cancel_switchover' ); ++ ++ return $OCF_SUCCESS; ++} ++ ++# This is called before a demote occurs. ++sub pgsql_notify_pre_demote { ++ my $rc; ++ my %cdata; ++ ++ # do nothing if the local node will not be demoted ++ return $OCF_SUCCESS unless scalar ++ grep { $_->{'uname'} eq $nodename } @{ $OCF_NOTIFY_ENV{'demote'} }; ++ ++ $rc = pgsql_monitor(); ++ ++ # do nothing if this is not a master recovery ++ return $OCF_SUCCESS unless _is_master_recover( $nodename ) ++ and $rc == $OCF_FAILED_MASTER; ++ ++ # in case of master crash, we need to detect if the CRM tries to recover ++ # the master clone. The usual transition is to do: ++ # demote->stop->start->promote ++ # ++ # There are multiple flaws with this transition: ++ # * the 1st and 2nd actions will fail because the instance is in ++ # OCF_FAILED_MASTER step ++ # * the usual start action is dangerous as the instance will start with ++ # a recovery.conf instead of entering a normal recovery process ++ # ++ # To avoid this, we try to start the instance in recovery from here. ++ # If it success, at least it will be demoted correctly with a normal ++ # status. If it fails, it will be catched up in next steps. ++ ++ ocf_log( 'info', 'Trying to start failing master "%s"...', ++ $OCF_RESOURCE_INSTANCE ); ++ ++ # Either the instance managed to start or it couldn't. ++ # We rely on the pg_ctk '-w' switch to take care of this. If it couldn't ++ # start, this error will be catched up later during the various checks ++ _pg_ctl_start(); ++ ++ %cdata = _get_controldata(); ++ ++ ocf_log( 'info', 'State is "%s" after recovery attempt', $cdata{'state'} ); ++ ++ return $OCF_SUCCESS; ++} ++ ++# This is called before a stop occurs. ++sub pgsql_notify_pre_stop { ++ my $rc; ++ my %cdata; ++ ++ # do nothing if the local node will not be stopped ++ return $OCF_SUCCESS unless scalar ++ grep { $_->{'uname'} eq $nodename } @{ $OCF_NOTIFY_ENV{'stop'} }; ++ ++ $rc = _controldata_to_ocf(); ++ ++ # do nothing if this is not a slave recovery ++ return $OCF_SUCCESS unless _is_slave_recover( $nodename ) ++ and $rc == $OCF_RUNNING_SLAVE; ++ ++ # in case of slave crash, we need to detect if the CRM tries to recover ++ # the slaveclone. The usual transition is to do: stop->start ++ # ++ # This transition can no twork because the instance is in ++ # OCF_ERR_GENERIC step. So the stop action will fail, leading most ++ # probably to fencing action. ++ # ++ # To avoid this, we try to start the instance in recovery from here. ++ # If it success, at least it will be stopped correctly with a normal ++ # status. If it fails, it will be catched up in next steps. ++ ++ ocf_log( 'info', 'Trying to start failing slave "%s"...', ++ $OCF_RESOURCE_INSTANCE ); ++ ++ # Either the instance managed to start or it couldn't. ++ # We rely on the pg_ctk '-w' switch to take care of this. If it couldn't ++ # start, this error will be catched up later during the various checks ++ _pg_ctl_start(); ++ ++ %cdata = _get_controldata(); ++ ++ ocf_log( 'info', 'State is "%s" after recovery attempt', $cdata{'state'} ); ++ ++ return $OCF_SUCCESS; ++} ++ ++# Notify type actions, called on all available nodes before (pre) and after ++# (post) other actions, like promote, start, ... ++# ++sub pgsql_notify { ++ my $type_op; ++ ++ ocf_log( 'debug', "pgsql_notify: environment variables: %s", ++ Data::Dumper->new( [ \%OCF_NOTIFY_ENV ] )->Sortkeys(1)->Terse(1)->Dump ); ++ ++ return unless %OCF_NOTIFY_ENV; ++ ++ $type_op = "$OCF_NOTIFY_ENV{'type'}-$OCF_NOTIFY_ENV{'operation'}"; ++ ++ for ( $type_op ) { ++ if ( /^pre-promote$/ ) { return pgsql_notify_pre_promote() } ++ elsif ( /^post-promote$/ ) { return pgsql_notify_post_promote() } ++ elsif ( /^pre-demote$/ ) { return pgsql_notify_pre_demote() } ++ elsif ( /^pre-stop$/ ) { return pgsql_notify_pre_stop() } ++ } ++ ++ return $OCF_SUCCESS; ++} ++ ++# Action used to allow for online modification of resource parameters value. ++# ++sub pgsql_reload { ++ ++ # No action necessary, the action declaration is enough to inform pacemaker ++ # that the modification of any non-unique parameter can be applied without ++ # having to restart the resource. ++ ocf_log( 'info', 'Instance "%s" reloaded', $OCF_RESOURCE_INSTANCE ); ++ return $OCF_SUCCESS; ++ ++} ++ ++############################################################ ++#### MAIN ++ ++exit ocf_meta_data() if $__OCF_ACTION eq 'meta-data'; ++exit ocf_methods() if $__OCF_ACTION eq 'methods'; ++ ++# Avoid "could not change directory" when executing commands as "system-user". ++chdir File::Spec->tmpdir(); ++ ++# mandatory sanity checks ++# check pgdata ++if ( ! -d $pgdata ) { ++ ocf_exit_reason( 'PGDATA "%s" does not exist', $pgdata ); ++ exit $OCF_ERR_ARGS; ++} ++ ++# check datadir ++if ( ! -d $datadir ) { ++ ocf_exit_reason( 'data_directory "%s" does not exist', $datadir ); ++ exit $OCF_ERR_ARGS; ++} ++ ++# Set PostgreSQL version ++$PGVERNUM = _get_pg_version(); ++ ++# Set current node name. ++$nodename = ocf_local_nodename(); ++ ++$exit_code = pgsql_validate_all(); ++ ++exit $exit_code if $exit_code != $OCF_SUCCESS or $__OCF_ACTION eq 'validate-all'; ++ ++# Run action ++for ( $__OCF_ACTION ) { ++ if ( /^start$/ ) { $exit_code = pgsql_start() } ++ elsif ( /^stop$/ ) { $exit_code = pgsql_stop() } ++ elsif ( /^monitor$/ ) { $exit_code = pgsql_monitor() } ++ elsif ( /^promote$/ ) { $exit_code = pgsql_promote() } ++ elsif ( /^demote$/ ) { $exit_code = pgsql_demote() } ++ elsif ( /^notify$/ ) { $exit_code = pgsql_notify() } ++ elsif ( /^reload$/ ) { $exit_code = pgsql_reload() } ++ else { $exit_code = $OCF_ERR_UNIMPLEMENTED } ++} ++ ++exit $exit_code; ++ ++ ++=head1 EXAMPLE CRM SHELL ++ ++The following is an example configuration for a pgsqlms resource using the ++crm(8) shell: ++ ++ primitive pgsqld pgsqlms \ ++ params pgdata="/var/lib/postgresql/9.6/main" \ ++ bindir="/usr/lib/postgresql/9.6/bin" \ ++ pghost="/var/run/postgresql" \ ++ recovery_template="/etc/postgresql/9.6/main/recovery.conf.pcmk" \ ++ start_opts="-c config_file=/etc/postgresql/9.6/main/postgresql.conf" \ ++ op start timeout=60s \ ++ op stop timeout=60s \ ++ op promote timeout=30s \ ++ op demote timeout=120s \ ++ op monitor interval=15s timeout=10s role="Master" \ ++ op monitor interval=16s timeout=10s role="Slave" \ ++ op notify timeout=60s ++ ++ ms pgsql-ha pgsqld meta notify=true ++ ++ ++=head1 EXAMPLE PCS ++ ++The following is an example configuration for a pgsqlms resource using pcs(8): ++ ++ pcs resource create pgsqld ocf:heartbeat:pgsqlms \ ++ bindir=/usr/pgsql-9.6/bin pgdata=/var/lib/pgsql/9.6/data \ ++ op start timeout=60s \ ++ op stop timeout=60s \ ++ op promote timeout=30s \ ++ op demote timeout=120s \ ++ op monitor interval=15s timeout=10s role="Master" \ ++ op monitor interval=16s timeout=10s role="Slave" \ ++ op notify timeout=60s --master notify=true ++ ++=head1 SEE ALSO ++ ++http://clusterlabs.org/ ++ ++=head1 AUTHOR ++ ++Jehan-Guillaume de Rorthais and Mael Rimbault. ++ ++=cut +diff --color -uNr a/paf_LICENSE b/paf_LICENSE +--- a/paf_LICENSE 1970-01-01 01:00:00.000000000 +0100 ++++ b/paf_LICENSE 2021-04-14 09:16:39.083555835 +0200 +@@ -0,0 +1,19 @@ ++Copyright (c) 2016-2020, Jehan-Guillaume de Rorthais, Mael Rimbault. ++ ++Permission to use, copy, modify, and distribute this software and its ++documentation for any purpose, without fee, and without a written agreement ++is hereby granted, provided that the above copyright notice and this ++paragraph and the following two paragraphs appear in all copies. ++ ++IN NO EVENT SHALL THE AUTHOR OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR ++DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING ++LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS ++DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE ++POSSIBILITY OF SUCH DAMAGE. ++ ++THE AUTHOR AND DISTRIBUTORS SPECIFICALLY DISCLAIMS ANY WARRANTIES, ++INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY ++AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ++ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO ++PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. ++ +diff --color -uNr a/paf_README.md b/paf_README.md +--- a/paf_README.md 1970-01-01 01:00:00.000000000 +0100 ++++ b/paf_README.md 2021-04-14 09:18:57.450968048 +0200 +@@ -0,0 +1,86 @@ ++# PostgreSQL Automatic Failover ++ ++High-Availibility for Postgres, based on industry references Pacemaker and ++Corosync. ++ ++## Description ++ ++Pacemaker is nowadays the industry reference for High Availability. In the same ++fashion than for Systemd, all Linux distributions moved (or are moving) to this ++unique Pacemaker+Corosync stack, removing all other existing high availability ++stacks (CMAN, RGManager, OpenAIS, ...). It is able to detect failure on various ++services and automatically decide to failover the failing resource to another ++node when possible. ++ ++To be able to manage a specific service resource, Pacemaker interact with it ++through a so-called "Resource Agent". Resource agents must comply to the OCF ++specification which define what they must implement (start, stop, promote, ++etc), how they should behave and inform Pacemaker of their results. ++ ++PostgreSQL Automatic Failover is a new OCF resource Agent dedicated to ++PostgreSQL. Its original wish is to keep a clear limit between the Pacemaker ++administration and the PostgreSQL one, to keep things simple, documented and ++yet powerful. ++ ++Once your PostgreSQL cluster built using internal streaming replication, PAF is ++able to expose to Pacemaker what is the current status of the PostgreSQL ++instance on each node: master, slave, stopped, catching up, etc. Should a ++failure occurs on the master, Pacemaker will try to recover it by default. ++Should the failure be non-recoverable, PAF allows the slaves to be able to ++elect the best of them (the closest one to the old master) and promote it as ++the new master. All of this thanks to the robust, feature-full and most ++importantly experienced project: Pacemaker. ++ ++For information about how to install this agent, see `INSTALL.md`. ++ ++## Setup and requirements ++ ++PAF supports PostgreSQL 9.3 and higher. It has been extensively tested under ++CentOS 6 and 7 in various scenario. ++ ++PAF has been written to give to the administrator the maximum control ++over their PostgreSQL configuration and architecture. Thus, you are 100% ++responsible for the master/slave creations and their setup. The agent ++will NOT edit your setup. It only requires you to follow these pre-requisites: ++ ++ * slave __must__ be in hot_standby (accept read-only connections) ; ++ * the following parameters __must__ be configured in the appropriate place : ++ * `standby_mode = on` (for PostgreSQL 11 and before) ++ * `recovery_target_timeline = 'latest'` ++ * `primary_conninfo` wih `application_name` set to the node name as seen ++ in Pacemaker. ++ * these last parameters has been merged inside the instance configuration ++ file with PostgreSQL 12. For PostgreSQL 11 and before, you __must__ ++ provide a `recovery.conf` template file. ++ ++When setting up the resource in Pacemaker, here are the available parameters you ++can set: ++ ++ * `bindir`: location of the PostgreSQL binaries (default: `/usr/bin`) ++ * `pgdata`: location of the PGDATA of your instance (default: ++ `/var/lib/pgsql/data`) ++ * `datadir`: path to the directory set in `data_directory` from your ++ postgresql.conf file. This parameter has same default than PostgreSQL ++ itself: the `pgdata` parameter value. Unless you have a special PostgreSQL ++ setup and you understand this parameter, __ignore it__ ++ * `pghost`: the socket directory or IP address to use to connect to the ++ local instance (default: `/tmp` or `/var/run/postgresql` for DEBIAN) ++ * `pgport`: the port to connect to the local instance (default: `5432`) ++ * `recovery_template`: __only__ for PostgreSQL 11 and before. The local ++ template that will be copied as the `PGDATA/recovery.conf` file. This ++ file must not exist on any node for PostgreSQL 12 and after. ++ (default: `$PGDATA/recovery.conf.pcmk`) ++ * `start_opts`: Additional arguments given to the postgres process on startup. ++ See "postgres --help" for available options. Useful when the postgresql.conf ++ file is not in the data directory (PGDATA), eg.: ++ `-c config_file=/etc/postgresql/9.3/main/postgresql.conf` ++ * `system_user`: the system owner of your instance's process (default: ++ `postgres`) ++ * `maxlag`: maximum lag allowed on a standby before we set a negative master ++ score on it. The calculation is based on the difference between the current ++ xlog location on the master and the write location on the standby. ++ (default: 0, which disables this feature) ++ ++For a demonstration about how to setup a cluster, see ++[http://clusterlabs.github.io/PAF/documentation.html](http://clusterlabs.github.io/PAF/documentation.html). ++ diff --git a/SOURCES/bz1891883-ethmonitor-vlan-fix.patch b/SOURCES/bz1891883-ethmonitor-vlan-fix.patch new file mode 100644 index 0000000..ffe74d1 --- /dev/null +++ b/SOURCES/bz1891883-ethmonitor-vlan-fix.patch @@ -0,0 +1,25 @@ +From 7f7ca75100a846242ff1510fd9bcf299cd3d00eb Mon Sep 17 00:00:00 2001 +From: Aleksei Burlakov +Date: Mon, 26 Oct 2020 13:25:45 +0100 +Subject: [PATCH] ethmonitor: is_interface: RE matches vlan names + +Vlan names end not with : but are suffixed with the @devices-name +--- + heartbeat/ethmonitor | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/ethmonitor b/heartbeat/ethmonitor +index e791fbe9d..cf0321ab4 100755 +--- a/heartbeat/ethmonitor ++++ b/heartbeat/ethmonitor +@@ -230,8 +230,8 @@ is_interface() { + # + # List interfaces but exclude FreeS/WAN ipsecN virtual interfaces + # +- local iface=`$IP2UTIL -o -f inet addr show | grep " $1 " \ +- | cut -d ' ' -f2 | sort -u | grep -v '^ipsec[0-9][0-9]*$'` ++ local iface=`$IP2UTIL -o -f link addr show | grep -e " $1[:@]" \ ++ | cut -d ' ' -f2 | tr -d ':' | cut -d '@' -f1 | sort -u | grep -v '^ipsec[0-9][0-9]*$'` + [ "$iface" != "" ] + } + diff --git a/SOURCES/bz1902045-iface-vlan-vlan-not-unique.patch b/SOURCES/bz1902045-iface-vlan-vlan-not-unique.patch new file mode 100644 index 0000000..b8448cf --- /dev/null +++ b/SOURCES/bz1902045-iface-vlan-vlan-not-unique.patch @@ -0,0 +1,40 @@ +From 3dd051ed56418dc241417ea02e59db3982b7b92c Mon Sep 17 00:00:00 2001 +From: Oliver Freyermuth +Date: Thu, 26 Nov 2020 10:25:01 +0100 +Subject: [PATCH] heartbeat/iface-vlan: vlan_{interface,id} do not have to be + unique. + +Machines commonly have several vlan_id attached to one interface, +and may also have a vlan_id attached to several interfaces. + +vlan_name will still be unique, usual names are: +- bond_in.83@bond_in +- bond_in.84@bond_in + +fixes #1581 +--- + heartbeat/iface-vlan | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/iface-vlan b/heartbeat/iface-vlan +index cbe7e86da..d0481373c 100755 +--- a/heartbeat/iface-vlan ++++ b/heartbeat/iface-vlan +@@ -89,7 +89,7 @@ vlan_meta_data() { + + + +- ++ + + Define the interface where VLAN should be attached. + +@@ -99,7 +99,7 @@ vlan_meta_data() { + + + +- ++ + + Define the VLAN ID. It has to be a value between 0 and 4094. + diff --git a/SOURCES/bz1920698-podman-return-not-running-probe.patch b/SOURCES/bz1920698-podman-return-not-running-probe.patch new file mode 100644 index 0000000..b8420f5 --- /dev/null +++ b/SOURCES/bz1920698-podman-return-not-running-probe.patch @@ -0,0 +1,42 @@ +From 6877b20a83cb691884996bf77385259388fdebb2 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Wed, 3 Mar 2021 17:06:12 +0100 +Subject: [PATCH] podman: return OCF_NOT_RUNNING when monitor cmd fails (not + running) + +--- + heartbeat/podman | 21 +++++++++++++-------- + 1 file changed, 13 insertions(+), 8 deletions(-) + +diff --git a/heartbeat/podman b/heartbeat/podman +index 82ea14624..5b707f3f5 100755 +--- a/heartbeat/podman ++++ b/heartbeat/podman +@@ -204,14 +204,19 @@ monitor_cmd_exec() + # 125: no container with name or ID ${CONTAINER} found + # 126: container state improper (not running) + # 127: any other error +- if [ $rc -eq 125 ] || [ $rc -eq 126 ]; then +- rc=$OCF_NOT_RUNNING +- elif [ $rc -ne 0 ]; then +- ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out" +- rc=$OCF_ERR_GENERIC +- else +- ocf_log debug "monitor cmd passed: exit code = $rc" +- fi ++ # 255: podman 2+: container not running ++ case "$rc" in ++ 125|126|255) ++ rc=$OCF_NOT_RUNNING ++ ;; ++ 0) ++ ocf_log debug "monitor cmd passed: exit code = $rc" ++ ;; ++ *) ++ ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out" ++ rc=$OCF_ERR_GENERIC ++ ;; ++ esac + + return $rc + } diff --git a/SOURCES/bz1924363-nfsserver-error-check-unmount.patch b/SOURCES/bz1924363-nfsserver-error-check-unmount.patch new file mode 100644 index 0000000..e77e92d --- /dev/null +++ b/SOURCES/bz1924363-nfsserver-error-check-unmount.patch @@ -0,0 +1,57 @@ +From dc4fc6fb51481e62c763212129e7dbae4cb663fd Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Tue, 2 Feb 2021 17:55:40 -0800 +Subject: [PATCH] nfsserver: Error-check unbind_tree + +Fail to stop if unmounting rpcpipefs_dir or /var/lib/nfs fails. + +Resolves: RHBZ#1924363 + +Signed-off-by: Reid Wahl +--- + heartbeat/nfsserver | 23 ++++++++++++++++++++--- + 1 file changed, 20 insertions(+), 3 deletions(-) + +diff --git a/heartbeat/nfsserver b/heartbeat/nfsserver +index 80d20676b..96b19abe3 100755 +--- a/heartbeat/nfsserver ++++ b/heartbeat/nfsserver +@@ -465,9 +465,20 @@ unbind_tree () + sleep 1 + i=$((i + 1)) + done ++ ++ if mount | grep -q " on $OCF_RESKEY_rpcpipefs_dir "; then ++ ocf_log err "Failed to unmount $OCF_RESKEY_rpcpipefs_dir" ++ return $OCF_ERR_GENERIC ++ fi ++ + if is_bound /var/lib/nfs; then +- umount /var/lib/nfs ++ if ! umount /var/lib/nfs; then ++ ocf_log err "Failed to unmount /var/lib/nfs" ++ return $OCF_ERR_GENERIC ++ fi + fi ++ ++ return $OCF_SUCCESS + } + + binary_status() +@@ -836,8 +847,14 @@ nfsserver_stop () + esac + + unbind_tree +- ocf_log info "NFS server stopped" +- return 0 ++ rc=$? ++ if [ "$rc" -ne $OCF_SUCCESS ]; then ++ ocf_exit_reason "Failed to unmount a bind mount" ++ else ++ ocf_log info "NFS server stopped" ++ fi ++ ++ return $rc + } + + nfsserver_validate () diff --git a/SOURCES/bz1932863-VirtualDomain-fix-pid-status.patch b/SOURCES/bz1932863-VirtualDomain-fix-pid-status.patch new file mode 100644 index 0000000..95f2f96 --- /dev/null +++ b/SOURCES/bz1932863-VirtualDomain-fix-pid-status.patch @@ -0,0 +1,31 @@ +From 500de79739cd39808fb48fa556c9b9b9fe2e8acd Mon Sep 17 00:00:00 2001 +From: Matthias Hensler +Date: Thu, 18 Feb 2021 12:49:49 +0100 +Subject: [PATCH] fix pid_status() for VirtualDomain on EL8 + +see https://github.com/ClusterLabs/resource-agents/issues/1613 +--- + heartbeat/VirtualDomain | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/VirtualDomain b/heartbeat/VirtualDomain +index eb41e3e22..f9cd21fc7 100755 +--- a/heartbeat/VirtualDomain ++++ b/heartbeat/VirtualDomain +@@ -421,14 +421,14 @@ pid_status() + case "$emulator" in + qemu-kvm|qemu-dm|qemu-system-*) + rc=$OCF_NOT_RUNNING +- ps awx | grep -E "[q]emu-(kvm|dm|system).*-name $DOMAIN_NAME " > /dev/null 2>&1 ++ ps awx | grep -E "[q]emu-(kvm|dm|system).*-name ($DOMAIN_NAME|[^ ]*guest=$DOMAIN_NAME(,[^ ]*)?) " > /dev/null 2>&1 + if [ $? -eq 0 ]; then + rc=$OCF_SUCCESS + fi + ;; + libvirt_lxc) + rc=$OCF_NOT_RUNNING +- ps awx | grep -E "[l]ibvirt_lxc.*-name $DOMAIN_NAME " > /dev/null 2>&1 ++ ps awx | grep -E "[l]ibvirt_lxc.*-name ($DOMAIN_NAME|[^ ]*guest=$DOMAIN_NAME(,[^ ]*)?) " > /dev/null 2>&1 + if [ $? -eq 0 ]; then + rc=$OCF_SUCCESS + fi diff --git a/SOURCES/bz1934651-db2-add-PRIMARY-REMOTE_CATCHUP_PENDING-CONNECTED.patch b/SOURCES/bz1934651-db2-add-PRIMARY-REMOTE_CATCHUP_PENDING-CONNECTED.patch new file mode 100644 index 0000000..59fb0ef --- /dev/null +++ b/SOURCES/bz1934651-db2-add-PRIMARY-REMOTE_CATCHUP_PENDING-CONNECTED.patch @@ -0,0 +1,23 @@ +From dd5394180267c652d0928db8c5508d9977893fe5 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 18 Mar 2021 16:23:10 +0100 +Subject: [PATCH] db2: add PRIMARY/REMOTE_CATCHUP_PENDING/CONNECTED status to + promote-check + +--- + heartbeat/db2 | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/heartbeat/db2 b/heartbeat/db2 +index a57fd2bb6..459136cbd 100755 +--- a/heartbeat/db2 ++++ b/heartbeat/db2 +@@ -767,7 +767,7 @@ db2_promote() { + return $OCF_SUCCESS + ;; + +- PRIMARY/PEER/*|PRIMARY/REMOTE_CATCHUP/*|Primary/Peer) ++ PRIMARY/PEER/*|PRIMARY/REMOTE_CATCHUP/*|PRIMARY/REMOTE_CATCHUP_PENDING/CONNECTED|Primary/Peer) + # nothing to do, only update pacemaker's view + echo MASTER > $STATE_FILE + return $OCF_SUCCESS diff --git a/SOURCES/bz1935422-python-pygments-fix-CVE-2021-20270.patch b/SOURCES/bz1935422-python-pygments-fix-CVE-2021-20270.patch new file mode 100644 index 0000000..b9ed544 --- /dev/null +++ b/SOURCES/bz1935422-python-pygments-fix-CVE-2021-20270.patch @@ -0,0 +1,52 @@ +From f91804ff4772e3ab41f46e28d370f57898700333 Mon Sep 17 00:00:00 2001 +From: Georg Brandl +Date: Thu, 10 Dec 2020 08:19:21 +0100 +Subject: [PATCH] fixes #1625: infinite loop in SML lexer + +Reason was a lookahead-only pattern which was included in the state +where the lookahead was transitioning to. +--- + pygments/lexers/ml.py | 12 ++++++------ + 2 files changed, 14 insertions(+), 6 deletions(-) + +diff --git a/pygments/lexers/ml.py b/pygments/lexers/ml.py +index 8ca8ce3eb..f2ac367c5 100644 +--- a/pygments/lexers/ml.py ++++ b/pygments/lexers/ml.py +@@ -142,7 +142,7 @@ def id_callback(self, match): + (r'#\s+(%s)' % symbolicid_re, Name.Label), + # Some reserved words trigger a special, local lexer state change + (r'\b(datatype|abstype)\b(?!\')', Keyword.Reserved, 'dname'), +- (r'(?=\b(exception)\b(?!\'))', Text, ('ename')), ++ (r'\b(exception)\b(?!\')', Keyword.Reserved, 'ename'), + (r'\b(functor|include|open|signature|structure)\b(?!\')', + Keyword.Reserved, 'sname'), + (r'\b(type|eqtype)\b(?!\')', Keyword.Reserved, 'tname'), +@@ -315,15 +315,14 @@ def id_callback(self, match): + 'ename': [ + include('whitespace'), + +- (r'(exception|and)\b(\s+)(%s)' % alphanumid_re, ++ (r'(and\b)(\s+)(%s)' % alphanumid_re, + bygroups(Keyword.Reserved, Text, Name.Class)), +- (r'(exception|and)\b(\s*)(%s)' % symbolicid_re, ++ (r'(and\b)(\s*)(%s)' % symbolicid_re, + bygroups(Keyword.Reserved, Text, Name.Class)), + (r'\b(of)\b(?!\')', Keyword.Reserved), ++ (r'(%s)|(%s)' % (alphanumid_re, symbolicid_re), Name.Class), + +- include('breakout'), +- include('core'), +- (r'\S+', Error), ++ default('#pop'), + ], + + 'datcon': [ +@@ -445,6 +444,7 @@ class OcamlLexer(RegexLexer): + ], + } + ++ + class OpaLexer(RegexLexer): + """ + Lexer for the Opa language (http://opalang.org). diff --git a/SOURCES/bz1939281-aws-vpc-move-ip-add-ENI-lookup.patch b/SOURCES/bz1939281-aws-vpc-move-ip-add-ENI-lookup.patch new file mode 100644 index 0000000..94d4d95 --- /dev/null +++ b/SOURCES/bz1939281-aws-vpc-move-ip-add-ENI-lookup.patch @@ -0,0 +1,141 @@ +From b727fe4e2a0f4c88fca0ed9f90f57e570253c961 Mon Sep 17 00:00:00 2001 +From: Costas Tyfoxylos +Date: Wed, 26 Aug 2020 15:18:00 +0300 +Subject: [PATCH 1/2] aws-vpc-move-ip: Implemented optional eni lookup instead + of the default instance id. + +In a shared network pattern where the cluster resides in shared subnets the instance ids of the nodes are not retrievable but the eni ids are and this optional feature gives transparent support in that situation. +--- + heartbeat/aws-vpc-move-ip | 41 +++++++++++++++++++++++++++++++-------- + 1 file changed, 33 insertions(+), 8 deletions(-) + +diff --git a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip +index 1b540caec..bc82428e5 100755 +--- a/heartbeat/aws-vpc-move-ip ++++ b/heartbeat/aws-vpc-move-ip +@@ -44,6 +44,7 @@ OCF_RESKEY_routing_table_default="" + OCF_RESKEY_routing_table_role_default="" + OCF_RESKEY_interface_default="eth0" + OCF_RESKEY_monapi_default="false" ++OCF_RESKEY_lookup_type_default="InstanceId" + + : ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}} + : ${OCF_RESKEY_profile=${OCF_RESKEY_profile_default}} +@@ -54,6 +55,7 @@ OCF_RESKEY_monapi_default="false" + : ${OCF_RESKEY_routing_table_role=${OCF_RESKEY_routing_table_role_default}} + : ${OCF_RESKEY_interface=${OCF_RESKEY_interface_default}} + : ${OCF_RESKEY_monapi=${OCF_RESKEY_monapi_default}} ++: ${OCF_RESKEY_lookup_type=${OCF_RESKEY_lookup_type_default}} + + [ -n "$OCF_RESKEY_region" ] && region_opt="--region $OCF_RESKEY_region" + ####################################################################### +@@ -154,6 +156,17 @@ Enable enhanced monitoring using AWS API calls to check route table entry + Enhanced Monitoring + + ++ ++ ++ ++Name of resource type to lookup in route table. ++"InstanceId" : EC2 instance ID. (default) ++"NetworkInterfaceId" : ENI ID. (useful in shared VPC setups). ++ ++lookup type for route table resource ++ ++ ++ + + + +@@ -187,7 +200,7 @@ execute_cmd_as_role(){ + + ec2ip_set_address_param_compat(){ + # Include backward compatibility for the deprecated address parameter +- if [ -z "$OCF_RESKEY_ip" ] && [ -n "$OCF_RESKEY_address" ]; then ++ if [ -z "$OCF_RESKEY_ip" ] && [ -n "$OCF_RESKEY_address" ]; then + OCF_RESKEY_ip="$OCF_RESKEY_address" + fi + } +@@ -213,16 +226,24 @@ ec2ip_validate() { + } + + ec2ip_monitor() { +- MON_RES="" ++ MON_RES="" ++ if [ "${OCF_RESKEY_lookup_type}" = "NetworkInterfaceId" ]; then ++ EC2_ID="$(ec2ip_get_instance_eni)" ++ RESOURCE_TYPE="interface" ++ else ++ EC2_ID="$EC2_INSTANCE_ID" ++ RESOURCE_TYPE="instance" ++ fi ++ + if ocf_is_true ${OCF_RESKEY_monapi} || [ "$__OCF_ACTION" = "start" ] || ocf_is_probe; then + for rtb in $(echo $OCF_RESKEY_routing_table | sed -e 's/,/ /g'); do + ocf_log info "monitor: check routing table (API call) - $rtb" + if [[ -z "${OCF_RESKEY_routing_table_role}" ]]; then +- cmd="$OCF_RESKEY_awscli --profile $OCF_RESKEY_profile $region_opt --output text ec2 describe-route-tables --route-table-ids $rtb --query RouteTables[*].Routes[?DestinationCidrBlock=='$OCF_RESKEY_ip/32'].InstanceId" ++ cmd="$OCF_RESKEY_awscli --profile $OCF_RESKEY_profile $region_opt --output text ec2 describe-route-tables --route-table-ids $rtb --query RouteTables[*].Routes[?DestinationCidrBlock=='$OCF_RESKEY_ip/32'].$OCF_RESKEY_lookup_type" + ocf_log debug "executing command: $cmd" + ROUTE_TO_INSTANCE="$($cmd)" + else +- cmd="$OCF_RESKEY_awscli $region_opt --output text ec2 describe-route-tables --route-table-ids $rtb --query RouteTables[*].Routes[?DestinationCidrBlock=='$OCF_RESKEY_ip/32'].InstanceId" ++ cmd="$OCF_RESKEY_awscli $region_opt --output text ec2 describe-route-tables --route-table-ids $rtb --query RouteTables[*].Routes[?DestinationCidrBlock=='$OCF_RESKEY_ip/32'].$OCF_RESKEY_lookup_type" + ROUTE_TO_INSTANCE="$(execute_cmd_as_role "$cmd" $OCF_RESKEY_routing_table_role)" + fi + ocf_log debug "Overlay IP is currently routed to ${ROUTE_TO_INSTANCE}" +@@ -230,8 +251,8 @@ ec2ip_monitor() { + ROUTE_TO_INSTANCE="" + fi + +- if [ "$EC2_INSTANCE_ID" != "$ROUTE_TO_INSTANCE" ]; then +- ocf_log warn "not routed to this instance ($EC2_INSTANCE_ID) but to instance $ROUTE_TO_INSTANCE on $rtb" ++ if [ "$EC2_ID" != "$ROUTE_TO_INSTANCE" ]; then ++ ocf_log warn "not routed to this $RESOURCE_TYPE ($EC2_ID) but to $RESOURCE_TYPE $ROUTE_TO_INSTANCE on $rtb" + MON_RES="$MON_RES $rtb" + fi + sleep 1 +@@ -275,7 +296,7 @@ ec2ip_drop() { + return $OCF_SUCCESS + } + +-ec2ip_get_and_configure() { ++ec2ip_get_instance_eni() { + MAC_FILE="/sys/class/net/${OCF_RESKEY_interface}/address" + if [ -f $MAC_FILE ]; then + cmd="cat ${MAC_FILE}" +@@ -300,7 +321,11 @@ ec2ip_get_and_configure() { + return $OCF_ERR_GENERIC + fi + ocf_log debug "network interface id associated MAC address ${MAC_ADDR}: ${EC2_NETWORK_INTERFACE_ID}" ++ echo $EC2_NETWORK_INTERFACE_ID ++} + ++ec2ip_get_and_configure() { ++ EC2_NETWORK_INTERFACE_ID="$(ec2ip_get_instance_eni)" + for rtb in $(echo $OCF_RESKEY_routing_table | sed -e 's/,/ /g'); do + if [ -z "${OCF_RESKEY_routing_table_role}" ]; then + cmd="$OCF_RESKEY_awscli --profile $OCF_RESKEY_profile $region_opt --output text ec2 replace-route --route-table-id $rtb --destination-cidr-block ${OCF_RESKEY_ip}/32 --network-interface-id $EC2_NETWORK_INTERFACE_ID" + +From f4c8daae098dd33bdd5136ca4846eb505110e006 Mon Sep 17 00:00:00 2001 +From: Sander Botman +Date: Fri, 28 Aug 2020 22:01:03 +0200 +Subject: [PATCH 2/2] aws-vpc-move-ip: Fix the region option + +--- + heartbeat/aws-vpc-move-ip | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip +index bc82428e5..a5b28ad92 100755 +--- a/heartbeat/aws-vpc-move-ip ++++ b/heartbeat/aws-vpc-move-ip +@@ -243,7 +243,7 @@ ec2ip_monitor() { + ocf_log debug "executing command: $cmd" + ROUTE_TO_INSTANCE="$($cmd)" + else +- cmd="$OCF_RESKEY_awscli $region_opt --output text ec2 describe-route-tables --route-table-ids $rtb --query RouteTables[*].Routes[?DestinationCidrBlock=='$OCF_RESKEY_ip/32'].$OCF_RESKEY_lookup_type" ++ cmd="$OCF_RESKEY_awscli $region_opt --output text ec2 describe-route-tables --route-table-ids $rtb --query RouteTables[*].Routes[?DestinationCidrBlock=='$OCF_RESKEY_ip/32'].$OCF_RESKEY_lookup_type" + ROUTE_TO_INSTANCE="$(execute_cmd_as_role "$cmd" $OCF_RESKEY_routing_table_role)" + fi + ocf_log debug "Overlay IP is currently routed to ${ROUTE_TO_INSTANCE}" diff --git a/SOURCES/bz1939992-awsvip-dont-partially-match-IPs.patch b/SOURCES/bz1939992-awsvip-dont-partially-match-IPs.patch new file mode 100644 index 0000000..1a0e86e --- /dev/null +++ b/SOURCES/bz1939992-awsvip-dont-partially-match-IPs.patch @@ -0,0 +1,23 @@ +From 3491a6ad30830a8545defa5a417a7db46b093904 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Wed, 17 Mar 2021 12:39:10 +0100 +Subject: [PATCH] awsvip: dont partially match similar IPs during + monitor-action + +--- + heartbeat/awsvip | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/heartbeat/awsvip b/heartbeat/awsvip +index 7d0bf35b6..044d049c6 100755 +--- a/heartbeat/awsvip ++++ b/heartbeat/awsvip +@@ -172,7 +172,7 @@ awsvip_monitor() { + --instance-id "${INSTANCE_ID}" \ + --query 'Reservations[].Instances[].NetworkInterfaces[].PrivateIpAddresses[].PrivateIpAddress[]' \ + --output text | \ +- grep -q "${SECONDARY_PRIVATE_IP}" ++ grep -qE "(^|\s)${SECONDARY_PRIVATE_IP}(\s|$)" + RET=$? + + if [ $RET -ne 0 ]; then diff --git a/SOURCES/bz1940094-aws-agents-dont-spam-logs.patch b/SOURCES/bz1940094-aws-agents-dont-spam-logs.patch new file mode 100644 index 0000000..97ff44e --- /dev/null +++ b/SOURCES/bz1940094-aws-agents-dont-spam-logs.patch @@ -0,0 +1,64 @@ +From 59b0840d262900d0eaa8b19df3ede55eea5250d2 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Wed, 17 Mar 2021 12:10:59 +0100 +Subject: [PATCH] AWS agents: dont spam log files when getting token + +--- + heartbeat/aws-vpc-move-ip | 2 +- + heartbeat/aws-vpc-route53.in | 2 +- + heartbeat/awseip | 2 +- + heartbeat/awsvip | 2 +- + 4 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip +index cbb629b00..3ca3d6bd6 100755 +--- a/heartbeat/aws-vpc-move-ip ++++ b/heartbeat/aws-vpc-move-ip +@@ -215,7 +215,7 @@ ec2ip_validate() { + return $OCF_ERR_CONFIGURED + fi + +- TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") ++ TOKEN=$(curl -sX PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") + EC2_INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id -H "X-aws-ec2-metadata-token: $TOKEN") + + if [ -z "${EC2_INSTANCE_ID}" ]; then +diff --git a/heartbeat/aws-vpc-route53.in b/heartbeat/aws-vpc-route53.in +index 4fb17019b..21948eaca 100644 +--- a/heartbeat/aws-vpc-route53.in ++++ b/heartbeat/aws-vpc-route53.in +@@ -347,7 +347,7 @@ r53_monitor() { + _get_ip() { + case $OCF_RESKEY_ip in + local|public) +- TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") ++ TOKEN=$(curl -sX PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") + IPADDRESS=$(curl -s http://169.254.169.254/latest/meta-data/${OCF_RESKEY_ip}-ipv4 -H "X-aws-ec2-metadata-token: $TOKEN");; + *.*.*.*) + IPADDRESS="${OCF_RESKEY_ip}";; +diff --git a/heartbeat/awseip b/heartbeat/awseip +index de1967774..12ffffaa3 100755 +--- a/heartbeat/awseip ++++ b/heartbeat/awseip +@@ -244,7 +244,7 @@ AWSCLI="${OCF_RESKEY_awscli}" + ELASTIC_IP="${OCF_RESKEY_elastic_ip}" + ALLOCATION_ID="${OCF_RESKEY_allocation_id}" + PRIVATE_IP_ADDRESS="${OCF_RESKEY_private_ip_address}" +-TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") ++TOKEN=$(curl -sX PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") + INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id -H "X-aws-ec2-metadata-token: $TOKEN") + + case $__OCF_ACTION in +diff --git a/heartbeat/awsvip b/heartbeat/awsvip +index 8050107e8..7d0bf35b6 100755 +--- a/heartbeat/awsvip ++++ b/heartbeat/awsvip +@@ -206,7 +206,7 @@ esac + + AWSCLI="${OCF_RESKEY_awscli}" + SECONDARY_PRIVATE_IP="${OCF_RESKEY_secondary_private_ip}" +-TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") ++TOKEN=$(curl -sX PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") + INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id -H "X-aws-ec2-metadata-token: $TOKEN") + MAC_ADDRESS=$(curl -s http://169.254.169.254/latest/meta-data/mac -H "X-aws-ec2-metadata-token: $TOKEN") + NETWORK_ID=$(curl -s http://169.254.169.254/latest/meta-data/network/interfaces/macs/${MAC_ADDRESS}/interface-id -H "X-aws-ec2-metadata-token: $TOKEN") diff --git a/SOURCES/bz1943093-aws-vpc-move-ip-add-ENI-lookup.patch b/SOURCES/bz1943093-aws-vpc-move-ip-add-ENI-lookup.patch deleted file mode 100644 index 94d4d95..0000000 --- a/SOURCES/bz1943093-aws-vpc-move-ip-add-ENI-lookup.patch +++ /dev/null @@ -1,141 +0,0 @@ -From b727fe4e2a0f4c88fca0ed9f90f57e570253c961 Mon Sep 17 00:00:00 2001 -From: Costas Tyfoxylos -Date: Wed, 26 Aug 2020 15:18:00 +0300 -Subject: [PATCH 1/2] aws-vpc-move-ip: Implemented optional eni lookup instead - of the default instance id. - -In a shared network pattern where the cluster resides in shared subnets the instance ids of the nodes are not retrievable but the eni ids are and this optional feature gives transparent support in that situation. ---- - heartbeat/aws-vpc-move-ip | 41 +++++++++++++++++++++++++++++++-------- - 1 file changed, 33 insertions(+), 8 deletions(-) - -diff --git a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip -index 1b540caec..bc82428e5 100755 ---- a/heartbeat/aws-vpc-move-ip -+++ b/heartbeat/aws-vpc-move-ip -@@ -44,6 +44,7 @@ OCF_RESKEY_routing_table_default="" - OCF_RESKEY_routing_table_role_default="" - OCF_RESKEY_interface_default="eth0" - OCF_RESKEY_monapi_default="false" -+OCF_RESKEY_lookup_type_default="InstanceId" - - : ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}} - : ${OCF_RESKEY_profile=${OCF_RESKEY_profile_default}} -@@ -54,6 +55,7 @@ OCF_RESKEY_monapi_default="false" - : ${OCF_RESKEY_routing_table_role=${OCF_RESKEY_routing_table_role_default}} - : ${OCF_RESKEY_interface=${OCF_RESKEY_interface_default}} - : ${OCF_RESKEY_monapi=${OCF_RESKEY_monapi_default}} -+: ${OCF_RESKEY_lookup_type=${OCF_RESKEY_lookup_type_default}} - - [ -n "$OCF_RESKEY_region" ] && region_opt="--region $OCF_RESKEY_region" - ####################################################################### -@@ -154,6 +156,17 @@ Enable enhanced monitoring using AWS API calls to check route table entry - Enhanced Monitoring - - -+ -+ -+ -+Name of resource type to lookup in route table. -+"InstanceId" : EC2 instance ID. (default) -+"NetworkInterfaceId" : ENI ID. (useful in shared VPC setups). -+ -+lookup type for route table resource -+ -+ -+ - - - -@@ -187,7 +200,7 @@ execute_cmd_as_role(){ - - ec2ip_set_address_param_compat(){ - # Include backward compatibility for the deprecated address parameter -- if [ -z "$OCF_RESKEY_ip" ] && [ -n "$OCF_RESKEY_address" ]; then -+ if [ -z "$OCF_RESKEY_ip" ] && [ -n "$OCF_RESKEY_address" ]; then - OCF_RESKEY_ip="$OCF_RESKEY_address" - fi - } -@@ -213,16 +226,24 @@ ec2ip_validate() { - } - - ec2ip_monitor() { -- MON_RES="" -+ MON_RES="" -+ if [ "${OCF_RESKEY_lookup_type}" = "NetworkInterfaceId" ]; then -+ EC2_ID="$(ec2ip_get_instance_eni)" -+ RESOURCE_TYPE="interface" -+ else -+ EC2_ID="$EC2_INSTANCE_ID" -+ RESOURCE_TYPE="instance" -+ fi -+ - if ocf_is_true ${OCF_RESKEY_monapi} || [ "$__OCF_ACTION" = "start" ] || ocf_is_probe; then - for rtb in $(echo $OCF_RESKEY_routing_table | sed -e 's/,/ /g'); do - ocf_log info "monitor: check routing table (API call) - $rtb" - if [[ -z "${OCF_RESKEY_routing_table_role}" ]]; then -- cmd="$OCF_RESKEY_awscli --profile $OCF_RESKEY_profile $region_opt --output text ec2 describe-route-tables --route-table-ids $rtb --query RouteTables[*].Routes[?DestinationCidrBlock=='$OCF_RESKEY_ip/32'].InstanceId" -+ cmd="$OCF_RESKEY_awscli --profile $OCF_RESKEY_profile $region_opt --output text ec2 describe-route-tables --route-table-ids $rtb --query RouteTables[*].Routes[?DestinationCidrBlock=='$OCF_RESKEY_ip/32'].$OCF_RESKEY_lookup_type" - ocf_log debug "executing command: $cmd" - ROUTE_TO_INSTANCE="$($cmd)" - else -- cmd="$OCF_RESKEY_awscli $region_opt --output text ec2 describe-route-tables --route-table-ids $rtb --query RouteTables[*].Routes[?DestinationCidrBlock=='$OCF_RESKEY_ip/32'].InstanceId" -+ cmd="$OCF_RESKEY_awscli $region_opt --output text ec2 describe-route-tables --route-table-ids $rtb --query RouteTables[*].Routes[?DestinationCidrBlock=='$OCF_RESKEY_ip/32'].$OCF_RESKEY_lookup_type" - ROUTE_TO_INSTANCE="$(execute_cmd_as_role "$cmd" $OCF_RESKEY_routing_table_role)" - fi - ocf_log debug "Overlay IP is currently routed to ${ROUTE_TO_INSTANCE}" -@@ -230,8 +251,8 @@ ec2ip_monitor() { - ROUTE_TO_INSTANCE="" - fi - -- if [ "$EC2_INSTANCE_ID" != "$ROUTE_TO_INSTANCE" ]; then -- ocf_log warn "not routed to this instance ($EC2_INSTANCE_ID) but to instance $ROUTE_TO_INSTANCE on $rtb" -+ if [ "$EC2_ID" != "$ROUTE_TO_INSTANCE" ]; then -+ ocf_log warn "not routed to this $RESOURCE_TYPE ($EC2_ID) but to $RESOURCE_TYPE $ROUTE_TO_INSTANCE on $rtb" - MON_RES="$MON_RES $rtb" - fi - sleep 1 -@@ -275,7 +296,7 @@ ec2ip_drop() { - return $OCF_SUCCESS - } - --ec2ip_get_and_configure() { -+ec2ip_get_instance_eni() { - MAC_FILE="/sys/class/net/${OCF_RESKEY_interface}/address" - if [ -f $MAC_FILE ]; then - cmd="cat ${MAC_FILE}" -@@ -300,7 +321,11 @@ ec2ip_get_and_configure() { - return $OCF_ERR_GENERIC - fi - ocf_log debug "network interface id associated MAC address ${MAC_ADDR}: ${EC2_NETWORK_INTERFACE_ID}" -+ echo $EC2_NETWORK_INTERFACE_ID -+} - -+ec2ip_get_and_configure() { -+ EC2_NETWORK_INTERFACE_ID="$(ec2ip_get_instance_eni)" - for rtb in $(echo $OCF_RESKEY_routing_table | sed -e 's/,/ /g'); do - if [ -z "${OCF_RESKEY_routing_table_role}" ]; then - cmd="$OCF_RESKEY_awscli --profile $OCF_RESKEY_profile $region_opt --output text ec2 replace-route --route-table-id $rtb --destination-cidr-block ${OCF_RESKEY_ip}/32 --network-interface-id $EC2_NETWORK_INTERFACE_ID" - -From f4c8daae098dd33bdd5136ca4846eb505110e006 Mon Sep 17 00:00:00 2001 -From: Sander Botman -Date: Fri, 28 Aug 2020 22:01:03 +0200 -Subject: [PATCH 2/2] aws-vpc-move-ip: Fix the region option - ---- - heartbeat/aws-vpc-move-ip | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip -index bc82428e5..a5b28ad92 100755 ---- a/heartbeat/aws-vpc-move-ip -+++ b/heartbeat/aws-vpc-move-ip -@@ -243,7 +243,7 @@ ec2ip_monitor() { - ocf_log debug "executing command: $cmd" - ROUTE_TO_INSTANCE="$($cmd)" - else -- cmd="$OCF_RESKEY_awscli $region_opt --output text ec2 describe-route-tables --route-table-ids $rtb --query RouteTables[*].Routes[?DestinationCidrBlock=='$OCF_RESKEY_ip/32'].$OCF_RESKEY_lookup_type" -+ cmd="$OCF_RESKEY_awscli $region_opt --output text ec2 describe-route-tables --route-table-ids $rtb --query RouteTables[*].Routes[?DestinationCidrBlock=='$OCF_RESKEY_ip/32'].$OCF_RESKEY_lookup_type" - ROUTE_TO_INSTANCE="$(execute_cmd_as_role "$cmd" $OCF_RESKEY_routing_table_role)" - fi - ocf_log debug "Overlay IP is currently routed to ${ROUTE_TO_INSTANCE}" diff --git a/SOURCES/bz1943464-python-pygments-fix-CVE-2021-27291.patch b/SOURCES/bz1943464-python-pygments-fix-CVE-2021-27291.patch new file mode 100644 index 0000000..d28028c --- /dev/null +++ b/SOURCES/bz1943464-python-pygments-fix-CVE-2021-27291.patch @@ -0,0 +1,138 @@ +From 2e7e8c4a7b318f4032493773732754e418279a14 Mon Sep 17 00:00:00 2001 +From: Georg Brandl +Date: Mon, 11 Jan 2021 09:46:34 +0100 +Subject: [PATCH] Fix several exponential/cubic complexity regexes found by Ben + Caller/Doyensec + +--- + pygments/lexers/archetype.py | 2 +- + pygments/lexers/factor.py | 4 ++-- + pygments/lexers/jvm.py | 1 - + pygments/lexers/matlab.py | 6 +++--- + pygments/lexers/objective.py | 4 ++-- + pygments/lexers/templates.py | 2 +- + pygments/lexers/varnish.py | 2 +- + 8 files changed, 14 insertions(+), 12 deletions(-) + +diff --git a/pygments/lexers/archetype.py b/pygments/lexers/archetype.py +index 65046613d..26f5ea8c9 100644 +--- a/pygments/lexers/archetype.py ++++ b/pygments/lexers/archetype.py +@@ -58,7 +58,7 @@ class AtomsLexer(RegexLexer): + (r'P((\d*(\.\d+)?[YyMmWwDd]){1,3}(T(\d*(\.\d+)?[HhMmSs]){,3})?|' + r'T(\d*(\.\d+)?[HhMmSs]){,3})', Literal.Date), + (r'[+-]?(\d+\.\d*|\.\d+|\d+)[eE][+-]?\d+', Number.Float), +- (r'[+-]?(\d+)*\.\d+%?', Number.Float), ++ (r'[+-]?\d*\.\d+%?', Number.Float), + (r'0x[0-9a-fA-F]+', Number.Hex), + (r'[+-]?\d+%?', Number.Integer), + ], +diff --git a/pygments/lexers/factor.py b/pygments/lexers/factor.py +index be7b30dff..9200547f9 100644 +--- a/pygments/lexers/factor.py ++++ b/pygments/lexers/factor.py +@@ -265,7 +265,7 @@ class FactorLexer(RegexLexer): + (r'(?:)\s', Keyword.Namespace), + + # strings +- (r'"""\s+(?:.|\n)*?\s+"""', String), ++ (r'"""\s(?:.|\n)*?\s"""', String), + (r'"(?:\\\\|\\"|[^"])*"', String), + (r'\S+"\s+(?:\\\\|\\"|[^"])*"', String), + (r'CHAR:\s+(?:\\[\\abfnrstv]|[^\\]\S*)\s', String.Char), +@@ -322,7 +322,7 @@ class FactorLexer(RegexLexer): + 'slots': [ + (r'\s+', Text), + (r';\s', Keyword, '#pop'), +- (r'(\{\s+)(\S+)(\s+[^}]+\s+\}\s)', ++ (r'(\{\s+)(\S+)(\s[^}]+\s\}\s)', + bygroups(Text, Name.Variable, Text)), + (r'\S+', Name.Variable), + ], +diff --git a/pygments/lexers/jvm.py b/pygments/lexers/jvm.py +index 62dfd45e5..9a9397c2d 100644 +--- a/pygments/lexers/jvm.py ++++ b/pygments/lexers/jvm.py +@@ -981,7 +981,6 @@ class CeylonLexer(RegexLexer): + (r'(import)(\s+)', bygroups(Keyword.Namespace, Text), 'import'), + (r'"(\\\\|\\[^\\]|[^"\\])*"', String), + (r"'\\.'|'[^\\]'|'\\\{#[0-9a-fA-F]{4}\}'", String.Char), +- (r'".*``.*``.*"', String.Interpol), + (r'(\.)([a-z_]\w*)', + bygroups(Operator, Name.Attribute)), + (r'[a-zA-Z_]\w*:', Name.Label), +diff --git a/pygments/lexers/matlab.py b/pygments/lexers/matlab.py +index 4823c6a7e..578848623 100644 +--- a/pygments/lexers/matlab.py ++++ b/pygments/lexers/matlab.py +@@ -137,7 +137,7 @@ class MatlabLexer(RegexLexer): + (r'.', Comment.Multiline), + ], + 'deffunc': [ +- (r'(\s*)(?:(.+)(\s*)(=)(\s*))?(.+)(\()(.*)(\))(\s*)', ++ (r'(\s*)(?:(\S+)(\s*)(=)(\s*))?(.+)(\()(.*)(\))(\s*)', + bygroups(Whitespace, Text, Whitespace, Punctuation, + Whitespace, Name.Function, Punctuation, Text, + Punctuation, Whitespace), '#pop'), +@@ -638,7 +638,7 @@ class OctaveLexer(RegexLexer): + (r"[^']*'", String, '#pop'), + ], + 'deffunc': [ +- (r'(\s*)(?:(.+)(\s*)(=)(\s*))?(.+)(\()(.*)(\))(\s*)', ++ (r'(\s*)(?:(\S+)(\s*)(=)(\s*))?(.+)(\()(.*)(\))(\s*)', + bygroups(Whitespace, Text, Whitespace, Punctuation, + Whitespace, Name.Function, Punctuation, Text, + Punctuation, Whitespace), '#pop'), +@@ -710,7 +710,7 @@ class ScilabLexer(RegexLexer): + (r'.', String, '#pop'), + ], + 'deffunc': [ +- (r'(\s*)(?:(.+)(\s*)(=)(\s*))?(.+)(\()(.*)(\))(\s*)', ++ (r'(\s*)(?:(\S+)(\s*)(=)(\s*))?(.+)(\()(.*)(\))(\s*)', + bygroups(Whitespace, Text, Whitespace, Punctuation, + Whitespace, Name.Function, Punctuation, Text, + Punctuation, Whitespace), '#pop'), +diff --git a/pygments/lexers/objective.py b/pygments/lexers/objective.py +index 34e4062f6..38ac9bb05 100644 +--- a/pygments/lexers/objective.py ++++ b/pygments/lexers/objective.py +@@ -261,11 +261,11 @@ class LogosLexer(ObjectiveCppLexer): + 'logos_classname'), + (r'(%hook|%group)(\s+)([a-zA-Z$_][\w$]+)', + bygroups(Keyword, Text, Name.Class)), +- (r'(%config)(\s*\(\s*)(\w+)(\s*=\s*)(.*?)(\s*\)\s*)', ++ (r'(%config)(\s*\(\s*)(\w+)(\s*=)(.*?)(\)\s*)', + bygroups(Keyword, Text, Name.Variable, Text, String, Text)), + (r'(%ctor)(\s*)(\{)', bygroups(Keyword, Text, Punctuation), + 'function'), +- (r'(%new)(\s*)(\()(\s*.*?\s*)(\))', ++ (r'(%new)(\s*)(\()(.*?)(\))', + bygroups(Keyword, Text, Keyword, String, Keyword)), + (r'(\s*)(%end)(\s*)', bygroups(Text, Keyword, Text)), + inherit, +diff --git a/pygments/lexers/templates.py b/pygments/lexers/templates.py +index 33c06c4c4..5c3346b4c 100644 +--- a/pygments/lexers/templates.py ++++ b/pygments/lexers/templates.py +@@ -1405,7 +1405,7 @@ class EvoqueLexer(RegexLexer): + # see doc for handling first name arg: /directives/evoque/ + # + minor inconsistency: the "name" in e.g. $overlay{name=site_base} + # should be using(PythonLexer), not passed out as String +- (r'(\$)(evoque|overlay)(\{(%)?)(\s*[#\w\-"\'.]+[^=,%}]+?)?' ++ (r'(\$)(evoque|overlay)(\{(%)?)(\s*[#\w\-"\'.]+)?' + r'(.*?)((?(4)%)\})', + bygroups(Punctuation, Name.Builtin, Punctuation, None, + String, using(PythonLexer), Punctuation)), +diff --git a/pygments/lexers/varnish.py b/pygments/lexers/varnish.py +index 23653f7a1..9d358bd7c 100644 +--- a/pygments/lexers/varnish.py ++++ b/pygments/lexers/varnish.py +@@ -61,7 +61,7 @@ def analyse_text(text): + bygroups(Name.Attribute, Operator, Name.Variable.Global, Punctuation)), + (r'(\.probe)(\s*=\s*)(\{)', + bygroups(Name.Attribute, Operator, Punctuation), 'probe'), +- (r'(\.\w+\b)(\s*=\s*)([^;]*)(\s*;)', ++ (r'(\.\w+\b)(\s*=\s*)([^;\s]*)(\s*;)', + bygroups(Name.Attribute, Operator, using(this), Punctuation)), + (r'\{', Punctuation, '#push'), + (r'\}', Punctuation, '#pop'), diff --git a/SOURCES/bz1957765-gcp-vpc-move-vip-retry.patch b/SOURCES/bz1957765-gcp-vpc-move-vip-retry.patch new file mode 100644 index 0000000..2350f1a --- /dev/null +++ b/SOURCES/bz1957765-gcp-vpc-move-vip-retry.patch @@ -0,0 +1,102 @@ +From 3ae6d8f0a34d099945d9bf005ed45dbfe9452202 Mon Sep 17 00:00:00 2001 +From: kj1724 <78624900+kj1724@users.noreply.github.com> +Date: Wed, 28 Apr 2021 10:22:38 -0400 +Subject: [PATCH] gcp-vpc-move-vip.in: Adds retries + +If the cluster fails a monitoring event, it will try to restart the resource. If the resource agent makes an API/metadata call that fails at that time, the resource will be considered "failed", but in certain case also "unconfigured", which prevents further operations. + +These changes can help the agent recover on certain intermittent failures. +--- + heartbeat/gcp-vpc-move-vip.in | 62 ++++++++++++++++++++--------------- + 1 file changed, 35 insertions(+), 27 deletions(-) + +diff --git a/heartbeat/gcp-vpc-move-vip.in b/heartbeat/gcp-vpc-move-vip.in +index bbbd87b7a9..c411555110 100755 +--- a/heartbeat/gcp-vpc-move-vip.in ++++ b/heartbeat/gcp-vpc-move-vip.in +@@ -50,6 +50,8 @@ REMOVE = 1 + CONN = None + THIS_VM = None + ALIAS = None ++MAX_RETRIES = 3 ++RETRY_BACKOFF_SECS = 1 + METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/' + METADATA_HEADERS = {'Metadata-Flavor': 'Google'} + METADATA = \ +@@ -111,18 +113,37 @@ def get_metadata(metadata_key, params=None, timeout=None): + + Returns: + HTTP response from the GET request. +- +- Raises: +- urlerror.HTTPError: raises when the GET request fails. + """ +- timeout = timeout or 60 +- metadata_url = os.path.join(METADATA_SERVER, metadata_key) +- params = urlparse.urlencode(params or {}) +- url = '%s?%s' % (metadata_url, params) +- request = urlrequest.Request(url, headers=METADATA_HEADERS) +- request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({})) +- return request_opener.open( +- request, timeout=timeout * 1.1).read().decode("utf-8") ++ for i in range(MAX_RETRIES): ++ try: ++ timeout = timeout or 60 ++ metadata_url = os.path.join(METADATA_SERVER, metadata_key) ++ params = urlparse.urlencode(params or {}) ++ url = '%s?%s' % (metadata_url, params) ++ request = urlrequest.Request(url, headers=METADATA_HEADERS) ++ request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({})) ++ return request_opener.open( ++ request, timeout=timeout * 1.1).read().decode("utf-8") ++ except Exception as e: ++ logger.error('Couldn\'t get instance name, is this running inside GCE?: ' ++ + str(e)) ++ time.sleep(RETRY_BACKOFF_SECS * (i + 1)) ++ ++ # If the retries are exhausted we exit with a generic error. ++ sys.exit(OCF_ERR_GENERIC) ++ ++ ++def create_api_connection(): ++ for i in range(MAX_RETRIES): ++ try: ++ return googleapiclient.discovery.build('compute', 'v1', ++ cache_discovery=False) ++ except Exception as e: ++ logger.error('Couldn\'t connect with google api: ' + str(e)) ++ time.sleep(RETRY_BACKOFF_SECS * (i + 1)) ++ ++ # If the retries are exhausted we exit with a generic error. ++ sys.exit(OCF_ERR_GENERIC) + + + def get_instance(project, zone, instance): +@@ -358,24 +379,11 @@ def gcp_alias_status(alias): + + def validate(): + global ALIAS +- global CONN + global THIS_VM ++ global CONN + +- # Populate global vars +- try: +- CONN = googleapiclient.discovery.build('compute', 'v1', +- cache_discovery=False) +- except Exception as e: +- logger.error('Couldn\'t connect with google api: ' + str(e)) +- sys.exit(OCF_ERR_CONFIGURED) +- +- try: +- THIS_VM = get_metadata('instance/name') +- except Exception as e: +- logger.error('Couldn\'t get instance name, is this running inside GCE?: ' +- + str(e)) +- sys.exit(OCF_ERR_CONFIGURED) +- ++ CONN = create_api_connection() ++ THIS_VM = get_metadata('instance/name') + ALIAS = os.environ.get('OCF_RESKEY_alias_ip') + if not ALIAS: + logger.error('Missing alias_ip parameter') diff --git a/SOURCES/bz1969968-lvmlockd-remove-with_cmirrord.patch b/SOURCES/bz1969968-lvmlockd-remove-with_cmirrord.patch new file mode 100644 index 0000000..f73d677 --- /dev/null +++ b/SOURCES/bz1969968-lvmlockd-remove-with_cmirrord.patch @@ -0,0 +1,82 @@ +diff --color -uNr a/heartbeat/lvmlockd b/heartbeat/lvmlockd +--- a/heartbeat/lvmlockd 2021-06-11 16:08:37.725598299 +0200 ++++ b/heartbeat/lvmlockd 2021-06-11 16:10:38.690910781 +0200 +@@ -59,14 +59,6 @@ + This agent manages the lvmlockd daemon + + +- +- +-Start with cmirrord (cluster mirror log daemon). +- +-activate cmirrord +- +- +- + + pid file + pid file +@@ -110,7 +102,6 @@ + : ${OCF_RESKEY_pidfile:="/run/lvmlockd.pid"} + + LOCKD="lvmlockd" +-CMIRRORD="cmirrord" + # 0.5s sleep each count + TIMEOUT_COUNT=20 + +@@ -150,12 +141,6 @@ + rc=$? + mirror_rc=$rc + +- if ocf_is_true $OCF_RESKEY_with_cmirrord; then +- pid=$(pgrep $CMIRRORD | head -n1) +- daemon_is_running "$pid" +- mirror_rc=$? +- fi +- + # If these ever don't match, return error to force recovery + if [ $mirror_rc -ne $rc ]; then + return $OCF_ERR_GENERIC +@@ -235,16 +220,6 @@ + return $OCF_SUCCESS + fi + +- if ocf_is_true $OCF_RESKEY_with_cmirrord; then +- ocf_log info "starting ${CMIRRORD}..." +- $CMIRRORD +- rc=$? +- if [ $rc -ne $OCF_SUCCESS ] ; then +- ocf_exit_reason "Failed to start ${CMIRRORD}, exit code: $rc" +- return $OCF_ERR_GENERIC +- fi +- fi +- + if [ ! -z "$OCF_RESKEY_socket_path" ] ; then + extras="$extras -s ${OCF_RESKEY_socket_path}" + fi +@@ -341,13 +316,8 @@ + pid=$(get_pid) + kill_stop $LOCKD $pid + +- if ocf_is_true $OCF_RESKEY_with_cmirrord; then +- pid=$(pgrep $CMIRRORD) +- kill_stop $CMIRRORD $pid +- fi +- + if silent_status ; then +- ocf_exit_reason "Failed to stop, ${LOCKD} or ${CMIRRORD} still running." ++ ocf_exit_reason "Failed to stop, ${LOCKD} still running." + return $OCF_ERR_GENERIC + fi + +@@ -370,10 +340,6 @@ + check_binary pgrep + check_binary lvmlockctl + +- if ocf_is_true $OCF_RESKEY_with_cmirrord; then +- check_binary $CMIRRORD +- fi +- + return $OCF_SUCCESS + } + diff --git a/SOURCES/bz1972035-LVM-activate-fix-drop-in.patch b/SOURCES/bz1972035-LVM-activate-fix-drop-in.patch new file mode 100644 index 0000000..272b16a --- /dev/null +++ b/SOURCES/bz1972035-LVM-activate-fix-drop-in.patch @@ -0,0 +1,39 @@ +From 5729c79c6ab06f3dacf1fe8dafab9403e5560e34 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 20 May 2021 10:14:49 +0200 +Subject: [PATCH] LVM-activate: fix drop-in check to avoid re-creating drop-in + file when it already exists + +--- + heartbeat/LVM-activate | 17 +++++++---------- + 1 file changed, 7 insertions(+), 10 deletions(-) + +diff --git a/heartbeat/LVM-activate b/heartbeat/LVM-activate +index a8e40dce4..53223367e 100755 +--- a/heartbeat/LVM-activate ++++ b/heartbeat/LVM-activate +@@ -820,17 +820,14 @@ lvm_start() { + if systemd_is_running ; then + # Create drop-in to deactivate VG before stopping + # storage services during shutdown/reboot. +- after=$(systemctl show resource-agents-deps.target.d \ +- --property=After | cut -d'=' -f2) +- +- case "$after" in +- *" blk-availability.service "*) +- ;; +- *) +- systemd_drop_in "99-LVM-activate" "After" \ ++ systemctl show resource-agents-deps.target \ ++ --property=After | cut -d'=' -f2 | \ ++ grep -qE "(^|\s)blk-availability.service(\s|$)" ++ ++ if [ "$?" -ne 0 ]; then ++ systemd_drop_in "99-LVM-activate" "After" \ + "blk-availability.service" +- ;; +- esac ++ fi + + # If blk-availability isn't started, the "After=" + # directive has no effect. diff --git a/SOURCES/bz1972236-LVM-activate-fix-drop-in.patch b/SOURCES/bz1972236-LVM-activate-fix-drop-in.patch deleted file mode 100644 index 272b16a..0000000 --- a/SOURCES/bz1972236-LVM-activate-fix-drop-in.patch +++ /dev/null @@ -1,39 +0,0 @@ -From 5729c79c6ab06f3dacf1fe8dafab9403e5560e34 Mon Sep 17 00:00:00 2001 -From: Oyvind Albrigtsen -Date: Thu, 20 May 2021 10:14:49 +0200 -Subject: [PATCH] LVM-activate: fix drop-in check to avoid re-creating drop-in - file when it already exists - ---- - heartbeat/LVM-activate | 17 +++++++---------- - 1 file changed, 7 insertions(+), 10 deletions(-) - -diff --git a/heartbeat/LVM-activate b/heartbeat/LVM-activate -index a8e40dce4..53223367e 100755 ---- a/heartbeat/LVM-activate -+++ b/heartbeat/LVM-activate -@@ -820,17 +820,14 @@ lvm_start() { - if systemd_is_running ; then - # Create drop-in to deactivate VG before stopping - # storage services during shutdown/reboot. -- after=$(systemctl show resource-agents-deps.target.d \ -- --property=After | cut -d'=' -f2) -- -- case "$after" in -- *" blk-availability.service "*) -- ;; -- *) -- systemd_drop_in "99-LVM-activate" "After" \ -+ systemctl show resource-agents-deps.target \ -+ --property=After | cut -d'=' -f2 | \ -+ grep -qE "(^|\s)blk-availability.service(\s|$)" -+ -+ if [ "$?" -ne 0 ]; then -+ systemd_drop_in "99-LVM-activate" "After" \ - "blk-availability.service" -- ;; -- esac -+ fi - - # If blk-availability isn't started, the "After=" - # directive has no effect. diff --git a/SOURCES/bz1972743-podman-fix-container-creation-race.patch b/SOURCES/bz1972743-podman-fix-container-creation-race.patch new file mode 100644 index 0000000..561e0a2 --- /dev/null +++ b/SOURCES/bz1972743-podman-fix-container-creation-race.patch @@ -0,0 +1,74 @@ +From 7850aea1600389beb16c7aad40bba1b76ae694c4 Mon Sep 17 00:00:00 2001 +From: Damien Ciabrini +Date: Tue, 15 Jun 2021 20:03:20 +0200 +Subject: [PATCH] podman: workaround race during container creation + +podman and OCI runtime have a race that sometimes causes +a container to fail to be created and run [1] if the +cgroup to be used is not available yet. When that happens, +try to recreate it until it succeeds or the start +timeout is reached. + +[1] https://bugzilla.redhat.com/show_bug.cgi?id=1972209 +--- + heartbeat/podman | 32 ++++++++++++++++++++++++++++++-- + 1 file changed, 30 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/podman b/heartbeat/podman +index 5b707f3f5..034dfff76 100755 +--- a/heartbeat/podman ++++ b/heartbeat/podman +@@ -358,8 +358,18 @@ run_new_container() + local rc + + ocf_log info "running container $CONTAINER for the first time" +- ocf_run podman run $opts $image $cmd ++ out=$(podman run $opts $image $cmd 2>&1) + rc=$? ++ ++ if [ -n "$out" ]; then ++ out="$(echo "$out" | tr -s ' \t\r\n' ' ')" ++ if [ $rc -eq 0 ]; then ++ ocf_log info "$out" ++ else ++ ocf_log err "$out" ++ fi ++ fi ++ + if [ $rc -eq 125 ]; then + # If an internal podman error occurred, it might be because + # the internal storage layer still references an old container +@@ -370,6 +380,24 @@ run_new_container() + ocf_run podman rm --storage $CONTAINER + ocf_run podman run $opts $image $cmd + rc=$? ++ elif [ $rc -eq 127 ]; then ++ # rhbz#1972209: podman 3.0.x seems to be hit by a race ++ # where the cgroup is not yet set up properly when the OCI ++ # runtime configures the container. If that happens, recreate ++ # the container as long as we get the same error code or ++ # until start timeout preempts us. ++ while [ $rc -eq 127 ] && (echo "$out" | grep -q "cgroup.*scope not found") ; do ++ ocf_log warn "Internal podman error while assigning cgroup. Retrying." ++ # Arbitrary sleep to prevent consuming all CPU while looping ++ sleep 1 ++ podman rm -f "$CONTAINER" ++ out=$(podman run $opts $image $cmd 2>&1) ++ rc=$? ++ done ++ # Log the created container ID if it succeeded ++ if [ $rc -eq 0 ]; then ++ ocf_log info "$out" ++ fi + fi + + return $rc +@@ -422,7 +450,7 @@ podman_start() + fi + + if [ $rc -ne 0 ]; then +- ocf_exit_reason "podman failed to launch container" ++ ocf_exit_reason "podman failed to launch container (rc: $rc)" + return $OCF_ERR_GENERIC + fi + diff --git a/SOURCES/bz1973035-podman-fix-container-creation-race.patch b/SOURCES/bz1973035-podman-fix-container-creation-race.patch deleted file mode 100644 index 561e0a2..0000000 --- a/SOURCES/bz1973035-podman-fix-container-creation-race.patch +++ /dev/null @@ -1,74 +0,0 @@ -From 7850aea1600389beb16c7aad40bba1b76ae694c4 Mon Sep 17 00:00:00 2001 -From: Damien Ciabrini -Date: Tue, 15 Jun 2021 20:03:20 +0200 -Subject: [PATCH] podman: workaround race during container creation - -podman and OCI runtime have a race that sometimes causes -a container to fail to be created and run [1] if the -cgroup to be used is not available yet. When that happens, -try to recreate it until it succeeds or the start -timeout is reached. - -[1] https://bugzilla.redhat.com/show_bug.cgi?id=1972209 ---- - heartbeat/podman | 32 ++++++++++++++++++++++++++++++-- - 1 file changed, 30 insertions(+), 2 deletions(-) - -diff --git a/heartbeat/podman b/heartbeat/podman -index 5b707f3f5..034dfff76 100755 ---- a/heartbeat/podman -+++ b/heartbeat/podman -@@ -358,8 +358,18 @@ run_new_container() - local rc - - ocf_log info "running container $CONTAINER for the first time" -- ocf_run podman run $opts $image $cmd -+ out=$(podman run $opts $image $cmd 2>&1) - rc=$? -+ -+ if [ -n "$out" ]; then -+ out="$(echo "$out" | tr -s ' \t\r\n' ' ')" -+ if [ $rc -eq 0 ]; then -+ ocf_log info "$out" -+ else -+ ocf_log err "$out" -+ fi -+ fi -+ - if [ $rc -eq 125 ]; then - # If an internal podman error occurred, it might be because - # the internal storage layer still references an old container -@@ -370,6 +380,24 @@ run_new_container() - ocf_run podman rm --storage $CONTAINER - ocf_run podman run $opts $image $cmd - rc=$? -+ elif [ $rc -eq 127 ]; then -+ # rhbz#1972209: podman 3.0.x seems to be hit by a race -+ # where the cgroup is not yet set up properly when the OCI -+ # runtime configures the container. If that happens, recreate -+ # the container as long as we get the same error code or -+ # until start timeout preempts us. -+ while [ $rc -eq 127 ] && (echo "$out" | grep -q "cgroup.*scope not found") ; do -+ ocf_log warn "Internal podman error while assigning cgroup. Retrying." -+ # Arbitrary sleep to prevent consuming all CPU while looping -+ sleep 1 -+ podman rm -f "$CONTAINER" -+ out=$(podman run $opts $image $cmd 2>&1) -+ rc=$? -+ done -+ # Log the created container ID if it succeeded -+ if [ $rc -eq 0 ]; then -+ ocf_log info "$out" -+ fi - fi - - return $rc -@@ -422,7 +450,7 @@ podman_start() - fi - - if [ $rc -ne 0 ]; then -- ocf_exit_reason "podman failed to launch container" -+ ocf_exit_reason "podman failed to launch container (rc: $rc)" - return $OCF_ERR_GENERIC - fi - diff --git a/SOURCES/bz1986868-podman-return-not-running-probe.patch b/SOURCES/bz1986868-podman-return-not-running-probe.patch deleted file mode 100644 index b8420f5..0000000 --- a/SOURCES/bz1986868-podman-return-not-running-probe.patch +++ /dev/null @@ -1,42 +0,0 @@ -From 6877b20a83cb691884996bf77385259388fdebb2 Mon Sep 17 00:00:00 2001 -From: Oyvind Albrigtsen -Date: Wed, 3 Mar 2021 17:06:12 +0100 -Subject: [PATCH] podman: return OCF_NOT_RUNNING when monitor cmd fails (not - running) - ---- - heartbeat/podman | 21 +++++++++++++-------- - 1 file changed, 13 insertions(+), 8 deletions(-) - -diff --git a/heartbeat/podman b/heartbeat/podman -index 82ea14624..5b707f3f5 100755 ---- a/heartbeat/podman -+++ b/heartbeat/podman -@@ -204,14 +204,19 @@ monitor_cmd_exec() - # 125: no container with name or ID ${CONTAINER} found - # 126: container state improper (not running) - # 127: any other error -- if [ $rc -eq 125 ] || [ $rc -eq 126 ]; then -- rc=$OCF_NOT_RUNNING -- elif [ $rc -ne 0 ]; then -- ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out" -- rc=$OCF_ERR_GENERIC -- else -- ocf_log debug "monitor cmd passed: exit code = $rc" -- fi -+ # 255: podman 2+: container not running -+ case "$rc" in -+ 125|126|255) -+ rc=$OCF_NOT_RUNNING -+ ;; -+ 0) -+ ocf_log debug "monitor cmd passed: exit code = $rc" -+ ;; -+ *) -+ ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out" -+ rc=$OCF_ERR_GENERIC -+ ;; -+ esac - - return $rc - } diff --git a/SPECS/resource-agents.spec b/SPECS/resource-agents.spec index 3ee8b95..1b9d9c6 100644 --- a/SPECS/resource-agents.spec +++ b/SPECS/resource-agents.spec @@ -70,7 +70,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.1.1 -Release: 90%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.7 +Release: 98%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} @@ -268,10 +268,21 @@ Patch176: bz1913932-3-gcp-vpc-move-route-make-vpc_network-optional.patch Patch177: bz1937142-azure-lb-redirect-to-avoid-nc-dying-EPIPE-error.patch Patch178: bz1940363-1-galera-redis-use-output-as.patch Patch179: bz1940363-2-bundle-disable-validate-with.patch -Patch180: bz1943093-aws-vpc-move-ip-add-ENI-lookup.patch -Patch181: bz1973035-podman-fix-container-creation-race.patch -Patch182: bz1986868-podman-return-not-running-probe.patch -Patch183: bz1972236-LVM-activate-fix-drop-in.patch +Patch180: bz1891883-ethmonitor-vlan-fix.patch +Patch181: bz1902045-iface-vlan-vlan-not-unique.patch +Patch182: bz1924363-nfsserver-error-check-unmount.patch +Patch183: bz1932863-VirtualDomain-fix-pid-status.patch +Patch184: bz1920698-podman-return-not-running-probe.patch +Patch185: bz1939992-awsvip-dont-partially-match-IPs.patch +Patch186: bz1940094-aws-agents-dont-spam-logs.patch +Patch187: bz1939281-aws-vpc-move-ip-add-ENI-lookup.patch +Patch188: bz1934651-db2-add-PRIMARY-REMOTE_CATCHUP_PENDING-CONNECTED.patch +Patch189: bz1872754-pgsqlms-new-ra.patch +Patch190: bz1957765-gcp-vpc-move-vip-retry.patch +Patch191: bz1969968-lvmlockd-remove-with_cmirrord.patch +Patch192: bz1972035-LVM-activate-fix-drop-in.patch +Patch193: bz1972743-podman-fix-container-creation-race.patch +Patch194: bz1509319-storage-mon-new-ra.patch # bundle patches Patch1000: 7-gcp-bundled.patch @@ -282,6 +293,8 @@ Patch1004: bz1691456-gcloud-dont-detect-python2.patch Patch1005: aliyun-vpc-move-ip-4-bundled.patch Patch1006: python3-syntax-fixes.patch Patch1007: aliyuncli-python3-fixes.patch +Patch1008: bz1935422-python-pygments-fix-CVE-2021-20270.patch +Patch1009: bz1943464-python-pygments-fix-CVE-2021-27291.patch Obsoletes: heartbeat-resources <= %{version} Provides: heartbeat-resources = %{version} @@ -421,6 +434,21 @@ The Google Cloud Platform resource agents allows Google Cloud Platform instances to be managed in a cluster environment. %endif +%package paf +License: PostgreSQL +Summary: PostgreSQL Automatic Failover (PAF) resource agent +%if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} +Group: System Environment/Base +%else +Group: Productivity/Clustering/HA +%endif +Requires: %{name} = %{version}-%{release} +Requires: perl-interpreter + +%description paf +PostgreSQL Automatic Failover (PAF) resource agents allows PostgreSQL +databases to be managed in a cluster environment. + %prep %if 0%{?suse_version} == 0 && 0%{?fedora} == 0 && 0%{?centos_version} == 0 && 0%{?rhel} == 0 %{error:Unable to determine the distribution/version. This is generally caused by missing /etc/rpm/macros.dist. Please install the correct build packages or define the required macros manually.} @@ -607,13 +635,25 @@ exit 1 %patch177 -p1 %patch178 -p1 %patch179 -p1 -%patch180 -p1 -F2 +%patch180 -p1 %patch181 -p1 %patch182 -p1 %patch183 -p1 +%patch184 -p1 +%patch185 -p1 +%patch186 -p1 +%patch187 -p1 -F2 +%patch188 -p1 +%patch189 -p1 +%patch190 -p1 +%patch191 -p1 +%patch192 -p1 +%patch193 -p1 +%patch194 -p1 -F2 chmod 755 heartbeat/nova-compute-wait chmod 755 heartbeat/NovaEvacuate +chmod 755 heartbeat/pgsqlms # bundles mkdir -p %{bundled_lib_dir}/gcp @@ -752,6 +792,12 @@ cp %{aliyuncli_dir}/LICENSE %{aliyuncli}_LICENSE # aliyun Python 3 fixes %patch1006 -p1 %patch1007 -p1 + +# fix CVE's in python-pygments +pushd %{googlecloudsdk_dir}/lib/third_party +%patch1008 -p1 -F2 +%patch1009 -p1 -F2 +popd %endif %build @@ -981,6 +1027,9 @@ rm -rf %{buildroot} %exclude /usr/lib/ocf/resource.d/heartbeat/gcp* %exclude %{_mandir}/man7/*gcp* %exclude /usr/lib/%{name}/%{bundled_lib_dir} +%exclude /usr/lib/ocf/resource.d/heartbeat/pgsqlms +%exclude %{_mandir}/man7/*pgsqlms* +%exclude %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm ### # Moved to separate packages @@ -1173,37 +1222,66 @@ ccs_update_schema > /dev/null 2>&1 ||: /usr/lib/%{name}/%{bundled_lib_dir}/gcp %endif +%files paf +%doc paf_README.md +%license paf_LICENSE +%defattr(-,root,root) +%{_usr}/lib/ocf/resource.d/heartbeat/pgsqlms +%{_mandir}/man7/*pgsqlms* +%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm + %changelog -* Tue Aug 3 2021 Oyvind Albrigtsen - 4.1.1-90.7 +* Mon Aug 30 2021 Oyvind Albrigtsen - 4.1.1-98 +- storage-mon: new resource agent + + Resolves: rhbz#1509319 + +* Thu Jun 17 2021 Oyvind Albrigtsen - 4.1.1-97 +- podman: fix possible race during container creation + + Resolves: rhbz#1972743 + +* Tue Jun 15 2021 Oyvind Albrigtsen - 4.1.1-96 - LVM-activate: fix drop-in check to avoid re-creating drop-in - Resolves: rhbz#1972236 + Resolves: rhbz#1972035 -* Wed Jul 28 2021 Oyvind Albrigtsen - 4.1.1-90.6 -- podman: return NOT_RUNNING when monitor cmd fails +* Fri Jun 11 2021 Oyvind Albrigtsen - 4.1.1-95 +- lvmlockd: remove cmirrord support, as cmirrord is incompatible w/lvmlockd - Resolves: rhbz#1986868 + Resolves: rhbz#1969968 -* Thu Jun 17 2021 Oyvind Albrigtsen - 4.1.1-90.5 -- podman: fix possible race during container creation +* Wed May 12 2021 Oyvind Albrigtsen - 4.1.1-94 +- gcp-vpc-move-vip: add retry logic + + Resolves: rhbz#1957765 + +* Wed Apr 28 2021 Oyvind Albrigtsen - 4.1.1-93 +- db2: add PRIMARY/REMOTE_CATCHUP_PENDING/CONNECTED status to promote-check +- pgsqlms: new resource agent +- python-pygments: fix CVE-2021-27291 and CVE-2021-20270 - Resolves: rhbz#1973035 + Resolves: rhbz#1872754, rhbz#1934651, rhbz#1935422, rhbz#1943464 -* Wed Apr 14 2021 Oyvind Albrigtsen - 4.1.1-90.2 +* Thu Apr 8 2021 Oyvind Albrigtsen - 4.1.1-91 +- ethmonitor: fix vlan regex +- iface-vlan: make vlan parameter not unique +- nfsserver: error-check unmount +- VirtualDomain: fix pid status regex +- podman: return NOT_RUNNING when monitor cmd fails +- awsvip: dont partially match similar IPs during +- aws agents: dont spam log files - aws-vpc-move-ip: add ENI lookup - Resolves: rhbz#1943093 + Resolves: rhbz#1891883, rhbz#1902045, rhbz#1924363, rhbz#1932863 + Resolves: rhbz#1920698, rhbz#1939992, rhbz#1940094, rhbz#1939281 * Mon Mar 22 2021 Oyvind Albrigtsen - 4.1.1-90 - galera/rabbitmq-cluster/redis: run crm_mon without validation when - running in bundle - - Resolves: rhbz#1940363 + running in bundle (1940363) * Thu Mar 11 2021 Oyvind Albrigtsen - 4.1.1-89 -- azure-lb: redirect to avoid nc dying with EPIPE error - - Resolves: rhbz#1937142 +- azure-lb: redirect to avoid nc dying with EPIPE error (1937142) * Thu Feb 25 2021 Oyvind Albrigtsen - 4.1.1-87 - gcp-vpc-move-route, gcp-vpc-move-vip: add project parameter and