diff --git a/SOURCES/bz1059988-db2-support.patch b/SOURCES/bz1059988-db2-support.patch new file mode 100644 index 0000000..f6561dc --- /dev/null +++ b/SOURCES/bz1059988-db2-support.patch @@ -0,0 +1,154 @@ +From c954c6470fe61c73396b45ca75310d146997f81b Mon Sep 17 00:00:00 2001 +From: David Vossel +Date: Wed, 29 Apr 2015 11:16:18 -0500 +Subject: [PATCH 5/6] db2 support + +--- + heartbeat/db2 | 60 +++++++++++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 46 insertions(+), 14 deletions(-) + +diff --git a/heartbeat/db2 b/heartbeat/db2 +index f9db2f8..fed2d86 100755 +--- a/heartbeat/db2 ++++ b/heartbeat/db2 +@@ -132,6 +132,9 @@ END + db2_validate() { + local db2home db2sql db2instance + ++ # db2 uses korn shell ++ check_binary "ksh" ++ + # check required instance vars + if [ -z "$OCF_RESKEY_instance" ] + then +@@ -208,6 +211,14 @@ db2_validate() { + return $OCF_SUCCESS + } + ++master_score() ++{ ++ if ! have_binary "crm_master"; then ++ return ++ fi ++ ++ crm_master $* ++} + + # + # Run the given command as db2 instance user +@@ -380,8 +391,17 @@ db2_check_config_compatibility() { + # + db2_start() { + local output start_cmd db ++ local start_opts="dbpartitionnum $db2node" ++ ++ # If we detect that db partitions are not in use, and no ++ # partition is explicitly specified, activate without ++ # partition information. This allows db2 instances without ++ # partition support to be managed. ++ if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -a "$db2sql/db2nodes.cfg" ]; then ++ start_opts="" ++ fi + +- if output=$(runasdb2 db2start dbpartitionnum $db2node) ++ if output=$(runasdb2 db2start $start_opts) + then + ocf_log info "DB2 instance $instance($db2node) started: $output" + else +@@ -473,10 +493,15 @@ db2_start() { + # + db2_stop_bg() { + local rc output ++ local stop_opts="dbpartitionnum $db2node" + + rc=$OCF_SUCCESS + +- if output=$(runasdb2 db2stop force dbpartitionnum $db2node) ++ if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -a "$db2sql/db2nodes.cfg" ]; then ++ stop_opts="" ++ fi ++ ++ if output=$(runasdb2 db2stop force $stop_opts) + then + ocf_log info "DB2 instance $instance($db2node) stopped: $output" + else +@@ -502,13 +527,13 @@ db2_stop() { + local stop_timeout grace_timeout stop_bg_pid i must_kill + + # remove master score +- crm_master -D -l reboot ++ master_score -D -l reboot + + # be very early here in order to avoid stale data + rm -f $STATE_FILE + +- if ! db2_instance_status +- then ++ db2_instance_status ++ if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log info "DB2 instance $instance already stopped" + return $OCF_SUCCESS + fi +@@ -585,7 +610,12 @@ db2_instance_status() { + local pscount + + pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- | grep ' db2[^ ]' | wc -l) +- test $pscount -ge 4 ++ if [ $pscount -ge 4 ]; then ++ return $OCF_SUCCESS; ++ elif [ $pscount -ge 1 ]; then ++ return $OCF_GENERIC_ERR ++ fi ++ return $OCF_NOT_RUNNING + } + + # +@@ -626,12 +656,14 @@ db2_hadr_status() { + # + db2_monitor() { + local CMD output hadr db ++ local rc + +- if ! db2_instance_status +- then ++ db2_instance_status ++ rc=$? ++ if [ $rc -ne $OCF_SUCCESS ]; then + # instance is dead remove master score +- crm_master -D -l reboot +- exit $OCF_NOT_RUNNING ++ master_score -D -l reboot ++ exit $rc + fi + + [ $db2node = 0 ] || return 0 +@@ -667,22 +699,22 @@ db2_monitor() { + ocf_log err "DB2 message: $output" + + # dead primary, remove master score +- crm_master -D -l reboot ++ master_score -D -l reboot + return $OCF_ERR_GENERIC + esac + fi + + ocf_log debug "DB2 database $instance($db2node)/$db appears to be working" +- ocf_is_ms && crm_master -v 10000 -l reboot ++ ocf_is_ms && master_score -v 10000 -l reboot + ;; + + Standby/*Peer) +- crm_master -v 8000 -l reboot ++ master_score -v 8000 -l reboot + ;; + + Standby/*) + ocf_log warn "DB2 database $instance($db2node)/$db in status $hadr can never be promoted" +- crm_master -D -l reboot ++ master_score -D -l reboot + ;; + + *) +-- +1.8.4.2 + diff --git a/SOURCES/bz1077888-ctdb-updates.patch b/SOURCES/bz1077888-ctdb-updates.patch new file mode 100644 index 0000000..13bdcc3 --- /dev/null +++ b/SOURCES/bz1077888-ctdb-updates.patch @@ -0,0 +1,159 @@ +From f681e6798d3a5ead5a0e077d6e73343b266ef56f Mon Sep 17 00:00:00 2001 +From: David Vossel +Date: Wed, 29 Apr 2015 11:18:25 -0500 +Subject: [PATCH 6/6] CTDB fixes + +--- + heartbeat/CTDB | 61 +++++++++++++++++++++++++++++++++++++++++++++------------- + 1 file changed, 48 insertions(+), 13 deletions(-) + +diff --git a/heartbeat/CTDB b/heartbeat/CTDB +index d1e8d03..1cf9d8c 100755 +--- a/heartbeat/CTDB ++++ b/heartbeat/CTDB +@@ -72,6 +72,19 @@ + ####################################################################### + # Default parameter values: + ++# Some distro's ctdb package stores the persistent db in /var/lib/ctdb, ++# others store in /var/ctdb. This attempts to detect the correct default ++# directory. ++var_prefix="/var/lib/ctdb" ++if [ ! -d "$var_prefix" ] && [ -d "/var/ctdb" ]; then ++ var_prefix="/var/ctdb" ++fi ++ ++run_prefix="/run" ++if [ ! -d "$var_prefix" ] && [ -d "/var/run" ]; then ++ var_prefix="/var/run" ++fi ++ + : ${OCF_RESKEY_ctdb_manages_samba:=no} + : ${OCF_RESKEY_ctdb_manages_winbind:=no} + : ${OCF_RESKEY_ctdb_service_smb:=""} +@@ -84,9 +97,10 @@ + : ${OCF_RESKEY_ctdb_config_dir:=/etc/ctdb} + : ${OCF_RESKEY_ctdb_binary:=/usr/bin/ctdb} + : ${OCF_RESKEY_ctdbd_binary:=/usr/sbin/ctdbd} +-: ${OCF_RESKEY_ctdb_socket:=/var/lib/ctdb/ctdb.socket} +-: ${OCF_RESKEY_ctdb_dbdir:=/var/lib/ctdb} ++: ${OCF_RESKEY_ctdb_dbdir:=${var_prefix}} + : ${OCF_RESKEY_ctdb_logfile:=/var/log/ctdb/log.ctdb} ++: ${OCF_RESKEY_ctdb_rundir:=${run_prefix}/ctdb} ++: ${OCF_RESKEY_ctdb_socket:=${OCF_RESKEY_ctdb_rundir}/ctdbd.socket} + : ${OCF_RESKEY_ctdb_debuglevel:=2} + + : ${OCF_RESKEY_smb_conf:=/etc/samba/smb.conf} +@@ -104,12 +118,13 @@ meta_data() { + + + This resource agent manages CTDB, allowing one to use Clustered Samba in a +-Linux-HA/Pacemaker cluster. You need a shared filesystem (e.g. OCFS2) on ++Linux-HA/Pacemaker cluster. You need a shared filesystem (e.g. OCFS2 or GFS2) on + which the CTDB lock will be stored. Create /etc/ctdb/nodes containing a list + of private IP addresses of each node in the cluster, then configure this RA +-as a clone. To have CTDB manage Samba, set ctdb_manages_samba="yes". +-Note that this option will be deprecated in future, in favour of configuring +-a separate Samba resource. ++as a clone. This agent expects the samba and windbind resources ++to be managed outside of CTDB's control as a separate set of resources controlled ++by the cluster manager. The optional support for enabling CTDB management of these ++daemons will be depreciated. + + For more information see http://linux-ha.org/wiki/CTDB_(resource_agent) + +@@ -235,7 +250,7 @@ Full path to the domain socket that ctdbd will create, used for + local clients to attach and communicate with the ctdb daemon. + + CTDB socket location +- ++ + + + +@@ -244,7 +259,7 @@ The directory to put the local CTDB database files in. + Persistent database files will be put in ctdb_dbdir/persistent. + + CTDB database directory +- ++ + + + +@@ -256,6 +271,15 @@ value "syslog". + + + ++ ++ ++Full path to ctdb runtime directory, used for storage of socket ++lock state. ++ ++CTDB runtime directory location ++ ++ ++ + + + What debug level to run at (0-10). Higher means more verbose. +@@ -538,7 +562,16 @@ ctdb_start() { + + # Use logfile by default, or syslog if asked for + local log_option="--logfile=$OCF_RESKEY_ctdb_logfile" +- [ "$OCF_RESKEY_ctdb_logfile" = "syslog" ] && log_option="--syslog" ++ if [ "$OCF_RESKEY_ctdb_logfile" = "syslog" ]; then ++ log_option="--syslog" ++ elif [ ! -d "$(dirname $OCF_RESKEY_ctdb_logfile)" ]; then ++ # ensure the logfile's directory exists, otherwise ctdb will fail to start ++ mkdir -p $(dirname $OCF_RESKEY_ctdb_logfile) ++ fi ++ ++ # ensure ctdb's rundir exists, otherwise it will fail to start ++ mkdir -p $OCF_RESKEY_ctdb_rundir 2>/dev/null ++ + # public addresses file (should not be present, but need to set for correctness if it is) + local pub_addr_option="" + [ -f "${OCF_RESKEY_ctdb_config_dir}/public_addresses" ] && \ +@@ -562,7 +595,7 @@ ctdb_start() { + if [ $? -ne 0 ]; then + # cleanup smb.conf + cleanup_smb_conf +- ++ + ocf_exit_reason "Failed to execute $OCF_RESKEY_ctdbd_binary." + return $OCF_ERR_GENERIC + else +@@ -589,10 +622,10 @@ ctdb_start() { + fi + done + fi +- ++ + # ctdbd will (or can) actually still be running at this point, so kill it + ctdb_stop +- ++ + ocf_exit_reason "Timeout waiting for CTDB to stabilize" + return $OCF_ERR_GENERIC + } +@@ -601,7 +634,7 @@ ctdb_start() { + ctdb_stop() { + # Do nothing if already stopped + pkill -0 -f $OCF_RESKEY_ctdbd_binary || return $OCF_SUCCESS +- ++ + # Tell it to die nicely + invoke_ctdb shutdown >/dev/null 2>&1 + rv=$? +@@ -645,6 +678,8 @@ ctdb_monitor() { + if [ $? -ne 0 ]; then + if echo $status | grep -qs 'Connection refused'; then + return $OCF_NOT_RUNNING ++ elif echo $status | grep -qs 'No such file or directory'; then ++ return $OCF_NOT_RUNNING + else + ocf_exit_reason "CTDB status call failed: $status" + return $OCF_ERR_GENERIC +-- +1.8.4.2 + diff --git a/SOURCES/bz1160365-iface-vlan.patch.patch b/SOURCES/bz1160365-iface-vlan.patch.patch new file mode 100644 index 0000000..f82d06c --- /dev/null +++ b/SOURCES/bz1160365-iface-vlan.patch.patch @@ -0,0 +1,520 @@ +From 0305c97abc49d0f7a93b3602a745805f7e8776d3 Mon Sep 17 00:00:00 2001 +From: David Vossel +Date: Thu, 25 Jun 2015 16:23:45 -0500 +Subject: [PATCH 1/3] bz1160365-iface-vlan.patch + +--- + doc/man/Makefile.am | 1 + + heartbeat/Makefile.am | 1 + + heartbeat/iface-vlan | 475 ++++++++++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 477 insertions(+) + create mode 100755 heartbeat/iface-vlan + +diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am +index 653e818..091ec24 100644 +--- a/doc/man/Makefile.am ++++ b/doc/man/Makefile.am +@@ -107,6 +107,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \ + ocf_heartbeat_iSCSILogicalUnit.7 \ + ocf_heartbeat_iSCSITarget.7 \ + ocf_heartbeat_ids.7 \ ++ ocf_heartbeat_iface-vlan.7 \ + ocf_heartbeat_iscsi.7 \ + ocf_heartbeat_jboss.7 \ + ocf_heartbeat_lxc.7 \ +diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am +index e4ed4fd..6df4080 100644 +--- a/heartbeat/Makefile.am ++++ b/heartbeat/Makefile.am +@@ -76,6 +76,7 @@ ocf_SCRIPTS = ClusterMon \ + fio \ + galera \ + ids \ ++ iface-vlan \ + iscsi \ + ICP \ + IPsrcaddr \ +diff --git a/heartbeat/iface-vlan b/heartbeat/iface-vlan +new file mode 100755 +index 0000000..bc8583c +--- /dev/null ++++ b/heartbeat/iface-vlan +@@ -0,0 +1,475 @@ ++#!/bin/sh ++# ++# OCF Resource Agent compliant iface-vlan script. ++# ++# Implements network VLAN interface management ++# ++# Copyright (C) 2013 Red Hat, Inc. All rights reserved. ++# Author: Fabio M. Di Nitto ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of version 2 of the GNU General Public License as ++# published by the Free Software Foundation. ++# ++# This program is distributed in the hope that it would be useful, but ++# WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ++# ++# Further, this software is distributed without any warranty that it is ++# free of the rightful claim of any third person regarding infringement ++# or the like. Any license provided herein, whether implied or ++# otherwise, applies only to this software file. Patent licenses, if ++# any, provided herein do not apply to combinations of this program with ++# other software, or any other product whatsoever. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write the Free Software Foundation, ++# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. ++# ++# ++ ++# TODO: ++# ++# OCF parameters are as below ++# OCF_RESKEY_vlan_interface ++# OCF_RESKEY_vlan_id ++# OCF_RESKEY_vlan_name ++# OCF_RESKEY_vlan_reorder_hdr ++# OCF_RESKEY_vlan_gvrp ++# OCF_RESKEY_vlan_mvrp ++# OCF_RESKEY_vlan_loose_binding ++# ++ ++####################################################################### ++# Initialization: ++ ++: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} ++. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ++ ++# Defaults ++OCF_RESKEY_vlan_reorder_hdr_default=1 ++OCF_RESKEY_vlan_gvrp_default=0 ++OCF_RESKEY_vlan_mvrp_default=0 ++OCF_RESKEY_vlan_loose_binding_default=0 ++OCF_RESKEY_vlan_name_default=${OCF_RESKEY_vlan_interface}.${OCF_RESKEY_vlan_id} ++ ++: ${OCF_RESKEY_vlan_name=${OCF_RESKEY_vlan_name_default}} ++: ${OCF_RESKEY_vlan_reorder_hdr=${OCF_RESKEY_vlan_reorder_hdr_default}} ++: ${OCF_RESKEY_vlan_gvrp=${OCF_RESKEY_vlan_gvrp_default}} ++ ++# don't set defaults for mvrp or loose binding since both ++# are rather new kernel features and they might not be supported ++#: ${OCF_RESKEY_vlan_mvrp=${OCF_RESKEY_vlan_mvrp_default}} ++#: ${OCF_RESKEY_vlan_loose_binding=${OCF_RESKEY_vlan_loose_binding_default}} ++ ++####################################################################### ++ ++vlan_usage() { ++ cat < ++ ++ ++ 1.0 ++ ++ ++ This resource manages VLAN network interfaces. ++ It can add, remove, configure VLANs. ++ ++ ++ ++ Manages VLAN network interfaces. ++ ++ ++ ++ ++ ++ Define the interface where VLAN should be attached. ++ ++ ++ Network interface. ++ ++ ++ ++ ++ ++ ++ Define the VLAN ID. It has to be a value between 0 and 4094. ++ ++ ++ Define the VLAN ID. ++ ++ ++ ++ ++ ++ ++ Define the name of the VLAN interface (max 15 charaters). ++ ++ ++ Name of the VLAN. ++ ++ ++ ++ ++ ++ ++ Enable or disable header reordering. ++ ++ ++ Enable or disable header reordering. ++ ++ ++ ++ ++ ++ ++ Enable or disable GARP VLAN registration protocol. ++ ++ ++ Enable or disable gvrp. ++ ++ ++ ++ ++ ++ ++ Enable or disable Multiple VLAN Registration Protocol. ++ Please note that most distributions do not ship a version of iproute2 ++ that supports mvrp yet, even if the kernel has support for it. ++ Check output of $IPADDR2 link add type vlan --help in the FLAG ++ section to verify if mvrp support is available. ++ ++ ++ Enable or disable mvrp. ++ ++ ++ ++ ++ ++ ++ Enable or disable VLAN loose bind. By default the VLAN interface ++ admin status (UP/DOWN) follows the underneath inteface status. ++ Enabling loose bind allows the VLAN to disconnect from the ++ interface status. Be very careful that enabling loose binding ++ could invalidate this agent monitor operations. ++ Please note that most distributions do not ship a version of iproute2 ++ that supports loose_binding yet, even if the kernel has support for it. ++ Check output of $IPADDR2 link add type vlan --help in the FLAG ++ section to verify if loose_binding support is available. ++ ++ ++ Enable or disable loose binding. ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++END ++} ++ ++# check if the interface is admin up/down ++ ++iface_is_up() { ++ if ! $IP2UTIL -o link show $1 | \ ++ sed -e 's#.*<##g' -e 's#>.*##' -e 's#LOWER_UP##g' | \ ++ grep -q UP; then ++ return 1 ++ fi ++ return 0 ++} ++ ++# check if the slaves have link layer up/down ++# see kernel network documentation on meaning of LOWER_UP flag ++# for more in depth explanation on how it works ++# NOTE: this check is not reliable in virt environment ++# since interfaces are always LOWER_UP. There is no way ++# from the guest to know if the host has disconnected somehow ++ ++iface_lower_is_up() { ++ if ! $IP2UTIL -o link show $1 | \ ++ grep -q LOWER_UP; then ++ return 1 ++ fi ++ return 0 ++} ++ ++vlan_validate() { ++ check_binary $IP2UTIL ++ ++ if [ -z "$OCF_RESKEY_vlan_interface" ]; then ++ ocf_log err "Invalid OCF_RESKEY_vlan_interface: value cannot be empty" ++ return 1 ++ fi ++ ++ # the echo .. is the equivalent of strlen in bash ++ # ++ # /usr/include/linux/if.h:#define IFNAMSIZ 16 ++ # needs to include 0 byte end string ++ ++ if [ "${#OCF_RESKEY_vlan_interface}" -gt 15 ]; then ++ ocf_log err "Invalid OCF_RESKEY_vlan_interface: name is too long" ++ return 1 ++ fi ++ ++ if [ ! -d "/sys/class/net" ]; then ++ ocf_log err "Unable to find sysfs network class in /sys" ++ return 1 ++ fi ++ ++ if [ ! -e "/sys/class/net/$OCF_RESKEY_vlan_interface" ]; then ++ ocf_log err "Invalid OCF_RESKEY_vlan_interface: $OCF_RESKEY_vlan_interface does not exists" ++ return 1 ++ fi ++ ++ if [ -z "$OCF_RESKEY_vlan_id" ]; then ++ ocf_log err "Invalid OCF_RESKEY_vlan_id: value cannot be empty" ++ return 1 ++ fi ++ if ! ocf_is_decimal "$OCF_RESKEY_vlan_id" || \ ++ [ "$OCF_RESKEY_vlan_id" -gt "4094" ]; then ++ ocf_log err "Invalid OCF_RESKEY_vlan_id: must be a decimal value (0 to 4094 included)" ++ return 1 ++ fi ++ ++ if [ "${#OCF_RESKEY_vlan_name}" -gt 15 ]; then ++ ocf_log err "Invalid OCF_RESKEY_vlan_name: name is too long" ++ return 1 ++ fi ++ ++ return 0 ++} ++ ++vlan_check() { ++ if [ -e "/sys/class/net/$OCF_RESKEY_vlan_name" ]; then ++ if [ ! -e "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" ]; then ++ return $OCF_ERR_GENERIC ++ fi ++ else ++ if [ -e "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" ]; then ++ error="$(rm -f "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" 2>&1)" ++ if [ "$?" != "0" ]; then ++ ocf_log err "Unable to remove stale lock file for vlan $OCF_RESKEY_vlan_name: $error" ++ return $OCF_ERR_GENERIC ++ fi ++ fi ++ return $OCF_NOT_RUNNING ++ fi ++ ++ if ! iface_is_up $OCF_RESKEY_vlan_interface; then ++ if ocf_is_true "$OCF_RESKEY_vlan_loose_binding"; then ++ ocf_log warn "Interface $OCF_RESKEY_vlan_interface is administratively down" ++ else ++ ocf_log err "Interface $OCF_RESKEY_vlan_interface is administratively down" ++ return $OCF_ERR_GENERIC ++ fi ++ fi ++ ++ if ! iface_is_up $OCF_RESKEY_vlan_name; then ++ ocf_log err "VLAN $OCF_RESKEY_vlan_name is administratively down" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ if ! iface_lower_is_up $OCF_RESKEY_vlan_name; then ++ ocf_log err "VLAN $OCF_RESKEY_vlan_name has no active link-layer" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ return $OCF_SUCCESS ++} ++ ++# we need a simpler stop version to clean after us if start fails ++# without involving any error checking ++# rolling back in case of failure is otherwise complex ++ ++vlan_force_stop() { ++ $IP2UTIL link delete "$OCF_RESKEY_vlan_name" >/dev/null 2>&1 ++ rm -f "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" 2>&1 ++} ++ ++vlan_start() { ++ # check if the vlan already exists ++ vlan_check ++ ret=$? ++ if [ "$ret" != "$OCF_NOT_RUNNING" ]; then ++ return $ret ++ fi ++ ++ # make sure kernel module is loaded ++ if [ ! -e /proc/net/vlan ]; then ++ error="$(modprobe 8021q 2>&1)" ++ if [ "$?" != "0" ]; then ++ ocf_log err "Unable to load kernel 8021q driver: $error" ++ return $OCF_ERR_GENERIC ++ fi ++ fi ++ ++ # generate options ++ VLANOPTS="" ++ ++ if [ -n "$OCF_RESKEY_vlan_reorder_hdr" ]; then ++ if ocf_is_true "$OCF_RESKEY_vlan_reorder_hdr"; then ++ VLANOPTS="reorder_hdr on" ++ else ++ VLANOPTS="reorder_hdr off" ++ fi ++ fi ++ ++ if [ -n "$OCF_RESKEY_vlan_gvrp" ]; then ++ if ocf_is_true "$OCF_RESKEY_vlan_gvrp"; then ++ VLANOPTS="$VLANOPTS gvrp on" ++ else ++ VLANOPTS="$VLANOPTS gvrp off" ++ fi ++ fi ++ ++ if [ -n "$OCF_RESKEY_vlan_mvrp" ]; then ++ if ocf_is_true "$OCF_RESKEY_vlan_mvrp"; then ++ VLANOPTS="$VLANOPTS mvrp on" ++ else ++ VLANOPTS="$VLANOPTS mvrp off" ++ fi ++ fi ++ ++ if [ -n "$OCF_RESKEY_vlan_loose_binding" ]; then ++ if ocf_is_true "$OCF_RESKEY_vlan_loose_binding"; then ++ VLANOPTS="$VLANOPTS loose_binding on" ++ else ++ VLANOPTS="$VLANOPTS loose_binding off" ++ fi ++ fi ++ ++ # create the VLAN ++ error="$($IP2UTIL link add link "$OCF_RESKEY_vlan_interface" name "$OCF_RESKEY_vlan_name" type vlan id "$OCF_RESKEY_vlan_id" $VLANOPTS 2>&1)" ++ if [ "$?" != "0" ]; then ++ ocf_log err "Unable to create VLAN $OCF_RESKEY_vlan_name: $error" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ # set the interface up ++ error="$($IP2UTIL link set dev "$OCF_RESKEY_vlan_interface" up 2>&1)" ++ if [ "$?" != "0" ]; then ++ ocf_log err "Unable to set VLAN $OCF_RESKEY_vlan_interface up: $error" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ # set the vlan up ++ error="$($IP2UTIL link set dev "$OCF_RESKEY_vlan_name" up 2>&1)" ++ if [ "$?" != "0" ]; then ++ ocf_log err "Unable to set VLAN $OCF_RESKEY_vlan_name up: $error" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ error="$(touch "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" 2>&1)" ++ if [ "$?" != "0" ]; then ++ ocf_log err "Unable to create lock file for VLAN $OCF_RESKEY_vlan_name: $error" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ return $OCF_SUCCESS ++} ++ ++vlan_stop() { ++ vlan_check ++ ret=$? ++ if [ "$ret" = "$OCF_NOT_RUNNING" ]; then ++ return $OCF_SUCCESS ++ fi ++ if [ "$ret" != "$OCF_SUCCESS" ]; then ++ return $ret ++ fi ++ ++ # set vlan down ++ error="$($IP2UTIL link set dev "$OCF_RESKEY_vlan_name" down 2>&1)" ++ if [ "$?" != "0" ]; then ++ ocf_log err "Unable to set VLAN $OCF_RESKEY_vlan_name down: $error" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ # delete vlan ++ error="$($IP2UTIL link delete "$OCF_RESKEY_vlan_name" 2>&1)" ++ if [ "$?" != "0" ]; then ++ ocf_log err "Unable to delete VLAN $OCF_RESKEY_vlan_name: $error" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ error="$(rm -f "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" 2>&1)" ++ if [ "$?" != "0" ]; then ++ ocf_log err "Unable to remove lock file for VLAN $OCF_RESKEY_vlan_name: $error" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ return $OCF_SUCCESS ++} ++ ++case $__OCF_ACTION in ++ meta-data) ++ vlan_meta_data ++ exit $OCF_SUCCESS ++ ;; ++ usage|help) ++ vlan_usage ++ exit $OCF_SUCCESS ++ ;; ++esac ++ ++if [ ! -d "$HA_RSCTMP" ]; then ++ ocf_log debug "$HA_RSCTMP not found, we are probably being executed manually" ++ mkdir -p "$HA_RSCTMP" ++fi ++ ++if [ -n "$__OCF_ACTION" ] && ! vlan_validate; then ++ exit $OCF_ERR_CONFIGURED ++fi ++ ++case $__OCF_ACTION in ++ start|stop) ++ if ! ocf_is_root; then ++ ocf_log err "You must be root for $__OCF_ACTION operation." ++ exit $OCF_ERR_PERM ++ fi ++ ;; ++esac ++ ++case $__OCF_ACTION in ++ start) ++ vlan_start ++ ret=$? ++ if [ "$ret" != "$OCF_SUCCESS" ]; then ++ vlan_force_stop ++ fi ++ exit $ret ++ ;; ++ stop) ++ vlan_stop ++ exit $? ++ ;; ++ status|monitor) ++ vlan_check ++ exit $? ++ ;; ++ validate-all) ++ # vlan_validate above does the trick ++ ;; ++ *) ++ vlan_usage ++ exit $OCF_ERR_UNIMPLEMENTED ++ ;; ++esac ++# vi:sw=4:ts=8: +-- +1.8.4.2 + diff --git a/SOURCES/bz1168251-SAPHana-agents-update.patch b/SOURCES/bz1168251-SAPHana-agents-update.patch new file mode 100644 index 0000000..871dbf5 --- /dev/null +++ b/SOURCES/bz1168251-SAPHana-agents-update.patch @@ -0,0 +1,97 @@ +diff --git a/heartbeat/SAPHana b/heartbeat/SAPHana +index f4db17a..412152b 100644 +--- a/heartbeat/SAPHana ++++ b/heartbeat/SAPHana +@@ -137,7 +137,7 @@ function saphana_meta_data() { + Manages two SAP HANA instances in system replication (SR). + + The SAPHanaSR resource agent manages two SAP Hana instances (databases) which are configured +-in system replication. This first version is limitted to the scale-up scenario. Scale-Up is ++in system replication. This first version is limitted to the scale-up scenario. Scale-Out is + not supported in this version. + + Managing the two SAP HANA instances means that the resource agent controls the start/stop of the +@@ -231,7 +231,9 @@ The resource agent uses the following four interfaces provided by SAP: + + Define SAPHana resource agent messages to be printed + Define SAPHana resource agent messages to be printed. +- This parameter should only be set of been requested by SUSE support. The default is sufficient for normal operation. ++ This parameter should only be set if requested by support. The default is sufficient for normal operation. ++ Values: ra-act-lpa-dec-flow ++ You could specify any combination of the above values like "ra-act-flow" + + + +@@ -480,7 +482,7 @@ function get_crm_master() + # globals: sr_name(w), remoteHost(w), otherNodes(w) + # globals: ATTR_NAME_HANA_SYNC_STATUS(w), ATTR_NAME_HANA_CLONE_STATE(w) + # globals: DIR_EXECUTABLE(w), SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), LD_LIBRARY_PATH(w), PATH(w) +-# globals: LPA_DIRECTORY(w), SIDInstanceName(w), remoteNode(w) ++# globals: LPA_DIRECTORY(w), SIDInstanceName(w), remoteNode(w), hdbSrQueryTimeout(w) + # saphana_init : Define global variables with default values, if optional parameters are not set + # + function saphana_init() { +@@ -497,6 +499,8 @@ function saphana_init() { + super_ocf_log debug "DBG: Used new method to get SID ($SID) and InstanceNr ($InstanceNr)" + sid=$(echo "$SID" | tr [:upper:] [:lower:]) + sidadm="${sid}adm" ++ # TODO PRIO3: Do we need a parameter for the RA to be able to adjust hdbSrQueryTimeout? ++ hdbSrQueryTimeout=180 + # DONE: PRIO4: SAPVIRHOST might be different to NODENAME + # DONE: PRIO1: ASK: Is the output format of ListInstances fix? Could we take that as an API? Answer: Yes + # try to catch: Inst Info : LNX - 42 - lv9041 - 740, patch 36, changelist 1444691 +@@ -827,7 +831,7 @@ function analyze_hana_sync_status() + super_ocf_log err "ACT: Secure store users are missing (see best practice manual how to setup the users)" + rc=$OCF_ERR_CONFIGURED + fi +- hana_sync_status=$(timeout 60 $DIR_EXECUTABLE/hdbsql -a -x -U $secUser $query_state); sqlrc=$? ++ hana_sync_status=$(timeout $hdbSrQueryTimeout $DIR_EXECUTABLE/hdbsql -a -x -U $secUser $query_state); sqlrc=$? + hana_sync_status=$(echo $hana_sync_status | dequote) + super_ocf_log debug "DBG: hdbsql rc=$sqlrc hana_sync_status=\"$hana_sync_status\"" + if [ "$sqlrc" -eq 0 -a "$hana_sync_status" != "" ]; then +@@ -846,10 +850,10 @@ function analyze_hana_sync_status() + # TODO: PRIO9: for first we assume there is only ONE secondary site (like ROT) + # TODO: PRIO3: should we loop over all cluster nodes fetching their roles-attribute? To minimize sql-queries? + # +- all_secondary_hosts=$(timeout 60 hdbsql -a -x -U $secUser $query_secondaries ); sqlrc=$? ++ all_secondary_hosts=$(timeout $hdbSrQueryTimeout hdbsql -a -x -U $secUser $query_secondaries ); sqlrc=$? + all_secondary_hosts=$(echo $all_secondary_hosts | dequote); + if [ "$sqlrc" -eq 0 ]; then +- all_broken_secondary_hosts=$(timeout 60 hdbsql -a -x -U $secUser $query_failed_secondaries); sqlrc=$? ++ all_broken_secondary_hosts=$(timeout $hdbSrQueryTimeout hdbsql -a -x -U $secUser $query_failed_secondaries); sqlrc=$? + all_broken_secondary_hosts=$(echo $all_broken_secondary_hosts | dequote); + if [ "$sqlrc" -eq 0 ]; then + if [ -n "$all_broken_secondary_hosts" ]; then +@@ -869,9 +873,9 @@ function analyze_hana_sync_status() + fi + fi + else +- # return codes 19: license error -> set SFAIL! + case "$sqlrc" in + 19 ) ++ # return codes 19: license error -> set SFAIL! + # DONE: PRIO1: We should NOT set SFAIL, if HDB is exactly broken now + # When HDB breaks during monitor this could prevent a prositive remote failover + super_ocf_log warn "ACT: Was not able to fetch HANA SYNC STATUS - set sync status to SFAIL for ALL OTHER cluster hosts" +diff --git a/heartbeat/SAPHanaTopology b/heartbeat/SAPHanaTopology +index 19fbbb4..082ad29 100644 +--- a/heartbeat/SAPHanaTopology ++++ b/heartbeat/SAPHanaTopology +@@ -123,7 +123,7 @@ function sht_meta_data() { + + + +- 0.149.3 ++ 0.149.4 + Analyzes SAP HANA System Replication Topology. + This RA analyzes the SAP HANA topology and "sends" all findings via the node status attributes to + all nodes in the cluster. These attributes are taken by the SAPHana RA to control the SAP Hana Databases. +@@ -172,7 +172,7 @@ SAPHanaTopology scans the output table of landscapeHostConfiguration.py to ident + Define type of SAPHanaTopology RA messages to be printed + Define type of SAPHanaTopology RA messages to be printed. + Define SAPHana resource agent messages to be printed. +- This parameter should only be set of been requested by SUSE support. The default is sufficient for normal operation. ++ This parameter should only be set if requested by support. The default is sufficient for normal operation. + Values: ra-act-lpa-dec-flow + You could specify any combination of the above values like "ra-act-flow" + diff --git a/SOURCES/bz1168251-SAPHana-agents-update2.patch b/SOURCES/bz1168251-SAPHana-agents-update2.patch new file mode 100644 index 0000000..50808f8 --- /dev/null +++ b/SOURCES/bz1168251-SAPHana-agents-update2.patch @@ -0,0 +1,37 @@ +diff --git a/heartbeat/SAPHana b/heartbeat/SAPHana +index 412152b..1ff6a7d 100644 +--- a/heartbeat/SAPHana ++++ b/heartbeat/SAPHana +@@ -356,7 +356,8 @@ function get_hana_attribute() + local attr_node=$1 + local attr_name=$2 + local attr_store=${3:-reboot} # DONE: PRIO5 get this (optional) from parameter +- crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q; rc=$? ++ local attr_default=${4:-} ++ crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q -d "$attr_default"; rc=$? + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc + } +@@ -373,9 +374,10 @@ function set_hana_attribute() + local attr_value=$2 + local attr_name=$3 + local attr_store=${4:-reboot} # DONE: PRIO5 get this (optional) from parameter ++ local attr_default=${5:-} + local rc=1 + local attr_old="" +- attr_old=$(get_hana_attribute $attr_node $attr_name $attr_store); get_rc=$? ++ attr_old=$(get_hana_attribute $attr_node $attr_name $attr_store $attr_default); get_rc=$? + if [ "$attr_old" != "$attr_value" ]; then + super_ocf_log debug "DBG: SET attribute $attr_name for node ${attr_node} to ${attr_value} former ($attr_old) get_rc=$get_rc " + crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store; rc=$? +@@ -578,8 +580,8 @@ function saphana_init() { + remoteHost=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_REMOTEHOST[@]}); + if [ -z "$remoteHost" ]; then + if [ ${#otherNodes[@]} -eq 1 ]; then # we are a 2 node cluster, lets assume the other is the remote-host +- remoteHost=${otherNodes[0]} +- remoteNode=$remoteHost ++ remoteNode=${otherNodes[0]} ++ remoteHost=$(get_hana_attribute $remoteNode ${ATTR_NAME_HANA_VHOST[@]} "$remoteNode"); + super_ocf_log debug "DBG: auto-guess remoteHost=$remoteHost" + else + super_ocf_log debug "DBG: Could not auto-guess remoteHost out of list (${otherNodes[@]})" diff --git a/SOURCES/bz1168251-SAPHana-agents-update3.patch b/SOURCES/bz1168251-SAPHana-agents-update3.patch new file mode 100644 index 0000000..35fc51b --- /dev/null +++ b/SOURCES/bz1168251-SAPHana-agents-update3.patch @@ -0,0 +1,13 @@ +--- a/heartbeat/SAPHana 2015-05-07 07:47:41.654914103 -0500 ++++ b/heartbeat/SAPHana 2015-05-07 07:47:06.164755744 -0500 +@@ -1733,8 +1733,8 @@ + analyze_hana_sync_status + ;; + esac +- rem_role=$(get_hana_attribute ${remoteHost} ${ATTR_NAME_HANA_ROLES[@]}) +- rem_clone_status=$(get_hana_attribute ${remoteHost} ${ATTR_NAME_HANA_CLONE_STATE[@]}) ++ rem_role=$(get_hana_attribute ${remoteNode} ${ATTR_NAME_HANA_ROLES[@]}) ++ rem_clone_status=$(get_hana_attribute ${remoteNode} ${ATTR_NAME_HANA_CLONE_STATE[@]}) + if [ "$promote_attr" = "DEMOTED" -a "$rem_clone_status" = "PROMOTED" ]; then + case "$rem_role" in + [234]:P:* ) # dual primary, but other instance marked as PROMOTED by the cluster diff --git a/SOURCES/bz1168251-SAPHana-agents.patch b/SOURCES/bz1168251-SAPHana-agents.patch new file mode 100644 index 0000000..caf09fe --- /dev/null +++ b/SOURCES/bz1168251-SAPHana-agents.patch @@ -0,0 +1,3129 @@ +From ef36b33da922b2b8501e80ca840bfb7accc65ff0 Mon Sep 17 00:00:00 2001 +From: David Vossel +Date: Thu, 26 Feb 2015 14:21:20 -0600 +Subject: [PATCH] bz1168251-SAPHana-agents + +--- + doc/man/Makefile.am | 2 + + heartbeat/Makefile.am | 2 + + heartbeat/SAPHana | 2106 +++++++++++++++++++++++++++++++++++++++ + heartbeat/SAPHanaTopology | 813 +++++++++++++++ + tools/Makefile.am | 2 +- + tools/show_SAPHanaSR_attributes | 133 +++ + 6 files changed, 3057 insertions(+), 1 deletion(-) + create mode 100755 heartbeat/SAPHana + create mode 100755 heartbeat/SAPHanaTopology + create mode 100755 tools/show_SAPHanaSR_attributes + +diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am +index 5a1ad4d..31fc1f5 100644 +--- a/doc/man/Makefile.am ++++ b/doc/man/Makefile.am +@@ -78,6 +78,8 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \ + ocf_heartbeat_Route.7 \ + ocf_heartbeat_SAPDatabase.7 \ + ocf_heartbeat_SAPInstance.7 \ ++ ocf_heartbeat_SAPHana.7 \ ++ ocf_heartbeat_SAPHanaTopology.7 \ + ocf_heartbeat_SendArp.7 \ + ocf_heartbeat_ServeRAID.7 \ + ocf_heartbeat_SphinxSearchDaemon.7 \ +diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am +index f08dad4..dd5b0a9 100644 +--- a/heartbeat/Makefile.am ++++ b/heartbeat/Makefile.am +@@ -105,6 +105,8 @@ ocf_SCRIPTS = ClusterMon \ + rsyslog \ + SAPDatabase \ + SAPInstance \ ++ SAPHana \ ++ SAPHanaTopology \ + SendArp \ + ServeRAID \ + slapd \ +diff --git a/heartbeat/SAPHana b/heartbeat/SAPHana +new file mode 100755 +index 0000000..f4db17a +--- /dev/null ++++ b/heartbeat/SAPHana +@@ -0,0 +1,2106 @@ ++#!/bin/bash ++# ++# SAPHana ++# ++# Description: Manages two single SAP HANA Instance in System Replication ++# Planned: do also manage scale-up scenarios ++# currently the SAPHana is dependent of the analysis of ++# SAPHanaTopology ++# For supported scenarios please read the README file provided ++# in the same software package (rpm) ++# ++############################################################################## ++# ++# SAPHana ++# Author: Fabian Herschel, November 2013 ++# Support: linux@sap.com ++# License: GNU General Public License (GPL) ++# Copyright: (c) 2013,2014 SUSE Linux Products GmbH ++# ++# An example usage: ++# See usage() function below for more details... ++# ++# OCF instance parameters: ++# OCF_RESKEY_SID ++# OCF_RESKEY_InstanceNumber ++# OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) ++# OCF_RESKEY_DIR_PROFILE (optional, well known directories will be searched by default) ++# OCF_RESKEY_INSTANCE_PROFILE (optional, well known directories will be searched by default) ++# OCF_RESKEY_PREFER_SITE_TAKEOVER (optional, default is no) ++# OCF_RESKEY_DUPLICATE_PRIMARY_TIMEOUT (optional, time difference needed between two last-primary-tiemstampe (lpt)) ++# OCF_RESKEY_SAPHanaFilter (optional, should only be set if been told by support or for debugging purposes) ++# ++# ++####################################################################### ++# ++# Initialization: ++timeB=$(date '+%s') ++ ++: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} ++. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ++ ++# ++####################################################################### ++# ++ ++HANA_STATE_PRIMARY=0 ++HANA_STATE_SECONDARY=1 ++HANA_STATE_STANDALONE=2 ++HANA_STATE_DEFECT=3 ++ ++SH=/bin/sh ++ ++# ++# function: super_ocf_log - wrapper function for ocf log in order catch usual logging into super log ++# params: LOG_MESSAGE ++# globals: SAPHanaFilter ++function super_ocf_log() { ++ local level="$1" ++ local message="$2" ++ local skip=1 ++ local mtype="" ++ local search=0 ++ local shf="${SAPHanaFilter:-all}" ++ # message levels: (dbg)|info|warn|err|error ++ # message types: (ACT|RA|FLOW|DBG|LPA|DEC|DBG2... ++ case "$level" in ++ debug | dbg | warn | err | error ) skip=0 ++ ;; ++ info ) ++ case "$shf" in ++ all) skip=0 ++ ;; ++ none ) ++ skip=1 ++ ;; ++ * ) mtype=${message%% *} ++ mtype=${mtype%:} ++ mtype=${mtype#fh} ++ echo "$shf"| grep -iq ${mtype}; search=$? ++ if [ $search -eq 0 ]; then ++ skip=0 ++ else ++ skip=1 ++ fi ++ ;; ++ esac ++ ;; ++ esac ++ if [ $skip -eq 0 ]; then ++ ocf_log "$level" "$message" ++ fi ++} ++ ++# ++# function: saphana_usage - short usage info ++# params: - ++# globals: $0(r) ++# ++function saphana_usage() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ methods=$(saphana_methods) ++ methods=$(echo $methods | tr ' ' '|') ++ cat <<-! ++ usage: $0 ($methods) ++ ++ $0 manages a SAP HANA Instance as an HA resource. ++ ++ The 'start' operation starts the HANA instance or bring the "clone instance" to a WAITING status ++ The 'stop' operation stops the HANA instance ++ The 'status' operation reports whether the HANA instance is running ++ The 'monitor' operation reports whether the HANA instance seems to be working in master/slave it also needs to check the system replication status ++ The 'promote' operation either runs a takeover for a secondary or a just-nothing for a primary ++ The 'demote' operation neary does nothing and just mark the instance as demoted ++ The 'notify' operation always returns SUCCESS ++ The 'validate-all' operation reports whether the parameters are valid ++ The 'methods' operation reports on the methods $0 supports ++ ++ ! ++ return $rc ++} ++ ++# ++# function: saphana_meta_data - print resource agent meta-data for cluster ++# params: - ++# globals: - ++# ++function saphana_meta_data() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ cat < ++ ++ ++0.149.4 ++ ++Manages two SAP HANA instances in system replication (SR). ++ ++The SAPHanaSR resource agent manages two SAP Hana instances (databases) which are configured ++in system replication. This first version is limitted to the scale-up scenario. Scale-Up is ++not supported in this version. ++ ++Managing the two SAP HANA instances means that the resource agent controls the start/stop of the ++instances. In addition the resource agent is able to monitor the SAP HANA databases to check their ++availability on landscape host configuration level. For this monitoring the resource agent relies on interfaces ++provided by SAP. A third task of the resource agent is to also check the synchronisation status ++of the two SAP HANA databases. If the synchronisation is not "SOK", than the cluster avoids to ++failover to the secondary side, if the primary fails. This is to improve the data consistency. ++ ++The resource agent uses the following four interfaces provided by SAP: ++ ++1. sapcontrol/sapstartsrv ++ The interface sapcontrol/sapstartsrv is used to start/stop a HANA database instance/system ++ ++2. landscapeHostConfiguration ++ The interface is used to monitor a HANA system. The python script is named landscapeHostConfiguration.py. ++ landscapeHostConfiguration.py has some detailed output about HANA system status ++ and node roles. For our monitor the overall status is relevant. This overall ++ status is reported by the returncode of the script: ++ 0: Internal Fatal, 1: ERROR, 2: WARNING, 3: INFO, 4: OK ++ The SAPHana resource agent will interpret returncodes 0 as FATAL, 1 as not-running or ERROR and and returncodes 2+3+4 as RUNNING. ++ ++3. hdbnsutil ++ The interface hdbnsutil is used to check the "topology" of the system replication as well as the current configuration ++ (primary/secondary) of a SAP HANA database instance. A second task of the interface is the posibility to run a ++ system replication takeover (sr_takeover) or to register a former primary to a newer one (sr_register). ++ ++4. hdbsql / systemReplicationStatus ++ Interface is SQL query into HANA (system replication table). The hdbsql query will be replaced by a python script ++ "systemReplicationStatus.py" in SAP HANA SPS8 or 9. ++ As long as we need to use hdbsql you need to setup secure store users for linux user root to be able to ++ access the SAP HANA database. You need to configure a secure store user key "SAPHANA${SID}SR" which can connect the SAP ++ HANA database: ++ ++5. saphostctrl ++ The interface saphostctrl uses the function ListInstances to figure out the virtual host name of the ++ SAP HANA instance. This is the hostname used during the HANA installation. ++ ++ ++ ++ ++ SAP System Identifier (SID) like "SLE" or "HAE" ++ SAP System Identifier (SID) ++ ++ ++ ++ SAP instance number like "00" or "07" ++ SAP instance number ++ ++ ++ ++ Should cluster/RA prefer to switchover to slave instance instead of restarting master locally? Default="yes" ++ no: Do prefer restart locally ++ yes: Do prefer takever to remote site ++ ++ Local or site recover preferred? ++ ++ ++ ++ Define, if a former primary should automatically be registered. ++ The parameter AUTOMATED_REGISTER defines, wether a former primary instance should ++ be registered automatically by the resource agent during cluster/resource start, if the DUPLICATE_PRIMARY_TIMEOUT is expired... TDB ++ ++ ++ ++ ++ Time difference needed between to primary time stamps, if a dual-primary situation occurs ++ Time difference needed between to primary time stamps, ++ if a dual-primary situation occurs. If the time difference is ++ less than the time gap, than the cluster hold one or both instances in a "WAITING" status. This is to give a admin ++ a chance to react on a failover. A failed former primary will be registered after the time difference is passed. After ++ this registration to the new primary all data will be overwritten by the system replication. ++ ++ ++ ++ ++ The full qualified path where to find sapstartsrv and sapcontrol. Specify this parameter, if you have changed the SAP kernel directory location after the default SAP installation. ++ Path of sapstartsrv and sapcontrol ++ ++ ++ ++ The full qualified path where to find the SAP START profile. Specify this parameter, if you have changed the SAP profile directory location after the default SAP installation. ++ Path of start profile ++ ++ ++ ++ The name of the SAP HANA instance profile. Specify this parameter, if you have changed the name of the SAP HANA instance profile after the default SAP installation. Normally you do not need to set this parameter. ++ HANA instance profile name ++ ++ ++ ++ Define SAPHana resource agent messages to be printed ++ Define SAPHana resource agent messages to be printed. ++ This parameter should only be set of been requested by SUSE support. The default is sufficient for normal operation. ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++END ++return $rc ++} ++ ++# ++# function: saphana_methods - report supported cluster methods ++# params: - ++# globals: - ++# methods: What methods/operations do we support? ++# ++function saphana_methods() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 m ++ for m in start stop status monitor promote demote notify validate-all methods meta-data usage; do ++ echo "$m" ++ done ++ return $rc ++} ++ ++# ++# function: dequote - filter: remove quotes (") from stdin ++# params: - ++# globals: - ++function dequote() ++{ ++ local rc=0; tr -d '"'; return $rc ++} ++ ++# ++# function: remoteHost2remoteNode - convert a SAP remoteHost to the cluster node name ++# params: remoteHost ++# globals: ATTR_NAME_HANA_VHOST[*] ++# ++function remoteHost2remoteNode() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local -a clusterNodes=() ++ local cl="" ++ local vHost="" ++ local remoteHost="$1" ++ local remoteNode="" ++ local rc=1 ++ for cl in ${otherNodes[@]}; do ++ vHost=$(get_hana_attribute $cl ${ATTR_NAME_HANA_VHOST[@]}) ++ if [ "$vHost" = "$remoteHost" ]; then # we found the correct node ++ remoteNode=$cl ++ rc=0 ++ fi ++ done ++ if [ -n "$remoteNode" ]; then ++ echo "$remoteNode" ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: is_clone - report, if resource is configured as a clone (also master/slave) ++# params: - ++# globals: OCF_*(r) ++# descript: is_clone : find out if we are configured to run in a Master/Slave configuration ++# rc: 0: it is a clone, 1: it is not a clone ++# ++# DONE: PRIO2: For the first shippment (scale-out) we need to limit the clones to 2 ++# ++function is_clone() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ # ++ # is a clone config? ++ # ++ if [ -n "$OCF_RESKEY_CRM_meta_clone_max" ] \ ++ && [ "$OCF_RESKEY_CRM_meta_clone_max" -gt 0 ]; then ++ # ++ # yes it is a clone config - check, if its configured well ++ # ++ if [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] || \ ++ [ "$OCF_RESKEY_CRM_meta_clone_max" -ne 2 ] || \ ++ [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] || \ ++ [ "$OCF_RESKEY_CRM_meta_master_max" -ne 1 ]; then ++ super_ocf_log err "ACT: Clone options misconfigured. (expect: clone_max=2,clone_node_max=1,master_node_max=1,master_max=1)" ++ exit $OCF_ERR_CONFIGURED ++ fi ++ rc=0; ++ else ++ rc=1; ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: get_hana_attribute ++# params: NODE ATTR [STORE] ++# globals: - ++# ++function get_hana_attribute() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ local attr_node=$1 ++ local attr_name=$2 ++ local attr_store=${3:-reboot} # DONE: PRIO5 get this (optional) from parameter ++ crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q; rc=$? ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: set_hana_attribute - set the multi-state status of a node ++# params: NODE VALUE ATTR [STORE] ++# globals: - ++# ++function set_hana_attribute() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local attr_node=$1 ++ local attr_value=$2 ++ local attr_name=$3 ++ local attr_store=${4:-reboot} # DONE: PRIO5 get this (optional) from parameter ++ local rc=1 ++ local attr_old="" ++ attr_old=$(get_hana_attribute $attr_node $attr_name $attr_store); get_rc=$? ++ if [ "$attr_old" != "$attr_value" ]; then ++ super_ocf_log debug "DBG: SET attribute $attr_name for node ${attr_node} to ${attr_value} former ($attr_old) get_rc=$get_rc " ++ crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store; rc=$? ++ else ++ super_ocf_log debug "DBG: LET attribute $attr_name for node ${attr_node} still be ${attr_value}" ++ rc=0 ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: assert - quickly go out of here with minimal error/return code handling and log ++# params: MESSAGE ++# globals: OCF_*(r) ++# ++function assert() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local err_msg=$1 local default_rc=$OCF_NOT_RUNNING ++ # DONE: Check, if we need to destinguish between probe and others ++ if ocf_is_probe; then ++ default_exit=$OCF_NOT_RUNNING ++ else ++ default_exit=$OCF_ERR_CONFIGURED ++ fi ++ if [ "$ACTION" = "stop" ]; then ++ cleanup_instance ++ exit $OCF_SUCCESS ++ fi ++ super_ocf_log err "ACT: $err_msg" ++ exit $OCF_NOT_RUNNING ++} ++ ++# ++# function: set_crm_master - set the crm master score of the local node ++# params: SCORE ++# globals: HA_SBIN_DIR(r), OCF_RESOURCE_INSTANCE(r) ++# ++function set_crm_master() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ local score=0 ++ if [ -n "$1" ]; then ++ score=$1 ++ fi ++ # DONE: PRIO2: Only adjust master if value is really different (try to check that) ++ oldscore=$(${HA_SBIN_DIR}/crm_master -G -q -l reboot) ++ if [ "$oldscore" != "$score" ]; then ++ super_ocf_log debug "DBG: SET crm master: $score (old: $oldscore)" ++ ${HA_SBIN_DIR}/crm_master -v $score -l reboot; rc=$? ++ else ++ super_ocf_log debug "DBG: LET crm master: $score" ++ rc=0 ++ fi ++ #logger -t fhLOG "crm_master with: $OCF_RESOURCE_INSTANCE -v $score -l reboot" ++ return $rc ++} ++ ++# ++# function: scoring_crm_master - score instance due to role ans sync match (table SCORING_TABLE_PREFERRED_SITE_TAKEOVER) ++# params: NODE_ROLES NODE_SYNC_STATUS ++# globals: SCORING_TABLE_PREFERRED_SITE_TAKEOVER[@], ++# ++scoring_crm_master() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local roles="$1" ++ local sync="$2" ++ local skip=0 ++ local myScore=-1 ++ for scan in "${SCORING_TABLE_PREFERRED_SITE_TAKEOVER[@]}"; do ++ if [ $skip -eq 0 ]; then ++ read rolePatt syncPatt score <<< $scan ++ if grep "$rolePatt" <<< "$roles"; then ++ if grep "$syncPatt" <<< "$sync"; then ++ skip=1 ++ myScore=$score ++ fi ++ fi ++ fi ++ done ++ super_ocf_log debug "DBG: scoring_crm_master adjust score $myScore" ++ set_crm_master $myScore ++} ++ ++# ++# function: get_crm_master - get the crm master score of the local node ++# params: - ++# globals: HA_SBIN_DIR(r) ++# ++function get_crm_master() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ ${HA_SBIN_DIR}/crm_master -G -q -l reboot; rc=$? ++ return $rc ++} ++ ++# ++# function: saphana_init - initialize variables for the resource agent ++# params: InstanceName ++# globals: OCF_*(r), SID(w), sid(rw), sidadm(w), InstanceName(w), InstanceNr(w), SAPVIRHOST(w), PreferSiteTakeover(w), ++# globals: sr_name(w), remoteHost(w), otherNodes(w) ++# globals: ATTR_NAME_HANA_SYNC_STATUS(w), ATTR_NAME_HANA_CLONE_STATE(w) ++# globals: DIR_EXECUTABLE(w), SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), LD_LIBRARY_PATH(w), PATH(w) ++# globals: LPA_DIRECTORY(w), SIDInstanceName(w), remoteNode(w) ++# saphana_init : Define global variables with default values, if optional parameters are not set ++# ++function saphana_init() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=$OCF_SUCCESS ++ local vName ++ # two parameter models (for transition only) ++ # OLD: InstanceName ++ # NEW: SID InstanceNumber ++ SID=$OCF_RESKEY_SID ++ InstanceNr=$OCF_RESKEY_InstanceNumber ++ SIDInstanceName="${SID}_HDB${InstanceNr}" ++ InstanceName="HDB${InstanceNr}" ++ super_ocf_log debug "DBG: Used new method to get SID ($SID) and InstanceNr ($InstanceNr)" ++ sid=$(echo "$SID" | tr [:upper:] [:lower:]) ++ sidadm="${sid}adm" ++ # DONE: PRIO4: SAPVIRHOST might be different to NODENAME ++ # DONE: PRIO1: ASK: Is the output format of ListInstances fix? Could we take that as an API? Answer: Yes ++ # try to catch: Inst Info : LNX - 42 - lv9041 - 740, patch 36, changelist 1444691 ++ # We rely on the following format: SID is word#4, NR is work#6, vHost is word#8 ++ vName=$(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances \ ++ | awk '$4 == SID && $6=NR { print $8 }' SID=$SID NR=$InstanceNr) ++ if [ -z "$vName" ]; then ++ # ++ # if saphostctrl does not know the answer, try to fallback to attribute provided by SAPHanaTopology ++ # ++ vName=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_VHOST[@]}); ++ fi ++ SAPVIRHOST=${vName} ++ PreferSiteTakeover="$OCF_RESKEY_PREFER_SITE_TAKEOVER" ++ SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" ++ AUTOMATED_REGISTER="${OCF_RESKEY_AUTOMATED_REGISTER:-false}" ++ LPA_DIRECTORY=/var/lib/SAPHanaRA ++ LPA_ATTR=("lpa_${sid}_lpt" "forever") ++ super_ocf_log debug "DBG: SID=$SID, sid=$sid, SIDInstanceName=$SIDInstanceName, InstanceName=$InstanceName, InstanceNr=$InstanceNr, SAPVIRHOST=$SAPVIRHOST" ++ ocf_env=$(env | grep 'OCF_RESKEY_CRM') ++ super_ocf_log debug "DBG: OCF: $ocf_env" ++ # ++ ATTR_NAME_HANA_SYNC_STATUS=("hana_${sid}_sync_state" "reboot") # SOK, SFAIL, UNKNOWN? ++ ATTR_NAME_HANA_PRIMARY_AT=("hana_${sid}_primary_at" "reboot") # Not used so far ++ ATTR_NAME_HANA_CLONE_STATE=("hana_${sid}_clone_state" "reboot") # UKNOWN?, DEMOTED, PROMOTED ++ ATTR_NAME_HANA_REMOTEHOST=("hana_${sid}_remoteHost" "forever") ++ ATTR_NAME_HANA_SITE=("hana_${sid}_site" "forever") ++ ATTR_NAME_HANA_ROLES=("hana_${sid}_roles" "reboot") ++ ATTR_NAME_HANA_SRMODE=("hana_${sid}_srmode" "forever") ++ ATTR_NAME_HANA_VHOST=("hana_${sid}_vhost" "forever") ++ ATTR_NAME_HANA_STATUS=("hana_${sid}_status" "reboot") ++ # ++ # TODO: PRIO4: Table for non-preferred-site-takeover ++ # ++ SCORING_TABLE_PREFERRED_SITE_TAKEOVER=( ++ "[234]*:P:[^:]*:master .* 150" ++ "[015-9]*:P:[^:]*:master .* 90" ++ "[0-9]*:P:[^:]*:slave .* 60" ++ "[0-9]*:P:[^:]*:\? .* 0" ++ "[0-9]*:P:[^:]*:- .* 0" ++ "[234]*:S:[^:]*:master SOK 100" ++ "[015-9]*:S:[^:]*:master SOK 80" ++ "[0-9]*:S:[^:]*:master SFAIL -INFINITY" ++ "[0-9]*:S:[^:]*:slave SOK 10" ++ "[0-9]*:S:[^:]*:slave SFAIL -INFINITY" ++ "[0-9]*:S:[^:]*:\? .* 0" ++ "[0-9]*:S:[^:]*:- .* 0" ++ ".* .* -1" ++ ) ++ SCORING_TABLE_PREFERRED_LOCAL_RESTART=( ++ "[0-9]*:P:[^:]*:master .* 150" ++ "[0-9]*:P:[^:]*:slave .* 140" ++ "[0-9]*:P:[^:]*:\? .* 0" ++ "[0-9]*:P:[^:]*:- .* 0" ++ "[0-9]*:S:[^:]*:master SOK 100" ++ "[0-9]*:S:[^:]*:master SFAIL -INFINITY" ++ "[0-9]*:S:[^:]*:slave SOK 10" ++ "[0-9]*:S:[^:]*:slave SFAIL -INFINITY" ++ "[0-9]*:S:[^:]*:\? .* 0" ++ "[0-9]*:S:[^:]*:- .* 0" ++ ".* .* -1" ++ ) ++ # ++ DUPLICATE_PRIMARY_TIMEOUT="${OCF_RESKEY_DUPLICATE_PRIMARY_TIMEOUT:-7200}" ++ super_ocf_log debug "DBG: DUPLICATE_PRIMARY_TIMEOUT=$DUPLICATE_PRIMARY_TIMEOUT" ++ # ++ # Determine list of other cluster nodes and store in otherNodes variable ++ otherNodes=() ++ case $(crm_attribute --type crm_config --name cluster-infrastructure -q) in ++ *corosync* ) otherNodes=($(crm_node -l | awk '{ if ($2 != me) { print $2 }}' me=${NODENAME}));; ++ *openais* ) otherNodes=($(crm_node -l | awk '$3 == "member" { if ($2 != me) { print $2 }}' me=${NODENAME}));; ++ *cman* ) otherNodes=($(crm_node -l | awk '{for (i=1; i<=NF; i++) { if ($i != me) { print $i }}}' me=${NODENAME}));; ++ esac ++ ++ remoteHost=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_REMOTEHOST[@]}); ++ if [ -z "$remoteHost" ]; then ++ if [ ${#otherNodes[@]} -eq 1 ]; then # we are a 2 node cluster, lets assume the other is the remote-host ++ remoteHost=${otherNodes[0]} ++ remoteNode=$remoteHost ++ super_ocf_log debug "DBG: auto-guess remoteHost=$remoteHost" ++ else ++ super_ocf_log debug "DBG: Could not auto-guess remoteHost out of list (${otherNodes[@]})" ++ fi ++ else ++ # ++ # search cluster node which vhost is equal remoteHost ++ # ++ remoteNode=$(remoteHost2remoteNode $remoteHost) ++ # TODO: PRIO5: catch rc!=0 ++ fi ++ # ATTR_NAME_HANA_SITE ++ sr_name=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SITE[@]}); ++ sr_mode=$(get_hana_attribute "${NODENAME}" ${ATTR_NAME_HANA_SRMODE[@]}) ++ if [ -z "$sr_mode" ]; then ++ sr_mode="sync" ++ fi ++ super_ocf_log debug "DBG: sr_name=$sr_name, remoteHost=$remoteHost, remoteNode=$remoteNode, sr_mode=$sr_mode" ++ # optional OCF parameters, we try to guess which directories are correct ++ if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] ++ then ++ if have_binary /usr/sap/$SID/$InstanceName/exe/sapstartsrv && have_binary /usr/sap/$SID/$InstanceName/exe/sapcontrol ++ then ++ DIR_EXECUTABLE="/usr/sap/$SID/$InstanceName/exe" ++ fi ++ else ++ if have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" && have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" ++ then ++ DIR_EXECUTABLE="$OCF_RESKEY_DIR_EXECUTABLE" ++ fi ++ fi ++ SAPSTARTSRV="$DIR_EXECUTABLE/sapstartsrv" ++ SAPCONTROL="$DIR_EXECUTABLE/sapcontrol" ++ ++ [ -z "$DIR_EXECUTABLE" ] && assert "Cannot find sapstartsrv and sapcontrol executable, please set DIR_EXECUTABLE parameter!" ++ DIR_PROFILE="${OCF_RESKEY_DIR_PROFILE:-/usr/sap/$SID/SYS/profile}" ++ # check, if the following fall-back is ok, or if there could be multiple profiles matching this pattern ++ if [ -n "${SAPVIRHOST}" ]; then ++ SAPSTARTPROFILE="$DIR_PROFILE/${OCF_RESKEY_INSTANCE_PROFILE:-${SID}_${InstanceName}_${SAPVIRHOST}}" ++ else ++ # check, if the following fall-back is ok, or if there could be multiple profiles matching this pattern ++ # also take profile versions into account - they might break this fall-back ++ # TODO: PRIO4: Check, if it makes sense to implement an additional last fall-back: get the SAPSTARTPROFILE from /usr/sap/sapservices ++ # ++ SAPSTARTPROFILE="$(ls -1 $DIR_PROFILE/${OCF_RESKEY_INSTANCE_PROFILE:-${SID}_${InstanceName}_*})" ++ fi ++ # as root user we need the library path to the SAP kernel to be able to call sapcontrol ++ # check, if we already added DIR_EXECUTABLE at the beginning of LD_LIBRARY_PATH ++ if [ "${LD_LIBRARY_PATH%%*:}" != "$DIR_EXECUTABLE" ] ++ then ++ LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH ++ export LD_LIBRARY_PATH ++ fi ++ PATH=${PATH}:${DIR_EXECUTABLE}; export PATH ++ super_ocf_log info "FLOW $FUNCNAME rc=$OCF_SUCCESS" ++ ############################# ++ # TODO: PRIO9: To be able to call landscapeHostConfig.py without su (so as root) ++ # TODO: PRIO9: Research for environment script .htacces or something like that ++ #export SAPSYSTEMNAME=ZLF ++ #export DIR_INSTANCE=/usr/sap/ZLF/HDB02 ++ #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$DIR_INSTANCE/exe:$DIR_INSTANCE/exe/Python/lib ++ #export PYTHONPATH=$DIR_INSTANCE/$HOST:$DIR_INSTANCE/exe/python_support:$DIR_INSTANCE/exe ++ #export PYTHONHOME=$DIR_INSTANCE/exe/Python ++ #export SAP_RETRIEVAL_PATH=$DIR_INSTANCE/$HOST ++ #export DIR_EXECUTABLE=$DIR_INSTANCE/exe ++ ############################# ++ return $OCF_SUCCESS ++} ++ ++# function: check_secstore_users ++# params: USER ++# globals: DIR_EXECUTABLE(r) ++# ++# TODO: PRIO5: Might be dropped, if we get a script for fetching the sync status ++function check_secstore_users() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local user="" ++ local rc=1 ++ while [ $# -gt 0 ]; do ++ user="$1" ++ $DIR_EXECUTABLE/hdbuserstore list | grep -q "KEY $user" && echo "$user" && rc=0 && break ++ shift ++ done ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: check_sapstartsrv - check for sapstartsrv - optional start ++# params: - ++# globals: DIR_PROFILE(w), SAPSTARTPROFILE(r), SAPCONTROL(r), SID(r), InstanceName(r), InstanceNr(r), OCF_*(r) ++# check_sapstartsrv : Before using sapcontrol we make sure that the sapstartsrv is running. ++# ++function check_sapstartsrv() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local restart=0 ++ local runninginst="" ++ local rc=$OCF_SUCCESS ++ local output="" ++ if [ ! -S /tmp/.sapstream5${InstanceNr}13 ]; then ++ super_ocf_log warn "ACT: sapstartsrv is not running for instance $SID-$InstanceName (no UDS), it will be started now" ++ restart=1 ++ else ++ output=$($SAPCONTROL -nr $InstanceNr -function ParameterValue INSTANCE_NAME -format script) ++ if [ $? -eq 0 ] ++ then ++ runninginst=$(echo "$output" | grep '^0 : ' | cut -d' ' -f3) ++ if [ "$runninginst" != "$InstanceName" ] ++ then ++ super_ocf_log warn "ACT: sapstartsrv is running for instance $runninginst, that service will be killed" ++ restart=1 ++ else ++ output=$($SAPCONTROL -nr $InstanceNr -function AccessCheck Start) ++ if [ $? -ne 0 ]; then ++ super_ocf_log warn "ACT: FAILED - sapcontrol -nr $InstanceNr -function AccessCheck Start ($(ls -ld1 /tmp/.sapstream5${InstanceNr}13))" ++ super_ocf_log warn "ACT: sapstartsrv will be restarted to try to solve this situation, otherwise please check sapstsartsrv setup (SAP Note 927637)" ++ restart=1 ++ fi ++ fi ++ else ++ super_ocf_log warn "ACT: sapstartsrv is not running for instance $SID-$InstanceName, it will be started now" ++ restart=1 ++ fi ++ fi ++ if [ -z "$runninginst" ]; then runninginst=$InstanceName; fi ++ if [ $restart -eq 1 ] ++ then ++ if [ -d /usr/sap/$SID/SYS/profile/ ] ++ then ++ DIR_PROFILE="/usr/sap/$SID/SYS/profile" ++ else ++ assert "Expected /usr/sap/$SID/SYS/profile/ to be a directory, please set DIR_PROFILE parameter!" ++ fi ++ [ ! -r $SAPSTARTPROFILE ] && assert "Expected $SAPSTARTPROFILE to be the instance START profile, please set INSTANCE_PROFILE parameter!" ++ pkill -9 -f "sapstartsrv.*$runninginst" ++ # removing the unix domain socket files as they might have wrong permissions ++ # or ownership - they will be recreated by sapstartsrv during next start ++ rm -f /tmp/.sapstream5${InstanceNr}13 ++ rm -f /tmp/.sapstream5${InstanceNr}14 ++ $SAPSTARTSRV pf=$SAPSTARTPROFILE -D -u $sidadm ++ # now make sure the daemon has been started and is able to respond ++ local srvrc=1 ++ while [ $srvrc -eq 1 -a $(pgrep -f "sapstartsrv.*$runninginst" | wc -l) -gt 0 ] ++ do ++ sleep 1 ++ $SAPCONTROL -nr $InstanceNr -function GetProcessList > /dev/null 2>&1 ++ srvrc=$? ++ done ++ if [ $srvrc -ne 1 ] ++ then ++ super_ocf_log info "ACT: sapstartsrv for instance $SID-$InstanceName was restarted!" ++ rc=$OCF_SUCCESS ++ else ++ super_ocf_log error "ACT: sapstartsrv for instance $SID-$InstanceName could not be started!" ++ rc=$OCF_ERR_GENERIC ++ ocf_is_probe && rc=$OCF_NOT_RUNNING ++ fi ++ fi ++ return $rc ++} ++ ++# ++# function: cleanup_instance - remove resources from a crashed instance ++# params: - ++# globals: - ++# ++function cleanup_instance() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ # TODO: PRIO5: Check, if we need HANA cleanup procedure (processes, ipc obj, pid files); Currently not needed ++ super_ocf_log debug "DBG: cleanup_instance currently not implemented" ++ rc=0 ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++} ++ ++# ++# function: check_for_primary - check if local SAP HANA is configured as primary ++# params: - ++# globals: HANA_STATE_PRIMARY(r), HANA_STATE_SECONDARY(r), HANA_STATE_DEFECT(r) ++# ++function check_for_primary() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=$HANA_STATE_DEFECT ++ node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) ++ node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') ++ super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" ++ for i in 1 2 3 4 5 6 7 8 9; do ++ case "$node_status" in ++ primary ) ++ super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_PRIMARY" ++ return $HANA_STATE_PRIMARY;; ++ syncmem | sync | async ) ++ super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_SECONDARY" ++ return $HANA_STATE_SECONDARY;; ++ none ) # have seen that mode on second side BEFEORE we registered it as replica ++ super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_STANDALONE" ++ return $HANA_STATE_STANDALONE;; ++ * ) ++ super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" ++ dump=$( echo $node_status | hexdump -C ); ++ super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" ++ node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) ++ node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') ++ super_ocf_log debug "DEC: check_for_primary: loop=$i: node_status=$node_status" ++ # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes ++ esac; ++ done ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: analyze_hana_sync_status - query and check hana system replication status ++# params: - ++# globals: DIR_EXECUTABLE(r), remoteHost(r) ++# get the HANA sync status ++# ++function analyze_hana_sync_status() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local -a clusterNodes=() ++ local cl="" ++ local vHost="" ++ local n="" ++ local hana_sync_status="" what_does_the_chamelion_say="" ++ local secUser="SLEHALOC" ++ local chkusr; ++ local rc=0 ++ local sqlrc=0 ++# local query_state='select distinct REPLICATION_STATUS from SYS.M_SERVICE_REPLICATION' ++# select distinct REPLICATION_STATUS from SYS.M_SERVICE_REPLICATION where SITE_NAME='"SITE1"'" ++ local query_state="select distinct REPLICATION_STATUS from SYS.M_SERVICE_REPLICATION where SITE_NAME='"${sr_name}"'" ++ local query_secondaries='select distinct SECONDARY_HOST from SYS.M_SERVICE_REPLICATION' ++ local query_failed_secondaries="select distinct SECONDARY_HOST from SYS.M_SERVICE_REPLICATION where SECONDARY_SITE_NAME = (select distinct SECONDARY_SITE_NAME from SYS.M_SERVICE_REPLICATION WHERE REPLICATION_STATUS != 'ACTIVE')" ++ local all_cluster_hosts all_secondary_hosts all_broken_secondaries ++# ++##################################################################################################### ++# ++# select distinct SITE_NAME, HOST, REPLICATION_STATUS, SECONDARY_SITE_NAME, SECONDARY_HOST from SYS.M_SERVICE_REPLICATION ++# ++# ===> "Walldorf", "sap-app-8" "ACTIVE", "Rot", "sap-app-5" ++# "Rot", "sap-app-5", "ACTIVE", "oslo", "sap-app-7" ++# ++##################################################################################################### ++# ++ secUser=$(check_secstore_users SAPHANA${SID}SR SLEHALOC RHELHALOC) ; chkusr=$? ++ if [ $chkusr -ne 0 ]; then ++ super_ocf_log err "ACT: Secure store users are missing (see best practice manual how to setup the users)" ++ rc=$OCF_ERR_CONFIGURED ++ fi ++ hana_sync_status=$(timeout 60 $DIR_EXECUTABLE/hdbsql -a -x -U $secUser $query_state); sqlrc=$? ++ hana_sync_status=$(echo $hana_sync_status | dequote) ++ super_ocf_log debug "DBG: hdbsql rc=$sqlrc hana_sync_status=\"$hana_sync_status\"" ++ if [ "$sqlrc" -eq 0 -a "$hana_sync_status" != "" ]; then ++ # ++ # UNKNOWN, ACTIVE, ERROR, INITIALIZING ++ # ++ if [ "${hana_sync_status}" == "ACTIVE" ]; then ++ # TODO PRIO1: REMOVE remoteNode dependency - set SOK ++ set_hana_attribute "$remoteNode" "SOK" ${ATTR_NAME_HANA_SYNC_STATUS[@]} ++ else ++ super_ocf_log warn "ACT: HANA SYNC STATUS is: ${hana_sync_status}" ++ # TODO PRIO1: REMOVE remoteNode dependency - set SFAIL ++ set_hana_attribute "$remoteNode" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} ++ fi ++ # first get a list of all secondary hosts, than a list of all secondary hosts, if the is ANY failure at this site ++ # TODO: PRIO9: for first we assume there is only ONE secondary site (like ROT) ++ # TODO: PRIO3: should we loop over all cluster nodes fetching their roles-attribute? To minimize sql-queries? ++ # ++ all_secondary_hosts=$(timeout 60 hdbsql -a -x -U $secUser $query_secondaries ); sqlrc=$? ++ all_secondary_hosts=$(echo $all_secondary_hosts | dequote); ++ if [ "$sqlrc" -eq 0 ]; then ++ all_broken_secondary_hosts=$(timeout 60 hdbsql -a -x -U $secUser $query_failed_secondaries); sqlrc=$? ++ all_broken_secondary_hosts=$(echo $all_broken_secondary_hosts | dequote); ++ if [ "$sqlrc" -eq 0 ]; then ++ if [ -n "$all_broken_secondary_hosts" ]; then ++ # ++ # we have a broken secondary site - set all hosts to "SFAIL" ++ # ++ # Note: since HANA hostname can be different from nodename we need to check all vhost attributes ++ for n in $all_broken_secondary_hosts; do ++ for cl in ${otherNodes[@]}; do ++ vHost=$(get_hana_attribute $cl ${ATTR_NAME_HANA_VHOST[@]}) ++ if [ "$vHost" = "$n" ]; then # we found the correct node ++ set_hana_attribute $cl "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} ++ fi ++ done ++ done ++ fi ++ fi ++ fi ++ else ++ # return codes 19: license error -> set SFAIL! ++ case "$sqlrc" in ++ 19 ) ++ # DONE: PRIO1: We should NOT set SFAIL, if HDB is exactly broken now ++ # When HDB breaks during monitor this could prevent a prositive remote failover ++ super_ocf_log warn "ACT: Was not able to fetch HANA SYNC STATUS - set sync status to SFAIL for ALL OTHER cluster hosts" ++ for n in $otherNodes; do ++ set_hana_attribute "$n" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} ++ done ++ ;; ++ esac ++ fi ++ return $rc ++} ++ ++# ++# function: get_hana_landscape_status - figure out hana ladscape status ++# params: - ++# globals: sidadm(r), DIR_EXECUTABLE(r) ++# ++function get_hana_landscape_status() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ # ++ su - $sidadm -c "python $DIR_EXECUTABLE/python_support/landscapeHostConfiguration.py" 1>/dev/null 2>/dev/null; rc=$? ++ return $rc; ++} ++ ++# ++# function: register_hana_secondary - register local hana as secondary to the other site ++# params: - ++# globals: sidadm(r), remoteHost(r), InstanceNr(r), sr_mode(r), sr_name(r) ++# register_hana_secondary ++# ++function register_hana_secondary() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=2; ++ local remoteInstance=""; ++ remoteInstance=$InstanceNr ++ if ocf_is_true ${AUTOMATED_REGISTER}; then ++ super_ocf_log info "ACT: REGISTER: hdbnsutil -sr_register --remoteHost=$remoteHost --remoteInstance=$remoteInstance --mode=$sr_mode --name=$sr_name" ++ su - $sidadm -c "hdbnsutil -sr_register --remoteHost=$remoteHost --remoteInstance=$remoteInstance --mode=$sr_mode --name=$sr_name"; rc=$? ++ else ++ super_ocf_log info "ACT: IGNORE REGISTER because AUTOMATED_REGISTER is set to FALSE" ++ rc=1 ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc; ++} ++ ++# ++# function: saphana_status - pure status check ++# params: - ++# globals: SIDInstanceName, OCF_*, ++function saphana_status() { ++ local binDeam="hdb.sap${SIDInstanceName}" rc=0 ++ binDeam=${binDeam:0:15} # Process name is limited to the first 15 characters ++ if pgrep $binDeam 1>/dev/null; then rc=$OCF_SUCCESS; else rc=$OCF_NOT_RUNNING; fi ++ return $rc ++} ++ ++# ++# function: saphana_start - start a hana instance ++# params: - ++# globals: OCF_*, SAPCONTROL, InstanceNr, SID, InstanceName, ++# ++function saphana_start() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=$OCF_NOT_RUNNING ++ local output="" ++ local loopcount=0 ++ check_sapstartsrv ++ rc=$? ++ # ++ # TODO: ASK: PRIO5: For SCALE-OUT - do we need to use an other call like StartSystem? Or better to use the HDB command? ++ # ++ if [ $rc -eq $OCF_SUCCESS ]; then ++ output=$($SAPCONTROL -nr $InstanceNr -function Start) ++ rc=$? ++ super_ocf_log info "ACT: Starting SAPHANA Instance $SID-$InstanceName: $output" ++ fi ++ if [ $rc -eq 0 ] ++ then ++ # TODO: PRIO9: something more dynamic than 3600 seconds in WaitforStarted ++ output=$($SAPCONTROL -nr $InstanceNr -function WaitforStarted 3600 1) ++ if [ $? -eq 0 ] ++ then ++ super_ocf_log info "ACT: SAPHANA Instance $SID-$InstanceName started: $output" ++ rc=$OCF_SUCCESS ++ else ++ super_ocf_log err "ACT: SAPHANA Instance $SID-$InstanceName start failed: $output" ++ rc=$OCF_ERR_GENERIC ++ fi ++ else ++ super_ocf_log err "ACT: SAPHANA Instance $SID-$InstanceName start failed: $output" ++ rc=$OCF_ERR_GENERIC ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: saphana_stop - stop a hana instance ++# params: - ++# globals: OCF_*(r), SAPCONTROL(r), SID(r), InstanceName(r) ++# saphana_stop: Stop the SAP instance ++# ++function saphana_stop() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local output="" ++ local rc=0 ++ check_sapstartsrv; rc=$? ++ if [ $rc -eq $OCF_SUCCESS ]; then ++ output=$($SAPCONTROL -nr $InstanceNr -function Stop) ++ rc=$? ++ super_ocf_log info "ACT: Stopping SAP Instance $SID-$InstanceName: $output" ++ fi ++ if [ $rc -eq 0 ] ++ then ++ output=$($SAPCONTROL -nr $InstanceNr -function WaitforStopped 3600 1) ++ if [ $? -eq 0 ] ++ then ++ super_ocf_log info "ACT: SAP Instance $SID-$InstanceName stopped: $output" ++ rc=$OCF_SUCCESS ++ else ++ super_ocf_log err "ACT: SAP Instance $SID-$InstanceName stop failed: $output" ++ rc=$OCF_ERR_GENERIC ++ fi ++ else ++ super_ocf_log err "ACT: SAP Instance $SID-$InstanceName stop failed: $output" ++ rc=$OCF_ERR_GENERIC ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: saphana_validate - validation of (some) variables/parameters ++# params: - ++# globals: OCF_*(r), SID(r), InstanceName(r), InstanceNr(r), SAPVIRHOST(r) ++# saphana_validate: Check the symantic of the input parameters ++# ++function saphana_validate() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=$OCF_SUCCESS ++ # ++ # SID is Alpha-AlphaNumeric-Alphanumeric? ++ # ++ if [ $(echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$') -ne 1 ] ++ then ++ super_ocf_log err "ACT: Parsing instance profile name: '$SID' is not a valid SID!" ++ rc=$OCF_ERR_ARGS ++ fi ++ # ++ # InstanceNr is a two-Digit? ++ # ++ if [ $(echo "$InstanceNr" | grep -c '^[0-9][0-9]$') -ne 1 ] ++ then ++ super_ocf_log err "ACT: Parsing instance profile name: '$InstanceNr' is not a valid instance number!" ++ rc=$OCF_ERR_ARGS ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: saphana_start_primary - handle startup of PRIMARY in M/S ++# params: ++# globals: OCF_*(r), NODENAME, ATTR_NAME_*, HANA_STATE_*, ++# ++function saphana_start_primary() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING ++ local lss sqlrc; ++ local rc=0 ++ local lpa_dec=4 ++ local lpa_advice="" ++ # ++ # we will be a master (PRIMARY) so checking, if the is an OTHER master ++ # ++ super_ocf_log debug "DBG: saphana_primary - check_for_primary reports HANA_STATE_PRIMARY" ++ # ++ lpa_init_lpt $HANA_STATE_PRIMARY ++ lpa_check_lpt_status; lpa_dec=$? ++ get_hana_landscape_status; lss=$? ++ my_role=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_ROLES[@]}) ++ my_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) ++ case "$lpa_dec" in ++ 0 ) # LPA says start-up ++ lpa_advice="start" ++ ;; ++ 1) # LPA says register! ++ lpa_advice="register" ++ ;; ++ 2) # LPA says wait for second LPT ++ lpa_advice="wait" ++ ;; ++ 3 | 4 ) # LPA says something is completely wrong - FAIL resource ++ lpa_advice="fail" ++ ;; ++ * ) # LPA failed with an unkonown status - FAIL resource ++ lpa_advice="fail" ++ ;; ++ esac ++ ++ # DONE: PRIO2: Do we need to differ 0 and 1 here? While 0 is a fatal SAP error, 1 for down/error ++ if [ $lss -eq 0 ]; then ++ super_ocf_log err "ACT: get_hana_landscape_status reports FATAL" ++ # DONE: PRIO1: what to do for lss=0? ++ # TODO: PRIO3: Check, if OCF_ERR_GENERIC is best reaction ++ lpa_advice="skip" ++ rc=$OCF_ERR_GENERIC ++ fi ++ case "$lpa_advice" in ++ start ) # process a normal START ++ case "$lss" in ++ 2 | 3 | 4 ) # as landcape says we are up - just set the scores and return code ++ super_ocf_log info "LPA: landcape: UP, LPA: start ==> keep running" ++ LPTloc=$(date '+%s') ++ lpa_set_lpt $LPTloc ++ rc=$OCF_SUCCSESS ++ ;; ++ 1 ) # landcape says we are down, lets start and adjust scores and return code ++ super_ocf_log info "LPA: landcape: DOWN, LPA: start ==> start instance" ++ saphana_start ++ rc=$? ++ LPTloc=$(date '+%s') ++ lpa_set_lpt $LPTloc ++ ;; ++ esac ++ scoring_crm_master "$my_role" "$my_sync" ++ ;; ++ register ) # process a REGISTER ++ case "$lss" in ++ 2 | 3 | 4 ) # upps we are up - but shoudn't? - we could not register with started HDB ++ # DONE: PRIO3: check if this reaction is correct - tell cluster about failed start ++ super_ocf_log info "LPA: landcape: UP, LPA: register ==> take down" ++ set_crm_master -inf ++ rc=$OCF_NOT_RUNNING ++ ;; ++ 1 ) # lets try to register ++ # DONE: PRIO2: Like Action in start_secondary ++ super_ocf_log info "LPA: landcape: DOWN, LPA: register ==> try to register" ++ super_ocf_log info "DEC: AN OTHER HANA IS AVAILABLE ==> LETS REGISTER" ++ set_crm_master 0 ++ if wait_for_primary_master 1; then ++ register_hana_secondary ++ check_for_primary; primary_status=$? ++ if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then ++ super_ocf_log info "ACT: Register successful" ++ lpa_push_lpt 10 ++ lpa_set_lpt 10 ++ set_crm_master 0 ++ saphana_start_secondary ++ rc=$? ++ lpa_set_lpt 30 ++ else ++ super_ocf_log err "ACT: Register failed" ++ rc=$OCF_NOT_RUNNING ++ fi ++ else ++ # lets check next monitor, if we can register ++ rc=$OCF_SUCCESS ++ fi ++ ;; ++ esac ++ ;; ++ wait ) # process a WAIT ++ case "$lss" in ++ 2 | 3 | 4 ) # as we ARE up we just keep it up ++ # TODO: PRIO3: I now change from "just keep it up to take that down" ++ # TODO: PRIO3: OCF_SUCCSESS, OCF_NOT_RUNNING or OCF_ERR_xxxx ? ++ set_crm_master -9000 ++ #scoring_crm_master "$my_role" "$my_sync" ++ rc=$OCF_ERR_GENERIC ++ ;; ++ 1 ) # we are down, so we should wait --> followup in next monitor ++ super_ocf_log info "LPA: landcape: DOWN, LPA: wait ==> keep waiting" ++ # TODO: PRIO3: Check, if WAITING is correct here ++ set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} ++ set_crm_master -9000 ++ rc=$OCF_SUCCSESS ++ ;; ++ esac ++ ;; ++ fail ) # process a lpa FAIL ++ super_ocf_log info "LPA: LPA reports FAIL" ++ set_crm_master -inf ++ rc=$OCF_NOT_RUNNING ++ ;; ++ esac ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# check_for_primary_master ++# params: - ++# globals: ATTR_NAME_HANA_ROLES[@], NODENAME ++# ++check_for_primary_master() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=1 ++ local ch ch_role ++ # ++ # get actual list of cluster members ++ # ++ if [ -n "$otherNodes" ]; then ++ for ch in ${otherNodes[@]}; do ++ if [ $rc -eq 1 ]; then ++ ch_role=$(get_hana_attribute ${ch} ${ATTR_NAME_HANA_ROLES[@]}) ++# TODO: PRIO3: check if [0-9], [234] or [34] is correct ++# TODO: PRIO4: Do we need different checks like "any-primary-master" or "running-primary-master" ? ++# grep '[0-9]*:P:[^:]*:master:' <<< $ch_role && rc=0 ++# grep '[34]:P:[^:]*:master:' <<< $ch_role && rc=0 ++# Match "Running+Available Primary" Master -> Match field 1: 3/4, 2: P, 4: master ++ awk -F: 'BEGIN { rc=1 } ++ $1 ~ "[34]" && $2 ="P" && $4="master" { rc=0 } ++ END { exit rc }' <<< $ch_role ; rc=$? ++ fi ++ done ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# wait_for_primary_master: wait some time till a running primary master is shown in attributes ++# params: optional: loop count - currently time in 10s waiting loop ++# globals: - ++# ++wait_for_primary_master() ++{ ++ local wait=1 ++ local rc=1 ++ local loops=${1:-0} ++ local count=0 ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ # ++ # hana_ndb_roles=primary:master1:master:worker:master ++ # ++ while [ "$wait" -eq 1 ]; do ++ if check_for_primary_master; then ++ wait=0 ++ rc=0 ++ else ++ if [ $loops -gt 0 ]; then ++ (( count++ )) ++ if [ $count -gt $loops ]; then ++ wait=0 ++ rc=1 ++ fi ++ fi ++ sleep 10 ++ fi ++ done ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: saphana_start_secondary - handle startup of PRIMARY in M/S ++# params: ++# globals: OCF_*(r), NODENAME, ATTR_NAME_*, ++# ++function saphana_start_secondary() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING ++ local sqlrc; ++ set_crm_master 0 ++ # ++ ####### LPA - begin ++ # ++ lpa_push_lpt 10 ++ lpa_set_lpt 10 ++ # ++ ####### LPA - end ++ # ++ # ++ # we would be slave (secondary) ++ # we first need to check, if there are Master Nodes, because the Scecondary only starts ++ # successfuly, if the Primary is available. Thatfore we mark the Secondary as "WAITING" ++ # DONE: PRIO3: wait_for_primary_master 10 is just a test value: 10 loops x10 seconds than go to WAITING ++ # DONE: PRIO3: rename 'wait_for_primary_master' to match better the use case ("wait_some_time") ++ # ++ super_ocf_log debug "DBG: wait for promoted side" ++ # TODO: PRIO3: Check if setting SFAIL during secondary start is ok ++ set_hana_attribute "${NODENAME}" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} ++ if wait_for_primary_master 10; then ++ saphana_start; rc=$? ++ if [ $rc -ne $OCF_SUCCESS ]; then ++ if ! wait_for_primary_master 1; then ++ # It seams the stating secondary could not start because of stopping primary ++ # so this is a WAITING situation ++ super_ocf_log info "ACT: PRIMARY seams to be down now ==> WAITING" ++ set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} ++ set_crm_master -INFINITY ++ rc=$OCF_SUCCSESS ++ fi ++ else ++ lpa_set_lpt 30 ++ fi ++ else ++ super_ocf_log info "ACT: wait_for_primary_master ==> WAITING" ++ set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} ++ set_crm_master -INFINITY ++ rc=$OCF_SUCCSESS ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: lpa_get_lpt - get lpt from cluster ++# params: NODE ++# output: LPT ++# rc: rc=0: OK, rc=1: InternalERROR, rc=2: ERROR ++# globals: LPA_ATTR_*, ++# ++function lpa_get_lpt() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=1 ++ local node=$1 ++ local lpt="" ++ lpt=$(get_hana_attribute ${node} ${LPA_ATTR[@]}) ++ if [ -n "$lpt" ]; then ++ rc=0 ++ echo $lpt ++ else ++ rc=2 ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: lpa_set_lpt - set lpt in cluster ++# params: LPT [node] ++# globals: LPA_ATTR(r), NODENAME(r), ++# rc: rc=0: OK, rc=1: InternalERROR, rc=2: ERROR ++# ++function lpa_set_lpt() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=1 ++ local crm_rc=1 ++ local lpt=$1 ++ local clpt=-1 ++ local node=${2:-${NODENAME}} ++ set_hana_attribute ${node} "$lpt" ${LPA_ATTR[@]}; crm_rc=$? ++ clpt=$(lpa_get_lpt $NODENAME) ++ if [ "$lpt" != "$clpt" ]; then ++ rc=2 ++ else ++ rc=0 ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: lpa_pull_lpt - fetch lpt from file ++# params: - ++# globals: LPA_DIRECTORY(r), sid, NODENAME ++# output: LPT ++# rc: rc=0: OK, rc=1: InternalERROR, rc=2: ERROR ++# ++function lpa_pull_lpt() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=1 ++ local lpt="" ++ local readrest=0 ++ local lpa_file=$LPA_DIRECTORY/lpa_${sid}_${NODENAME} ++ if [ -f $lpa_file ]; then ++ read lpt readrest <<<$(cat $lpa_file) # exactly load first word from file to lpt ++ fi ++ if [ -n "$lpt" ]; then ++ rc=0 ++ echo $lpt ++ else ++ rc=2 ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: lpa_push_lpt - put lpt to file ++# params: LPT ++# globals: LPA_DIRECTORY(r), sid, NODENAME ++# output: -- ++# rc: rc=0: OK, rc=1: InternalERROR, rc=2: ERROR ++# ++function lpa_push_lpt() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local lpt=$1 ++ local clpt=-1 ++ local rc=1 ++ local lpa_file=$LPA_DIRECTORY/lpa_${sid}_${NODENAME} ++ # ++ mkdir -p $LPA_DIRECTORY ++ echo "$lpt" > $lpa_file ++ clpt=$(lpa_pull_lpt); lpt_rc=$? ++ if [ "$clpt" != "$lpt" -o "$lpt_rc" -ne 0 ]; then ++ rc=2 ++ else ++ rc=0 ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: lpa_init_lpt - initialize local lpt, if needed ++# params: HANA_STATE ++# globals: HANA_STATE_*(r), LPA_DIRECTORY(r), sid(r), NODENAME(r), ++# lpa_init_lpt ++# ++# Returncodes: ++# rc=0: OK, rc=1 InternalERROR, rc=2: ERROR ++# ++# Initializing (if NO local LPT-file): ++# SECONDARY sets to 0 ++# PRIMARY sets to 1 ++# ++function lpa_init_lpt() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=1 ++ local LPTloc=-1 ++ local LPTrem=-1 ++ local hana_state=$1 ++ local lpa_file=$LPA_DIRECTORY/lpa_${sid}_${NODENAME} ++ mkdir -p $LPA_DIRECTORY ++ LPTloc=$(lpa_get_lpt ${NODENAME}) || LPTloc=$(lpa_pull_lpt) || \ ++ if [ "$hana_state" -eq "$HANA_STATE_PRIMARY" ]; then # Initialize for Primary ++ # init primary ++ LPTloc=20 ++ lpa_push_lpt "20"; rc=$? ++ elif [ "$hana_state" -eq "$HANA_STATE_SECONDARY" ]; then # Initialize for Secondary ++ # init secondary ++ LPTloc=10 ++ lpa_push_lpt "10"; rc=$? ++ else ++ rc=2 ++ fi ++ lpa_set_lpt $LPTloc ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: lpa_check_lpt_status - start a hana clone instance ++# params: - ++# globals: DUPLICATE_PRIMARY_TIMEOUT, NODENAME, remoteNode ++# lpa_check_lpt_status ++# ++# Returncodes: ++# ++# Initializing (if NO local LPT-file): ++# SECONDARY sets to 10 ++# PRIMARY sets to 20 ++# ++# LPRlocal OR LPTremore ARE real lpt (>1000) ++# THEN: ++# Bigger LPR wins, if delta-gab is OK ++# LPTlocal >> LPTremore ===> rc=0 (start) ++# LPTRemote >> LPTlocal ===> rc=1 (register) ++# Stalemate in all other cases ==> STALEMATE-HANDLING ===> rc=2 (wait) ++# LPRlocal AND LPTremore ARE NOT real lpt (<=1000) ++# THEN: ++# Bigger LPT wins ++# LPTlocal > LPTremore ===> rc=0 (start) ++# LPTRemote > LPTlocal ===> rc=1 (register) ++# Stalemate in all other cases ==> STALEMATE-HANDLING ===> rc=2 (wait) ++# LPTRemote is not initialized (0) ++# THEN: ++# WAIT ==> like STALEMATE-HANDLING ===> rc=2 (wait) ++# ++function lpa_check_lpt_status() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ local LPTloc=-1 ++ local LPTrem=-1 ++ local LPTMark=1000 ++ local delta=0 ++ # ++ # First GET LPT from ATTR-FILE-DEFAULT ++ # ++ LPTloc=$(lpa_get_lpt $NODENAME); lparc=$? # ATTR ++ if [ "$lparc" -ne 0 ]; then ++ # as a fallback try to fetch the value from external status file ++ LPTloc=$(lpa_pull_lpt); # FILE ++ lparc=$? ++ if [ -z "$LPTloc" -o "$LPTloc" -eq -1 -o "$lparc" -ne 0 ]; then ++ # last option - try to initialize as PRIMARY ++ lpa_push_lpt 20 ++ lpa_set_lpt 20 ++ LPTloc=20 # DEFAULT ++ fi ++ fi ++ # TODO PRIO1: REMOVE remoteNode dependency - lpa_get_lpt ++ LPTrem=$(lpa_get_lpt $remoteNode); lparc=$? ++ if [ $lparc -ne 0 ]; then ++ # LPT of the other node could not be evaluated - LPA says WAIT ++ super_ocf_log debug "DBG: LPA: LPTloc=$LPTloc, LPTrem undefined ==> WAIT" ++ rc=2 ++ else ++ super_ocf_log debug "DBG: LPA: LPTloc ($LPTloc) LPTrem ($LPTrem) delta ($delta)" ++ if [ $LPTloc -lt $LPTMark -a $LPTrem -lt $LPTMark ]; then ++ delta=0 # both lpts are not a real timestamp so just take the greater one ++ else ++ delta=$DUPLICATE_PRIMARY_TIMEOUT # at least one of the lpts is a real timestamp so include delta-gap ++ fi ++ if (( delta < LPTloc - LPTrem )); then ++ # We are the winner - LPA says STARTUP ++ super_ocf_log debug "DBG: LPA: LPTloc wins $LPTloc > $LPTrem + $delta ==> START" ++ rc=0 ++ elif (( delta < LPTrem - LPTloc )); then ++ if ocf_is_true "$AUTOMATED_REGISTER" ; then ++ # The other one has won - LPA says REGISTER ++ super_ocf_log debug "DBG: LPA: LPTrem wins $LPTrem > $LPTloc + $delta ==> REGISTER" ++ rc=1 ++ else ++ super_ocf_log debug "DBG: LPA: LPTrem wins $LPTrem > $LPTloc + $delta BUT AUTOMATED_REGISTER='false' ==> WAIT" ++ rc=2 ++ fi ++ ++ else ++ super_ocf_log debug "DBG: LPA: Difference between LPTloc and LPTrem is less than delta ($delta) ==> WAIT" ++ # TODO: PRIO3: ADD STALEMATE-HANDLING HERE; currently admin should set one of the lpa to 20 ++ rc=2 ++ fi ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: saphana_start_clone - start a hana clone instance ++# params: - ++# globals: OCF_*, ATTR_NAME_*, HANA_STATE_*, NODENAME ++# saphana_start_clone ++# ++function saphana_start_clone() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING ++ local sqlrc; ++ local chkusr; ++ # TODO: PRIO4: remove check_secstore_users later ++ secUser=$(check_secstore_users SAPHANA${SID}SR SLEHALOC RHELHALOC) ; chkusr=$? ++ if [ $chkusr -ne 0 ]; then ++ super_ocf_log err "ACT: Secure store users are missing (see best practice manual how to setup the users)" ++ rc=$OCF_ERR_CONFIGURED ++ else ++ set_hana_attribute ${NODENAME} "DEMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} ++ check_for_primary; primary_status=$? ++ if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then ++ saphana_start_primary; rc=$? ++ else ++ saphana_start_secondary; rc=$? ++ lpa_set_lpt 30 ++ fi ++ fi ++ return $rc ++} ++ ++# ++# function: saphana_stop_clone - stop a hana clone instance ++# params: - ++# globals: NODENAME(r), HANA_STATE_*(r) ++# saphana_stop_clone ++# ++function saphana_stop_clone() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ local primary_status="x" ++ set_hana_attribute ${NODENAME} "UNDEFINED" ${ATTR_NAME_HANA_CLONE_STATE[@]} ++ check_for_primary; primary_status=$? ++ if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then ++ lpa_set_lpt 10 ++ fi ++ saphana_stop; rc=$? ++ return $rc ++} ++ ++# ++# function: saphana_monitor_primary - monitor a hana clone instance ++# params: - ++# globals: HANA_STATE_*(r), remoteHost, NODENAME, ATTR_NAME_*, OCF_*, PreferSiteTakeover ++# ++function saphana_monitor_primary() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=$OCF_ERR_GENERIC ++ local promoted=0 ++ local init_attribute=0 ++ local LPTloc=-1 ++ local lparc=4 ++ local lss ++ local remoreSync="" ++ local my_role="" ++ # ++ # OK, we are running/are configured as HANA PRIMARY ++ # ++ super_ocf_log debug "DBG: saphana_monitor_clone: HANA_STATE_PRIMARY" ++ # ++ ##### CHECK, IF WE ARE DEMOTED (CLUSTER NODE ATTRIBUTE) ++ # ++ promote_attr=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_CLONE_STATE[@]}) ++ super_ocf_log debug "DBG: saphana_monitor_clone: $ATTR_NAME_HANA_CLONE_STATE=$promote_attr" ++ if [ -z "$promote_attr" ]; then ++ init_attribute=1 ++ promoted=0; ++ else ++ case "$promote_attr" in ++ PROMOTED ) ++ promoted=1; ++ ;; ++ DEMOTED ) ++ promoted=0; ++ ;; ++ WAITING ) ++ # DONE: lpa_check_lpt_status to come out of here :) ++ # DONE: PRIO2: CHECK IF THE FIX FOR COMING OUT OF WAITING IS CORRECT ++ get_hana_landscape_status; lss=$? ++ if [ $lss -ge 2 ]; then ++ # seems admin already decided that for us? -> we are running - set DEMOTED ++ promoted=0; ++ LPTloc=$(date '+%s') ++ lpa_set_lpt $LPTloc ++ fi ++ lpa_check_lpt_status; lparc=$? ++ if [ $lparc -ne 2 ]; then ++ # lpa - no need to wait any longer - lets try a new start ++ saphana_start_clone ++ rc=$? ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++ else ++ lpa_init_lpt $HANA_STATE_PRIMARY ++ # still waiting for second site to report lpa-lpt ++ if ocf_is_true "$AUTOMATED_REGISTER" ; then ++ super_ocf_log info "LPA: Still waiting for remote site to report LPA status" ++ else ++ super_ocf_log info "LPA: Dual primary detected and AUTOMATED_REGISTER='false' ==> WAITING" ++ fi ++ ++ return $OCF_SUCCESS ++ fi ++ promoted=0; ++ ;; ++ UNDEFINED ) ++ if ocf_is_probe; then ++ promoted=0; ++ else ++ set_hana_attribute ${NODENAME} "DEMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} ++ promoted=0; ++ fi ++ ;; ++ * ) ++ promoted=0; ++ ;; ++ esac ++ fi ++ get_hana_landscape_status; lss=$? ++ super_ocf_log debug "DBG: saphana_monitor_clone: get_hana_landscape_status=$lss" ++ case "$lss" in ++ 0 ) # FATAL or ERROR ++ rc=$OCF_ERR_GENERIC ++ ;; ++ 1 ) # DOWN or ERROR ++ # DONE: PRIO2: Maybe we need to differ between 0 and 1. While 0 is a fatal sap error, 1 is down/error ++ if ocf_is_probe; then ++ # ++ # leave master score untouched, only set return code ++ # ++ rc=$OCF_NOT_RUNNING ++ else ++ if [ "$promoted" -eq 1 ]; then ++ # INSTANCE IS FAILED PRIMARY IN PROMOTED STATE ++ # DONE: PRIO2: Adjust with set_crm_master? ++ # For Migration it would be good to decrease master score ++ # For Reload locally we should NOT adjust the master score ++ # ===> Should we rely on the migration threshold? ++ # set_crm_master ++ if ocf_is_true "${PreferSiteTakeover}" ; then ++ # ++ # DONE: PRIO1: first check, if remote site is already (and still) in sync ++ # TODO: PRIO4: Decide if penality (-9000) or weak (5) is better here to cover situations where other clone is gone ++ # ++ # TODO PRIO1: REMOVE remoteNode dependency - get_sync_status ++ remoteSync=$(get_hana_attribute $remoteNode ${ATTR_NAME_HANA_SYNC_STATUS[@]}) ++ case "$remoteSync" in ++ SOK ) ++ super_ocf_log info "DEC: PreferSiteTakeover selected so decrease promotion score here (and reset lpa)" ++ set_crm_master 5 ++ if check_for_primary_master; then ++ lpa_set_lpt 20 ++ fi ++ ;; ++ SFAIL ) ++ super_ocf_log info "DEC: PreferSiteTakeover selected BUT remoteHost is not in sync (SFAIL) ==> local restart preferred" ++ ;; ++ * ) ++ super_ocf_log info "DEC: PreferSiteTakeover selected BUT remoteHost is not in sync ($remoteSync) ==> local restart preferred" ++ ;; ++ esac ++ else ++ # TODO: PRIO5: SCALE-OUT ONLY? Implement for local restart ++ # It maybe that for the local restart we only need to decrease the secondaries promotion score ++ #super_ocf_log info "DEC: PreferSiteTakeover selected so decrease promotion score here" ++ my_role=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_ROLES[@]}) ++ my_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) ++ scoring_crm_master "$my_role" "$my_sync" ++ rc=$OCF_FAILED_MASTER ++ fi ++ rc=$OCF_FAILED_MASTER ++ else ++ # INSTANCE IS FAILED PRIMARY IN DEMOTED STATE ++ # TODO: PRIO3: Adjust with set_crm_master? ++ # Current decission: Do NOT adjust master score now as other ++ # steps should already have done that ++ # ++ rc=$OCF_NOT_RUNNING ++ fi ++ fi ++ ;; ++ 2 | 3 | 4 ) # WARN, INFO or OK ++ if ocf_is_probe; then ++ rc=$OCF_SUCCESS ++ else ++ LPTloc=$(date '+%s') ++ lpa_set_lpt $LPTloc ++ lpa_push_lpt $LPTloc ++ if [ "$promoted" -eq 1 ]; then ++ set_hana_attribute "$NODENAME" "PRIM" ${ATTR_NAME_HANA_SYNC_STATUS[@]} ++ rc=$OCF_RUNNING_MASTER ++ else ++ if [ "$init_attribute" -eq 1 ]; then ++ set_hana_attribute ${NODENAME} "DEMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} ++ rc=$OCF_RUNNING_MASTER ++ else ++ rc=$OCF_SUCCESS ++ fi ++ fi ++ my_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) ++ my_role=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_ROLES[@]}) ++ case "$my_role" in ++ [12]:P:*:master:* ) # primary is down or may not anser hdbsql query so drop analyze_hana_sync_status ++ ;; ++ [34]:P:*:master:* ) # primary is up and should now be able to anser hdbsql query ++ analyze_hana_sync_status ++ ;; ++ esac ++ rem_role=$(get_hana_attribute ${remoteHost} ${ATTR_NAME_HANA_ROLES[@]}) ++ rem_clone_status=$(get_hana_attribute ${remoteHost} ${ATTR_NAME_HANA_CLONE_STATE[@]}) ++ if [ "$promote_attr" = "DEMOTED" -a "$rem_clone_status" = "PROMOTED" ]; then ++ case "$rem_role" in ++ [234]:P:* ) # dual primary, but other instance marked as PROMOTED by the cluster ++ lpa_check_lpt_status; again_lpa_rc=$? ++ if [ $again_lpa_rc -eq 2 ]; then ++ super_ocf_log info "DEC: Dual primary detected, other instance is PROMOTED and lpa stalemate ==> local restart" ++ lpa_set_lpt 10 ++ lpa_push_lpt 10 ++ rc=$OCF_NOT_RUNNING ++ fi ++ ;; ++ esac ++ fi ++ scoring_crm_master "$my_role" "$my_sync" ++ fi ++ ;; ++ * ) # UNDEFINED STATUS ++ if ocf_is_probe; then ++ rc=$OCF_NOT_RUNNING ++ else ++ if [ "$promoted" -eq 1 ]; then ++ rc=$OCF_FAILED_MASTER ++ else ++ rc=$OCF_NOT_RUNNING ++ fi ++ fi ++ ;; ++ esac ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: saphana_monitor_secondary - monitor a hana clone instance ++# params: - ++# globals: OCF_*, ATTR_NAME_*, NODENAME ++# saphana_monitor_secondary ++# ++function saphana_monitor_secondary() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=$OCF_ERR_GENERIC ++ local promoted=0 ++ local init_attribute=0 ++ local lss ++ # ++ # OK, we are running as HANA SECONDARY ++ # ++ if ! lpa_get_lpt ${NODENAME}; then ++ lpa_set_lpt 10 ++ lpa_push_lpt 10 ++ fi ++ promote_attr=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_CLONE_STATE[@]}) ++ super_ocf_log debug "DBG: saphana_monitor_clone: $ATTR_NAME_HANA_CLONE_STATE=$promote_attr" ++ if [ -z "$promote_attr" ]; then ++ init_attribute=1 ++ # DONE: PRIO3: do we need to inizialize also the DEMOTED attribute value? ++ set_hana_attribute ${NODENAME} "DEMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} ++ promoted=0; ++ else ++ case "$promote_attr" in ++ PROMOTED ) # However - PROMOTED should never happen for a SECONDARY ++ promoted=1; ++ ;; ++ DEMOTED ) # This is the status we expect ++ promoted=0; ++ ;; ++ WAITING* ) # We are WAITING for PRIMARY so not testing the HANA engine now but check for a new start ++ if check_for_primary_master; then ++ super_ocf_log info "ACT: SECONDARY still in status WAITING - Primary now available - try a new start" ++ saphana_start_clone ++ rc=$? ++ else ++ super_ocf_log info "ACT: saphana_monitor_clone: SECONDARY still in status WAITING - Primary is still missing" ++ return $OCF_SUCCESS ++ fi ++ promoted=0; ++ ;; ++ UNDEFINED | * ) ++ if ocf_is_probe; then ++ promoted=0; ++ else ++ set_hana_attribute ${NODENAME} "DEMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} ++ promoted=0; ++ fi ++ ;; ++ esac ++ fi ++ # ++ super_ocf_log debug "DBG: saphana_monitor_clone: HANA_STATE_SECONDARY" ++ # ++ # old method was: saphana_monitor - new method is get_hana_landscape_status ++ get_hana_landscape_status; lss=$? ++ super_ocf_log debug "DBG: saphana_monitor_clone: get_hana_landscape_status=$lss" ++ case "$lss" in ++ 0 ) # FATAL ++ # DONE: PRIO1: Maybe we need to differ between 0 and 1. While 0 is a fatal sap error, 1 is down/error ++ # TODO: PRIO3: is OCF_ERR_GENERIC best option? ++ lpa_set_lpt 10 ++ rc=$OCF_ERR_GENERIC ++ ;; ++ 1 ) # ERROR ++ lpa_set_lpt 10 ++ rc=$OCF_NOT_RUNNING ++ ;; ++ 2 | 3 | 4 ) # WARN INFO OK ++ rc=$OCF_SUCCESS ++ lpa_set_lpt 30 ++ sync_attr=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) ++ super_ocf_log debug "DBG: sync_attr=$sync_attr" ++ case "$sync_attr" in ++ "SOK" ) # This is a possible node to promote, when primary is missing ++ super_ocf_log info "DEC: secondary with sync status SOK ==> possible takeover node" ++ my_role=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_ROLES[@]}) ++ my_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) ++ scoring_crm_master "$my_role" "$my_sync" ++ ;; ++ "SFAIL" ) # This is currently NOT a possible node to promote ++ super_ocf_log info "DEC: secondary with sync status FAILED ==> EXCLUDE as posible takeover node" ++ set_crm_master -INFINITY ++ ;; ++ "*" ) # Unknown sync status ++ super_ocf_log info "DEC: secondary with sync status UKNOWN/UNDEFINED ==> EXCLUDE as posible takeover node" ++ set_crm_master -INFINITY ++ ;; ++ esac ++ ;; ++ * ) # UNDEFINED STATUS ++ rc=$OCF_NOT_RUNNING ++ ;; ++ esac ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: saphana_monitor_clone - monitor a hana clone instance ++# params: - ++# globals: OCF_*, ATTR_NAME_*, HOSTNANE, HANA_STATE_* ++# saphana_monitor_clone ++# ++function saphana_monitor_clone() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ # ++ # TODO: PRIO3: For the secondary, which is missing the primary (so in status WAITING) what is better: ++ # a) returning 7 here and force cluster a restart of the slave ++ # b) starting the instance here inside the monitor -> may result in longer runtime, timeouts ++ # ++ # first check with the status function (OS tools) if there could be something like a SAP instance running ++ # as we do not know here, if we are in master or slave state we do not want to start our monitoring ++ # agents (sapstartsrv) on the wrong host ++ local rc=$OCF_ERR_GENERIC ++ local promoted=0 ++ local init_attribute=0 ++ ++ my_role=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_ROLES[@]}) ++ my_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) ++ lpa_check_lpt_status # TODO: PRIO3 : remove that line later - its only to call lpa_check_lpt_status much more often for checking ++ ++ if ocf_is_probe; then ++ super_ocf_log debug "DBG: PROBE ONLY" ++ else ++ super_ocf_log debug "DBG: REGULAR MONITOR" ++ fi ++ # ++ # First check, if we are PRIMARY or SECONDARY ++ # ++ check_for_primary; primary_status=$? ++ if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then ++ saphana_monitor_primary; rc=$? ++ else ++ if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then ++ saphana_monitor_secondary; rc=$? ++ else ++ # ++ # OK, we are neither HANA PRIMARY nor HANA SECONDARY ++ # ++ super_ocf_log warn "ACT: saphana_monitor_clone: HANA_STATE_DEFECT" ++ # TODO: PRIO2: Or only set_crm_master -INFINITY ? ++ rc=$OCF_ERR_GENERIC ++ fi ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: saphana_promote_clone - promote a hana clone ++# params: - ++# globals: OCF_*(r), NODENAME(r), HANA_STATE_*, SID(r), InstanceName(r), ++# saphana_promote_clone: ++# In a Master/Slave configuration get Master being the primary OR by running hana takeover ++# ++function saphana_promote_clone() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=$OCF_ERR_GENERIC; ++ local hana_sync; ++ local primary_status; ++ # ++ # first check, if we WILL be PRIMARY (checking HANA status) ++ # ++ set_hana_attribute ${NODENAME} "PROMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} ++ check_for_primary; primary_status=$? ++ # ++ if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then ++ # ++ # as we are already planned to be PRIMARY we only mark the node as PROMOTED ++ # ++ super_ocf_log info "ACT: Promoted $SID-$InstanceName as master (no hdbnsutil action needed)." ++ rc=$OCF_SUCCESS; ++ else ++ if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then ++ # ++ # we are SECONDARY/SLAVE and need to takepover ... ++ # promote on the replica side... ++ # ++ hana_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) ++ case "$hana_sync" in ++ SOK ) ++ super_ocf_log info "ACT: !!!!!!! Promote REPLICA $SID-$InstanceName to be primary. !!!!!!" ++ LPTloc=$(date '+%s') ++ # lpa_set_lpt 20 $remoteNode ++ lpa_set_lpt $LPTloc ++ lpa_push_lpt $LPTloc ++ su - $sidadm -c "hdbnsutil -sr_takeover" ++ # ++ # now gain check, if we are primary NOW ++ # ++ # TODO: PRIO3: check, if we need to destinguish between HANA_STATE_PRIMARY, HANA_STATE_SECONDARY, HANA_STATE_DEFECT ++ # ++ if check_for_primary; then ++ rc=$OCF_SUCCESS; ++ else ++ rc=$OCF_FAILED_MASTER ++ fi ++ ;; ++ * ) ++ super_ocf_log err "ACT: HANA SYNC STATUS IS NOT 'SOK' SO THIS HANA SITE COULD NOT BE PROMOTED" ++ rc=$OCF_ERR_GENERIC ++ ;; ++ esac ++ else ++ # ++ # neither MASTER nor SLAVE - This clone instance seams to be broken!! ++ # ++ rc=$OCF_ERR_GENERIC ++ fi ++ fi ++ rc=$? ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: saphana_demote_clone - demote a hana clone instance ++# params: - ++# globals: OCF_*(r), NODENAME(r), ++# saphana_demote_clone ++# the HANA System Replication (SR) runs in a Master/Slave ++# While we could not change a HANA instance to be really demoted, we only mark the status for ++# correct monitor return codes ++# ++function saphana_demote_clone() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=$OCF_ERR_GENERIC; ++ set_hana_attribute ${NODENAME} "DEMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} ++ rc=$OCF_SUCCESS; ++ super_ocf_log info "ACT: Demoted $SID-$InstanceName." ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: main - main function to operate ++# params: ACTION ++# globals: OCF_*(r), SID(w), sidadm(w), InstanceName(w), SAPVIRHOST(w), DIR_EXECUTABLE(w), ++# globals: SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), ACTION(w), CLACT(w), ra_rc(rw), $0(r), %ENV(r) ++# ++ ++## GLOBALS ++SID="" ++sidadm="" ++InstanceName="" ++InstanceNr="" ++SAPVIRHOST="" ++DIR_EXECUTABLE="" ++SAPSTARTSRV="" ++SAPCONTROL="" ++DIR_PROFILE="" ++SAPSTARTPROFILE="" ++SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" ++ ++NODENAME=$(crm_node -n) ++ ++ ++if [ $# -ne 1 ] ++then ++ saphana_usage ++ exit $OCF_ERR_ARGS ++fi ++ ++ACTION=$1 ++if [ "$ACTION" = "status" ]; then ++ ACTION=monitor ++fi ++ ++# These operations don't require OCF parameters to be set ++# TODO: PRIO5: check, if notify is still not needing OCF parameters ++case "$ACTION" in ++ usage|methods) saphana_$ACTION ++ exit $OCF_SUCCESS;; ++ meta-data) saphana_meta_data ++ exit $OCF_SUCCESS;; ++ notify) #saphana_notify ++ exit $OCF_SUCCESS;; ++ *);; ++esac ++saphana_init ++ ++if ! ocf_is_root ++then ++ super_ocf_log err "ACT: $0 must be run as root" ++ exit $OCF_ERR_PERM ++fi ++ ++# parameter check ++if [ -z "$OCF_RESKEY_SID" ] ++then ++ super_ocf_log err "ACT: Please set parameter SID!" ++ exit $OCF_ERR_ARGS ++fi ++ ++if [ -z "$OCF_RESKEY_InstanceNumber" ] ++then ++ super_ocf_log err "ACT: Please set parameter InstanceNumber!" ++ exit $OCF_ERR_ARGS ++fi ++ ++if is_clone ++then ++ CLACT=_clone ++else ++ if [ "$ACTION" = "promote" -o "$ACTION" = "demote" ] ++ then ++ super_ocf_log err "ACT: $ACTION called in a non master/slave environment" ++ exit $OCF_ERR_ARGS ++ fi ++fi ++ ++# What kind of method was invoked? ++THE_VERSION=$(saphana_meta_data | grep ' ++ ++ ++ 0.149.3 ++ Analyzes SAP HANA System Replication Topology. ++ This RA analyzes the SAP HANA topology and "sends" all findings via the node status attributes to ++ all nodes in the cluster. These attributes are taken by the SAPHana RA to control the SAP Hana Databases. ++ In addition it starts and monitors the local saphostagent. ++ ++1. Interface to monitor a HANA system: landscapeHostConfiguration.py ++landscapeHostConfiguration.py has some detailed output about HANA system status ++and node roles. For our monitor the overall status is relevant. This overall ++status is reported by the returncode of the script: ++0: Internal Fatal ++1: ERROR ++2: WARNING ++3: INFO (maybe a switch of the resource running) ++4: OK ++The SAPHanaTopology resource agent will interpret returncodes 1 as NOT-RUNNING (or 1 failure) and returncodes 2+3+4 as RUNNING. ++SAPHanaTopology scans the output table of landscapeHostConfiguration.py to identify the roles of the cluster node. Roles means configured and current role of the nameserver as well as the indexserver. ++ ++2. Interface is hdbnsutil ++ The interface hdbnsutil is used to check the "topology" of the system replication as well as the current configuration ++ (primary/secondary) of a SAP HANA database instance. A second task of the interface is the posibility to run a ++ system replication takeover (sr_takeover) or to register a former primary to a newer one (sr_register). ++ ++3. saphostctrl ++ The interface saphostctrl uses the function ListInstances to figure out the virtual host name of the ++ SAP HANA instance. This is the hostname used during the HANA installation. ++ ++ ++ ++ The SAP System Identifier (SID) ++ The SAP System Identifier (SID) ++ ++ ++ ++ The SAP Instance Number ++ The SAP Instance Number ++ ++ ++ ++ Path to the SAP Hana Instance executable directory. If not set the RA tries /usr/sap/\$SID/\$InstanceName/exe. ++ While InstanceName is the string of "HDB" and \$InstanceNumber for SAP Hana databases. ++ ++ Path to the SAP Hana Instance executable directory. ++ ++ ++ ++ Define type of SAPHanaTopology RA messages to be printed ++ Define type of SAPHanaTopology RA messages to be printed. ++Define SAPHana resource agent messages to be printed. ++ This parameter should only be set of been requested by SUSE support. The default is sufficient for normal operation. ++ Values: ra-act-lpa-dec-flow ++ You could specify any combination of the above values like "ra-act-flow" ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++END ++return $rc ++} ++ ++# ++# function: get_hana_attribute ++# params: NODE ATTR [STORE] ++# globals: - ++# ++function get_hana_attribute() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ local attr_node=$1 ++ local attr_name=$2 ++ local attr_store=${3:-reboot} # DONE: PRIO5 get this (optional) from parameter ++ crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q; rc=$? ++ if [ $rc -ne 0 ]; then ++ super_ocf_log debug "DBG: ATTRIBUTE-FAILURE: crm_attribute -N $attr_node -G -n "$attr_name" -l $attr_store -q" ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: set_hana_attribute - set the multi-state status of a node ++# params: NODE VALUE ATTR [STORE] ++# globals: - ++# ++function set_hana_attribute() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local attr_node=$1 ++ local attr_value=$2 ++ local attr_name=$3 ++ local attr_store=${4:-reboot} # DONE: PRIO5 get this (optional) from parameter ++ local rc=1 ++ local attr_old ++ attr_old=$(get_hana_attribute $attr_node $attr_name $attr_store); get_rc=$? ++ if [ "$attr_old" != "$attr_value" ]; then ++ super_ocf_log debug "DBG: SET attribute $attr_name for node ${attr_node} to ${attr_value} former ($attr_old) get_rc=$get_rc " ++ crm_attribute -N $attr_node -v "$attr_value" -n "$attr_name" -l $attr_store; rc=$? ++ if [ $rc -ne 0 ]; then ++ super_ocf_log debug "DBG: ATTRIBUTE-FAILURE: crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store" ++ fi ++ else ++ super_ocf_log debug "DBG: LET attribute $attr_name for node ${attr_node} still be ${attr_value}" ++ rc=0 ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: sht_methods - report supported cluster methods ++# params: - ++# globals: - ++# methods: What methods/operations do we support? ++# ++function sht_methods() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ cat <<-! ++ start ++ stop ++ status ++ monitor ++ notify ++ validate-all ++ methods ++ meta-data ++ usage ++ admin-setup ++ ! ++ return $rc ++} ++ ++# ++# function: is_clone - report, if resource is configured as a clone (also master/slave) ++# params: - ++# globals: OCF_*(r) ++# descript: is_clone : find out if we are configured to run in a Master/Slave configuration ++# rc: 0: it is a clone ++# 1: it is not a clone ++# Special EXIT of RA, if clone is missconfigured ++# ++function is_clone() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ # ++ # is a clone config? ++ # ++ if [ -n "$OCF_RESKEY_CRM_meta_clone_max" ] \ ++ && [ "$OCF_RESKEY_CRM_meta_clone_max" -gt 0 ]; then ++ # ++ # yes it is a clone config - check, if its configured well ++ # ++ if [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] ; then ++ super_ocf_log err "ACT: Clone options misconfigured. (expect: clone_node_max=1)" ++ exit $OCF_ERR_CONFIGURED ++ fi ++ rc=0; ++ else ++ rc=1; ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: sht_init - initialize variables for the resource agent ++# params: - ++# globals: OCF_*(r), SID(w), sid(rw), sidadm(w), InstanceName(w), InstanceNr(w), ++# globals: meta_notify_master_uname(w), HANA_SR_TOLOPOGY(w), sr_name(w), remoteHost(w) ++# globals: ATTR_NAME_HANA_SYNC_STATUS(w), ATTR_NAME_HANA_PRIMARY_AT(w), ATTR_NAME_HANA_CLONE_STATE(w) ++# globals: DIR_EXECUTABLE(w), SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), LD_LIBRARY_PATH(w), PATH(w), nodelist(w) ++# sht_init : Define global variables with default values, if optional parameters are not set ++# ++# ++ ++function sht_init() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local myInstanceName="" ++ local rc=$OCF_SUCCESS ++ local hdbANSWER="" ++ HOSTEXECNAME=saphostexec ++ USRSAP=/usr/sap ++ SAPSERVICE_PATH=${USRSAP}/sapservices ++ SAPHOSTCTRL_PATH=${USRSAP}/hostctrl/exe ++ HOSTEXEC_PATH=${SAPHOSTCTRL_PATH}/${HOSTEXECNAME} ++ HOSTEXEC_PROFILE_PATH=${SAPHOSTCTRL_PATH}/host_profile ++ SID=$OCF_RESKEY_SID ++ InstanceNr=$OCF_RESKEY_InstanceNumber ++ myInstanceName="${SID}_HDB${InstanceNr}" ++ InstanceName="HDB${InstanceNr}" ++ super_ocf_log debug "DBG2: Used new method to get SID ($SID) and InstanceNr ($InstanceNr)" ++ sid=$(echo "$SID" | tr [:upper:] [:lower:]) ++ sidadm="${sid}adm" ++ SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" ++ ocf_env=$(env | grep 'OCF_RESKEY_CRM') ++ super_ocf_log debug "DBG3: OCF: $ocf_env" ++ ATTR_NAME_HANA_SYNC_STATUS=("hana_${sid}_sync_state" "reboot") # SOK, SFAIL, UNKNOWN? ++ ATTR_NAME_HANA_PRIMARY_AT=("hana_${sid}_primary_at" "reboot") # Not really used ++ ATTR_NAME_HANA_CLONE_STATE=("hana_${sid}_clone_state" "reboot") # UKNOWN?, DEMOTED, PROMOTED ++ ATTR_NAME_HANA_REMOTEHOST=("hana_${sid}_remoteHost" "forever") ++ ATTR_NAME_HANA_SITE=("hana_${sid}_site" "forever") ++ ATTR_NAME_HANA_ROLES=("hana_${sid}_roles" "reboot") ++ ATTR_NAME_HANA_SRMODE=("hana_${sid}_srmode" "forever") ++ ATTR_NAME_HANA_VHOST=("hana_${sid}_vhost" "forever") ++ ATTR_NAME_HANA_STATUS=("hana_${sid}_status" "reboot") ++ ++ # optional OCF parameters, we try to guess which directories are correct ++ if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] ++ then ++ DIR_EXECUTABLE="/usr/sap/$SID/$InstanceName/exe" ++ else ++ DIR_EXECUTABLE="$OCF_RESKEY_DIR_EXECUTABLE" ++ fi ++ ++ if [ -z "$DIR_EXECUTABLE" ]; then ++ super_ocf_log err "DEC: Can not determine DIR_EXECUTABLE. Please set this parameter. -> OCF_ERR_CONFIGURED" ++ rc=$OCF_ERR_CONFIGURED ++ fi ++ ++ if [ -z "$OCF_RESKEY_DIR_PROFILE" ] ++ then ++ DIR_PROFILE="/usr/sap/$SID/SYS/profile" ++ else ++ DIR_PROFILE="$OCF_RESKEY_DIR_PROFILE" ++ fi ++ ++ # as root user we need the library path to the SAP kernel to be able to call sapcontrol ++ # check, if we already added DIR_EXECUTABLE at the beginning of LD_LIBRARY_PATH ++ if [ "${LD_LIBRARY_PATH%%*:}" != "$DIR_EXECUTABLE" ] ++ then ++ LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH ++ export LD_LIBRARY_PATH ++ fi ++ ++ PATH=${PATH}:${DIR_EXECUTABLE} ++ # ++ # figure-out all needed values from system replication status with ONE call ++ # we need: mode=primary|sync|syncmem|...; site name=; mapping/=/ (multiple lines) ++ case $(crm_attribute --type crm_config --name cluster-infrastructure -q) in ++ *corosync* ) nodelist=$(crm_node -l | awk '{ print $2 }');; ++ *openais* ) nodelist=$(crm_node -l | awk '/member/ {print $2}');; ++ *cman* ) nodelist=$(crm_node -l);; ++ esac ++ hdbANSWER=$(su - ${sidadm} -c "hdbnsutil -sr_state --sapcontrol=1" 2>/dev/null) ++ super_ocf_log debug "DBG2: hdbANSWER=\$\(su - ${sidadm} -c \"hdbnsutil -sr_state --sapcontrol=1\"\)" ++ site=$(echo "$hdbANSWER" | awk -F= '/site name/ {print $2}') ++ srmode=$(echo "$hdbANSWER" | awk -F= '/mode/ {print $2}') ++ MAPPING=$(echo "$hdbANSWER" | awk -F[=/] '$1 ~ "mapping" && $3 !~ site { print $4 }' site=$site) ++ super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING" ++ # ++ # filter all non-cluster mappings ++ # ++ hanaRemoteHost=$(for n1 in $nodelist; do for n2 in $MAPPING; do if [ "$n1" == "$n2" ]; then echo $n1; fi; done; done ) ++ super_ocf_log info "DEC: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" ++ super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" ++ super_ocf_log info "FLOW $FUNCNAME rc=$OCF_SUCCESS" ++ return $OCF_SUCCESS ++} ++ ++# ++# function: check_for_primary - check if local SAP HANA is configured as primary ++# params: - ++# globals: HANA_STATE_PRIMARY(r), HANA_STATE_SECONDARY(r), HANA_STATE_DEFECT(r), HANA_STATE_STANDALONE(r) ++# ++function check_for_primary() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ # DONE: Change stderr location!! ++ #sidadm=lnxadm ++ #node_status=$(check_for_primary_single) ++ node_status=$srmode ++ super_ocf_log debug "DBG2: check_for_primary: node_status=$node_status" ++ super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" ++ for i in 1 2 3 4 5 6 7 8 9; do ++ case "$node_status" in ++ primary ) ++ super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_PRIMARY" ++ return $HANA_STATE_PRIMARY;; ++ syncmem | sync | async ) ++ super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_SECONDARY" ++ return $HANA_STATE_SECONDARY;; ++ none ) # have seen that mode on second side BEFEORE we registered it as replica ++ super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_STANDALONE" ++ return $HANA_STATE_STANDALONE;; ++ * ) ++ super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" ++ dump=$( echo $node_status | hexdump -C ); ++ super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" ++ node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) ++ node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') ++ super_ocf_log info "DEC: check_for_primary: loop=$i: node_status=$node_status" ++ # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes ++ esac; ++ done ++ super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_DEFECT" ++ return $HANA_STATE_DEFECT ++} ++ ++ ++# ++# function: start_saphostagent ++# params: - ++# globals: ++# ++function start_saphostagent() ++{ ++ if [ -x "${HOSTEXEC_PATH}" ]; then ++ ${HOSTEXEC_PATH} pf=${HOSTEXEC_PROFILE_PATH} ++ fi ++ return 0 ++} ++ ++# ++# function: stop_saphostagent ++# params: - ++# globals: ++# ++function stop_saphostagent() ++{ ++ if [ -x "${HOSTEXEC_PATH}" ]; then ++ ${HOSTEXEC_PATH} -stop ++ fi ++} ++ ++# ++# function: check_saphostagent ++# params: - ++# globals: ++# ++function check_saphostagent() ++{ ++ local rc=1 ++ pgrep -f /usr/sap/hostctrl/exe/saphostexec; rc=$? ++ return $rc ++} ++ ++# ++############################################################################# ++# ++# function: sht_start - start a hana instance ++# params: - ++# globals: OCF_* ++# sht_start : Start the SAP HANA instance ++# ++function sht_start() { ++ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ ++ local rc=$OCF_NOT_RUNNING ++ local output="" ++ local loopcount=0 ++ ++ mkdir -p /var/lib/SAPHana ++ touch /var/lib/SAPHana/SAPTopologyON ++ if ! check_saphostagent; then ++ start_saphostagent ++ fi ++ ++ rc=$OCF_SUCCESS ++ ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: sht_stop - stop a hana instance ++# params: - ++# globals: OCF_*(r), SAPCONTROL(r), SID(r), InstanceName(r) ++# sht_stop: Stop the SAP instance ++# ++function sht_stop() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local output="" ++ local rc=0 ++ ++ rm /var/lib/SAPHana/SAPTopologyON ++ rc=$OCF_SUCCESS ++ ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++ ++# ++# function: sht_monitor - monitor a hana topology instance ++# params: -- ++# globals: OCF_*(r), SAPCONTROL(r), InstanveNr(r) ++# sht_monitor: Can the given SAP instance do anything useful? ++# ++function sht_monitor() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ ++ if [ -f /var/lib/SAPHana/SAPTopologyON ]; then ++ rc=$OCF_SUCCESS ++ else ++ rc=$OCF_NOT_RUNNING ++ fi ++ ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++ ++# ++# function: sht_status - get status of a hana instance (os tools only) ++# params: - ++# globals: SID(r), InstanceName(r), OCF_*(r), sidarm(r) ++# sht_status: Lightweight check of SAP instance only with OS tools ++# ++function sht_status() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ ++ sht_monitor; rc=$? ++ return $rc ++} ++ ++ ++# ++# function: sht_validate - validation of (some) variables/parameters ++# params: - ++# globals: OCF_*(r), SID(r), InstanceName(r), InstanceNr(r), ++# sht_validate: Check the symantic of the input parameters ++# ++function sht_validate() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=$OCF_SUCCESS ++ if [ $(echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$') -ne 1 ] ++ then ++ super_ocf_log err "ACT: Parsing instance profile name: '$SID' is not a valid SID!" ++ rc=$OCF_ERR_ARGS ++ fi ++ ++ if [ $(echo "$InstanceNr" | grep -c '^[0-9][0-9]$') -ne 1 ] ++ then ++ super_ocf_log err "ACT: Parsing instance profile name: '$InstanceNr' is not a valid instance number!" ++ rc=$OCF_ERR_ARGS ++ fi ++ ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: sht_start_clone - start a hana clone instance ++# params: - ++# globals: OCF_*(r), ++# sht_start_clone ++# ++function sht_start_clone() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=$OCF_NOT_RUNNING ++ sht_start; rc=$? ++ return $rc ++} ++ ++# ++# function: sht_stop_clone - stop a hana clone instance ++# params: - ++# globals: NODENAME(r), HANA_STATE_*, ATTR_NAME_* ++# sht_stop_clone ++# ++function sht_stop_clone() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ check_for_primary; primary_status=$? ++ if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then ++ hanaPrim="P" ++ elif [ $primary_status -eq $HANA_STATE_SECONDARY ]; then ++ hanaPrim="S" ++ elif [ $primary_status -eq $HANA_STATE_STANDALONE ]; then ++ hanaPrim="N" ++ else ++ hanaPrim="-" ++ fi ++ set_hana_attribute "${NODENAME}" "1:$hanaPrim:-:-:-:-" ${ATTR_NAME_HANA_ROLES[@]} ++ sht_stop; rc=$? ++ return $rc ++} ++ ++# ++# function: sht_monitor_clone - monitor a hana clone instance ++# params: - ++# globals: OCF_*, SID, InstanceNr, InstanceName, MAPPING(r) ++# sht_monitor_clone ++# ++function sht_monitor_clone() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ # ++ local rc=$OCF_ERR_GENERIC ++ local promoted=0 ++ local init_attribute=0 ++ ++ ++ if ocf_is_probe; then ++ super_ocf_log debug "DBG2: PROBE ONLY" ++ else ++ super_ocf_log debug "DBG2: REGULAR MONITOR" ++ if ! check_saphostagent; then ++ start_saphostagent ++ fi ++ fi ++ # ++ # First check, if we are PRIMARY or SECONDARY ++ # ++ super_ocf_log debug "DBG2: HANA SID $SID" ++ super_ocf_log debug "DBG2: HANA InstanceName $InstanceName" ++ super_ocf_log debug "DBG2: HANA InstanceNr $InstanceNr" ++ check_for_primary; primary_status=$? ++ if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then ++ hanaPrim="P" ++ super_ocf_log debug "DBG2: HANA IS PRIMARY" ++ sht_monitor; rc=$? ++ else ++ if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then ++ hanaPrim="S" ++ super_ocf_log debug "DBG2: HANA IS SECONDARY" ++ sht_monitor; rc=$? ++ elif [ $primary_status -eq $HANA_STATE_STANDALONE ]; then ++ hanaPrim="N" ++ super_ocf_log debug "DBG2: HANA IS STANDALONE" ++ sht_monitor; rc=$? ++ else ++ hanaPrim="-" ++ super_ocf_log warn "ACT: sht_monitor_clone: HANA_STATE_DEFECT" ++ rc=$OCF_ERR_CONFIGURED ++ fi ++ fi ++ # DONE: PRIO1: ASK: Is the output format of ListInstances fix? Could we take that as an API? ++ # try to catch: Inst Info : LNX - 42 - lv9041 - 740, patch 36, changelist 1444691 ++ # We rely on the following format: SID is word#4, NR is work#6, vHost is word#8 ++ vName=$(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances \ ++ | awk '$4 == SID && $6=NR { print $8 }' SID=$SID NR=$InstanceNr 2>/dev/null ) ++ super_ocf_log debug "DBG: ListInstances: $(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances)" ++ if [ -n "$vName" ]; then ++ set_hana_attribute ${NODENAME} "$vName" ${ATTR_NAME_HANA_VHOST[@]} ++ else ++ vName=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_VHOST[@]}) ++ fi ++ #site=$(get_site_name) ++ hanaANSWER=$(su - $sidadm -c "python exe/python_support/landscapeHostConfiguration.py" 2>/dev/null); hanalrc="$?" ++ hanarole=$(echo "$hanaANSWER" | tr -d ' ' | awk -F'|' '$2 == host { printf "%s:%s:%s:%s\n",$10,$11,$12,$13 } ' host=${vName}) ++ #if [ -z "$MAPPING" ]; then ++ # super_ocf_log info "ACT: Did not find remote Host at this moment" ++ #fi ++ # FH TODO PRIO1: TRY TO GET RID OF "ATTR_NAME_HANA_REMOTEHOST" ++ if [ -n "$hanaRemoteHost" ]; then ++ set_hana_attribute ${NODENAME} "$hanaRemoteHost" ${ATTR_NAME_HANA_REMOTEHOST[@]} ++ fi ++ set_hana_attribute ${NODENAME} "$hanalrc:$hanaPrim:$hanarole" ${ATTR_NAME_HANA_ROLES[@]} ++ set_hana_attribute ${NODENAME} "$site" ${ATTR_NAME_HANA_SITE[@]} ++ set_hana_attribute ${NODENAME} "$vName" ${ATTR_NAME_HANA_VHOST[@]} ++ case "$hanaPrim" in ++ P ) ;; ++ S ) # only secondary may propargate its sync status ++ case $(crm_attribute --type crm_config --name cluster-infrastructure -q) in ++ *corosync* ) nodelist=$(crm_node -l | awk '{ print $2 }');; ++ *openais* ) nodelist=$(crm_node -l | awk '/member/ {print $2}');; ++ *cman* ) nodelist=$(crm_node -l);; ++ esac ++ ++ for n in ${nodelist}; do ++ set_hana_attribute ${n} "$srmode" ${ATTR_NAME_HANA_SRMODE[@]} ++ done ++ ;; ++ esac ++ #ATTR_NAME_HANA_STATUS # TODO: PRIO5: For SCALE-OUT: Fill that attribute later ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# ++# function: sht_notify - notify action ++# params: - ++# globals: OCF_*(r), ACTION(r), CLACT(r), NODENAME(r) ++# sht_notify: Handle master scoring - to make sure a slave gets the next master ++# ++function sht_notify() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ super_ocf_log info "RA ==== end action $ACTION$CLACT (${n_type}/${n_op})====" ++ return $rc ++} ++ ++# ++# function: main - main function to operate ++# params: ACTION ++# globals: OCF_*(r), SID(w), sidadm(w), InstanceName(w), DIR_EXECUTABLE(w), ACTION(w), CLACT(w), ra_rc(rw), $0(r), %ENV(r) ++# ++ ++## GLOBALS ++SID="" ++sidadm="" ++InstanceName="" ++InstanceNr="" ++DIR_EXECUTABLE="" ++SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" ++NODENAME=$(crm_node -n) ++ ++if [ $# -ne 1 ] ++then ++ sht_usage ++ exit $OCF_ERR_ARGS ++fi ++ ++ACTION=$1 ++if [ "$ACTION" = "status" ]; then ++ ACTION=monitor ++fi ++ ++# These operations don't require OCF parameters to be set ++case "$ACTION" in ++ usage|methods) sht_$ACTION ++ exit $OCF_SUCCESS;; ++ meta-data) sht_meta_data ++ exit $OCF_SUCCESS;; ++ notify) sht_notify ++ exit $OCF_SUCCESS;; ++ admin-setup) admin-setup ++ exit $OCF_SUCCESS;; ++ *);; ++esac ++sht_init ++ ++if ! ocf_is_root ++then ++ super_ocf_log err "ACT: $0 must be run as root" ++ exit $OCF_ERR_PERM ++fi ++ ++# parameter check ++if [ -z "$OCF_RESKEY_SID" ] ++then ++ super_ocf_log err "ACT: Please set parameter SID!" ++ exit $OCF_ERR_ARGS ++fi ++ ++if [ -z "$OCF_RESKEY_InstanceNumber" ] ++then ++ super_ocf_log err "ACT: Please set parameter InstanceNumber!" ++ exit $OCF_ERR_ARGS ++fi ++ ++ ++if is_clone ++then ++ CLACT=_clone ++else ++ if [ "$ACTION" = "promote" -o "$ACTION" = "demote" ] ++ then ++ super_ocf_log err "ACT: $ACTION called in a non clone environment" ++ exit $OCF_ERR_ARGS ++ fi ++fi ++ ++THE_VERSION=$(sht_meta_data | grep ' $b ? $a : $b; ++} ++ ++sub print_attr_host() ++{ ++ my ($HKey, $AKey); ++ printf "%-22s", "Attribute \\ Host"; ++ foreach $HKey (sort keys %Host) { ++ printf "%-16s ", $HKey; ++ } ++ printf "\n"; ++ ++ printf "%s\n", "-" x 120 ; ++ ++ foreach $AKey (sort keys %Name) { ++ printf "%-22s", $AKey; ++ foreach $HKey (sort keys %Host) { ++ printf "%-16.16s ", $Host{$HKey} -> {$AKey}; ++ } ++ ++ printf "\n"; ++ } ++ return 0; ++} ++ ++sub print_host_attr() ++{ ++ my ($AKey, $HKey, $len, $line_len, $hclen); ++ $hclen=$Name{_hosts}->{_length}; ++ $line_len=$hclen+1; ++ printf "%-$hclen.${hclen}s ", "$table_title"; ++ foreach $AKey (sort keys %Name) { ++ if ($AKey ne "_hosts") { ++ $len = $Name{$AKey}->{_length}; ++ $line_len=$line_len+$len+1; ++ printf "%-$len.${len}s ", $Name{$AKey}->{_title}; ++ } ++ } ++ printf "\n"; ++ printf "%s\n", "-" x $line_len ; ++ foreach $HKey (sort keys %Host) { ++ printf "%-$hclen.${hclen}s ", $HKey; ++ foreach $AKey (sort keys %Name) { ++ if ($AKey ne "_hosts") { ++ $len = $Name{$AKey}->{_length}; ++ printf "%-$len.${len}s ", $Host{$HKey} -> {$AKey}; ++ } ++ } ++ printf "\n"; ++ } ++ return 0; ++} ++ ++open ListInstances, "/usr/sap/hostctrl/exe/saphostctrl -function ListInstances|"; ++while () { ++ # try to catch: Inst Info : LNX - 42 - lv9041 - 740, patch 36, changelist 1444691 ++ chomp; ++ if ( $_ =~ /:\s+([A-Z][A-Z0-9][A-Z0-9])\s+-/ ) { ++ $sid=tolower("$1"); ++ } ++} ++close ListInstances; ++ ++ ++open CIB, "cibadmin -Ql |"; ++while () { ++ chomp; ++ my ($host, $name, $value); ++ my $found=0; ++ if ( $_ =~ /nvpair.*name="(\w+_${sid}_\w+)"/ ) { ++ $name=$1; ++ # find attribute in forever and reboot store :) ++ if ( $_ =~ /id="(status|nodes)-([a-zA-Z0-9\_\-]+)-/ ) { ++ $host=$2; ++ } ++ if ( $_ =~ /value="([^"]+)"/ ) { ++ $value=$1; ++ $found=1; ++ } ++ } ++ if ( $found == 1 ) { ++ # ++ # handle the hosts name and table-title ++ # ++ $Host{$host}->{$name}=${value}; ++ if ( defined ($Name{_hosts}->{_length})) { ++ $Name{_hosts}->{_length} = max($Name{_hosts}->{_length}, length($host )); ++ } else { ++ $Name{_hosts}->{_length} = length($host ); ++ } ++ $Name{_hosts}->{_length} = max($Name{_hosts}->{_length}, length( $table_title)); ++ # ++ # now handle the attributes name and value ++ # ++ $Name{$name}->{$host}=${value}; ++ if ( defined ($Name{$name}->{_length})) { ++ $Name{$name}->{_length} = max($Name{$name}->{_length}, length($value )); ++ } else { ++ $Name{$name}->{_length} = length($value ); ++ } ++ if ( $name =~ /hana_${sid}_(.*)/ ) { ++ $Name{$name}->{_title} = $1; ++ } else { ++ $Name{$name}->{_title} = $name; ++ } ++ $Name{$name}->{_length} = max($Name{$name}->{_length}, length( $Name{$name}->{_title})); ++ # printf "%-8s %-20s %-30s\n", $1, $2, $3; ++ } ++} ++close CIB; ++ ++#print_attr_host; ++print_host_attr; +-- +1.8.4.2 + diff --git a/SOURCES/bz1168251-SAPHana-agents_update4.patch b/SOURCES/bz1168251-SAPHana-agents_update4.patch new file mode 100644 index 0000000..9cf860a --- /dev/null +++ b/SOURCES/bz1168251-SAPHana-agents_update4.patch @@ -0,0 +1,441 @@ +diff --git a/heartbeat/SAPHana b/heartbeat/SAPHana +index 1913dc3..ed0443b 100644 +--- a/heartbeat/SAPHana ++++ b/heartbeat/SAPHana +@@ -48,6 +48,8 @@ HANA_STATE_SECONDARY=1 + HANA_STATE_STANDALONE=2 + HANA_STATE_DEFECT=3 + ++debug_attributes=0 ++ + SH=/bin/sh + + # +@@ -132,19 +134,19 @@ function saphana_meta_data() { + + + +-0.149.4 ++0.149.7 + + Manages two SAP HANA instances in system replication (SR). + + The SAPHanaSR resource agent manages two SAP Hana instances (databases) which are configured +-in system replication. This first version is limitted to the scale-up scenario. Scale-Out is ++in system replication. This first version is limited to the scale-up scenario. Scale-Out is + not supported in this version. + + Managing the two SAP HANA instances means that the resource agent controls the start/stop of the + instances. In addition the resource agent is able to monitor the SAP HANA databases to check their + availability on landscape host configuration level. For this monitoring the resource agent relies on interfaces + provided by SAP. A third task of the resource agent is to also check the synchronisation status +-of the two SAP HANA databases. If the synchronisation is not "SOK", than the cluster avoids to ++of the two SAP HANA databases. If the synchronisation is not "SOK", then the cluster avoids to + failover to the secondary side, if the primary fails. This is to improve the data consistency. + + The resource agent uses the following four interfaces provided by SAP: +@@ -162,7 +164,7 @@ The resource agent uses the following four interfaces provided by SAP: + + 3. hdbnsutil + The interface hdbnsutil is used to check the "topology" of the system replication as well as the current configuration +- (primary/secondary) of a SAP HANA database instance. A second task of the interface is the posibility to run a ++ (primary/secondary) of a SAP HANA database instance. A second task of the interface is the possibility to run a + system replication takeover (sr_takeover) or to register a former primary to a newer one (sr_register). + + 4. hdbsql / systemReplicationStatus +@@ -198,7 +200,7 @@ The resource agent uses the following four interfaces provided by SAP: + + + Define, if a former primary should automatically be registered. +- The parameter AUTOMATED_REGISTER defines, wether a former primary instance should ++ The parameter AUTOMATED_REGISTER defines, whether a former primary instance should + be registered automatically by the resource agent during cluster/resource start, if the DUPLICATE_PRIMARY_TIMEOUT is expired... TDB + + +@@ -207,7 +209,7 @@ The resource agent uses the following four interfaces provided by SAP: + Time difference needed between to primary time stamps, if a dual-primary situation occurs + Time difference needed between to primary time stamps, + if a dual-primary situation occurs. If the time difference is +- less than the time gap, than the cluster hold one or both instances in a "WAITING" status. This is to give a admin ++ less than the time gap, then the cluster hold one or both instances in a "WAITING" status. This is to give an admin + a chance to react on a failover. A failed former primary will be registered after the time difference is passed. After + this registration to the new primary all data will be overwritten by the system replication. + +@@ -316,7 +318,7 @@ function remoteHost2remoteNode() + # descript: is_clone : find out if we are configured to run in a Master/Slave configuration + # rc: 0: it is a clone, 1: it is not a clone + # +-# DONE: PRIO2: For the first shippment (scale-out) we need to limit the clones to 2 ++# DONE: PRIO2: For the first shipment (scale-out) we need to limit the clones to 2 + # + function is_clone() { + super_ocf_log info "FLOW $FUNCNAME ($*)" +@@ -356,8 +358,14 @@ function get_hana_attribute() + local attr_node=$1 + local attr_name=$2 + local attr_store=${3:-reboot} # DONE: PRIO5 get this (optional) from parameter +- local attr_default=${4:-} +- crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q -d "$attr_default"; rc=$? ++ local attr_default=${5:-} ++ local attr_val="" ++ attr_val=$(crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q -d "$attr_default"); rc=$? ++ if [ $debug_attributes -eq 1 ]; then ++ dstr=$(date) ++ echo "$dstr: SAPHana: crm_attribute -N ${attr_node} -G -n \"$attr_name\" -l $attr_store -q --> $attr_val" >> /var/log/fhATTRIBUTE ++ fi ++ echo "$attr_val" + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc + } +@@ -381,6 +389,10 @@ function set_hana_attribute() + if [ "$attr_old" != "$attr_value" ]; then + super_ocf_log debug "DBG: SET attribute $attr_name for node ${attr_node} to ${attr_value} former ($attr_old) get_rc=$get_rc " + crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store; rc=$? ++ if [ $debug_attributes -eq 1 ]; then ++ dstr=$(date) ++ echo "$dstr: SAPHana: crm_attribute -N $attr_node -v $attr_value -n \"$attr_name\" -l $attr_store" >> /var/log/fhATTRIBUTE ++ fi + else + super_ocf_log debug "DBG: LET attribute $attr_name for node ${attr_node} still be ${attr_value}" + rc=0 +@@ -448,7 +460,7 @@ scoring_crm_master() + local roles="$1" + local sync="$2" + local skip=0 +- local myScore=-1 ++ local myScore="" + for scan in "${SCORING_TABLE_PREFERRED_SITE_TAKEOVER[@]}"; do + if [ $skip -eq 0 ]; then + read rolePatt syncPatt score <<< $scan +@@ -461,7 +473,10 @@ scoring_crm_master() + fi + done + super_ocf_log debug "DBG: scoring_crm_master adjust score $myScore" +- set_crm_master $myScore ++ # TODO: PRIO1: DO Not Score, If we did not found our role/sync at this moment - bsc#919925 ++ if [ -n "$myScore" ]; then ++ set_crm_master $myScore ++ fi + } + + # +@@ -1068,6 +1083,27 @@ function saphana_start_primary() + case "$lpa_dec" in + 0 ) # LPA says start-up + lpa_advice="start" ++ # TODO: PRIO1: We need to do a special handling for remote being a 234-Secondary in SR Status SOK ++ # if ( remote_role like [234]:S ) && ( remote_sync_status is SOK|PRIM ) && ( PreferSiteTakeover ) ++ # then lpa_advice="wait" ++ remoteRole=$(get_hana_attribute $remoteNode ${ATTR_NAME_HANA_ROLES[@]}) ++ remoteSync=$(get_hana_attribute $remoteNode ${ATTR_NAME_HANA_SYNC_STATUS[@]}) ++ super_ocf_log info "DEC: saphana_primary - checking remoteStatus" ++ if ocf_is_true "${PreferSiteTakeover}"; then ++ remoteStatus="$remoteRole:$remoteSync" ++ case "$remoteStatus" in ++ [234]:S:*:SOK | [234]:S:*:PRIM ) ++ lpa_advice="wait" ++ # TODO: PRIO3: Split WAIT into WAIT4TAKEOVER ++ super_ocf_log info "DEC: saphana_primary - waiting for secondary to takeover (SOK, PreferSiteTakover)" ++ ;; ++ * ) ++ super_ocf_log info "DEC: saphana_primary - remoteStatus is: $remoteStatus" ++ ;; ++ esac ++ else ++ super_ocf_log info "DEC: saphana_primary - PreferSiteTakeover set to false" ++ fi + ;; + 1) # LPA says register! + lpa_advice="register" +@@ -1075,7 +1111,7 @@ function saphana_start_primary() + 2) # LPA says wait for second LPT + lpa_advice="wait" + ;; +- 3 | 4 ) # LPA says something is completely wrong - FAIL resource ++ 3 | 4 ) # LPA says something is completely wrong - FAIL resource # TODO: PRIO1: RC3 for waiting remote side to report lss + lpa_advice="fail" + ;; + * ) # LPA failed with an unkonown status - FAIL resource +@@ -1098,7 +1134,7 @@ function saphana_start_primary() + super_ocf_log info "LPA: landcape: UP, LPA: start ==> keep running" + LPTloc=$(date '+%s') + lpa_set_lpt $LPTloc +- rc=$OCF_SUCCSESS ++ rc=$OCF_SUCCESS + ;; + 1 ) # landcape says we are down, lets start and adjust scores and return code + super_ocf_log info "LPA: landcape: DOWN, LPA: start ==> start instance" +@@ -1149,7 +1185,7 @@ function saphana_start_primary() + case "$lss" in + 2 | 3 | 4 ) # as we ARE up we just keep it up + # TODO: PRIO3: I now change from "just keep it up to take that down" +- # TODO: PRIO3: OCF_SUCCSESS, OCF_NOT_RUNNING or OCF_ERR_xxxx ? ++ # TODO: PRIO3: OCF_SUCCESS, OCF_NOT_RUNNING or OCF_ERR_xxxx ? + set_crm_master -9000 + #scoring_crm_master "$my_role" "$my_sync" + rc=$OCF_ERR_GENERIC +@@ -1159,7 +1195,7 @@ function saphana_start_primary() + # TODO: PRIO3: Check, if WAITING is correct here + set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} + set_crm_master -9000 +- rc=$OCF_SUCCSESS ++ rc=$OCF_SUCCESS + ;; + esac + ;; +@@ -1277,7 +1313,7 @@ function saphana_start_secondary() + super_ocf_log info "ACT: PRIMARY seams to be down now ==> WAITING" + set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} + set_crm_master -INFINITY +- rc=$OCF_SUCCSESS ++ rc=$OCF_SUCCESS + fi + else + lpa_set_lpt 30 +@@ -1286,7 +1322,7 @@ function saphana_start_secondary() + super_ocf_log info "ACT: wait_for_primary_master ==> WAITING" + set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} + set_crm_master -INFINITY +- rc=$OCF_SUCCSESS ++ rc=$OCF_SUCCESS + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +@@ -1453,7 +1489,8 @@ function lpa_init_lpt() { + # LPTlocal > LPTremore ===> rc=0 (start) + # LPTRemote > LPTlocal ===> rc=1 (register) + # Stalemate in all other cases ==> STALEMATE-HANDLING ===> rc=2 (wait) +-# LPTRemote is not initialized (0) ++# LPTRemote is not initialized or node not kown in cluster (crm_mon -l) (0) ++# TODO: PRIO1: Need to introduce a return-code 3 for remote sides lpa not ready + # THEN: + # WAIT ==> like STALEMATE-HANDLING ===> rc=2 (wait) + # +@@ -1625,7 +1662,6 @@ function saphana_monitor_primary() + else + super_ocf_log info "LPA: Dual primary detected and AUTOMATED_REGISTER='false' ==> WAITING" + fi +- + return $OCF_SUCCESS + fi + promoted=0; +@@ -1853,11 +1889,11 @@ function saphana_monitor_secondary() + scoring_crm_master "$my_role" "$my_sync" + ;; + "SFAIL" ) # This is currently NOT a possible node to promote +- super_ocf_log info "DEC: secondary with sync status FAILED ==> EXCLUDE as posible takeover node" ++ super_ocf_log info "DEC: secondary with sync status FAILED ==> EXCLUDE as possible takeover node" + set_crm_master -INFINITY + ;; + "*" ) # Unknown sync status +- super_ocf_log info "DEC: secondary with sync status UKNOWN/UNDEFINED ==> EXCLUDE as posible takeover node" ++ super_ocf_log info "DEC: secondary with sync status UKNOWN/UNDEFINED ==> EXCLUDE as possible takeover node" + set_crm_master -INFINITY + ;; + esac +@@ -1889,10 +1925,12 @@ function saphana_monitor_clone() { + local rc=$OCF_ERR_GENERIC + local promoted=0 + local init_attribute=0 ++ local lpaRc=0 ++ local mRc=0 ++ local myMaster=-1 + + my_role=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_ROLES[@]}) + my_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) +- lpa_check_lpt_status # TODO: PRIO3 : remove that line later - its only to call lpa_check_lpt_status much more often for checking + + if ocf_is_probe; then + super_ocf_log debug "DBG: PROBE ONLY" +@@ -1904,6 +1942,16 @@ function saphana_monitor_clone() { + # + check_for_primary; primary_status=$? + if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then ++ # FIX: bsc#919925 Leaving Node Maintenance stops HANA Resource Agent ++ # TODO: PRIO1: Maybe we need a lpa-check here to ++ if ocf_is_probe; then ++ myMaster=$(get_crm_master); mRc=$? ++ if [ $mRc -ne 0 ]; then ++ set_crm_master 5 ++ elif [ $myMaster -eq -1 ]; then ++ set_crm_master 5 ++ fi ++ fi + saphana_monitor_primary; rc=$? + else + if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then +diff --git a/heartbeat/SAPHanaTopology b/heartbeat/SAPHanaTopology +index 082ad29..1d4887f 100644 +--- a/heartbeat/SAPHanaTopology ++++ b/heartbeat/SAPHanaTopology +@@ -14,6 +14,7 @@ + # Support: linux@sap.com + # License: GNU General Public License (GPL) + # Copyright: (c) 2014 SUSE Linux Products GmbH ++# (c) 2015 SUSE Linux GmbH + # + # An example usage: + # See usage() function below for more details... +@@ -39,6 +40,8 @@ HANA_STATE_SECONDARY=1 + HANA_STATE_STANDALONE=2 + HANA_STATE_DEFECT=3 + ++debug_attributes=0 ++ + SH=/bin/sh + + # +@@ -123,7 +126,7 @@ function sht_meta_data() { + + + +- 0.149.4 ++ 0.149.6 + Analyzes SAP HANA System Replication Topology. + This RA analyzes the SAP HANA topology and "sends" all findings via the node status attributes to + all nodes in the cluster. These attributes are taken by the SAPHana RA to control the SAP Hana Databases. +@@ -205,7 +208,13 @@ function get_hana_attribute() + local attr_node=$1 + local attr_name=$2 + local attr_store=${3:-reboot} # DONE: PRIO5 get this (optional) from parameter +- crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q; rc=$? ++ local attr_val="" ++ attr_val=$(crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q); rc=$? ++ if [ $debug_attributes -eq 1 ]; then ++ dstr=$(date) ++ echo "$dstr: SAPHanaTopology: crm_attribute -N ${attr_node} -G -n \"$attr_name\" -l $attr_store -q --> $attr_val" >> /var/log/fhATTRIBUTE ++ fi ++ echo "$attr_val" + if [ $rc -ne 0 ]; then + super_ocf_log debug "DBG: ATTRIBUTE-FAILURE: crm_attribute -N $attr_node -G -n "$attr_name" -l $attr_store -q" + fi +@@ -230,6 +239,10 @@ function set_hana_attribute() + attr_old=$(get_hana_attribute $attr_node $attr_name $attr_store); get_rc=$? + if [ "$attr_old" != "$attr_value" ]; then + super_ocf_log debug "DBG: SET attribute $attr_name for node ${attr_node} to ${attr_value} former ($attr_old) get_rc=$get_rc " ++ if [ $debug_attributes -eq 1 ]; then ++ dstr=$(date) ++ echo "$dstr: SAPHanaTopology: crm_attribute -N $attr_node -v $attr_value -n \"$attr_name\" -l $attr_store" >> /var/log/fhATTRIBUTE ++ fi + crm_attribute -N $attr_node -v "$attr_value" -n "$attr_name" -l $attr_store; rc=$? + if [ $rc -ne 0 ]; then + super_ocf_log debug "DBG: ATTRIBUTE-FAILURE: crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store" +@@ -377,18 +390,32 @@ function sht_init() { + *openais* ) nodelist=$(crm_node -l | awk '/member/ {print $2}');; + *cman* ) nodelist=$(crm_node -l);; + esac ++ #### SAP-CALL + hdbANSWER=$(su - ${sidadm} -c "hdbnsutil -sr_state --sapcontrol=1" 2>/dev/null) + super_ocf_log debug "DBG2: hdbANSWER=\$\(su - ${sidadm} -c \"hdbnsutil -sr_state --sapcontrol=1\"\)" + site=$(echo "$hdbANSWER" | awk -F= '/site name/ {print $2}') + srmode=$(echo "$hdbANSWER" | awk -F= '/mode/ {print $2}') +- MAPPING=$(echo "$hdbANSWER" | awk -F[=/] '$1 ~ "mapping" && $3 !~ site { print $4 }' site=$site) ++ if [ $debug_attributes -eq 1 ]; then ++ dstr=$(date) ++ echo "$dstr: SAPHanaTopology: srmode=$srmode" >> /var/log/fhATTRIBUTE ++ fi ++ MAPPING=$(echo "$hdbANSWER" | awk -F[=/] '$1 == "mapping" && $3 != site { print $4 }' site=$site) + super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING" + # + # filter all non-cluster mappings + # +- hanaRemoteHost=$(for n1 in $nodelist; do for n2 in $MAPPING; do if [ "$n1" == "$n2" ]; then echo $n1; fi; done; done ) +- super_ocf_log info "DEC: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" +- super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" ++ # DONE: PRIO2: Need mapping between HANA HOSTS not cluster NODES ++ local hanaVHost ++ hanaRemoteHost=$(for n1 in $nodelist; do ++ hanaVHost=$(get_hana_attribute ${n1} ${ATTR_NAME_HANA_VHOST[@]}) ++ for n2 in $MAPPING; do ++ if [ "$hanaVHost" == "$n2" ]; then ++ echo $hanaVHost; ++ fi; ++ done; ++ done ) ++ super_ocf_log info "DEC: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" ++ super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" + super_ocf_log info "FLOW $FUNCNAME rc=$OCF_SUCCESS" + return $OCF_SUCCESS + } +@@ -422,6 +449,7 @@ function check_for_primary() { + super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" + dump=$( echo $node_status | hexdump -C ); + super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" ++ #### SAP-CALL + node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) + node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') + super_ocf_log info "DEC: check_for_primary: loop=$i: node_status=$node_status" +@@ -440,6 +468,7 @@ function check_for_primary() { + # + function start_saphostagent() + { ++ ### SAP-CALL + if [ -x "${HOSTEXEC_PATH}" ]; then + ${HOSTEXEC_PATH} pf=${HOSTEXEC_PROFILE_PATH} + fi +@@ -453,9 +482,10 @@ function start_saphostagent() + # + function stop_saphostagent() + { +- if [ -x "${HOSTEXEC_PATH}" ]; then +- ${HOSTEXEC_PATH} -stop +- fi ++ ### SAP-CALL ++ if [ -x "${HOSTEXEC_PATH}" ]; then ++ ${HOSTEXEC_PATH} -stop ++ fi + } + + # +@@ -586,7 +616,7 @@ function sht_validate() { + # + function sht_start_clone() { + super_ocf_log info "FLOW $FUNCNAME ($*)" +- local rc=$OCF_NOT_RUNNING ++ local rc=$OCF_NOT_RUNNING + sht_start; rc=$? + return $rc + } +@@ -666,27 +696,30 @@ function sht_monitor_clone() { + # DONE: PRIO1: ASK: Is the output format of ListInstances fix? Could we take that as an API? + # try to catch: Inst Info : LNX - 42 - lv9041 - 740, patch 36, changelist 1444691 + # We rely on the following format: SID is word#4, NR is work#6, vHost is word#8 ++ #### SAP-CALL + vName=$(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances \ + | awk '$4 == SID && $6=NR { print $8 }' SID=$SID NR=$InstanceNr 2>/dev/null ) +- super_ocf_log debug "DBG: ListInstances: $(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances)" ++ # super_ocf_log debug "DBG: ListInstances: $(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances)" + if [ -n "$vName" ]; then + set_hana_attribute ${NODENAME} "$vName" ${ATTR_NAME_HANA_VHOST[@]} + else + vName=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_VHOST[@]}) + fi + #site=$(get_site_name) ++ #### SAP-CALL + hanaANSWER=$(su - $sidadm -c "python exe/python_support/landscapeHostConfiguration.py" 2>/dev/null); hanalrc="$?" + hanarole=$(echo "$hanaANSWER" | tr -d ' ' | awk -F'|' '$2 == host { printf "%s:%s:%s:%s\n",$10,$11,$12,$13 } ' host=${vName}) + #if [ -z "$MAPPING" ]; then + # super_ocf_log info "ACT: Did not find remote Host at this moment" + #fi +- # FH TODO PRIO1: TRY TO GET RID OF "ATTR_NAME_HANA_REMOTEHOST" ++ # FH TODO PRIO3: TRY TO GET RID OF "ATTR_NAME_HANA_REMOTEHOST" + if [ -n "$hanaRemoteHost" ]; then + set_hana_attribute ${NODENAME} "$hanaRemoteHost" ${ATTR_NAME_HANA_REMOTEHOST[@]} + fi + set_hana_attribute ${NODENAME} "$hanalrc:$hanaPrim:$hanarole" ${ATTR_NAME_HANA_ROLES[@]} +- set_hana_attribute ${NODENAME} "$site" ${ATTR_NAME_HANA_SITE[@]} +- set_hana_attribute ${NODENAME} "$vName" ${ATTR_NAME_HANA_VHOST[@]} ++ if [ -n "$site" ]; then ++ set_hana_attribute ${NODENAME} "$site" ${ATTR_NAME_HANA_SITE[@]} ++ fi + case "$hanaPrim" in + P ) ;; + S ) # only secondary may propargate its sync status +@@ -701,7 +734,6 @@ function sht_monitor_clone() { + done + ;; + esac +- #ATTR_NAME_HANA_STATUS # TODO: PRIO5: For SCALE-OUT: Fill that attribute later + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc + } diff --git a/SOURCES/bz1171162-clvmd-opt-fix.patch b/SOURCES/bz1171162-clvmd-opt-fix.patch new file mode 100644 index 0000000..2a46add --- /dev/null +++ b/SOURCES/bz1171162-clvmd-opt-fix.patch @@ -0,0 +1,25 @@ +From e0f3e2190cfef76b9d7383a0009b678ed2ef4b17 Mon Sep 17 00:00:00 2001 +From: David Vossel +Date: Wed, 29 Apr 2015 11:08:55 -0500 +Subject: [PATCH 1/6] bz1171162-clvmd-opt-fix + +--- + heartbeat/clvm | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/heartbeat/clvm b/heartbeat/clvm +index dcefcca..a1e2bc4 100755 +--- a/heartbeat/clvm ++++ b/heartbeat/clvm +@@ -370,7 +370,7 @@ clvmd_start() + if ocf_is_true $OCF_RESKEY_with_cmirrord; then + start_process $CMIRROR_PATH + fi +- start_process $DAEMON_PATH $CLVMDOPTS ++ start_process $DAEMON_PATH "$CLVMDOPTS" + + # Refresh local cache. + # +-- +1.8.4.2 + diff --git a/SOURCES/bz1183136-nginx-support.patch b/SOURCES/bz1183136-nginx-support.patch new file mode 100644 index 0000000..b85c948 --- /dev/null +++ b/SOURCES/bz1183136-nginx-support.patch @@ -0,0 +1,113 @@ +From d828c825c58f2da4b4edd6548c5fd254842a0add Mon Sep 17 00:00:00 2001 +From: David Vossel +Date: Wed, 29 Apr 2015 11:15:18 -0500 +Subject: [PATCH 4/6] nginx agent support + +--- + heartbeat/nginx | 27 ++++++++++++--------------- + 1 file changed, 12 insertions(+), 15 deletions(-) + +diff --git a/heartbeat/nginx b/heartbeat/nginx +index 65fd8f2..fadc545 100755 +--- a/heartbeat/nginx ++++ b/heartbeat/nginx +@@ -31,7 +31,7 @@ + # OCF_RESKEY_status10regex + # OCF_RESKEY_status10url + # OCF_RESKEY_client +-# OCF_RESKEY_testurl ++# OCF_RESKEY_test20url + # OCF_RESKEY_test20regex + # OCF_RESKEY_test20conffile + # OCF_RESKEY_test20name +@@ -416,7 +416,7 @@ start_nginx() { + return $OCF_SUCCESS + fi + if +- ocf_run $NGINXD -t -c $CONFIGFILE ++ ocf_run $NGINXD $OPTIONS -t -c $CONFIGFILE + then + : Configuration file $CONFIGFILE looks OK + else +@@ -442,7 +442,7 @@ start_nginx() { + [ $ec -eq $OCF_NOT_RUNNING ] + then + tries=`expr $tries + 1` +- ocf_log info "Waiting for $NGINXD -c $CONFIGFILE to come up (try $tries)" ++ ocf_log info "Waiting for $NGINXD $OPTIONS -c $CONFIGFILE to come up (try $tries)" + true + else + false +@@ -727,25 +727,25 @@ For example, you can set this paramter to "wget" if you prefer that to curl. + + + +- ++ + + URL to test. If it does not start with "http", then it's + considered to be relative to the document root address. + +-Level 10 monitor url ++Level 20 monitor url + + + + + +-Regular expression to match in the output of testurl. ++Regular expression to match in the output of test20url. + Case insensitive. + + Level 20 monitor regular expression + + + +- ++ + + A file which contains a more complex test configuration. Could be useful if + you have to check more than one web application or in case sensitive +@@ -785,14 +785,11 @@ Extra options to apply when starting nginx. + + + +- ++ + + + +- +- +- +- ++ + + + +@@ -838,11 +835,11 @@ validate_all_nginx() { + exit $OCF_ERR_CONFIGURED + fi + if +- ocf_run $NGINXD -t -c $CONFIGFILE ++ ocf_run $NGINXD $OPTIONS -t -c $CONFIGFILE + then + : Cool $NGINXD likes $CONFIGFILE + else +- ocf_log err "$NGINXD -t -c $CONFIGFILE reported a configuration error." ++ ocf_log err "$NGINXD $OPTIONS -t -c $CONFIGFILE reported a configuration error." + return $OCF_ERR_CONFIGURED + fi + return $OCF_SUCCESS +@@ -859,7 +856,7 @@ then + OPTIONS="$OCF_RESKEY_options" + CLIENT=${OCF_RESKEY_client} + TESTREGEX=${OCF_RESKEY_status10regex:-'Reading: [0-9]+ Writing: [0-9]+ Waiting: [0-9]+'} +- TESTURL="$OCF_RESKEY_status10url" ++ TESTURL="$OCF_RESKEY_test20url" + TESTREGEX20=${OCF_RESKEY_test20regex} + TESTCONFFILE="$OCF_RESKEY_test20conffile" + TESTNAME="$OCF_RESKEY_test20name" +-- +1.8.4.2 + diff --git a/SOURCES/bz1198681-clvm-activate-vgs-option.patch b/SOURCES/bz1198681-clvm-activate-vgs-option.patch new file mode 100644 index 0000000..c3f8455 --- /dev/null +++ b/SOURCES/bz1198681-clvm-activate-vgs-option.patch @@ -0,0 +1,56 @@ +From b5ac7d0e49bb3b967c3865438067a95606db959a Mon Sep 17 00:00:00 2001 +From: David Vossel +Date: Mon, 27 Apr 2015 16:35:03 -0400 +Subject: [PATCH] High: clvm: activate_vgs option for enable/disable of + automatic vg activation + +--- + heartbeat/clvm | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/heartbeat/clvm b/heartbeat/clvm +index 9d312cc..23e6f9f 100755 +--- a/heartbeat/clvm ++++ b/heartbeat/clvm +@@ -60,6 +60,18 @@ Options to clvmd. Refer to clvmd.8 for detailed descriptions. + Daemon Options + + ++ ++ ++ ++Whether or not to activate all cluster volume groups after starting ++the clvmd or not. Note that clustered volume groups will always be ++deactivated before the clvmd stops regardless of what this option ++is set to. ++ ++Activate volume groups ++ ++ ++ + + + +@@ -77,6 +89,7 @@ END + ####################################################################### + + : ${OCF_RESKEY_daemon_options:="-d0"} ++: ${OCF_RESKEY_activate_vgs:="true"} + + sbindir=$HA_SBIN_DIR + if [ -z $sbindir ]; then +@@ -322,6 +335,11 @@ start_process() + + clvmd_activate_all() + { ++ ++ if ! ocf_is_true "$OCF_RESKEY_activate_vgs"; then ++ ocf_log info "skipping vg activation, activate_vgs is set to $OCF_RESKEY_activate_vgs" ++ return $OCF_SUCCESS ++ fi + # Activate all volume groups by leaving the + # "volume group name" parameter empty + ocf_run ${LVM_VGCHANGE} -aay +-- +1.8.4.2 + diff --git a/SOURCES/bz1200756-ipsrcaddr-misconfig.patch b/SOURCES/bz1200756-ipsrcaddr-misconfig.patch new file mode 100644 index 0000000..d69d1e2 --- /dev/null +++ b/SOURCES/bz1200756-ipsrcaddr-misconfig.patch @@ -0,0 +1,92 @@ +From 3c383f3dbb3b5351b25d33aa6e516ab8fc04a26a Mon Sep 17 00:00:00 2001 +From: David Vossel +Date: Tue, 28 Apr 2015 11:47:21 -0500 +Subject: [PATCH] High: IPsrcaddr: return correct error code during stop when + misconfigured + +--- + heartbeat/IPsrcaddr | 45 +++++++++++++++++++++++++++++++-------------- + 1 file changed, 31 insertions(+), 14 deletions(-) + +diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr +index 8163c0c..33c5be6 100755 +--- a/heartbeat/IPsrcaddr ++++ b/heartbeat/IPsrcaddr +@@ -387,15 +387,27 @@ ip_status() { + + srca_validate_all() { + +- check_binary $AWK +- check_binary $IFCONFIG ++ if [ -z "$OCF_RESKEY_ipaddress" ]; then ++ # usage ++ ocf_exit_reason "Please set OCF_RESKEY_ipaddress to the preferred source IP address!" ++ return $OCF_ERR_CONFIGURED ++ fi ++ ++ ++ if ! [ "x$SYSTYPE" = "xLinux" ]; then ++ # checks after this point are only relevant for linux. ++ return $OCF_SUCCESS ++ fi ++ ++ check_binary $AWK ++ check_binary $IFCONFIG + + # The IP address should be in good shape + if CheckIP "$ipaddress"; then + : + else + ocf_exit_reason "Invalid IP address [$ipaddress]" +- exit $OCF_ERR_CONFIGURED ++ return $OCF_ERR_CONFIGURED + fi + + if ocf_is_probe; then +@@ -407,8 +419,9 @@ srca_validate_all() { + : + else + ocf_exit_reason "We are not serving [$ipaddress], hence can not make it a preferred source address" +- exit $OCF_ERR_INSTALLED ++ return $OCF_ERR_INSTALLED + fi ++ return $OCF_SUCCESS + } + + if +@@ -430,18 +443,22 @@ case $1 in + ;; + esac + +-if +- [ -z "$OCF_RESKEY_ipaddress" ] +-then +-# usage +- ocf_exit_reason "Please set OCF_RESKEY_ipaddress to the preferred source IP address!" +- exit $OCF_ERR_CONFIGURED +-fi +- + ipaddress="$OCF_RESKEY_ipaddress" + +-if [ "x$SYSTYPE" = "xLinux" ]; then +- srca_validate_all ++srca_validate_all ++rc=$? ++if [ $rc -ne $OCF_SUCCESS ]; then ++ case $1 in ++ # if we can't validate the configuration during a stop, that ++ # means the resources isn't configured correctly. There's no way ++ # to actually stop the resource in this situation because there's ++ # no way it could have even started. Return success here ++ # to indicate that the resource is not running, otherwise the ++ # stop action will fail causing the node to be fenced just because ++ # of a mis configuration. ++ stop) exit $OCF_SUCCESS;; ++ *) exit $rc;; ++ esac + fi + + findif_out=`$FINDIF -C` +-- +1.8.4.2 + diff --git a/SOURCES/bz1213971-ethmon-opt.patch b/SOURCES/bz1213971-ethmon-opt.patch new file mode 100644 index 0000000..5a1f346 --- /dev/null +++ b/SOURCES/bz1213971-ethmon-opt.patch @@ -0,0 +1,43 @@ +From 3e969507468bea12e1d126b31b222ad248780a80 Mon Sep 17 00:00:00 2001 +From: David Vossel +Date: Wed, 29 Apr 2015 11:13:26 -0500 +Subject: [PATCH 3/6] ethmonitor link_statys_only option + +--- + heartbeat/ethmonitor | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/heartbeat/ethmonitor b/heartbeat/ethmonitor +index a447391..d0ec4ef 100755 +--- a/heartbeat/ethmonitor ++++ b/heartbeat/ethmonitor +@@ -176,6 +176,14 @@ For infiniband devices, this is the port to monitor. + + + ++ ++ ++Only report success based on link status. Do not perform RX counter or arping related connectivity tests. ++ ++link status check only ++ ++ ++ + + + +@@ -378,6 +386,11 @@ if_check () { + return $OCF_NOT_RUNNING + fi + ++ # if using link_status_only, skip RX count and arping related tests ++ if ocf_is_true "$OCF_RESKEY_link_status_only"; then ++ return $OCF_SUCCESS ++ fi ++ + # watch for packet counter changes + ocf_log debug "watch for packet counter changes" + watch_pkt_counter +-- +1.8.4.2 + diff --git a/SOURCES/bz1214360-NovaCompute-update1.patch b/SOURCES/bz1214360-NovaCompute-update1.patch deleted file mode 100644 index 2dabe0b..0000000 --- a/SOURCES/bz1214360-NovaCompute-update1.patch +++ /dev/null @@ -1,494 +0,0 @@ -From 8c92227bce9cc4fe177eea5b2f7c9016e96434f9 Mon Sep 17 00:00:00 2001 -From: David Vossel -Date: Mon, 29 Jun 2015 13:03:17 -0500 -Subject: [PATCH 1/3] bz1214360-NovaCompute-update1.patch - ---- - doc/man/Makefile.am | 1 + - heartbeat/Makefile.am | 3 +- - heartbeat/NovaCompute | 73 ++++++------ - heartbeat/NovaEvacuate | 311 +++++++++++++++++++++++++++++++++++++++++++++++++ - 4 files changed, 352 insertions(+), 36 deletions(-) - create mode 100755 heartbeat/NovaEvacuate - -diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am -index 42a57fe..d32426b 100644 ---- a/doc/man/Makefile.am -+++ b/doc/man/Makefile.am -@@ -74,6 +74,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \ - ocf_heartbeat_ManageRAID.7 \ - ocf_heartbeat_ManageVE.7 \ - ocf_heartbeat_NovaCompute.7 \ -+ ocf_heartbeat_NovaEvacuate.7 \ - ocf_heartbeat_Pure-FTPd.7 \ - ocf_heartbeat_Raid1.7 \ - ocf_heartbeat_Route.7 \ -diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am -index 0bebf97..1034632 100644 ---- a/heartbeat/Makefile.am -+++ b/heartbeat/Makefile.am -@@ -52,7 +52,8 @@ send_ua_SOURCES = send_ua.c IPv6addr_utils.c - IPv6addr_LDADD = -lplumb $(LIBNETLIBS) - send_ua_LDADD = $(LIBNETLIBS) - --osp_SCRIPTS = NovaCompute -+osp_SCRIPTS = NovaCompute \ -+ NovaEvacuate - - ocf_SCRIPTS = ClusterMon \ - CTDB \ -diff --git a/heartbeat/NovaCompute b/heartbeat/NovaCompute -index f71abeb..09eee38 100644 ---- a/heartbeat/NovaCompute -+++ b/heartbeat/NovaCompute -@@ -107,15 +107,26 @@ Disable shared storage recovery for instances. Use at your own risk! - - - -+ -+ -+How long to wait for nova to finish evacuating instances elsewhere -+before starting nova-compute. Only used when the agent detects -+evacuations might be in progress. -+ -+You may need to increase the start timeout when increasing this value. -+ -+Delay to allow evacuations time to complete -+ -+ -+ - - - -- -+ - - - - -- - - - END -@@ -132,7 +143,7 @@ sigterm_handler() { - - nova_usage() { - cat < -+ -+ -+1.0 -+ -+ -+Facility for tacking a list of compute nodes and reliably evacuating the ones that fence_evacuate has flagged. -+ -+Evacuator for OpenStack Nova Compute Server -+ -+ -+ -+ -+ -+Authorization URL for connecting to keystone in admin context -+ -+Authorization URL -+ -+ -+ -+ -+ -+Username for connecting to keystone in admin context -+ -+Username -+ -+ -+ -+ -+Password for connecting to keystone in admin context -+ -+Password -+ -+ -+ -+ -+ -+Tenant name for connecting to keystone in admin context. -+Note that with Keystone V3 tenant names are only unique within a domain. -+ -+Tenant name -+ -+ -+ -+ -+ -+Nova API location (internal, public or admin URL) -+ -+Nova API location (internal, public or admin URL) -+ -+ -+ -+ -+ -+Disable shared storage recovery for instances. Use at your own risk! -+ -+Disable shared storage recovery for instances -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+END -+} -+ -+####################################################################### -+ -+# don't exit on TERM, to test that lrmd makes sure that we do exit -+trap sigterm_handler TERM -+sigterm_handler() { -+ ocf_log info "They use TERM to bring us down. No such luck." -+ return -+} -+ -+evacuate_usage() { -+ cat < +Date: Mon, 29 Jun 2015 13:03:17 -0500 +Subject: [PATCH 1/3] bz1214360-NovaCompute-update1.patch + +--- + doc/man/Makefile.am | 1 + + heartbeat/Makefile.am | 3 +- + heartbeat/NovaCompute | 73 ++++++------ + heartbeat/NovaEvacuate | 311 +++++++++++++++++++++++++++++++++++++++++++++++++ + 4 files changed, 352 insertions(+), 36 deletions(-) + create mode 100755 heartbeat/NovaEvacuate + +diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am +index 42a57fe..d32426b 100644 +--- a/doc/man/Makefile.am ++++ b/doc/man/Makefile.am +@@ -74,6 +74,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \ + ocf_heartbeat_ManageRAID.7 \ + ocf_heartbeat_ManageVE.7 \ + ocf_heartbeat_NovaCompute.7 \ ++ ocf_heartbeat_NovaEvacuate.7 \ + ocf_heartbeat_Pure-FTPd.7 \ + ocf_heartbeat_Raid1.7 \ + ocf_heartbeat_Route.7 \ +diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am +index 0bebf97..1034632 100644 +--- a/heartbeat/Makefile.am ++++ b/heartbeat/Makefile.am +@@ -52,7 +52,8 @@ send_ua_SOURCES = send_ua.c IPv6addr_utils.c + IPv6addr_LDADD = -lplumb $(LIBNETLIBS) + send_ua_LDADD = $(LIBNETLIBS) + +-osp_SCRIPTS = NovaCompute ++osp_SCRIPTS = NovaCompute \ ++ NovaEvacuate + + ocf_SCRIPTS = ClusterMon \ + CTDB \ +diff --git a/heartbeat/NovaCompute b/heartbeat/NovaCompute +index f71abeb..09eee38 100644 +--- a/heartbeat/NovaCompute ++++ b/heartbeat/NovaCompute +@@ -107,15 +107,26 @@ Disable shared storage recovery for instances. Use at your own risk! + + + ++ ++ ++How long to wait for nova to finish evacuating instances elsewhere ++before starting nova-compute. Only used when the agent detects ++evacuations might be in progress. ++ ++You may need to increase the start timeout when increasing this value. ++ ++Delay to allow evacuations time to complete ++ ++ ++ + + + +- ++ + + + + +- + + + END +@@ -132,7 +143,7 @@ sigterm_handler() { + + nova_usage() { + cat < ++ ++ ++1.0 ++ ++ ++Facility for tacking a list of compute nodes and reliably evacuating the ones that fence_evacuate has flagged. ++ ++Evacuator for OpenStack Nova Compute Server ++ ++ ++ ++ ++ ++Authorization URL for connecting to keystone in admin context ++ ++Authorization URL ++ ++ ++ ++ ++ ++Username for connecting to keystone in admin context ++ ++Username ++ ++ ++ ++ ++Password for connecting to keystone in admin context ++ ++Password ++ ++ ++ ++ ++ ++Tenant name for connecting to keystone in admin context. ++Note that with Keystone V3 tenant names are only unique within a domain. ++ ++Tenant name ++ ++ ++ ++ ++ ++Nova API location (internal, public or admin URL) ++ ++Nova API location (internal, public or admin URL) ++ ++ ++ ++ ++ ++Disable shared storage recovery for instances. Use at your own risk! ++ ++Disable shared storage recovery for instances ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++END ++} ++ ++####################################################################### ++ ++# don't exit on TERM, to test that lrmd makes sure that we do exit ++trap sigterm_handler TERM ++sigterm_handler() { ++ ocf_log info "They use TERM to bring us down. No such luck." ++ return ++} ++ ++evacuate_usage() { ++ cat < +Date: Thu, 25 Jun 2015 16:27:47 -0500 +Subject: [PATCH 2/3] bz1214781-lvm-partial-activation-fix.patch + +--- + heartbeat/LVM | 26 ++++++++++++++++++++++++-- + 1 file changed, 24 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/LVM b/heartbeat/LVM +index 58cbe83..4b9c167 100755 +--- a/heartbeat/LVM ++++ b/heartbeat/LVM +@@ -568,8 +568,30 @@ LVM_validate_all() { + ## + VGOUT=`vgck ${VOLUME} 2>&1` + if [ $? -ne 0 ]; then +- ocf_exit_reason "Volume group [$VOLUME] does not exist or contains error! ${VGOUT}" +- exit $OCF_ERR_GENERIC ++ # Inconsistency might be due to missing physical volumes, which doesn't ++ # automatically mean we should fail. If partial_activation=true then ++ # we should let start try to handle it, or if no PVs are listed as ++ # "unknown device" then another node may have marked a device missing ++ # where we have access to all of them and can start without issue. ++ if vgs -o pv_attr --noheadings $OCF_RESKEY_volgrpname 2>/dev/null | grep 'm' > /dev/null 2>&1; then ++ if vgs -o pv_name --noheadings $OCF_RESKEY_volgrpname 2>/dev/null | grep 'unknown device' > /dev/null 2>&1; then ++ if ! ocf_is_true "$OCF_RESKEY_partial_activation" ; then ++ # We are missing devices and cannot activate partially ++ ocf_exit_reason "Volume group [$VOLUME] has devices missing. Consider partial_activation=true to attempt to activate partially" ++ exit $OCF_ERR_GENERIC ++ else ++ # We are missing devices but are allowed to activate partially. ++ # Assume that caused the vgck failure and carry on ++ ocf_log warn "Volume group inconsistency detected with missing device(s) and partial_activation enabled. Proceeding with requested action." ++ fi ++ fi ++ # else the vg is partial but all devices are accounted for, so another ++ # node must have marked the device missing. Proceed. ++ else ++ # vgck failure was for something other than missing devices ++ ocf_exit_reason "Volume group [$VOLUME] does not exist or contains error! ${VGOUT}" ++ exit $OCF_ERR_GENERIC ++ fi + fi + + ## +-- +1.8.4.2 + diff --git a/SOURCES/bz1223615-apache-includes-fix.patch.patch b/SOURCES/bz1223615-apache-includes-fix.patch.patch new file mode 100644 index 0000000..22105ee --- /dev/null +++ b/SOURCES/bz1223615-apache-includes-fix.patch.patch @@ -0,0 +1,27 @@ +From 72482ca1e117f426378a700a8b1e01443e0fb597 Mon Sep 17 00:00:00 2001 +From: David Vossel +Date: Thu, 25 Jun 2015 16:30:20 -0500 +Subject: [PATCH 3/3] bz1223615-apache-includes-fix.patch + +--- + heartbeat/apache-conf.sh | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/heartbeat/apache-conf.sh b/heartbeat/apache-conf.sh +index dc3426f..a3c8930 100644 +--- a/heartbeat/apache-conf.sh ++++ b/heartbeat/apache-conf.sh +@@ -24,7 +24,9 @@ apachecat() { + function procline() { + split($0,a); + if( a[1]~/^[Ii]nclude$/ ) { +- procinclude(a[2]); ++ includedir=a[2]; ++ gsub("\"","",includedir); ++ procinclude(includedir); + } else { + if( a[1]=="ServerRoot" ) { + rootdir=a[2]; +-- +1.8.4.2 + diff --git a/SOURCES/bz1227293-dhcpd-chroot-fix.patch.patch b/SOURCES/bz1227293-dhcpd-chroot-fix.patch.patch new file mode 100644 index 0000000..7435dd2 --- /dev/null +++ b/SOURCES/bz1227293-dhcpd-chroot-fix.patch.patch @@ -0,0 +1,49 @@ +From 6f8a0aa5c0f6c1e4965e4ce10d62ba83ae9f834e Mon Sep 17 00:00:00 2001 +From: David Vossel +Date: Mon, 29 Jun 2015 13:10:42 -0500 +Subject: [PATCH 3/3] bz1227293-dhcpd-chroot-fix.patch + +--- + heartbeat/dhcpd | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/dhcpd b/heartbeat/dhcpd +index 67b529e..89a9578 100755 +--- a/heartbeat/dhcpd ++++ b/heartbeat/dhcpd +@@ -38,6 +38,14 @@ OCF_RESKEY_leases_default="/db/dhcpd.leases" + OCF_RESKEY_interface_default="" + OCF_RESKEY_includes_default="" + ++# On some systems, the chrooted default is slightly different. ++# Lets do our best to support both by default. ++if [ ! -d "$OCF_RESKEY_chrooted_path_default" ]; then ++ if [ -d "/var/lib/dhcpd" ]; then ++ OCF_RESKEY_chrooted_path_default="/var/lib/dhcpd" ++ fi ++fi ++ + : ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} + : ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} + : ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +@@ -302,7 +310,7 @@ dhcpd_initialize_chroot() { + { ocf_exit_reason "could not copy $i to chroot jail"; return $OCF_ERR_GENERIC; } + done + +- libdir=$(basename $(echo /var/lib/dhcp/lib*)) ++ libdir=$(basename $(echo ${OCF_RESKEY_chrooted_path}/lib*)) + if test -x /usr/bin/ldd ; then + get_ldd_deps() + { +@@ -327,7 +335,7 @@ dhcpd_initialize_chroot() { + done | sort -u` + for i in $cplibs ; do + if [ -s "$i" ]; then +- cp -pL "$i" "/var/lib/dhcp/$libdir/" || ++ cp -pL "$i" "${OCF_RESKEY_chrooted_path}/$libdir/" || + { ocf_exit_reason "could not copy $i to chroot jail"; return $OCF_ERR_GENERIC; } + fi + done +-- +1.8.4.2 + diff --git a/SOURCES/bz1231032-redis-update.patch b/SOURCES/bz1231032-redis-update.patch deleted file mode 100644 index 03ddf4b..0000000 --- a/SOURCES/bz1231032-redis-update.patch +++ /dev/null @@ -1,121 +0,0 @@ -From c982683ac8c2de64f69c5f47727242c65e00df90 Mon Sep 17 00:00:00 2001 -From: David Vossel -Date: Mon, 29 Jun 2015 13:07:14 -0500 -Subject: [PATCH 2/3] bz1231032-redis-update.patch - ---- - heartbeat/redis | 51 ++++++++++++++++++++++++++++++++++++++++++++++----- - 1 file changed, 46 insertions(+), 5 deletions(-) - -diff --git a/heartbeat/redis b/heartbeat/redis -index 6b479b2..b63a2b9 100644 ---- a/heartbeat/redis -+++ b/heartbeat/redis -@@ -20,6 +20,7 @@ fi - - CHECK_SLAVE_STATE=0 - -+REDIS_CHECK_DUMP="/usr/bin/redis-check-dump" - REDIS_SERVER="$OCF_RESKEY_bin" - REDIS_CLIENT="$OCF_RESKEY_client_bin" - REDIS_CONFIG="$OCF_RESKEY_config" -@@ -29,6 +30,17 @@ REDIS_PIDFILE="$OCF_RESKEY_rundir/$OCF_RESKEY_pidfile_name" - REDIS_SOCKET="$OCF_RESKEY_rundir/$OCF_RESKEY_socket_name" - REDIS_REPLICATION_PORT="$OCF_RESKEY_port" - -+if ! [ -f $REDIS_CHECK_DUMP ]; then -+ REDIS_CHECK_DUMP="$(which redis-check-dump 2>/dev/null)" -+fi -+ -+if [ -f "$REDIS_CONFIG" ]; then -+ REDIS_DUMP_DIR="$(cat $REDIS_CONFIG | grep "^\s*dir\s" | awk '{ print $2 }' 2>/dev/null)" -+ REDIS_DUMP_FILE="$(cat $REDIS_CONFIG | grep "^\s*dbfilename\s" | awk '{ print $2 }' 2>/dev/null)" -+fi -+: ${REDIS_DUMP_DIR:=/var/lib/redis/} -+: ${REDIS_DUMP_FILE:=dump.rdb} -+ - function meta_data() { - cat < -@@ -289,6 +301,14 @@ function monitor() { - return $OCF_SUCCESS - } - -+function check_dump_file() -+{ -+ if ! have_binary "$REDIS_CHECK_DUMP"; then -+ return 0 -+ fi -+ $REDIS_CHECK_DUMP ${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE} 2>&1 -+} -+ - function start() { - monitor - status=$? -@@ -301,6 +321,16 @@ function start() { - [[ ! -d "$REDIS_RUNDIR" ]] && mkdir -p "$REDIS_RUNDIR" - chown -R "$REDIS_USER" "$REDIS_RUNDIR" - -+ # check for 0 byte database dump file. This is an unrecoverable start -+ # condition that we can avoid by deleting the 0 byte database file. -+ if [ -f "${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE}" ]; then -+ local size="$(stat --format "%s" ${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE})" -+ if [ "$?" -eq "0" ] && [ "$size" -eq "0" ]; then -+ ocf_log notice "Detected 0 byte ${REDIS_DUMP_FILE}, deleting zero length file to avoid start failure." -+ rm -f ${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE} -+ fi -+ fi -+ - ocf_log info "start: $REDIS_SERVER --daemonize yes --unixsocket '$REDIS_SOCKET' --pidfile '$REDIS_PIDFILE'" - output="$(su "$REDIS_USER" -s /bin/sh -c "cd '$REDIS_RUNDIR'; exec '$REDIS_SERVER' '$REDIS_CONFIG' --daemonize yes --unixsocket '$REDIS_SOCKET' --pidfile '$REDIS_PIDFILE'" 2>&1)" - -@@ -325,7 +355,8 @@ function start() { - # It's possible that the `pidof` will pick up a different redis, but in that case, the start operation will just time out - sleep 1 - else -- ocf_log err "start: Unknown error waiting for redis to start" -+ check_output="$(check_dump_file)" -+ ocf_log err "start: Unknown error waiting for redis to start. redis-check-dump output=${check_output//$'\n'/; }" - return $OCF_ERR_GENERIC - fi - done -@@ -338,7 +369,8 @@ function start() { - return $OCF_SUCCESS - fi - -- ocf_log err "start: Unknown error starting redis. output=${output//$'\n'/; }" -+ check_output="$(check_dump_file)" -+ ocf_log err "start: Unknown error starting redis. redis-server output=${output//$'\n'/; } redis-check-dump output=${check_output//$'\n'/; }" - return $status - } - -@@ -427,14 +459,23 @@ function demote() { - - redis_client slaveof "$master_host" "$master_port" - -- # wait briefly for the slave to connect to the master -- for (( c=1; c <= 20; c++ )) -- do -+ # Wait forever for the slave to connect to the master and finish the -+ # sync. Timeout is controlled by Pacemaker "op start timeout=XX". -+ # -+ # hint: redis master_link_status will only come "up" when -+ # the SYNC with the master has completed. -+ # This can take an arbitraty time (data) and should -+ # only be parametrized by the start operation timeout -+ # by the administrator, not by this resource agent code -+ while true; do -+ # Wait infinite if replication is syncing -+ # Then start/demote operation timeout determines timeout - monitor - status=$? - if (( status == OCF_SUCCESS )); then - return $OCF_SUCCESS - fi -+ - sleep 1 - done - --- -1.8.4.2 - diff --git a/SOURCES/bz1231032-redis-update.patch.patch b/SOURCES/bz1231032-redis-update.patch.patch new file mode 100644 index 0000000..03ddf4b --- /dev/null +++ b/SOURCES/bz1231032-redis-update.patch.patch @@ -0,0 +1,121 @@ +From c982683ac8c2de64f69c5f47727242c65e00df90 Mon Sep 17 00:00:00 2001 +From: David Vossel +Date: Mon, 29 Jun 2015 13:07:14 -0500 +Subject: [PATCH 2/3] bz1231032-redis-update.patch + +--- + heartbeat/redis | 51 ++++++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 46 insertions(+), 5 deletions(-) + +diff --git a/heartbeat/redis b/heartbeat/redis +index 6b479b2..b63a2b9 100644 +--- a/heartbeat/redis ++++ b/heartbeat/redis +@@ -20,6 +20,7 @@ fi + + CHECK_SLAVE_STATE=0 + ++REDIS_CHECK_DUMP="/usr/bin/redis-check-dump" + REDIS_SERVER="$OCF_RESKEY_bin" + REDIS_CLIENT="$OCF_RESKEY_client_bin" + REDIS_CONFIG="$OCF_RESKEY_config" +@@ -29,6 +30,17 @@ REDIS_PIDFILE="$OCF_RESKEY_rundir/$OCF_RESKEY_pidfile_name" + REDIS_SOCKET="$OCF_RESKEY_rundir/$OCF_RESKEY_socket_name" + REDIS_REPLICATION_PORT="$OCF_RESKEY_port" + ++if ! [ -f $REDIS_CHECK_DUMP ]; then ++ REDIS_CHECK_DUMP="$(which redis-check-dump 2>/dev/null)" ++fi ++ ++if [ -f "$REDIS_CONFIG" ]; then ++ REDIS_DUMP_DIR="$(cat $REDIS_CONFIG | grep "^\s*dir\s" | awk '{ print $2 }' 2>/dev/null)" ++ REDIS_DUMP_FILE="$(cat $REDIS_CONFIG | grep "^\s*dbfilename\s" | awk '{ print $2 }' 2>/dev/null)" ++fi ++: ${REDIS_DUMP_DIR:=/var/lib/redis/} ++: ${REDIS_DUMP_FILE:=dump.rdb} ++ + function meta_data() { + cat < +@@ -289,6 +301,14 @@ function monitor() { + return $OCF_SUCCESS + } + ++function check_dump_file() ++{ ++ if ! have_binary "$REDIS_CHECK_DUMP"; then ++ return 0 ++ fi ++ $REDIS_CHECK_DUMP ${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE} 2>&1 ++} ++ + function start() { + monitor + status=$? +@@ -301,6 +321,16 @@ function start() { + [[ ! -d "$REDIS_RUNDIR" ]] && mkdir -p "$REDIS_RUNDIR" + chown -R "$REDIS_USER" "$REDIS_RUNDIR" + ++ # check for 0 byte database dump file. This is an unrecoverable start ++ # condition that we can avoid by deleting the 0 byte database file. ++ if [ -f "${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE}" ]; then ++ local size="$(stat --format "%s" ${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE})" ++ if [ "$?" -eq "0" ] && [ "$size" -eq "0" ]; then ++ ocf_log notice "Detected 0 byte ${REDIS_DUMP_FILE}, deleting zero length file to avoid start failure." ++ rm -f ${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE} ++ fi ++ fi ++ + ocf_log info "start: $REDIS_SERVER --daemonize yes --unixsocket '$REDIS_SOCKET' --pidfile '$REDIS_PIDFILE'" + output="$(su "$REDIS_USER" -s /bin/sh -c "cd '$REDIS_RUNDIR'; exec '$REDIS_SERVER' '$REDIS_CONFIG' --daemonize yes --unixsocket '$REDIS_SOCKET' --pidfile '$REDIS_PIDFILE'" 2>&1)" + +@@ -325,7 +355,8 @@ function start() { + # It's possible that the `pidof` will pick up a different redis, but in that case, the start operation will just time out + sleep 1 + else +- ocf_log err "start: Unknown error waiting for redis to start" ++ check_output="$(check_dump_file)" ++ ocf_log err "start: Unknown error waiting for redis to start. redis-check-dump output=${check_output//$'\n'/; }" + return $OCF_ERR_GENERIC + fi + done +@@ -338,7 +369,8 @@ function start() { + return $OCF_SUCCESS + fi + +- ocf_log err "start: Unknown error starting redis. output=${output//$'\n'/; }" ++ check_output="$(check_dump_file)" ++ ocf_log err "start: Unknown error starting redis. redis-server output=${output//$'\n'/; } redis-check-dump output=${check_output//$'\n'/; }" + return $status + } + +@@ -427,14 +459,23 @@ function demote() { + + redis_client slaveof "$master_host" "$master_port" + +- # wait briefly for the slave to connect to the master +- for (( c=1; c <= 20; c++ )) +- do ++ # Wait forever for the slave to connect to the master and finish the ++ # sync. Timeout is controlled by Pacemaker "op start timeout=XX". ++ # ++ # hint: redis master_link_status will only come "up" when ++ # the SYNC with the master has completed. ++ # This can take an arbitraty time (data) and should ++ # only be parametrized by the start operation timeout ++ # by the administrator, not by this resource agent code ++ while true; do ++ # Wait infinite if replication is syncing ++ # Then start/demote operation timeout determines timeout + monitor + status=$? + if (( status == OCF_SUCCESS )); then + return $OCF_SUCCESS + fi ++ + sleep 1 + done + +-- +1.8.4.2 + diff --git a/SOURCES/bz1232376-oracle-agent-update.diff b/SOURCES/bz1232376-oracle-agent-update.diff new file mode 100644 index 0000000..3a8eb12 --- /dev/null +++ b/SOURCES/bz1232376-oracle-agent-update.diff @@ -0,0 +1,246 @@ +diff --git a/heartbeat/oracle b/heartbeat/oracle +index 5ecc2f3..c629eb6 100755 +--- a/heartbeat/oracle ++++ b/heartbeat/oracle +@@ -27,6 +27,9 @@ + # OCF_RESKEY_ipcrm (optional; defaults to "instance") + # OCF_RESKEY_clear_backupmode (optional; default to "false") + # OCF_RESKEY_shutdown_method (optional; default to "checkpoint/abort") ++# OCF_RESKEY_monuser (optional; defaults to "OCFMON") ++# OCF_RESKEY_monpassword (optional; defaults to "OCFMON") ++# OCF_RESKEY_monprofile (optional; defaults to "OCFMONPROFILE") + # + # Initialization: + +@@ -56,6 +59,11 @@ oracle_usage() { + ! + } + ++# Defaults ++OCF_RESKEY_monuser_default="OCFMON" ++OCF_RESKEY_monpassword_default="OCFMON" ++OCF_RESKEY_monprofile_default="OCFMONPROFILE" ++ + oracle_meta_data() { + cat < +@@ -100,6 +108,39 @@ If this does not work for you, just set it explicitely. + + + ++ ++ ++Monitoring user name. Every connection as ++sysdba is logged in an audit log. This can ++result in a large number of new files created. ++A new user is created (if it doesn't exist) in ++the start action and subsequently used in monitor. ++It should have very limited rights. Make sure ++that the password for this user does not expire. ++ ++monuser ++ ++ ++ ++ ++ ++Password for the monitoring user. Make sure ++that the password for this user does not expire. ++ ++monpassword ++ ++ ++ ++ ++ ++Profile used by the monitoring user. If the ++profile does not exist, it will be created ++with a non-expiring password. ++ ++monprofile ++ ++ ++ + + + Sometimes IPC objects (shared memory segments and semaphores) +@@ -216,7 +257,7 @@ execsql() { + if [ "$US" = "$ORACLE_OWNER" ]; then + sqlplus -S /nolog + else +- su - $ORACLE_OWNER -c ". $ORA_ENVF; sqlplus -S /nolog" ++ su - $ORACLE_OWNER -s /bin/sh -c ". $ORA_ENVF; sqlplus -S /nolog" + fi + } + +@@ -250,7 +291,7 @@ dbasql() { + runsql "connect / as sysdba" $* + } + monsql() { +- runsql "connect $MONUSR/$MONUSR" $* ++ runsql "connect $MONUSR/\"$MONPWD\"" $* + } + # use dbasql_one if the query should result in a single line output + # at times people stuff commands in oracle .profile +@@ -325,22 +366,73 @@ getipc() { + echo "oradebug tracefile_name" + echo "oradebug ipc" + } ++show_mon_profile() { ++ echo "select PROFILE from dba_profiles where PROFILE='$MONPROFILE';" ++} ++mk_mon_profile() { ++ cat</dev/null && ++ output=`dbasql show_mon_profile` ++ if echo "$output" | grep -iw "^$MONPROFILE" >/dev/null; then + return 0 ++ fi ++ output=`dbasql mk_mon_profile show_mon_profile` ++ if echo "$output" | grep -iw "^$MONPROFILE" >/dev/null; then ++ return 0 ++ else ++ ocf_log err "could not create $MONPROFILE oracle profile" ++ ocf_log err "sqlplus output: $output" ++ return 1 ++ fi ++} ++check_mon_user() { ++ local output ++ local output2 ++ ++ output=`dbasql show_mon_user` ++ if echo "$output" | grep -iw "^$MONUSR" >/dev/null; then ++ if echo "$output" | grep -w "EXPIRED" >/dev/null; then ++ dbasql reset_mon_user_password ++ fi ++ output=`dbasql show_mon_user_profile` ++ if echo "$output" | grep -iw "^$MONPROFILE" >/dev/null; then ++ return 0 ++ else ++ output=`dbasql set_mon_user_profile` ++ output2=`dbasql show_mon_user_profile` ++ if echo "$output2" | grep -iw "^$MONPROFILE" >/dev/null; then ++ return 0 ++ fi ++ ocf_log err "could not set profile for $MONUSR oracle user" ++ ocf_log err "sqlplus output: $output( $output2 )" ++ return 1 ++ fi ++ fi + output=`dbasql mk_mon_user show_mon_user` +- if echo "$output" | grep -w "^$MONUSR" >/dev/null; then ++ if echo "$output" | grep -iw "^$MONUSR" >/dev/null; then + return 0 + else + ocf_log err "could not create $MONUSR oracle user" +@@ -417,7 +509,7 @@ ipcdesc() { + } + rmipc() { + local what=$1 id=$2 +- ipcs -$what | filteroraipc | grep -w $id >/dev/null 2>&1 || ++ ipcs -$what | filteroraipc | grep -iw $id >/dev/null 2>&1 || + return + ocf_log info "Removing `ipcdesc $what` $id." + ipcrm -$what $id +@@ -447,6 +539,8 @@ is_proc_running() { + # instance in OPEN state? + instance_live() { + local status=`monsql_one dbstat` ++ [ "$status" = OPEN ] && return 0 ++ status=`dbasql_one dbstat` + if [ "$status" = OPEN ]; then + return 0 + else +@@ -473,7 +567,7 @@ ora_cleanup() { + } + + oracle_getconfig() { +- ora_common_getconfig "$OCF_RESKEY_sid" "$OCF_RESKEY_home" "$OCF_RESKEY_user" "$OCF_RESKEY_tns_admin" ++ ora_common_getconfig "$OCF_RESKEY_sid" "$OCF_RESKEY_home" "$OCF_RESKEY_user" + + clear_backupmode=${OCF_RESKEY_clear_backupmode:-"false"} + shutdown_method=${OCF_RESKEY_shutdown_method:-"checkpoint/abort"} +@@ -493,7 +587,7 @@ oracle_getconfig() { + oracle_start() { + local status output + if is_proc_running; then +- status="`monsql_one dbstat`" ++ status="`dbasql_one dbstat`" + case "$status" in + "OPEN") + : nothing to be done, we can leave right now +@@ -541,6 +635,11 @@ oracle_start() { + fi + output=`dbasql dbopen` + ++ # check/create the monitor profile ++ if ! check_mon_profile; then ++ return $OCF_ERR_GENERIC ++ fi ++ + # check/create the monitor user + if ! check_mon_user; then + return $OCF_ERR_GENERIC +@@ -650,7 +749,12 @@ show_procs() { + proc_pids() { show_procs | awk '{print $1}'; } + PROCS_CLEANUP_TIME="30" + +-MONUSR="OCFMON" ++MONUSR=${OCF_RESKEY_monuser:-$OCF_RESKEY_monuser_default} ++MONPWD=${OCF_RESKEY_monpassword:-$OCF_RESKEY_monpassword_default} ++MONPROFILE=${OCF_RESKEY_monprofile_default:-$OCF_RESKEY_monprofile_default} ++ ++MONUSR=$(echo $MONUSR | awk '{print toupper($0)}') ++MONPROFILE=$(echo $MONPROFILE | awk '{print toupper($0)}') + OCF_REQUIRED_PARAMS="sid" + OCF_REQUIRED_BINARIES="sqlplus" + ocf_rarun $* +diff --git a/heartbeat/oralsnr b/heartbeat/oralsnr +index 2409017..a91eeab 100755 +--- a/heartbeat/oralsnr ++++ b/heartbeat/oralsnr +@@ -158,7 +158,7 @@ runasdba() { + ( + echo ". $ORA_ENVF" + cat +- ) | su - $ORACLE_OWNER ++ ) | su -s $SH - $ORACLE_OWNER + fi + } + +@@ -268,7 +268,7 @@ oralsnr_validate_all() { + # used in ora-common.sh + show_procs() { + ps -e -o pid,user,args | +- grep '[t]nslsnr' | grep -w "$listener" | grep -w "$ORACLE_OWNER" ++ grep '[t]nslsnr' | grep -i -w "$listener" | grep -w "$ORACLE_OWNER" + } + proc_pids() { show_procs | awk '{print $1}'; } + PROCS_CLEANUP_TIME="10" diff --git a/SOURCES/bz1251484-redis-client-passwd-support.patch b/SOURCES/bz1251484-redis-client-passwd-support.patch new file mode 100644 index 0000000..f450683 --- /dev/null +++ b/SOURCES/bz1251484-redis-client-passwd-support.patch @@ -0,0 +1,33 @@ +commit fe53056f225fadae184a0ab79f1f96430854812f +Author: David Vossel +Date: Thu Aug 13 14:11:30 2015 -0400 + + High: redis: use required client pass word when set + +diff --git a/heartbeat/redis b/heartbeat/redis +index e1d0795..65abb2a 100755 +--- a/heartbeat/redis ++++ b/heartbeat/redis +@@ -218,7 +218,11 @@ function set_score() + + function redis_client() { + ocf_log debug "redis_client: '$REDIS_CLIENT' -s '$REDIS_SOCKET' $@" +- "$REDIS_CLIENT" -s "$REDIS_SOCKET" "$@" | sed 's/\r//' ++ if [ -n "$clientpasswd" ]; then ++ "$REDIS_CLIENT" -s "$REDIS_SOCKET" -a "$clientpasswd" "$@" | sed 's/\r//' ++ else ++ "$REDIS_CLIENT" -s "$REDIS_SOCKET" "$@" | sed 's/\r//' ++ fi + } + + function simple_status() { +@@ -487,6 +491,9 @@ function validate() { + } + + NODENAME=$(ocf_local_nodename) ++if [ -f "$REDIS_CONFIG" ]; then ++ clientpasswd="$(cat $REDIS_CONFIG | sed -n -e 's/^\s*requirepass\s*\(.*\)\s*$/\1/p' | tail -n 1)" ++fi + + ocf_log debug "action=${1:-$__OCF_ACTION} notify_type=${OCF_RESKEY_CRM_meta_notify_type} notify_operation=${OCF_RESKEY_CRM_meta_notify_operation} master_host=${OCF_RESKEY_CRM_meta_notify_master_uname} slave_host=${OCF_RESKEY_CRM_meta_notify_slave_uname} promote_host=${OCF_RESKEY_CRM_meta_notify_promote_uname} demote_host=${OCF_RESKEY_CRM_meta_notify_demote_uname}; params: bin=${OCF_RESKEY_bin} client_bin=${OCF_RESKEY_client_bin} config=${OCF_RESKEY_config} user=${OCF_RESKEY_user} rundir=${OCF_RESKEY_rundir} port=${OCF_RESKEY_port}" + diff --git a/SOURCES/bz1259595-redis-client-passwd-support.patch b/SOURCES/bz1259595-redis-client-passwd-support.patch deleted file mode 100644 index f450683..0000000 --- a/SOURCES/bz1259595-redis-client-passwd-support.patch +++ /dev/null @@ -1,33 +0,0 @@ -commit fe53056f225fadae184a0ab79f1f96430854812f -Author: David Vossel -Date: Thu Aug 13 14:11:30 2015 -0400 - - High: redis: use required client pass word when set - -diff --git a/heartbeat/redis b/heartbeat/redis -index e1d0795..65abb2a 100755 ---- a/heartbeat/redis -+++ b/heartbeat/redis -@@ -218,7 +218,11 @@ function set_score() - - function redis_client() { - ocf_log debug "redis_client: '$REDIS_CLIENT' -s '$REDIS_SOCKET' $@" -- "$REDIS_CLIENT" -s "$REDIS_SOCKET" "$@" | sed 's/\r//' -+ if [ -n "$clientpasswd" ]; then -+ "$REDIS_CLIENT" -s "$REDIS_SOCKET" -a "$clientpasswd" "$@" | sed 's/\r//' -+ else -+ "$REDIS_CLIENT" -s "$REDIS_SOCKET" "$@" | sed 's/\r//' -+ fi - } - - function simple_status() { -@@ -487,6 +491,9 @@ function validate() { - } - - NODENAME=$(ocf_local_nodename) -+if [ -f "$REDIS_CONFIG" ]; then -+ clientpasswd="$(cat $REDIS_CONFIG | sed -n -e 's/^\s*requirepass\s*\(.*\)\s*$/\1/p' | tail -n 1)" -+fi - - ocf_log debug "action=${1:-$__OCF_ACTION} notify_type=${OCF_RESKEY_CRM_meta_notify_type} notify_operation=${OCF_RESKEY_CRM_meta_notify_operation} master_host=${OCF_RESKEY_CRM_meta_notify_master_uname} slave_host=${OCF_RESKEY_CRM_meta_notify_slave_uname} promote_host=${OCF_RESKEY_CRM_meta_notify_promote_uname} demote_host=${OCF_RESKEY_CRM_meta_notify_demote_uname}; params: bin=${OCF_RESKEY_bin} client_bin=${OCF_RESKEY_client_bin} config=${OCF_RESKEY_config} user=${OCF_RESKEY_user} rundir=${OCF_RESKEY_rundir} port=${OCF_RESKEY_port}" - diff --git a/SOURCES/bz773399-netmast-error.patch b/SOURCES/bz773399-netmast-error.patch new file mode 100644 index 0000000..42ea233 --- /dev/null +++ b/SOURCES/bz773399-netmast-error.patch @@ -0,0 +1,105 @@ +diff --git a/heartbeat/IPaddr2 b/heartbeat/IPaddr2 +index 2791ea0..74bdef1 100755 +--- a/heartbeat/IPaddr2 ++++ b/heartbeat/IPaddr2 +@@ -446,7 +446,7 @@ ip_init() { + ocf_log warn "[$FINDIF] failed" + exit $OCF_SUCCESS + else +- ocf_exit_reason "[$FINDIF] failed" ++ ocf_log err "[$FINDIF] failed" + exit $rc + fi + fi +diff --git a/heartbeat/findif.sh b/heartbeat/findif.sh +index 98649bf..6250a03 100644 +--- a/heartbeat/findif.sh ++++ b/heartbeat/findif.sh +@@ -75,26 +75,26 @@ findif_check_params() + if [ "$family" = "inet6" ] ; then + ipcheck_ipv6 $match + if [ $? = 1 ] ; then +- ocf_log err "IP address [$match] not valid." ++ ocf_exit_reason "IP address [$match] not valid." + return $OCF_ERR_CONFIGURED + fi + if [ -n "$nic" ] ; then + ifcheck_ipv6 $nic + if [ $? = 1 ] ; then +- ocf_log err "Unknown interface [$nic] No such device." ++ ocf_exit_reason "Unknown interface [$nic] No such device." + return $OCF_ERR_CONFIGURED + fi + else + echo $match | grep -qis '^fe80::' + if [ $? = 0 ] ; then +- ocf_log err "'nic' parameter is mandatory for a link local address [$match]." ++ ocf_exit_reason "'nic' parameter is mandatory for a link local address [$match]." + return $OCF_ERR_CONFIGURED + fi + fi + if [ -n "$netmask" ] ; then + prefixcheck $netmask 128 + if [ $? = 1 ] ; then +- ocf_log err "Invalid netmask specification [$netmask]." ++ ocf_exit_reason "Invalid netmask specification [$netmask]." + return $OCF_ERR_CONFIGURED + fi + fi +@@ -102,27 +102,27 @@ findif_check_params() + # family = inet + ipcheck_ipv4 $match + if [ $? = 1 ] ; then +- ocf_log err "IP address [$match] not valid." ++ ocf_exit_reason "IP address [$match] not valid." + return $OCF_ERR_CONFIGURED + fi + if [ -n "$nic" ] ; then + ifcheck_ipv4 $nic + if [ $? = 1 ] ; then +- ocf_log err "Unknown interface [$nic] No such device." ++ ocf_exit_reason "Unknown interface [$nic] No such device." + return $OCF_ERR_CONFIGURED + fi + fi + if [ -n "$netmask" ] ; then + prefixcheck $netmask 32 + if [ $? = 1 ] ; then +- ocf_log err "Invalid netmask specification [$netmask]." ++ ocf_exit_reason "Invalid netmask specification [$netmask]." + return $OCF_ERR_CONFIGURED + fi + fi + if [ -n "$brdcast" ] ; then + ipcheck_ipv4 $brdcast + if [ $? = 1 ] ; then +- ocf_log err "Invalid broadcast address [$brdcast]." ++ ocf_exit_reason "Invalid broadcast address [$brdcast]." + return $OCF_ERR_CONFIGURED + fi + fi +@@ -166,13 +166,13 @@ findif() + fi + if [ -z "$nic" -o -z "$netmask" ] ; then + if [ $# = 0 ] ; then +- ocf_log err "Unable to find nic or netmask." ++ ocf_exit_reason "Unable to find nic or netmask." + return $OCF_ERR_GENERIC + fi + case $1 in + */*) : OK ;; + *) +- ocf_log err "Unable to find cidr_netmask." ++ ocf_exit_reason "Unable to find cidr_netmask." + return $OCF_ERR_GENERIC ;; + esac + fi +@@ -187,7 +187,7 @@ findif() + fi + else + if [ -z "$OCF_RESKEY_nic" -a "$netmask" != "${1#*/}" ] ; then +- ocf_log err "Unable to find nic, or netmask mismatch." ++ ocf_exit_reason "Unable to find nic, or netmask mismatch." + return $OCF_ERR_GENERIC + fi + fi diff --git a/SOURCES/nfs-fixes-update.patch b/SOURCES/nfs-fixes-update.patch new file mode 100644 index 0000000..aeafff1 --- /dev/null +++ b/SOURCES/nfs-fixes-update.patch @@ -0,0 +1,143 @@ +From 6900fcb7b014bd0177c44f20447caca4658b45c6 Mon Sep 17 00:00:00 2001 +From: David Vossel +Date: Wed, 29 Apr 2015 11:12:23 -0500 +Subject: [PATCH 2/6] nfsserver updates + +--- + heartbeat/nfsserver | 51 ++++++++++++++++++++++++++++++++++----------------- + 1 file changed, 34 insertions(+), 17 deletions(-) + +diff --git a/heartbeat/nfsserver b/heartbeat/nfsserver +index de1a802..33cb607 100755 +--- a/heartbeat/nfsserver ++++ b/heartbeat/nfsserver +@@ -208,9 +208,9 @@ The mount point for the sunrpc file system. + + + +- +- +- ++ ++ ++ + + + +@@ -391,7 +391,12 @@ set_arg() + # only write to the tmp /etc/sysconfig/nfs if sysconfig exists. + # otherwise this distro does not support setting these options. + if [ -d "/etc/sysconfig" ]; then +- echo "${key}=\"${value}\"" >> $file ++ # replace if the value exists, append otherwise ++ if grep "^\s*${key}=" $file ; then ++ sed -i "s/\s*${key}=.*$/${key}=\"${value}\"/" $file ++ else ++ echo "${key}=\"${value}\"" >> $file ++ fi + elif [ "$requires_sysconfig" = "true" ]; then + ocf_log warn "/etc/sysconfig/nfs not found, unable to set port and nfsd args." + fi +@@ -404,6 +409,11 @@ set_env_args() + local tmpconfig=$(mktemp ${HA_RSCTMP}/nfsserver-tmp-XXXXX) + local statd_args + ++ if [ -f "$NFS_SYSCONFIG" ]; then ++ ## Take the $NFS_SYSCONFIG file as our skeleton ++ cp $NFS_SYSCONFIG $tmpconfig ++ fi ++ + # nfsd args + set_arg "RPCNFSDARGS" "$OCF_RESKEY_nfsd_args" "$tmpconfig" "true" + +@@ -434,14 +444,20 @@ set_env_args() + + # override local nfs config. preserve previous local config though. + if [ -s $tmpconfig ]; then +- cat $NFS_SYSCONFIG | grep -e "$NFS_SYSCONFIG_AUTOGEN_TAG" ++ cat $NFS_SYSCONFIG | grep -q -e "$NFS_SYSCONFIG_AUTOGEN_TAG" > /dev/null 2>&1 + if [ $? -ne 0 ]; then + # backup local nfs config if it doesn't have our HA autogen tag in it. + mv -f $NFS_SYSCONFIG $NFS_SYSCONFIG_LOCAL_BACKUP + fi +- echo "# $NFS_SYSCONFIG_AUTOGEN_TAG" > $NFS_SYSCONFIG +- echo "# local config backup stored here, '$NFS_SYSCONFIG_LOCAL_BACKUP'" >> $NFS_SYSCONFIG +- cat $tmpconfig >> $NFS_SYSCONFIG ++ ++ cat $tmpconfig | grep -q -e "$NFS_SYSCONFIG_AUTOGEN_TAG" > /dev/null 2>&1 ++ if [ $? -ne 0 ]; then ++ echo "# $NFS_SYSCONFIG_AUTOGEN_TAG" > $NFS_SYSCONFIG ++ echo "# local config backup stored here, '$NFS_SYSCONFIG_LOCAL_BACKUP'" >> $NFS_SYSCONFIG ++ cat $tmpconfig >> $NFS_SYSCONFIG ++ else ++ cat $tmpconfig > $NFS_SYSCONFIG ++ fi + fi + rm -f $tmpconfig + } +@@ -460,13 +476,14 @@ prepare_directory () + [ -d "$fp/$STATD_DIR/sm" ] || mkdir -p "$fp/$STATD_DIR/sm" + [ -d "$fp/$STATD_DIR/sm.ha" ] || mkdir -p "$fp/$STATD_DIR/sm.ha" + [ -d "$fp/$STATD_DIR/sm.bak" ] || mkdir -p "$fp/$STATD_DIR/sm.bak" +- [ -n "`id -u rpcuser`" -a "`id -g rpcuser`" ] && chown -R rpcuser.rpcuser "$fp/$STATD_DIR" ++ [ -n "`id -u rpcuser 2>/dev/null`" -a "`id -g rpcuser 2>/dev/null`" ] && ++ chown -R rpcuser.rpcuser "$fp/$STATD_DIR" + + [ -f "$fp/etab" ] || touch "$fp/etab" + [ -f "$fp/xtab" ] || touch "$fp/xtab" + [ -f "$fp/rmtab" ] || touch "$fp/rmtab" + +- dd if=/dev/urandom of=$fp/$STATD_DIR/state bs=1 count=4 &> /dev/null ++ dd if=/dev/urandom of=$fp/$STATD_DIR/state bs=1 count=4 >/dev/null 2>&1 + [ -n "`id -u rpcuser`" -a "`id -g rpcuser`" ] && chown rpcuser.rpcuser "$fp/$STATD_DIR/state" + [ $SELINUX_ENABLED -eq 0 ] && chcon -R "$SELINUX_LABEL" "$fp" + } +@@ -546,15 +563,15 @@ locking_start() + + terminate() + { +- declare pids +- declare i=0 ++ local pids ++ local i=0 + + while : ; do + pids=$(binary_status $1) + [ -z "$pids" ] && return 0 + kill $pids + sleep 1 +- ((i++)) ++ i=$((i + 1)) + [ $i -gt 3 ] && return 1 + done + } +@@ -562,22 +579,22 @@ terminate() + + killkill() + { +- declare pids +- declare i=0 ++ local pids ++ local i=0 + + while : ; do + pids=$(binary_status $1) + [ -z "$pids" ] && return 0 + kill -9 $pids + sleep 1 +- ((i++)) ++ i=$((i + 1)) + [ $i -gt 3 ] && return 1 + done + } + + stop_process() + { +- declare process=$1 ++ local process=$1 + + ocf_log info "Stopping $process" + if terminate $process; then +-- +1.8.4.2 + diff --git a/SPECS/resource-agents.spec b/SPECS/resource-agents.spec index 0533a42..9ebfb2b 100644 --- a/SPECS/resource-agents.spec +++ b/SPECS/resource-agents.spec @@ -32,7 +32,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 3.9.5 -Release: 40%{?dist}.9 +Release: 54%{?dist} License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} @@ -101,11 +101,30 @@ Patch56: bz1135026-docker-handle-invalid-monitor-cmd.patch Patch57: bz1118029-iscsi-remove-write-back.patch Patch58: rabbitmq-cluster.patch Patch59: bz1189187-redis-agent.patch -Patch60: NovaCompute.patch -Patch61: bz1214360-NovaCompute-update1.patch -Patch62: bz1170376-galera-no-readonly.patch -Patch63: bz1231032-redis-update.patch -Patch64: bz1259595-redis-client-passwd-support.patch +Patch60: bz1170376-galera-no-readonly.patch +Patch61: bz1198681-clvm-activate-vgs-option.patch +Patch62: bz1200756-ipsrcaddr-misconfig.patch +Patch63: bz773399-netmast-error.patch +Patch64: bz1059988-db2-support.patch +Patch65: bz1077888-ctdb-updates.patch +Patch66: bz1171162-clvmd-opt-fix.patch +Patch67: bz1183136-nginx-support.patch +Patch68: bz1213971-ethmon-opt.patch +Patch69: nfs-fixes-update.patch +Patch70: bz1160365-iface-vlan.patch.patch +Patch71: bz1214781-lvm-partial-activation-fix.patch.patch +Patch72: bz1223615-apache-includes-fix.patch.patch +Patch73: NovaCompute.patch +Patch74: bz1214360-NovaCompute-update1.patch.patch +Patch75: bz1227293-dhcpd-chroot-fix.patch.patch +Patch76: bz1231032-redis-update.patch.patch +Patch77: bz1232376-oracle-agent-update.diff +Patch78: bz1168251-SAPHana-agents.patch +Patch79: bz1168251-SAPHana-agents-update.patch +Patch80: bz1168251-SAPHana-agents-update2.patch +Patch81: bz1168251-SAPHana-agents-update3.patch +Patch82: bz1168251-SAPHana-agents_update4.patch +Patch83: bz1251484-redis-client-passwd-support.patch Obsoletes: heartbeat-resources <= %{version} Provides: heartbeat-resources = %{version} @@ -187,6 +206,23 @@ The SAP resource agents and connector script interface with Pacemaker to allow SAP instances to be managed in a cluster environment. +%ifarch x86_64 +%package sap-hana +License: GPLv2+ +Summary: SAP HANA cluster resource agents +%if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} +Group: System Environment/Base +%else +Group: Productivity/Clustering/HA +%endif +Requires: %{name} = %{version}-%{release} +Requires: perl + +%description sap-hana +The SAP HANA resource agents interface with Pacemaker to allow +SAP instances to be managed in a cluster environment. +%endif + %prep %if 0%{?suse_version} == 0 && 0%{?fedora} == 0 && 0%{?centos_version} == 0 && 0%{?rhel} == 0 %{error:Unable to determine the distribution/version. This is generally caused by missing /etc/rpm/macros.dist. Please install the correct build packages or define the required macros manually.} @@ -258,6 +294,25 @@ exit 1 %patch62 -p1 %patch63 -p1 %patch64 -p1 +%patch65 -p1 +%patch66 -p1 +%patch67 -p1 +%patch68 -p1 +%patch69 -p1 +%patch70 -p1 +%patch71 -p1 +%patch72 -p1 +%patch73 -p1 +%patch74 -p1 +%patch75 -p1 +%patch76 -p1 +%patch77 -p1 +%patch78 -p1 +%patch79 -p1 +%patch80 -p1 +%patch81 -p1 +%patch82 -p1 +%patch83 -p1 %build if [ ! -f configure ]; then @@ -270,8 +325,11 @@ chmod 755 heartbeat/nfsnotify chmod 755 heartbeat/docker chmod 755 heartbeat/rabbitmq-cluster chmod 755 heartbeat/redis +chmod 755 heartbeat/iface-vlan chmod 755 heartbeat/NovaCompute chmod 755 heartbeat/NovaEvacuate +chmod 755 heartbeat/SAPHana +chmod 755 heartbeat/SAPHanaTopology %if 0%{?fedora} >= 11 || 0%{?centos_version} > 5 || 0%{?rhel} > 5 CFLAGS="$(echo '%{optflags}')" @@ -374,6 +432,7 @@ rm -rf %{buildroot} # Supported, but in another sub package ### %exclude %{_sbindir}/sap_redhat_cluster_connector +%exclude %{_sbindir}/show_SAPHanaSR_attributes %exclude /usr/lib/ocf/resource.d/heartbeat/SAP* %exclude /usr/lib/ocf/lib/heartbeat/sap* %exclude %{_mandir}/man7/*SAP* @@ -403,7 +462,6 @@ rm -rf %{buildroot} %exclude /usr/lib/ocf/resource.d/heartbeat/Xen %exclude /usr/lib/ocf/resource.d/heartbeat/anything %exclude /usr/lib/ocf/resource.d/heartbeat/asterisk -%exclude /usr/lib/ocf/resource.d/heartbeat/db2 %exclude /usr/lib/ocf/resource.d/heartbeat/eDir88 %exclude /usr/lib/ocf/resource.d/heartbeat/fio %exclude /usr/lib/ocf/resource.d/heartbeat/ids @@ -411,8 +469,6 @@ rm -rf %{buildroot} %exclude /usr/lib/ocf/resource.d/heartbeat/jboss %exclude /usr/lib/ocf/resource.d/heartbeat/ldirectord %exclude /usr/lib/ocf/resource.d/heartbeat/lxc -%exclude /usr/lib/ocf/resource.d/heartbeat/oracle -%exclude /usr/lib/ocf/resource.d/heartbeat/oralsnr %exclude /usr/lib/ocf/resource.d/heartbeat/pingd %exclude /usr/lib/ocf/resource.d/heartbeat/portblock %exclude /usr/lib/ocf/resource.d/heartbeat/pound @@ -424,7 +480,6 @@ rm -rf %{buildroot} %exclude /usr/lib/ocf/resource.d/heartbeat/vmware %exclude /usr/lib/ocf/resource.d/heartbeat/zabbixserver %exclude /usr/lib/ocf/resource.d/heartbeat/mysql-proxy -%exclude /usr/lib/ocf/resource.d/heartbeat/nginx %exclude /usr/lib/ocf/resource.d/heartbeat/rsyslog %exclude %{_mandir}/man7/ocf_heartbeat_AoEtarget.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_AudibleAlarm.7.gz @@ -449,15 +504,12 @@ rm -rf %{buildroot} %exclude %{_mandir}/man7/ocf_heartbeat_Xen.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_anything.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_asterisk.7.gz -%exclude %{_mandir}/man7/ocf_heartbeat_db2.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_eDir88.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_fio.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_ids.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_iscsi.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_jboss.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_lxc.7.gz -%exclude %{_mandir}/man7/ocf_heartbeat_oracle.7.gz -%exclude %{_mandir}/man7/ocf_heartbeat_oralsnr.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_pingd.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_portblock.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_pound.7.gz @@ -469,7 +521,6 @@ rm -rf %{buildroot} %exclude %{_mandir}/man7/ocf_heartbeat_vmware.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_zabbixserver.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_mysql-proxy.7.gz -%exclude %{_mandir}/man7/ocf_heartbeat_nginx.7.gz %exclude %{_mandir}/man7/ocf_heartbeat_rsyslog.7.gz ### @@ -503,43 +554,111 @@ ccs_update_schema > /dev/null 2>&1 ||: /usr/lib/ocf/resource.d/heartbeat/SAP* /usr/lib/ocf/lib/heartbeat/sap* %{_mandir}/man7/*SAP* +%exclude %{_mandir}/man7/*SAPHana* +%exclude /usr/lib/ocf/resource.d/heartbeat/SAPHana* + +%ifarch x86_64 +%files sap-hana +%defattr(-,root,root) +/usr/lib/ocf/resource.d/heartbeat/SAPHana* +%{_mandir}/man7/*SAPHana* +%endif %changelog -* Mon Sep 7 2015 Fabio M. Di Nitto - 3.9.5-40.9 +* Mon Sep 7 2015 Fabio M. Di Nitto - 3.9.5-54 - Fix redis client password regexp + Resolves: rhbz#1251484 - Resolves: rhbz#1259595 - -* Thu Sep 3 2015 Fabio M. Di Nitto - 3.9.5-40.8 +* Thu Sep 3 2015 Fabio M. Di Nitto - 3.9.5-53 - Add support redis client password authentication + Resolves: rhbz#1251484 - Resolves: rhbz#1259595 +* Thu Jul 23 2015 David Vossel - 3.9.5-52 +- Only build SAP hana packages for x86_64 -* Tue Aug 04 2015 David Vossel - 3.9.5-40.7 -- Fix redis failure to start when db is 0 bytes. + Resolves: rhbz#1244827 - Resolves: rhbz#1250073 +* Thu Jul 23 2015 David Vossel - 3.9.5-51 +- Properly include SAP hana packages in correct subpackage. -* Mon Jul 13 2015 David Vossel - 3.9.5-40.6 -- Improve galera resource-agent to not require use of read-only - mode to retrieve last known write sequence number. + Resolves: rhbz#1244827 + +* Thu Jul 23 2015 David Vossel - 3.9.5-50 +- Sync SAP Hana agents with upstream + + Resolves: rhbz#1244827 - Resolves: rhbz#1242339 +* Wed Jul 22 2015 David Vossel - 3.9.5-49 +- Place SAP Hana agents in sap-hana subpackage -* Mon Jul 6 2015 David Vossel - 3.9.5-40.5 + Resolves: rhbz#1244827 + +* Wed Jul 10 2015 David Vossel - 3.9.5-48 +- add support for oracle resource agents + + Resolves: rhbz#1232376 + +* Thu Jun 25 2015 David Vossel - 3.9.5-47 - NovaCompute and NovaEvacuate updates +- dhcpd chroot fix +- redis 0byte error fix - Resolves: rhbz#1238716 + Resolves: rhbz#1214360 + Resolves: rhbz#1227293 + Resolves: rhbz#1231032 -* Thu Jun 11 2015 David Vossel - 3.9.5-40.4 +* Thu Jun 25 2015 David Vossel - 3.9.5-46 +- iface-vlan agent +- Allow partial activation when physical volumes are missing. +- Properly handle 'includes' during apache config parsing - Support for NovaCompute resource-agent - Resolves: rhbz#1229383 + Resolves: rhbz#1160365 + Resolves: rhbz#1214781 + Resolves: rhbz#1223615 + Resolves: rhbz#1214360 + +* Wed Apr 29 2015 David Vossel - 3.9.5-45 +- Fix clvmd usage of daemon_options +- Use better default nfsserver start timeouts +- Make nfsserver preserve options in /etc/sysconfig/nfs +- Add link_status_only option to ethmonitor agent +- Add support for nginx agent +- Add support for db2 agent +- CTDB agent updates + + Resolves: rhbz#1171162 + Resolves: rhbz#1173193 + Resolves: rhbz#1182787 + Resolves: rhbz#1213971 + Resolves: rhbz#1183136 + Resolves: rhbz#1059988 + Resolves: rhbz#1077888 + +* Tue Apr 28 2015 David Vossel - 3.9.5-44 +- For IPsrcaddr, properly handle misconfiguration in a way that + doesn't result in fencing. +- Return exit reason for invalid netmask in IPaddr2 + + Resolves: rhbz#1200756 + Resolves: rhbz#773399 + +* Mon Apr 27 2015 David Vossel - 3.9.5-43 +- Add activate_vgs option to clvmd to control activating volume + groups + + Resolves: rhbz#1198681 + +* Thu Apr 23 2015 David Vossel - 3.9.5-42 +- Improve galera resource-agent to not require use of read-only + mode to retrieve last known write sequence number. + + Resolves: rhbz#1170376 -* Thu Feb 5 2015 David Vossel - 3.9.5-40.3 +* Thu Feb 5 2015 David Vossel - 3.9.5-41 - Support for redis resource-agent - Resolves: rhbz#1201002 + Resolves: rhbz#1189187 * Mon Jan 26 2015 David Vossel - 3.9.5-20.2 - Support for rabbitmq-cluster resource-agent