diff --git a/SOURCES/bz1344225-garbd-Introduces-garbd-resource-agent.patch b/SOURCES/bz1344225-garbd-Introduces-garbd-resource-agent.patch new file mode 100644 index 0000000..af18286 --- /dev/null +++ b/SOURCES/bz1344225-garbd-Introduces-garbd-resource-agent.patch @@ -0,0 +1,474 @@ +From beb8dd713fa3a15ca01738de33f2031d1e5925d9 Mon Sep 17 00:00:00 2001 +From: Damien Ciabrini +Date: Wed, 1 Jun 2016 17:14:04 +0200 +Subject: [PATCH 1/2] garbd: Introduces garbd resource-agent + +--- + heartbeat/garbd | 417 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 417 insertions(+) + create mode 100755 heartbeat/garbd + +diff --git a/heartbeat/garbd b/heartbeat/garbd +new file mode 100755 +index 0000000..950df76 +--- /dev/null ++++ b/heartbeat/garbd +@@ -0,0 +1,417 @@ ++#!/bin/sh ++# ++# Copyright (c) 2015 Damien Ciabrini ++# All Rights Reserved. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of version 2 of the GNU General Public License as ++# published by the Free Software Foundation. ++# ++# This program is distributed in the hope that it would be useful, but ++# WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ++# ++# Further, this software is distributed without any warranty that it is ++# free of the rightful claim of any third person regarding infringement ++# or the like. Any license provided herein, whether implied or ++# otherwise, applies only to this software file. Patent licenses, if ++# any, provided herein do not apply to combinations of this program with ++# other software, or any other product whatsoever. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write the Free Software Foundation, ++# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. ++# ++ ++## ++# README. ++# ++# Resource agent for garbd, the Galera arbitrator ++# ++# You can use this agent if you run an even number of galera nodes, ++# and you want an additional node to avoid split-brain situations. ++# ++# garbd requires that a Galera cluster is running, so make sure to ++# add a proper ordering constraint to the cluster, e.g.: ++# ++# pcs constraint order galera-master then garbd ++# ++# If you add garbd to the cluster while Galera is not running, you ++# might want to disable it before setting up ordering constraint, e.g.: ++# ++# pcs resource create garbd garbd \ ++# wsrep_cluster_address=gcomm://node1:4567,node2:4567 \ ++# meta target-role=stopped ++# ++# Use location constraints to avoid running galera and garbd on ++# the same node, e.g.: ++# ++# pcs constraint colocation add garbd with galera-master -INFINITY ++# pcs constraint location garbd prefers node3=INFINITY ++# ++## ++ ++####################################################################### ++# Initialization: ++ ++: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} ++. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ++ ++####################################################################### ++# Set default paramenter values ++ ++OCF_RESKEY_binary_default="/usr/sbin/garbd" ++OCF_RESKEY_log_default="/var/log/garbd.log" ++OCF_RESKEY_pid_default="/var/run/garbd.pid" ++OCF_RESKEY_user_default="mysql" ++if [ "X${HOSTOS}" = "XOpenBSD" ];then ++ OCF_RESKEY_group_default="_mysql" ++else ++ OCF_RESKEY_group_default="mysql" ++fi ++ ++: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} ++: ${OCF_RESKEY_log=${OCF_RESKEY_log_default}} ++: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} ++: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} ++: ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} ++ ++usage() { ++ cat < ++ ++ ++1.0 ++ ++ ++Resource script for managing Galera arbitrator. ++ ++Manages a galera arbitrator instance ++ ++ ++ ++ ++Location of the Galera arbitrator binary ++ ++garbd server binary ++ ++ ++ ++ ++ ++User running the garbd process ++ ++garbd user ++ ++ ++ ++ ++ ++Group running garbd (for logfile permissions) ++ ++garbd group ++ ++ ++ ++ ++ ++The logfile to be used for garbd. ++ ++Galera arbitrator log file ++ ++ ++ ++ ++ ++The pidfile to be used for garbd. ++ ++Galera arbitrator pidfile ++ ++ ++ ++ ++ ++Additional parameters which are passed to garbd on startup. ++ ++Additional parameters to pass to garbd ++ ++ ++ ++ ++ ++The galera cluster address. This takes the form of: ++gcomm://node:port,node:port,node:port ++ ++Unlike Galera servers, port is mandatory for garbd. ++ ++Galera cluster address ++ ++ ++ ++ ++ ++The group name of the Galera cluster to connect to. ++ ++Galera cluster name ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++END ++} ++ ++ ++garbd_start() ++{ ++ local rc ++ local pid ++ local start_wait ++ local garbd_params ++ ++ garbd_status info ++ rc=$? ++ if [ $rc -eq $OCF_SUCCESS ]; then ++ ocf_exit_reason "garbd started outside of the cluster's control" ++ return $OCF_ERR_GENERIC; ++ fi ++ ++ touch $OCF_RESKEY_log ++ chown $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_log ++ chmod 0640 $OCF_RESKEY_log ++ [ -x /sbin/restorecon ] && /sbin/restorecon $OCF_RESKEY_log ++ ++ garbd_params="--address=${OCF_RESKEY_wsrep_cluster_address} \ ++ --group ${OCF_RESKEY_wsrep_cluster_name} \ ++ --log ${OCF_RESKEY_log}" ++ ++ if [ ! -z "${OCF_RESKEY_options}" ]; then ++ garbd_params="${garbd_params} --options=${OCF_RESKEY_options}" ++ fi ++ ++ # garbd has no parameter to run as a specific user, ++ # so we need to start it by our own means ++ pid=$(su - -s /bin/sh $OCF_RESKEY_user -c "${OCF_RESKEY_binary} ${garbd_params} >/dev/null 2>&1 & echo \$!") ++ ++ # garbd doesn't create a pidfile either, so we create our own ++ echo $pid > $OCF_RESKEY_pid ++ if [ $? -ne 0 ]; then ++ ocf_exit_reason "Cannot create pidfile for garbd at $OCF_RESKEY_pid (rc=$?), please check your installation" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ # Spin waiting for garbd to connect to the cluster. ++ # Let the CRM/LRM time us out if required. ++ start_wait=1 ++ while [ $start_wait -eq 1 ]; do ++ garbd_monitor info ++ rc=$? ++ if [ $rc -eq $OCF_NOT_RUNNING ]; then ++ ocf_exit_reason "garbd failed to start (pid=$pid), check logs in ${OCF_RESKEY_log}" ++ return $OCF_ERR_GENERIC ++ elif [ $rc -eq $OCF_SUCCESS ]; then ++ start_wait=0 ++ fi ++ sleep 2 ++ done ++ ++ ocf_log info "garbd connected to cluster \"${OCF_RESKEY_wsrep_cluster_name}\"" ++ return $OCF_SUCCESS ++} ++ ++garbd_status() ++{ ++ local loglevel=$1 ++ local rc ++ ocf_pidfile_status $OCF_RESKEY_pid ++ rc=$? ++ ++ if [ $rc -eq 0 ]; then ++ return $OCF_SUCCESS ++ elif [ $rc -eq 2 ]; then ++ return $OCF_NOT_RUNNING ++ else ++ # clean up if pidfile is stale ++ if [ $rc -eq 1 ]; then ++ ocf_log $loglevel "garbd not running: removing old PID file" ++ rm -f $OCF_RESKEY_pid ++ fi ++ return $OCF_ERR_GENERIC ++ fi ++} ++ ++garbd_monitor() ++{ ++ local rc ++ local pid ++ local loglevel=$1 ++ ++ # Set loglevel to info during probe ++ if ocf_is_probe; then ++ loglevel="info" ++ fi ++ ++ garbd_status $loglevel ++ rc=$? ++ ++ # probe just wants to know if garbd is running or not ++ if [ ocf_is_probe -a $rc -ne $OCF_SUCCESS ]; then ++ rc=$OCF_NOT_RUNNING ++ fi ++ ++ # Consider garbd is working if it's connected to at least ++ # one node in the galera cluster. ++ # Note: a Galera node in Non-Primary state will be ++ # stopped by the galera RA. So we can assume that ++ # garbd will always be connected to the right partition ++ if [ $rc -eq $OCF_SUCCESS ]; then ++ pid=`cat $OCF_RESKEY_pid 2> /dev/null ` ++ netstat -tnp 2>/dev/null | grep -s -q "ESTABLISHED.*${pid}/" ++ if [ $? -ne 0 ]; then ++ ocf_log $loglevel "garbd disconnected from cluster \"${OCF_RESKEY_wsrep_cluster_name}\"" ++ rc=$OCF_ERR_GENERIC ++ fi ++ fi ++ ++ return $rc ++} ++ ++garbd_stop() ++{ ++ local rc ++ local pid ++ ++ if [ ! -f $OCF_RESKEY_pid ]; then ++ ocf_log info "garbd is not running" ++ return $OCF_SUCCESS ++ fi ++ ++ pid=`cat $OCF_RESKEY_pid 2> /dev/null ` ++ ++ ocf_log info "stopping garbd" ++ ++ # make sure the process is stopped ++ ocf_stop_processes TERM 10 $pid ++ rc=$? ++ ++ if [ $rc -ne 0 ]; then ++ return $OCF_ERR_GENERIC ++ else ++ rm -f $OCF_RESKEY_pid ++ ocf_log info "garbd stopped" ++ return $OCF_SUCCESS ++ fi ++} ++ ++garbd_validate() ++{ ++ if ! have_binary "$OCF_RESKEY_binary"; then ++ ocf_exit_reason "Setup problem: couldn't find command: $OCF_RESKEY_binary" ++ return $OCF_ERR_INSTALLED; ++ fi ++ ++ if ! have_binary "netstat"; then ++ ocf_exit_reason "Setup problem: couldn't find command: netstat" ++ return $OCF_ERR_INSTALLED; ++ fi ++ ++ if [ -z "$OCF_RESKEY_wsrep_cluster_address" ]; then ++ ocf_exit_reason "garbd must be configured with a wsrep_cluster_address value." ++ return $OCF_ERR_CONFIGURED ++ fi ++ ++ # unlike galera RA, ports must be set in cluster address for garbd ++ # https://github.com/codership/galera/issues/98 ++ for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do ++ echo $node | grep -s -q ':[1-9][0-9]*$' ++ if [ $? -ne 0 ]; then ++ ocf_exit_reason "wsrep_cluster_address must specify ports (gcomm://node1:port,node2:port)." ++ return $OCF_ERR_CONFIGURED ++ fi ++ done ++ ++ # Ensure that the encryption method is set if garbd is configured ++ # to use SSL. ++ echo $OCF_RESKEY_options | grep -s -q -i -E '\bsocket.ssl_(key|cert)=' ++ if [ $? -eq 0 ]; then ++ echo $OCF_RESKEY_options | grep -s -q -i -E '\bsocket.ssl_cipher=' ++ if [ $? -ne 0 ]; then ++ ocf_exit_reason "option socket.ssl_cipher must be set if SSL is enabled." ++ return $OCF_ERR_CONFIGURED ++ fi ++ fi ++ ++ if [ -z "$OCF_RESKEY_wsrep_cluster_name" ]; then ++ ocf_exit_reason "garbd must be configured with a wsrep_cluster_name value." ++ return $OCF_ERR_CONFIGURED ++ fi ++ ++ if ! getent passwd $OCF_RESKEY_user >/dev/null 2>&1; then ++ ocf_exit_reason "User $OCF_RESKEY_user doesn't exist" ++ return $OCF_ERR_INSTALLED ++ fi ++ ++ if ! getent group $OCF_RESKEY_group >/dev/null 2>&1; then ++ ocf_exit_reason "Group $OCF_RESKEY_group doesn't exist" ++ return $OCF_ERR_INSTALLED ++ fi ++ ++ return $OCF_SUCCESS ++} ++ ++case "$1" in ++ meta-data) meta_data ++ exit $OCF_SUCCESS;; ++ usage|help) usage ++ exit $OCF_SUCCESS;; ++esac ++ ++garbd_validate ++rc=$? ++ ++# trap configuration errors early, but don't block stop in such cases ++LSB_STATUS_STOPPED=3 ++if [ $rc -ne 0 ]; then ++ case "$1" in ++ stop) exit $OCF_SUCCESS;; ++ status) exit $LSB_STATUS_STOPPED;; ++ *) exit $rc;; ++ esac ++fi ++ ++# What kind of method was invoked? ++case "$1" in ++ start) garbd_start;; ++ stop) garbd_stop;; ++ status) garbd_status err;; ++ monitor) garbd_monitor err;; ++ promote) garbd_promote;; ++ demote) garbd_demote;; ++ validate-all) exit $OCF_SUCCESS;; ++ ++ *) usage ++ exit $OCF_ERR_UNIMPLEMENTED;; ++esac +-- +2.5.5 + + +From f36298aa97fc4cbed3e2eff28d6821f4314becbe Mon Sep 17 00:00:00 2001 +From: Damien Ciabrini +Date: Fri, 3 Jun 2016 18:27:38 +0200 +Subject: [PATCH 2/2] garbd: fix install and man page + +--- + doc/man/Makefile.am | 1 + + heartbeat/Makefile.am | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am +index 5e28895..25fb29b 100644 +--- a/doc/man/Makefile.am ++++ b/doc/man/Makefile.am +@@ -105,6 +105,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \ + ocf_heartbeat_exportfs.7 \ + ocf_heartbeat_fio.7 \ + ocf_heartbeat_galera.7 \ ++ ocf_heartbeat_garbd.7 \ + ocf_heartbeat_iSCSILogicalUnit.7 \ + ocf_heartbeat_iSCSITarget.7 \ + ocf_heartbeat_iface-bridge.7 \ +diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am +index b70c104..df0e3b8 100644 +--- a/heartbeat/Makefile.am ++++ b/heartbeat/Makefile.am +@@ -76,6 +76,7 @@ ocf_SCRIPTS = ClusterMon \ + Filesystem \ + fio \ + galera \ ++ garbd \ + ids \ + iscsi \ + ICP \ +-- +2.5.5 + diff --git a/SOURCES/bz1344228-rabbitmq-cluster-return-code-69-not-running.patch b/SOURCES/bz1344228-rabbitmq-cluster-return-code-69-not-running.patch new file mode 100644 index 0000000..7fc59b1 --- /dev/null +++ b/SOURCES/bz1344228-rabbitmq-cluster-return-code-69-not-running.patch @@ -0,0 +1,73 @@ +diff -uNr a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster +--- a/heartbeat/rabbitmq-cluster 2016-06-03 16:17:09.794967156 +0200 ++++ b/heartbeat/rabbitmq-cluster 2016-06-03 16:27:29.777803932 +0200 +@@ -167,8 +167,13 @@ + rmq_delete_nodename + return $OCF_NOT_RUNNING + ;; ++ 69) ++ ocf_log info "RabbitMQ server is not running" ++ rmq_delete_nodename ++ return $OCF_NOT_RUNNING ++ ;; + *) +- ocf_log err "Unexpected return code from '$RMQ_CTL cluster status' exit code: $rc" ++ ocf_log err "Unexpected return code from '$RMQ_CTL cluster_status' exit code: $rc" + rmq_delete_nodename + return $OCF_ERR_GENERIC + ;; +From 41657b4108211725878b6b46883ff6cc72e44fa9 Mon Sep 17 00:00:00 2001 +From: Peter Lemenkov +Date: Mon, 4 Jul 2016 17:09:16 +0200 +Subject: [PATCH] More RabbitMQ POSIX error codes + +We must add the following POSIX error codes in order to detect node +failure: + +* 68 - EX_NOHOST +* 69 - EX_UNAVAILABLE +* 70 - EX_SOFTWARE +* 75 - EX_TEMPFAIL +* 78 - EX_CONFIG + +The following commits introduced these return values: + +* rabbitmq/rabbitmq-server@7984540175d0b8852025165b6b6a0ac05d692c98 +* rabbitmq/rabbitmq-common@92ae50e5964d4f079c7b2abed1caaa8ab54a439b + +For the error codes meanings go to: + +* http://www.sbras.ru/cgi-bin/www/unix_help/unix-man?sysexits+3 +* http://linux.die.net/include/sysexits.h +* https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=misc/sysexits.h;hb=HEAD + +Note that the following error valies do not mean that the node is +stopped and therefore doesn't covered by this commit: + +* 64 - EX_USAGE +* 65 - EX_DATAERR +* 67 - EX_NOUSER + +Signed-off-by: Peter Lemenkov +--- + heartbeat/rabbitmq-cluster | 7 +------ + 1 file changed, 1 insertion(+), 6 deletions(-) + +diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster +index b9ae38e..651b837 100755 +--- a/heartbeat/rabbitmq-cluster ++++ b/heartbeat/rabbitmq-cluster +@@ -162,12 +162,7 @@ rmq_monitor() { + + return $OCF_SUCCESS + ;; +- 2) +- ocf_log info "RabbitMQ server is not running" +- rmq_delete_nodename +- return $OCF_NOT_RUNNING +- ;; +- 69) ++ 2|68|69|70|75|78) + ocf_log info "RabbitMQ server is not running" + rmq_delete_nodename + return $OCF_NOT_RUNNING diff --git a/SOURCES/bz1347536-saphana-mcos-support.patch b/SOURCES/bz1347536-saphana-mcos-support.patch new file mode 100644 index 0000000..1532f94 --- /dev/null +++ b/SOURCES/bz1347536-saphana-mcos-support.patch @@ -0,0 +1,1778 @@ +diff -uNr a/heartbeat/SAPHana b/heartbeat/SAPHana +--- a/heartbeat/SAPHana 2016-04-26 12:01:55.620889964 +0200 ++++ b/heartbeat/SAPHana 2016-04-26 12:03:17.240897137 +0200 +@@ -2,9 +2,9 @@ + # + # SAPHana + # +-# Description: Manages two single SAP HANA Instance in System Replication ++# Description: Manages two single SAP HANA Instance in System Replication + # Planned: do also manage scale-up scenarios +-# currently the SAPHana is dependent of the analysis of ++# currently the SAPHana is dependent of the analysis of + # SAPHanaTopology + # For supported scenarios please read the README file provided + # in the same software package (rpm) +@@ -16,16 +16,17 @@ + # Support: linux@sap.com + # License: GNU General Public License (GPL) + # Copyright: (c) 2013,2014 SUSE Linux Products GmbH ++# Copyright: (c) 2015 SUSE Linux GmbH + # +-# An example usage: ++# An example usage: + # See usage() function below for more details... + # + # OCF instance parameters: +-# OCF_RESKEY_SID +-# OCF_RESKEY_InstanceNumber +-# OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) +-# OCF_RESKEY_DIR_PROFILE (optional, well known directories will be searched by default) +-# OCF_RESKEY_INSTANCE_PROFILE (optional, well known directories will be searched by default) ++# OCF_RESKEY_SID ++# OCF_RESKEY_InstanceNumber ++# OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) ++# OCF_RESKEY_DIR_PROFILE (optional, well known directories will be searched by default) ++# OCF_RESKEY_INSTANCE_PROFILE (optional, well known directories will be searched by default) + # OCF_RESKEY_PREFER_SITE_TAKEOVER (optional, default is no) + # OCF_RESKEY_DUPLICATE_PRIMARY_TIMEOUT (optional, time difference needed between two last-primary-tiemstampe (lpt)) + # OCF_RESKEY_SAPHanaFilter (optional, should only be set if been told by support or for debugging purposes) +@@ -71,7 +72,7 @@ + info ) + case "$shf" in + all) skip=0 +- ;; ++ ;; + none ) + skip=1 + ;; +@@ -80,13 +81,13 @@ + mtype=${mtype#fh} + echo "$shf"| grep -iq ${mtype}; search=$? + if [ $search -eq 0 ]; then +- skip=0 ++ skip=0 + else + skip=1 + fi + ;; + esac +- ;; ++ ;; + esac + if [ $skip -eq 0 ]; then + ocf_log "$level" "$message" +@@ -103,8 +104,8 @@ + local rc=0 + methods=$(saphana_methods) + methods=$(echo $methods | tr ' ' '|') +- cat <<-! +- usage: $0 ($methods) ++ cat <<-EOF ++ usage: $0 ($methods) + + $0 manages a SAP HANA Instance as an HA resource. + +@@ -118,8 +119,17 @@ + The 'validate-all' operation reports whether the parameters are valid + The 'methods' operation reports on the methods $0 supports + +- ! +- return $rc ++EOF ++ return $rc ++} ++ ++function backup_global_and_nameserver() { ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ cp /hana/shared/LNX/global/hdb/custom/config/global.ini /hana/shared/LNX/global/hdb/custom/config/global.ini.$(date +"%s") ++ cp /hana/shared/LNX/global/hdb/custom/config/nameserver.ini /hana/shared/LNX/global/hdb/custom/config/nameserver.ini.$(date +"%s") ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc + } + + # +@@ -130,11 +140,12 @@ + function saphana_meta_data() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 +- cat < + + +-0.149.7 ++0.151.1 + + Manages two SAP HANA instances in system replication (SR). + +@@ -157,7 +168,7 @@ + 2. landscapeHostConfiguration + The interface is used to monitor a HANA system. The python script is named landscapeHostConfiguration.py. + landscapeHostConfiguration.py has some detailed output about HANA system status +- and node roles. For our monitor the overall status is relevant. This overall ++ and node roles. For our monitor the overall status is relevant. This overall + status is reported by the returncode of the script: + 0: Internal Fatal, 1: ERROR, 2: WARNING, 3: INFO, 4: OK + The SAPHana resource agent will interpret returncodes 0 as FATAL, 1 as not-running or ERROR and and returncodes 2+3+4 as RUNNING. +@@ -168,14 +179,14 @@ + system replication takeover (sr_takeover) or to register a former primary to a newer one (sr_register). + + 4. hdbsql / systemReplicationStatus +- Interface is SQL query into HANA (system replication table). The hdbsql query will be replaced by a python script ++ Interface is SQL query into HANA (system replication table). The hdbsql query will be replaced by a python script + "systemReplicationStatus.py" in SAP HANA SPS8 or 9. + As long as we need to use hdbsql you need to setup secure store users for linux user root to be able to + access the SAP HANA database. You need to configure a secure store user key "SAPHANA${SID}SR" which can connect the SAP +- HANA database: ++ HANA database: + + 5. saphostctrl +- The interface saphostctrl uses the function ListInstances to figure out the virtual host name of the ++ The interface saphostctrl uses the function ListInstances to figure out the virtual host name of the + SAP HANA instance. This is the hostname used during the HANA installation. + + +@@ -207,7 +218,7 @@ + + + Time difference needed between to primary time stamps, if a dual-primary situation occurs +- Time difference needed between to primary time stamps, ++ Time difference needed between to primary time stamps, + if a dual-primary situation occurs. If the time difference is + less than the time gap, then the cluster hold one or both instances in a "WAITING" status. This is to give an admin + a chance to react on a failover. A failed former primary will be registered after the time difference is passed. After +@@ -231,12 +242,8 @@ + + + +- Define SAPHana resource agent messages to be printed +- Define SAPHana resource agent messages to be printed. +- This parameter should only be set if requested by support. The default is sufficient for normal operation. +- Values: ra-act-lpa-dec-flow +- You could specify any combination of the above values like "ra-act-flow" +- ++ OUTDATED PARAMETER ++ OUTDATED PARAMETER + + + +@@ -271,7 +278,7 @@ + for m in start stop status monitor promote demote notify validate-all methods meta-data usage; do + echo "$m" + done +- return $rc ++ return $rc + } + + # +@@ -298,7 +305,7 @@ + local remoteNode="" + local rc=1 + for cl in ${otherNodes[@]}; do +- vHost=$(get_hana_attribute $cl ${ATTR_NAME_HANA_VHOST[@]}) ++ vHost=$(get_hana_attribute $cl ${ATTR_NAME_HANA_VHOST[@]} "$cl") + if [ "$vHost" = "$remoteHost" ]; then # we found the correct node + remoteNode=$cl + rc=0 +@@ -347,9 +354,10 @@ + } + + # +-# function: get_hana_attribute ++# function: get_hana_attribute + # params: NODE ATTR [STORE] + # globals: - ++# output: attribute value + # + function get_hana_attribute() + { +@@ -358,14 +366,20 @@ + local attr_node=$1 + local attr_name=$2 + local attr_store=${3:-reboot} # DONE: PRIO5 get this (optional) from parameter +- local attr_default=${5:-} ++ local attr_default=${4:-} ++ local dstr + local attr_val="" +- attr_val=$(crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q -d "$attr_default"); rc=$? +- if [ $debug_attributes -eq 1 ]; then +- dstr=$(date) +- echo "$dstr: SAPHana: crm_attribute -N ${attr_node} -G -n \"$attr_name\" -l $attr_store -q --> $attr_val" >> /var/log/fhATTRIBUTE +- fi +- echo "$attr_val" ++ dstr=$(date) ++ case "$attr_store" in ++ reboot | forever ) ++ echo "$dstr: SAPHana: crm_attribute -N ${attr_node} -G -n \"$attr_name\" -l $attr_store -q" >> /var/log/fhATTRIBUTE ++ crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q -d "$attr_default" 2>>/var/log/fhATTRIBUTE; rc=$? ++ ;; ++ props ) ++ echo "$dstr: SAPHana: crm_attribute -G -n \"$attr_name\" -t crm_config -q" >> /var/log/fhATTRIBUTE ++ crm_attribute -G -n "$attr_name" -t crm_config -q -d "$attr_default" 2>>/var/log/fhATTRIBUTE; rc=$? ++ ;; ++ esac + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc + } +@@ -388,11 +402,17 @@ + attr_old=$(get_hana_attribute $attr_node $attr_name $attr_store $attr_default); get_rc=$? + if [ "$attr_old" != "$attr_value" ]; then + super_ocf_log debug "DBG: SET attribute $attr_name for node ${attr_node} to ${attr_value} former ($attr_old) get_rc=$get_rc " +- crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store; rc=$? +- if [ $debug_attributes -eq 1 ]; then +- dstr=$(date) +- echo "$dstr: SAPHana: crm_attribute -N $attr_node -v $attr_value -n \"$attr_name\" -l $attr_store" >> /var/log/fhATTRIBUTE +- fi ++ dstr=$(date) ++ case "$attr_store" in ++ reboot | forever ) ++ echo "$dstr: SAPHana: crm_attribute -N $attr_node -v $attr_value -n \"$attr_name\" -l $attr_store" >> /var/log/fhATTRIBUTE ++ crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store 2>>/var/log/fhATTRIBUTE; rc=$? ++ ;; ++ props ) ++ echo "$dstr: SAPHana: crm_attribute -v $attr_value -n \"$attr_name\" -t crm_config -s SAPHanaSR" >> /var/log/fhATTRIBUTE ++ crm_attribute -v $attr_value -n "$attr_name" -t crm_config -s SAPHanaSR 2>>/var/log/fhATTRIBUTE; rc=$? ++ ;; ++ esac + else + super_ocf_log debug "DBG: LET attribute $attr_name for node ${attr_node} still be ${attr_value}" + rc=0 +@@ -408,7 +428,8 @@ + # + function assert() { + super_ocf_log info "FLOW $FUNCNAME ($*)" +- local err_msg=$1 local default_rc=$OCF_NOT_RUNNING ++ local err_msg=$1 ++ local default_rc=$OCF_NOT_RUNNING + # DONE: Check, if we need to destinguish between probe and others + if ocf_is_probe; then + default_exit=$OCF_NOT_RUNNING +@@ -435,7 +456,7 @@ + local score=0 + if [ -n "$1" ]; then + score=$1 +- fi ++ fi + # DONE: PRIO2: Only adjust master if value is really different (try to check that) + oldscore=$(${HA_SBIN_DIR}/crm_master -G -q -l reboot) + if [ "$oldscore" != "$score" ]; then +@@ -452,7 +473,7 @@ + # + # function: scoring_crm_master - score instance due to role ans sync match (table SCORING_TABLE_PREFERRED_SITE_TAKEOVER) + # params: NODE_ROLES NODE_SYNC_STATUS +-# globals: SCORING_TABLE_PREFERRED_SITE_TAKEOVER[@], ++# globals: SCORING_TABLE_PREFERRED_SITE_TAKEOVER[@], + # + scoring_crm_master() + { +@@ -467,7 +488,7 @@ + if grep "$rolePatt" <<< "$roles"; then + if grep "$syncPatt" <<< "$sync"; then + skip=1 +- myScore=$score ++ myScore=$score + fi + fi + fi +@@ -496,7 +517,7 @@ + # function: saphana_init - initialize variables for the resource agent + # params: InstanceName + # globals: OCF_*(r), SID(w), sid(rw), sidadm(w), InstanceName(w), InstanceNr(w), SAPVIRHOST(w), PreferSiteTakeover(w), +-# globals: sr_name(w), remoteHost(w), otherNodes(w) ++# globals: sr_name(w), remoteHost(w), otherNodes(w), rem_SR_name(w) + # globals: ATTR_NAME_HANA_SYNC_STATUS(w), ATTR_NAME_HANA_CLONE_STATE(w) + # globals: DIR_EXECUTABLE(w), SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), LD_LIBRARY_PATH(w), PATH(w) + # globals: LPA_DIRECTORY(w), SIDInstanceName(w), remoteNode(w), hdbSrQueryTimeout(w) +@@ -506,6 +527,8 @@ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=$OCF_SUCCESS + local vName ++ local clN ++ # local site + # two parameter models (for transition only) + # OLD: InstanceName + # NEW: SID InstanceNumber +@@ -528,11 +551,10 @@ + # + # if saphostctrl does not know the answer, try to fallback to attribute provided by SAPHanaTopology + # +- vName=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_VHOST[@]}); ++ vName=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_VHOST[@]} "$NODENAME"); + fi + SAPVIRHOST=${vName} + PreferSiteTakeover="$OCF_RESKEY_PREFER_SITE_TAKEOVER" +- SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" + AUTOMATED_REGISTER="${OCF_RESKEY_AUTOMATED_REGISTER:-false}" + LPA_DIRECTORY=/var/lib/SAPHanaRA + LPA_ATTR=("lpa_${sid}_lpt" "forever") +@@ -591,6 +613,8 @@ + *openais* ) otherNodes=($(crm_node -l | awk '$3 == "member" { if ($2 != me) { print $2 }}' me=${NODENAME}));; + *cman* ) otherNodes=($(crm_node -l | awk '{for (i=1; i<=NF; i++) { if ($i != me) { print $i }}}' me=${NODENAME}));; + esac ++ # ++ # + + remoteHost=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_REMOTEHOST[@]}); + if [ -z "$remoteHost" ]; then +@@ -611,9 +635,13 @@ + # ATTR_NAME_HANA_SITE + sr_name=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SITE[@]}); + sr_mode=$(get_hana_attribute "${NODENAME}" ${ATTR_NAME_HANA_SRMODE[@]}) ++ + if [ -z "$sr_mode" ]; then + sr_mode="sync" + fi ++ if [ -n "$remoteNode" ]; then ++ rem_SR_name=$(get_hana_attribute ${remoteNode} ${ATTR_NAME_HANA_SITE[@]}); ++ fi + super_ocf_log debug "DBG: sr_name=$sr_name, remoteHost=$remoteHost, remoteNode=$remoteNode, sr_mode=$sr_mode" + # optional OCF parameters, we try to guess which directories are correct + if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] +@@ -706,7 +734,7 @@ + then + runninginst=$(echo "$output" | grep '^0 : ' | cut -d' ' -f3) + if [ "$runninginst" != "$InstanceName" ] +- then ++ then + super_ocf_log warn "ACT: sapstartsrv is running for instance $runninginst, that service will be killed" + restart=1 + else +@@ -784,38 +812,113 @@ + node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) + node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') + super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" ++ # TODO: PRIO2: Maybe we need to use a fallback interface when hdbnsitil does not answer properly -> lookup in config files? ++ # This might also solve some problems when we could not figure-out the ilocal or remote site name + for i in 1 2 3 4 5 6 7 8 9; do + case "$node_status" in +- primary ) +- super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_PRIMARY" +- return $HANA_STATE_PRIMARY;; ++ primary ) ++ super_ocf_log info "FLOW: $FUNCNAME rc=HANA_STATE_PRIMARY" ++ return $HANA_STATE_PRIMARY;; + syncmem | sync | async ) +- super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_SECONDARY" +- return $HANA_STATE_SECONDARY;; +- none ) # have seen that mode on second side BEFEORE we registered it as replica +- super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_STANDALONE" +- return $HANA_STATE_STANDALONE;; ++ super_ocf_log info "FLOW: $FUNCNAME rc=HANA_STATE_SECONDARY" ++ return $HANA_STATE_SECONDARY;; ++ none ) # have seen that mode on second side BEFEORE we registered it as replica ++ super_ocf_log info "FLOW: $FUNCNAME rc=HANA_STATE_STANDALONE" ++ return $HANA_STATE_STANDALONE;; + * ) +- super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" +- dump=$( echo $node_status | hexdump -C ); +- super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" +- node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) +- node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') +- super_ocf_log debug "DEC: check_for_primary: loop=$i: node_status=$node_status" +- # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes ++ super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" ++ dump=$( echo $node_status | hexdump -C ); ++ super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" ++ node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) ++ node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') ++ super_ocf_log debug "DEC: check_for_primary: loop=$i: node_status=$node_status" ++ # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes + esac; + done + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc + } + ++# function: analyze_hana_sync_statusSRS ++# params: - ++# globals: DIR_EXECUTABLE(r), FULL_SR_STATUS(w), remoteNode ++# ++# systemReplicationStatus.py return-codes: ++# NoHSR = 10 ++# Error = 11 ++# Unkown = 12 ++# Initializing = 13 ++# Syncing = 14 ++# Active = 15 ++function analyze_hana_sync_statusSRS() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=-1 srRc=0 all_nodes_other_side="" n="" siteParam="" ++ if [ -n "$rem_SR_name" ]; then ++ siteParam="--site=$rem_SR_name" ++ fi ++ FULL_SR_STATUS=$(su - $sidadm -c "python $DIR_EXECUTABLE/python_support/systemReplicationStatus.py $siteParam" 2>/dev/null); srRc=$? ++ super_ocf_log info "DEC $FUNCNAME systemReplicationStatus.py (to site '$rem_SR_name')-> $srRc" ++ super_ocf_log info "FLOW $FUNCNAME systemReplicationStatus.py (to site '$rem_SR_name')-> $srRc" ++ # ++ # TODO: PRIO2: Here we might also need to filter additional sites (if multi tier should be supported) ++ # And is the check for return code capable for chains? ++ # ++ if [ $srRc -eq 15 ]; then ++ # Fix for a HANA BUG, where a non-working SR resulted in RC 15: ++ if grep -q "ACTIVE" <<< "$FULL_SR_STATUS"; then ++ super_ocf_log info "FLOW $FUNCNAME SOK" ++ set_hana_attribute "$remoteNode" "SOK" ${ATTR_NAME_HANA_SYNC_STATUS[@]} ++ super_ocf_log info "ACT site=$sr_name, seting SOK for secondary (1)" ++ lpa_set_lpt 30 "$remoteNode" ++ rc=0; ++ else ++ # ok we should be careful and set secondary to SFAIL ++ super_ocf_log info "FLOW $FUNCNAME SFAIL" ++ set_hana_attribute "$remoteNode" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} ++ super_ocf_log info "ACT site=$sr_name, seting SFAIL for secondary (6) - srRc=$srRc lss=$lss No ACTIVES found in cmd output" ++ # TODO: PRIO1 - P004: need to check LSS again to avoid dying primary to block (SFAIL) secondary ++ lpa_set_lpt 10 "$remoteNode" ++ fi ++ elif [ $srRc -le 11 ]; then # 11 and 10 ++ # if systemReplicationStatus is ERROR and landscapeHostConfiguration is down than do NOT set SFAIL ++ get_hana_landscape_status; lss=$? ++ if [ $lss -lt 2 ]; then ++ # keep everithing like it was ++ rc=2 ++ else ++ # ok we should be careful and set secondary to SFAIL ++ super_ocf_log info "FLOW $FUNCNAME SFAIL" ++ set_hana_attribute "$remoteNode" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} ++ super_ocf_log info "ACT site=$sr_name, seting SFAIL for secondary (5) - srRc=$srRc lss=$lss" ++ # TODO: PRIO1 - P004: need to check LSS again to avoid dying primary to block (SFAIL) secondary ++ lpa_set_lpt 10 "$remoteNode" ++ rc=1 ++ fi ++ else ++ super_ocf_log info "FLOW $FUNCNAME SFAIL" ++ set_hana_attribute "$remoteNode" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} ++ super_ocf_log info "ACT site=$sr_name, seting SFAIL for secondary (2) - srRc=$srRc" ++ # TODO: PRIO1 - P004: need to check LSS again to avoid dying primary to block (SFAIL) secondary ++ lpa_set_lpt 10 "$remoteNode" ++ rc=1; ++ fi ++ super_ocf_log info "FLOW $FUNCNAME PRIM+LPA" ++ super_ocf_log info "DBG PRIM" ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ + # +-# function: analyze_hana_sync_status - query and check hana system replication status ++#### ++#### OLD HDBSQL STUFF FOR SPS6,7,8 AND SCALE-UP ONLY ++#### ++# function: analyze_hana_sync_statusSQL - query and check hana system replication status + # params: - + # globals: DIR_EXECUTABLE(r), remoteHost(r) + # get the HANA sync status +-# +-function analyze_hana_sync_status() ++# ++function analyze_hana_sync_statusSQL() + { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local -a clusterNodes=() +@@ -863,35 +966,9 @@ + # TODO PRIO1: REMOVE remoteNode dependency - set SFAIL + set_hana_attribute "$remoteNode" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} + fi +- # first get a list of all secondary hosts, than a list of all secondary hosts, if the is ANY failure at this site +- # TODO: PRIO9: for first we assume there is only ONE secondary site (like ROT) +- # TODO: PRIO3: should we loop over all cluster nodes fetching their roles-attribute? To minimize sql-queries? +- # +- all_secondary_hosts=$(timeout $hdbSrQueryTimeout hdbsql -a -x -U $secUser $query_secondaries ); sqlrc=$? +- all_secondary_hosts=$(echo $all_secondary_hosts | dequote); +- if [ "$sqlrc" -eq 0 ]; then +- all_broken_secondary_hosts=$(timeout $hdbSrQueryTimeout hdbsql -a -x -U $secUser $query_failed_secondaries); sqlrc=$? +- all_broken_secondary_hosts=$(echo $all_broken_secondary_hosts | dequote); +- if [ "$sqlrc" -eq 0 ]; then +- if [ -n "$all_broken_secondary_hosts" ]; then +- # +- # we have a broken secondary site - set all hosts to "SFAIL" +- # +- # Note: since HANA hostname can be different from nodename we need to check all vhost attributes +- for n in $all_broken_secondary_hosts; do +- for cl in ${otherNodes[@]}; do +- vHost=$(get_hana_attribute $cl ${ATTR_NAME_HANA_VHOST[@]}) +- if [ "$vHost" = "$n" ]; then # we found the correct node +- set_hana_attribute $cl "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} +- fi +- done +- done +- fi +- fi +- fi + else + case "$sqlrc" in +- 19 ) ++ 19 ) + # return codes 19: license error -> set SFAIL! + # DONE: PRIO1: We should NOT set SFAIL, if HDB is exactly broken now + # When HDB breaks during monitor this could prevent a prositive remote failover +@@ -901,7 +978,7 @@ + done + ;; + esac +- fi ++ fi + return $rc + } + +@@ -932,10 +1009,18 @@ + local remoteInstance=""; + remoteInstance=$InstanceNr + if ocf_is_true ${AUTOMATED_REGISTER}; then ++ # ++ # ++ # ++ # ++ # + super_ocf_log info "ACT: REGISTER: hdbnsutil -sr_register --remoteHost=$remoteHost --remoteInstance=$remoteInstance --mode=$sr_mode --name=$sr_name" ++ # ++ # + su - $sidadm -c "hdbnsutil -sr_register --remoteHost=$remoteHost --remoteInstance=$remoteInstance --mode=$sr_mode --name=$sr_name"; rc=$? ++ # backup_global_and_nameserver + else +- super_ocf_log info "ACT: IGNORE REGISTER because AUTOMATED_REGISTER is set to FALSE" ++ super_ocf_log info "ACT: SAPHANA DROP REGISTER because AUTOMATED_REGISTER is set to FALSE" + rc=1 + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" +@@ -945,7 +1030,7 @@ + # + # function: saphana_status - pure status check + # params: - +-# globals: SIDInstanceName, OCF_*, ++# globals: SIDInstanceName, OCF_*, + function saphana_status() { + local binDeam="hdb.sap${SIDInstanceName}" rc=0 + binDeam=${binDeam:0:15} # Process name is limited to the first 15 characters +@@ -956,13 +1041,13 @@ + # + # function: saphana_start - start a hana instance + # params: - +-# globals: OCF_*, SAPCONTROL, InstanceNr, SID, InstanceName, ++# globals: OCF_*, SAPCONTROL, InstanceNr, SID, InstanceName, + # + function saphana_start() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=$OCF_NOT_RUNNING + local output="" +- local loopcount=0 ++ local loopcount=0 + check_sapstartsrv + rc=$? + # +@@ -1000,11 +1085,11 @@ + # saphana_stop: Stop the SAP instance + # + function saphana_stop() { +- super_ocf_log info "FLOW $FUNCNAME ($*)" +- local output="" +- local rc=0 +- check_sapstartsrv; rc=$? +- if [ $rc -eq $OCF_SUCCESS ]; then ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local output="" ++ local rc=0 ++ check_sapstartsrv; rc=$? ++ if [ $rc -eq $OCF_SUCCESS ]; then + output=$($SAPCONTROL -nr $InstanceNr -function Stop) + rc=$? + super_ocf_log info "ACT: Stopping SAP Instance $SID-$InstanceName: $output" +@@ -1032,7 +1117,7 @@ + # function: saphana_validate - validation of (some) variables/parameters + # params: - + # globals: OCF_*(r), SID(r), InstanceName(r), InstanceNr(r), SAPVIRHOST(r) +-# saphana_validate: Check the symantic of the input parameters ++# saphana_validate: Check the symantic of the input parameters + # + function saphana_validate() { + super_ocf_log info "FLOW $FUNCNAME ($*)" +@@ -1060,12 +1145,12 @@ + # + # function: saphana_start_primary - handle startup of PRIMARY in M/S + # params: +-# globals: OCF_*(r), NODENAME, ATTR_NAME_*, HANA_STATE_*, ++# globals: OCF_*(r), NODENAME, ATTR_NAME_*, HANA_STATE_*, + # + function saphana_start_primary() + { + super_ocf_log info "FLOW $FUNCNAME ($*)" +- local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING ++ local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING + local lss sqlrc; + local rc=0 + local lpa_dec=4 +@@ -1074,7 +1159,7 @@ + # we will be a master (PRIMARY) so checking, if the is an OTHER master + # + super_ocf_log debug "DBG: saphana_primary - check_for_primary reports HANA_STATE_PRIMARY" +- # ++ # + lpa_init_lpt $HANA_STATE_PRIMARY + lpa_check_lpt_status; lpa_dec=$? + get_hana_landscape_status; lss=$? +@@ -1139,7 +1224,7 @@ + 1 ) # landcape says we are down, lets start and adjust scores and return code + super_ocf_log info "LPA: landcape: DOWN, LPA: start ==> start instance" + saphana_start +- rc=$? ++ rc=$? + LPTloc=$(date '+%s') + lpa_set_lpt $LPTloc + ;; +@@ -1152,7 +1237,7 @@ + # DONE: PRIO3: check if this reaction is correct - tell cluster about failed start + super_ocf_log info "LPA: landcape: UP, LPA: register ==> take down" + set_crm_master -inf +- rc=$OCF_NOT_RUNNING ++ rc=$OCF_NOT_RUNNING + ;; + 1 ) # lets try to register + # DONE: PRIO2: Like Action in start_secondary +@@ -1160,7 +1245,7 @@ + super_ocf_log info "DEC: AN OTHER HANA IS AVAILABLE ==> LETS REGISTER" + set_crm_master 0 + if wait_for_primary_master 1; then +- register_hana_secondary ++ register_hana_secondary + check_for_primary; primary_status=$? + if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then + super_ocf_log info "ACT: Register successful" +@@ -1169,11 +1254,11 @@ + set_crm_master 0 + saphana_start_secondary + rc=$? +- lpa_set_lpt 30 ++ lpa_set_lpt 10 + else + super_ocf_log err "ACT: Register failed" + rc=$OCF_NOT_RUNNING +- fi ++ fi + else + # lets check next monitor, if we can register + rc=$OCF_SUCCESS +@@ -1185,6 +1270,9 @@ + case "$lss" in + 2 | 3 | 4 ) # as we ARE up we just keep it up + # TODO: PRIO3: I now change from "just keep it up to take that down" ++# TODO: PRIO1 differ lpt_advice!! ++# 2 => DOWN ++# 3 => KEEP + # TODO: PRIO3: OCF_SUCCESS, OCF_NOT_RUNNING or OCF_ERR_xxxx ? + set_crm_master -9000 + #scoring_crm_master "$my_role" "$my_sync" +@@ -1193,7 +1281,7 @@ + 1 ) # we are down, so we should wait --> followup in next monitor + super_ocf_log info "LPA: landcape: DOWN, LPA: wait ==> keep waiting" + # TODO: PRIO3: Check, if WAITING is correct here +- set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} ++ set_hana_attribute ${NODENAME} "WAITING4LPA" ${ATTR_NAME_HANA_CLONE_STATE[@]} + set_crm_master -9000 + rc=$OCF_SUCCESS + ;; +@@ -1202,7 +1290,7 @@ + fail ) # process a lpa FAIL + super_ocf_log info "LPA: LPA reports FAIL" + set_crm_master -inf +- rc=$OCF_NOT_RUNNING ++ rc=$OCF_NOT_RUNNING + ;; + esac + super_ocf_log info "FLOW $FUNCNAME rc=$rc" +@@ -1278,12 +1366,12 @@ + # + # function: saphana_start_secondary - handle startup of PRIMARY in M/S + # params: +-# globals: OCF_*(r), NODENAME, ATTR_NAME_*, ++# globals: OCF_*(r), NODENAME, ATTR_NAME_*, + # + function saphana_start_secondary() + { + super_ocf_log info "FLOW $FUNCNAME ($*)" +- local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING ++ local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING + local sqlrc; + set_crm_master 0 + # +@@ -1291,9 +1379,9 @@ + # + lpa_push_lpt 10 + lpa_set_lpt 10 +- # ++ # + ####### LPA - end +- # ++ # + # + # we would be slave (secondary) + # we first need to check, if there are Master Nodes, because the Scecondary only starts +@@ -1311,16 +1399,16 @@ + # It seams the stating secondary could not start because of stopping primary + # so this is a WAITING situation + super_ocf_log info "ACT: PRIMARY seams to be down now ==> WAITING" +- set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} ++ set_hana_attribute ${NODENAME} "WAITING4PRIM" ${ATTR_NAME_HANA_CLONE_STATE[@]} + set_crm_master -INFINITY + rc=$OCF_SUCCESS + fi + else +- lpa_set_lpt 30 ++ lpa_set_lpt 10 + fi + else + super_ocf_log info "ACT: wait_for_primary_master ==> WAITING" +- set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} ++ set_hana_attribute ${NODENAME} "WAITING4PRIM" ${ATTR_NAME_HANA_CLONE_STATE[@]} + set_crm_master -INFINITY + rc=$OCF_SUCCESS + fi +@@ -1329,11 +1417,71 @@ + } + + # ++# function: saphana_check_local_instance ++# params: ++# output: ++# rc: rc=0 (UP) rc=1 (DOWN) ++# globals: ++# ++function saphana_check_local_instance() ++{ ++ local rc=1 ++ local count=0 ++ local SERVNO ++ local output ++ local MONITOR_SERVICES="hdbnameserver|hdbdaemon" # TODO: PRIO1: exact list of Services ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ check_sapstartsrv ++ rc=$? ++ if [ $rc -eq $OCF_SUCCESS ] ++ then ++ output=$($SAPCONTROL -nr $InstanceNr -function GetProcessList -format script) ++ # we have to parse the output, because the returncode doesn't tell anything about the instance status ++ for SERVNO in `echo "$output" | grep '^[0-9] ' | cut -d' ' -f1 | sort -u` ++ do ++ local COLOR=`echo "$output" | grep "^$SERVNO dispstatus: " | cut -d' ' -f3` ++ local SERVICE=`echo "$output" | grep "^$SERVNO name: " | cut -d' ' -f3` ++ local STATE=0 ++ local SEARCH ++ ++ case $COLOR in ++ GREEN|YELLOW) STATE=$OCF_SUCCESS;; ++ *) STATE=$OCF_NOT_RUNNING;; ++ esac ++ ++ SEARCH=`echo "$MONITOR_SERVICES" | sed 's/\+/\\\+/g' | sed 's/\./\\\./g'` ++ if [ `echo "$SERVICE" | egrep -c "$SEARCH"` -eq 1 ] ++ then ++ if [ $STATE -eq $OCF_NOT_RUNNING ] ++ then ++ [ "$MONLOG" != "NOLOG" ] && ocf_log err "SAP instance service $SERVICE is not running with status $COLOR !" ++ rc=$STATE ++ fi ++ count=1 ++ fi ++ done ++ ++ if [ $count -eq 0 -a $rc -eq $OCF_SUCCESS ] ++ then ++ if ocf_is_probe ++ then ++ rc=1 ++ else ++ [ "$MONLOG" != "NOLOG" ] && ocf_log err "The SAP instance does not run any services which this RA could monitor!" ++ rc=1 ++ fi ++ fi ++ fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc ++} ++ ++# + # function: lpa_get_lpt - get lpt from cluster + # params: NODE + # output: LPT + # rc: rc=0: OK, rc=1: InternalERROR, rc=2: ERROR +-# globals: LPA_ATTR_*, ++# globals: LPA_ATTR_*, + # + function lpa_get_lpt() { + super_ocf_log info "FLOW $FUNCNAME ($*)" +@@ -1348,7 +1496,7 @@ + rc=2 + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" +- return $rc ++ return $rc + } + + # +@@ -1372,7 +1520,7 @@ + rc=0 + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" +- return $rc ++ return $rc + } + + # +@@ -1398,7 +1546,7 @@ + rc=2 + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" +- return $rc ++ return $rc + } + + # +@@ -1422,15 +1570,15 @@ + rc=2 + else + rc=0 +- fi ++ fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" +- return $rc ++ return $rc + } + + # + # function: lpa_init_lpt - initialize local lpt, if needed + # params: HANA_STATE +-# globals: HANA_STATE_*(r), LPA_DIRECTORY(r), sid(r), NODENAME(r), ++# globals: HANA_STATE_*(r), LPA_DIRECTORY(r), sid(r), NODENAME(r), + # lpa_init_lpt + # + # Returncodes: +@@ -1439,7 +1587,7 @@ + # Initializing (if NO local LPT-file): + # SECONDARY sets to 0 + # PRIMARY sets to 1 +-# ++# + function lpa_init_lpt() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=1 +@@ -1458,11 +1606,11 @@ + LPTloc=10 + lpa_push_lpt "10"; rc=$? + else +- rc=2 ++ rc=2 + fi + lpa_set_lpt $LPTloc + super_ocf_log info "FLOW $FUNCNAME rc=$rc" +- return $rc ++ return $rc + } + + # +@@ -1472,6 +1620,10 @@ + # lpa_check_lpt_status + # + # Returncodes: ++# 0: start ++# 1: register than start ++# 2: wait4gab ++# 3: wait4other + # + # Initializing (if NO local LPT-file): + # SECONDARY sets to 10 +@@ -1480,20 +1632,20 @@ + # LPRlocal OR LPTremore ARE real lpt (>1000) + # THEN: + # Bigger LPR wins, if delta-gab is OK +-# LPTlocal >> LPTremore ===> rc=0 (start) ++# LPTlocal >> LPTremore ===> rc=0 (start) + # LPTRemote >> LPTlocal ===> rc=1 (register) +-# Stalemate in all other cases ==> STALEMATE-HANDLING ===> rc=2 (wait) ++# Stalemate in all other cases ==> STALEMATE-HANDLING ===> rc=2 (wait4gab) + # LPRlocal AND LPTremore ARE NOT real lpt (<=1000) + # THEN: + # Bigger LPT wins +-# LPTlocal > LPTremore ===> rc=0 (start) ++# LPTlocal > LPTremore ===> rc=0 (start) + # LPTRemote > LPTlocal ===> rc=1 (register) +-# Stalemate in all other cases ==> STALEMATE-HANDLING ===> rc=2 (wait) ++# Stalemate in all other cases ==> STALEMATE-HANDLING ===> rc=2 (wait4gab) + # LPTRemote is not initialized or node not kown in cluster (crm_mon -l) (0) + # TODO: PRIO1: Need to introduce a return-code 3 for remote sides lpa not ready + # THEN: + # WAIT ==> like STALEMATE-HANDLING ===> rc=2 (wait) +-# ++# + function lpa_check_lpt_status() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 +@@ -1501,6 +1653,8 @@ + local LPTrem=-1 + local LPTMark=1000 + local delta=0 ++ local remSn_name="" ++ local remHost="" + # + # First GET LPT from ATTR-FILE-DEFAULT + # +@@ -1550,7 +1704,20 @@ + fi + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" +- return $rc ++ return $rc ++} ++ ++# function: is_the_master_nameserver ++# params: - ++# rc: 0: yes, local node is THE master nameserver ++# 1: else ++# globals: ++function is_the_master_nameserver() ++{ ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc + } + + # +@@ -1574,11 +1741,12 @@ + check_for_primary; primary_status=$? + if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then + saphana_start_primary; rc=$? +- else ++ else ++ lpa_set_lpt 10 + saphana_start_secondary; rc=$? +- lpa_set_lpt 30 +- fi ++ fi + fi ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc + } + +@@ -1596,7 +1764,7 @@ + check_for_primary; primary_status=$? + if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then + lpa_set_lpt 10 +- fi ++ fi + saphana_stop; rc=$? + return $rc + } +@@ -1637,7 +1805,7 @@ + DEMOTED ) + promoted=0; + ;; +- WAITING ) ++ WAITING* ) + # DONE: lpa_check_lpt_status to come out of here :) + # DONE: PRIO2: CHECK IF THE FIX FOR COMING OUT OF WAITING IS CORRECT + get_hana_landscape_status; lss=$? +@@ -1648,7 +1816,8 @@ + lpa_set_lpt $LPTloc + fi + lpa_check_lpt_status; lparc=$? +- if [ $lparc -ne 2 ]; then ++ # TODO: PRIO1: Need to differ lpa_check_lpt_status return codes ++ if [ $lparc -lt 2 ]; then + # lpa - no need to wait any longer - lets try a new start + saphana_start_clone + rc=$? +@@ -1663,7 +1832,7 @@ + super_ocf_log info "LPA: Dual primary detected and AUTOMATED_REGISTER='false' ==> WAITING" + fi + return $OCF_SUCCESS +- fi ++ fi + promoted=0; + ;; + UNDEFINED ) +@@ -1682,13 +1851,13 @@ + get_hana_landscape_status; lss=$? + super_ocf_log debug "DBG: saphana_monitor_clone: get_hana_landscape_status=$lss" + case "$lss" in +- 0 ) # FATAL or ERROR ++ 0 ) # FATAL or ERROR + rc=$OCF_ERR_GENERIC + ;; +- 1 ) # DOWN or ERROR ++ 1 ) # DOWN or ERROR + # DONE: PRIO2: Maybe we need to differ between 0 and 1. While 0 is a fatal sap error, 1 is down/error + if ocf_is_probe; then +- # ++ # + # leave master score untouched, only set return code + # + rc=$OCF_NOT_RUNNING +@@ -1699,7 +1868,7 @@ + # For Migration it would be good to decrease master score + # For Reload locally we should NOT adjust the master score + # ===> Should we rely on the migration threshold? +- # set_crm_master ++ # set_crm_master + if ocf_is_true "${PreferSiteTakeover}" ; then + # + # DONE: PRIO1: first check, if remote site is already (and still) in sync +@@ -1708,7 +1877,7 @@ + # TODO PRIO1: REMOVE remoteNode dependency - get_sync_status + remoteSync=$(get_hana_attribute $remoteNode ${ATTR_NAME_HANA_SYNC_STATUS[@]}) + case "$remoteSync" in +- SOK ) ++ SOK | PRIM ) + super_ocf_log info "DEC: PreferSiteTakeover selected so decrease promotion score here (and reset lpa)" + set_crm_master 5 + if check_for_primary_master; then +@@ -1718,11 +1887,11 @@ + SFAIL ) + super_ocf_log info "DEC: PreferSiteTakeover selected BUT remoteHost is not in sync (SFAIL) ==> local restart preferred" + ;; +- * ) ++ * ) + super_ocf_log info "DEC: PreferSiteTakeover selected BUT remoteHost is not in sync ($remoteSync) ==> local restart preferred" + ;; +- esac +- else ++ esac ++ else + # TODO: PRIO5: SCALE-OUT ONLY? Implement for local restart + # It maybe that for the local restart we only need to decrease the secondaries promotion score + #super_ocf_log info "DEC: PreferSiteTakeover selected so decrease promotion score here" +@@ -1765,8 +1934,12 @@ + case "$my_role" in + [12]:P:*:master:* ) # primary is down or may not anser hdbsql query so drop analyze_hana_sync_status + ;; +- [34]:P:*:master:* ) # primary is up and should now be able to anser hdbsql query +- analyze_hana_sync_status ++ [34]:P:*:*:* ) # primary is up and should now be able to anser hdbsql query ++ if [ -f $DIR_EXECUTABLE/python_support/systemReplicationStatus.py ]; then ++ analyze_hana_sync_statusSRS ++ else ++ analyze_hana_sync_statusSQL ++ fi + ;; + esac + rem_role=$(get_hana_attribute ${remoteNode} ${ATTR_NAME_HANA_ROLES[@]}) +@@ -1776,9 +1949,9 @@ + [234]:P:* ) # dual primary, but other instance marked as PROMOTED by the cluster + lpa_check_lpt_status; again_lpa_rc=$? + if [ $again_lpa_rc -eq 2 ]; then +- super_ocf_log info "DEC: Dual primary detected, other instance is PROMOTED and lpa stalemate ==> local restart" +- lpa_set_lpt 10 +- lpa_push_lpt 10 ++ super_ocf_log info "DEC: Dual primary detected, other instance is PROMOTED and lpa stalemate ==> local restart" ++ lpa_set_lpt 10 ++ lpa_push_lpt 10 + rc=$OCF_NOT_RUNNING + fi + ;; +@@ -1812,13 +1985,13 @@ + function saphana_monitor_secondary() + { + super_ocf_log info "FLOW $FUNCNAME ($*)" +- local rc=$OCF_ERR_GENERIC +- local promoted=0 ++ local rc=$OCF_ERR_GENERIC ++ local promoted=0 + local init_attribute=0 + local lss + # + # OK, we are running as HANA SECONDARY +- # ++ # + if ! lpa_get_lpt ${NODENAME}; then + lpa_set_lpt 10 + lpa_push_lpt 10 +@@ -1863,7 +2036,7 @@ + super_ocf_log debug "DBG: saphana_monitor_clone: HANA_STATE_SECONDARY" + # + # old method was: saphana_monitor - new method is get_hana_landscape_status +- get_hana_landscape_status; lss=$? ++ get_hana_landscape_status; lss=$? + super_ocf_log debug "DBG: saphana_monitor_clone: get_hana_landscape_status=$lss" + case "$lss" in + 0 ) # FATAL +@@ -1919,11 +2092,11 @@ + # a) returning 7 here and force cluster a restart of the slave + # b) starting the instance here inside the monitor -> may result in longer runtime, timeouts + # +- # first check with the status function (OS tools) if there could be something like a SAP instance running +- # as we do not know here, if we are in master or slave state we do not want to start our monitoring +- # agents (sapstartsrv) on the wrong host +- local rc=$OCF_ERR_GENERIC +- local promoted=0 ++ # first check with the status function (OS tools) if there could be something like a SAP instance running ++ # as we do not know here, if we are in master or slave state we do not want to start our monitoring ++ # agents (sapstartsrv) on the wrong host ++ local rc=$OCF_ERR_GENERIC ++ local promoted=0 + local init_attribute=0 + local lpaRc=0 + local mRc=0 +@@ -1973,7 +2146,7 @@ + # function: saphana_promote_clone - promote a hana clone + # params: - + # globals: OCF_*(r), NODENAME(r), HANA_STATE_*, SID(r), InstanceName(r), +-# saphana_promote_clone: ++# saphana_promote_clone: + # In a Master/Slave configuration get Master being the primary OR by running hana takeover + # + function saphana_promote_clone() { +@@ -2017,7 +2190,7 @@ + rc=$OCF_SUCCESS; + else + rc=$OCF_FAILED_MASTER +- fi ++ fi + ;; + * ) + super_ocf_log err "ACT: HANA SYNC STATUS IS NOT 'SOK' SO THIS HANA SITE COULD NOT BE PROMOTED" +@@ -2039,10 +2212,10 @@ + # + # function: saphana_demote_clone - demote a hana clone instance + # params: - +-# globals: OCF_*(r), NODENAME(r), ++# globals: OCF_*(r), NODENAME(r), + # saphana_demote_clone +-# the HANA System Replication (SR) runs in a Master/Slave +-# While we could not change a HANA instance to be really demoted, we only mark the status for ++# the HANA System Replication (SR) runs in a Master/Slave ++# While we could not change a HANA instance to be really demoted, we only mark the status for + # correct monitor return codes + # + function saphana_demote_clone() { +@@ -2056,9 +2229,9 @@ + } + + # +-# function: main - main function to operate ++# function: main - main function to operate + # params: ACTION +-# globals: OCF_*(r), SID(w), sidadm(w), InstanceName(w), SAPVIRHOST(w), DIR_EXECUTABLE(w), ++# globals: OCF_*(r), SID(w), sidadm(w), InstanceName(w), SAPVIRHOST(w), DIR_EXECUTABLE(w), + # globals: SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), ACTION(w), CLACT(w), ra_rc(rw), $0(r), %ENV(r) + # + +@@ -2073,7 +2246,7 @@ + SAPCONTROL="" + DIR_PROFILE="" + SAPSTARTPROFILE="" +-SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" ++SAPHanaFilter="ra-act-dec-lpa" + + NODENAME=$(crm_node -n) + +@@ -2100,7 +2273,7 @@ + exit $OCF_SUCCESS;; + *);; + esac +-saphana_init ++saphana_init + + if ! ocf_is_root + then +@@ -2141,7 +2314,7 @@ + saphana_$ACTION$CLACT + ra_rc=$? + ;; +- validate-all) ++ validate-all) + saphana_validate + ra_rc=$? + ;; +@@ -2149,12 +2322,13 @@ + lpa_check_lpt_status + ra_rc=$? + ;; +- *) # seams to be a unknown request +- saphana_methods ++ *) # seams to be a unknown request ++ saphana_methods + ra_rc=$OCF_ERR_UNIMPLEMENTED + ;; + esac + timeE=$(date '+%s') + (( timeR = timeE - timeB )) ++#super_ocf_log info "RA ==== SAPHanaFilter=$SAPHanaFilter" + super_ocf_log info "RA ==== end action $ACTION$CLACT with rc=${ra_rc} ($THE_VERSION) (${timeR}s)====" + exit ${ra_rc} +diff -uNr a/heartbeat/SAPHanaTopology b/heartbeat/SAPHanaTopology +--- a/heartbeat/SAPHanaTopology 2016-04-26 12:01:55.620889964 +0200 ++++ b/heartbeat/SAPHanaTopology 2016-04-26 12:03:18.033887556 +0200 +@@ -16,7 +16,7 @@ + # Copyright: (c) 2014 SUSE Linux Products GmbH + # (c) 2015 SUSE Linux GmbH + # +-# An example usage: ++# An example usage: + # See usage() function below for more details... + # + # OCF instance parameters: +@@ -41,7 +41,6 @@ + HANA_STATE_DEFECT=3 + + debug_attributes=0 +- + SH=/bin/sh + + # +@@ -57,7 +56,7 @@ + local shf="${SAPHanaFilter:-all}" + #ocf_log "info" "super_ocf_log: f:$shf l:$level m:$message" + # message levels: (dbg)|info|warn|err|error +- # ++ # + # message types: (ACT|RA|FLOW|DBG|LPA|DEC + case "$level" in + dbg | debug | warn | err | error ) skip=0 +@@ -65,7 +64,7 @@ + info ) + case "$shf" in + all) skip=0 +- ;; ++ ;; + none ) + skip=1 + ;; +@@ -74,13 +73,13 @@ + mtype=${mtype#fh} + echo "$shf"| grep -iq ${mtype}; search=$? + if [ $search -eq 0 ]; then +- skip=0 ++ skip=0 + else + skip=1 + fi + ;; + esac +- ;; ++ ;; + esac + if [ $skip -eq 0 ]; then + ocf_log "$level" "$message" +@@ -126,15 +125,15 @@ + + + +- 0.149.6 ++ 0.151.1 + Analyzes SAP HANA System Replication Topology. + This RA analyzes the SAP HANA topology and "sends" all findings via the node status attributes to + all nodes in the cluster. These attributes are taken by the SAPHana RA to control the SAP Hana Databases. + In addition it starts and monitors the local saphostagent. + +-1. Interface to monitor a HANA system: landscapeHostConfiguration.py ++1. Interface to monitor a HANA system: landscapeHostConfiguration.py + landscapeHostConfiguration.py has some detailed output about HANA system status +-and node roles. For our monitor the overall status is relevant. This overall ++and node roles. For our monitor the overall status is relevant. This overall + status is reported by the returncode of the script: + 0: Internal Fatal + 1: ERROR +@@ -150,7 +149,7 @@ + system replication takeover (sr_takeover) or to register a former primary to a newer one (sr_register). + + 3. saphostctrl +- The interface saphostctrl uses the function ListInstances to figure out the virtual host name of the ++ The interface saphostctrl uses the function ListInstances to figure out the virtual host name of the + SAP HANA instance. This is the hostname used during the HANA installation. + + +@@ -172,13 +171,8 @@ + + + +- Define type of SAPHanaTopology RA messages to be printed +- Define type of SAPHanaTopology RA messages to be printed. +-Define SAPHana resource agent messages to be printed. +- This parameter should only be set if requested by support. The default is sufficient for normal operation. +- Values: ra-act-lpa-dec-flow +- You could specify any combination of the above values like "ra-act-flow" +- ++ OUTDATED ++ OUTDATED + + + +@@ -197,7 +191,7 @@ + } + + # +-# function: get_hana_attribute ++# function: get_hana_attribute + # params: NODE ATTR [STORE] + # globals: - + # +@@ -208,16 +202,19 @@ + local attr_node=$1 + local attr_name=$2 + local attr_store=${3:-reboot} # DONE: PRIO5 get this (optional) from parameter +- local attr_val="" +- attr_val=$(crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q); rc=$? +- if [ $debug_attributes -eq 1 ]; then +- dstr=$(date) +- echo "$dstr: SAPHanaTopology: crm_attribute -N ${attr_node} -G -n \"$attr_name\" -l $attr_store -q --> $attr_val" >> /var/log/fhATTRIBUTE +- fi +- echo "$attr_val" +- if [ $rc -ne 0 ]; then +- super_ocf_log debug "DBG: ATTRIBUTE-FAILURE: crm_attribute -N $attr_node -G -n "$attr_name" -l $attr_store -q" +- fi ++ local attr_default=${4:-} ++ local dstr ++ dstr=$(date) ++ case "$attr_store" in ++ reboot | forever ) ++ echo "$dstr: SAPHanaTopology: crm_attribute -N ${attr_node} -G -n \"$attr_name\" -l $attr_store -q" >> /var/log/fhATTRIBUTE ++ crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q -d "$attr_default" 2>>/var/log/fhATTRIBUTE; rc=$? ++ ;; ++ props ) ++ echo "$dstr: SAPHanaTopology: crm_attribute -G -n \"$attr_name\" -t crm_config -q" >> /var/log/fhATTRIBUTE ++ crm_attribute -G -n "$attr_name" -t crm_config -q -d "$attr_default" 2>>/var/log/fhATTRIBUTE; rc=$? ++ ;; ++ esac + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc + } +@@ -234,19 +231,24 @@ + local attr_value=$2 + local attr_name=$3 + local attr_store=${4:-reboot} # DONE: PRIO5 get this (optional) from parameter ++ local attr_default=${5:-} + local rc=1 +- local attr_old +- attr_old=$(get_hana_attribute $attr_node $attr_name $attr_store); get_rc=$? ++ local attr_old="" ++ local dstr ++ dstr=$(date) ++ attr_old=$(get_hana_attribute $attr_node $attr_name $attr_store $attr_default); get_rc=$? + if [ "$attr_old" != "$attr_value" ]; then + super_ocf_log debug "DBG: SET attribute $attr_name for node ${attr_node} to ${attr_value} former ($attr_old) get_rc=$get_rc " +- if [ $debug_attributes -eq 1 ]; then +- dstr=$(date) +- echo "$dstr: SAPHanaTopology: crm_attribute -N $attr_node -v $attr_value -n \"$attr_name\" -l $attr_store" >> /var/log/fhATTRIBUTE +- fi +- crm_attribute -N $attr_node -v "$attr_value" -n "$attr_name" -l $attr_store; rc=$? +- if [ $rc -ne 0 ]; then +- super_ocf_log debug "DBG: ATTRIBUTE-FAILURE: crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store" +- fi ++ case "$attr_store" in ++ reboot | forever ) ++ echo "$dstr: SAPHanaTopology: crm_attribute -N $attr_node -v $attr_value -n \"$attr_name\" -l $attr_store" >> /var/log/fhATTRIBUTE ++ crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store 2>>/var/log/fhATTRIBUTE; rc=$? ++ ;; ++ props ) ++ echo "$dstr: SAPHanaTopology: crm_attribute -v $attr_value -n \"$attr_name\" -t crm_config -s SAPHanaSR" >> /var/log/fhATTRIBUTE ++ crm_attribute -v $attr_value -n "$attr_name" -t crm_config -s SAPHanaSR 2>>/var/log/fhATTRIBUTE; rc=$? ++ ;; ++ esac + else + super_ocf_log debug "DBG: LET attribute $attr_name for node ${attr_node} still be ${attr_value}" + rc=0 +@@ -299,7 +301,7 @@ + # + # yes it is a clone config - check, if its configured well + # +- if [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] ; then ++ if [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] ; then + super_ocf_log err "ACT: Clone options misconfigured. (expect: clone_node_max=1)" + exit $OCF_ERR_CONFIGURED + fi +@@ -314,8 +316,8 @@ + # + # function: sht_init - initialize variables for the resource agent + # params: - +-# globals: OCF_*(r), SID(w), sid(rw), sidadm(w), InstanceName(w), InstanceNr(w), +-# globals: meta_notify_master_uname(w), HANA_SR_TOLOPOGY(w), sr_name(w), remoteHost(w) ++# globals: OCF_*(r), SID(w), sid(rw), sidadm(w), InstanceName(w), InstanceNr(w), ++# globals: meta_notify_master_uname(w), HANA_SR_TOLOPOGY(w), sr_name(w), remoteHost(w) + # globals: ATTR_NAME_HANA_SYNC_STATUS(w), ATTR_NAME_HANA_PRIMARY_AT(w), ATTR_NAME_HANA_CLONE_STATE(w) + # globals: DIR_EXECUTABLE(w), SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), LD_LIBRARY_PATH(w), PATH(w), nodelist(w) + # sht_init : Define global variables with default values, if optional parameters are not set +@@ -327,6 +329,8 @@ + local myInstanceName="" + local rc=$OCF_SUCCESS + local hdbANSWER="" ++ local siteID ++ local siteNAME + HOSTEXECNAME=saphostexec + USRSAP=/usr/sap + SAPSERVICE_PATH=${USRSAP}/sapservices +@@ -340,10 +344,9 @@ + super_ocf_log debug "DBG2: Used new method to get SID ($SID) and InstanceNr ($InstanceNr)" + sid=$(echo "$SID" | tr [:upper:] [:lower:]) + sidadm="${sid}adm" +- SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" + ocf_env=$(env | grep 'OCF_RESKEY_CRM') + super_ocf_log debug "DBG3: OCF: $ocf_env" +- ATTR_NAME_HANA_SYNC_STATUS=("hana_${sid}_sync_state" "reboot") # SOK, SFAIL, UNKNOWN? ++ ATTR_NAME_HANA_SYNC_STATUS=("hana_${sid}_sync_state" "reboot") # SOK, SFAIL, UNKNOWN? + ATTR_NAME_HANA_PRIMARY_AT=("hana_${sid}_primary_at" "reboot") # Not really used + ATTR_NAME_HANA_CLONE_STATE=("hana_${sid}_clone_state" "reboot") # UKNOWN?, DEMOTED, PROMOTED + ATTR_NAME_HANA_REMOTEHOST=("hana_${sid}_remoteHost" "forever") +@@ -352,8 +355,14 @@ + ATTR_NAME_HANA_SRMODE=("hana_${sid}_srmode" "forever") + ATTR_NAME_HANA_VHOST=("hana_${sid}_vhost" "forever") + ATTR_NAME_HANA_STATUS=("hana_${sid}_status" "reboot") +- ++ # ++ # new "central" attributes ++ # ++ ATTR_NAME_HANA_FILTER=("hana_${sid}_glob_filter" "props" "ra-act-dec-lpa") + # optional OCF parameters, we try to guess which directories are correct ++ ++ SAPHanaFilter=$(get_hana_attribute "X" ${ATTR_NAME_HANA_FILTER[@]}) ++ + if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] + then + DIR_EXECUTABLE="/usr/sap/$SID/$InstanceName/exe" +@@ -387,19 +396,32 @@ + # we need: mode=primary|sync|syncmem|...; site name=; mapping/=/ (multiple lines) + case $(crm_attribute --type crm_config --name cluster-infrastructure -q) in + *corosync* ) nodelist=$(crm_node -l | awk '{ print $2 }');; +- *openais* ) nodelist=$(crm_node -l | awk '/member/ {print $2}');; +- *cman* ) nodelist=$(crm_node -l);; ++ *openais* ) nodelist=$(crm_node -l | awk '/member/ {print $2}');; ++ *cman* ) nodelist=$(crm_node -l);; + esac + #### SAP-CALL +- hdbANSWER=$(su - ${sidadm} -c "hdbnsutil -sr_state --sapcontrol=1" 2>/dev/null) +- super_ocf_log debug "DBG2: hdbANSWER=\$\(su - ${sidadm} -c \"hdbnsutil -sr_state --sapcontrol=1\"\)" +- site=$(echo "$hdbANSWER" | awk -F= '/site name/ {print $2}') ++ # hdbnsutil was a bit unstable in some tests so we recall the tool, if it fails to report the srmode ++ for i in 1 2 3 4 5 6 7 8 9; do ++ hdbANSWER=$(su - ${sidadm} -c "hdbnsutil -sr_state --sapcontrol=1" 2>/dev/null) ++ super_ocf_log debug "DBG2: hdbANSWER=\$\(su - ${sidadm} -c \"hdbnsutil -sr_state --sapcontrol=1\"\)" ++ srmode=$(echo "$hdbANSWER" | awk -F= '/mode/ {print $2}') ++ case "$srmode" in ++ primary | syncmem | sync | async | none ) ++ # we can leave the loop as we already got a result ++ break ++ ;; ++ * ) ++ # lets pause a bit to give hdbnsutil a chance to answer next time ++ sleep 2 ++ ;; ++ esac ++ done ++ # TODO PRIO3: Implement a file lookup, if we did not get a result ++ siteID=$(echo "$hdbANSWER" | awk -F= '/site id/ {print $2}') ++ siteNAME=$(echo "$hdbANSWER" | awk -F= '/site name/ {print $2}') ++ site=$siteNAME + srmode=$(echo "$hdbANSWER" | awk -F= '/mode/ {print $2}') +- if [ $debug_attributes -eq 1 ]; then +- dstr=$(date) +- echo "$dstr: SAPHanaTopology: srmode=$srmode" >> /var/log/fhATTRIBUTE +- fi +- MAPPING=$(echo "$hdbANSWER" | awk -F[=/] '$1 == "mapping" && $3 != site { print $4 }' site=$site) ++ MAPPING=$(echo "$hdbANSWER" | awk -F[=/] '$1 ~ "mapping" && $3 !~ site { print $4 }' site=$site) + super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING" + # + # filter all non-cluster mappings +@@ -413,12 +435,12 @@ + echo $hanaVHost; + fi; + done; +- done ) ++ done ) + super_ocf_log info "DEC: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" + super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" + super_ocf_log info "FLOW $FUNCNAME rc=$OCF_SUCCESS" + return $OCF_SUCCESS +-} ++} + + # + # function: check_for_primary - check if local SAP HANA is configured as primary +@@ -428,32 +450,30 @@ + function check_for_primary() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 +- # DONE: Change stderr location!! +- #sidadm=lnxadm +- #node_status=$(check_for_primary_single) +- node_status=$srmode +- super_ocf_log debug "DBG2: check_for_primary: node_status=$node_status" +- super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" +- for i in 1 2 3 4 5 6 7 8 9; do +- case "$node_status" in +- primary ) ++ node_status=$srmode ++ super_ocf_log debug "DBG2: check_for_primary: node_status=$node_status" ++ super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" ++ for i in 1 2 3 4 5 6 7 8 9; do ++ case "$node_status" in ++ primary ) + super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_PRIMARY" + return $HANA_STATE_PRIMARY;; + syncmem | sync | async ) + super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_SECONDARY" + return $HANA_STATE_SECONDARY;; +- none ) # have seen that mode on second side BEFEORE we registered it as replica ++ none ) # have seen that mode on second side BEFEORE we registered it as replica + super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_STANDALONE" + return $HANA_STATE_STANDALONE;; + * ) +- super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" +- dump=$( echo $node_status | hexdump -C ); +- super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" +- #### SAP-CALL +- node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) +- node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') +- super_ocf_log info "DEC: check_for_primary: loop=$i: node_status=$node_status" +- # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes ++ # TODO: PRIO1: Should we set SFAIL? ++ # TODO: PRIO2: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes ++ dump=$( echo $node_status | hexdump -C ); ++ super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP: <$dump>" ++ #### SAP-CALL ++ node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) ++ node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') ++ super_ocf_log info "DEC: check_for_primary: loop=$i: node_status=$node_status" ++ # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes + esac; + done + super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_DEFECT" +@@ -464,7 +484,7 @@ + # + # function: start_saphostagent + # params: - +-# globals: ++# globals: HOSTEXEC_PATH(r), HOSTEXEC_PROFILE_PATH(r) + # + function start_saphostagent() + { +@@ -478,7 +498,7 @@ + # + # function: stop_saphostagent + # params: - +-# globals: ++# globals: HOSTEXEC_PATH(r) + # + function stop_saphostagent() + { +@@ -496,6 +516,8 @@ + function check_saphostagent() + { + local rc=1 ++ # TODO: PRIO3: should the path been removed like "saphostexec" instead of "/usr/sap/hostctrl/exe/saphostexec" ++ # or should we use ${HOSTEXEC_PATH} instead? + pgrep -f /usr/sap/hostctrl/exe/saphostexec; rc=$? + return $rc + } +@@ -509,15 +531,16 @@ + # sht_start : Start the SAP HANA instance + # + function sht_start() { +- + super_ocf_log info "FLOW $FUNCNAME ($*)" + + local rc=$OCF_NOT_RUNNING + local output="" +- local loopcount=0 ++ local loopcount=0 + +- mkdir -p /var/lib/SAPHana +- touch /var/lib/SAPHana/SAPTopologyON ++ # TODO: PRIO3: move the string "$HA_RSCTMP/SAPHana/SAPTopologyON" to a variable ++ # TODO: PRIO3: move the file to the clusters tmp directory? ++ mkdir -p $HA_RSCTMP/SAPHana ++ touch $HA_RSCTMP/SAPHana/SAPTopologyON + if ! check_saphostagent; then + start_saphostagent + fi +@@ -532,16 +555,16 @@ + # function: sht_stop - stop a hana instance + # params: - + # globals: OCF_*(r), SAPCONTROL(r), SID(r), InstanceName(r) +-# sht_stop: Stop the SAP instance ++# sht_stop: Stop the SAP HANA Topology Resource + # + function sht_stop() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local output="" + local rc=0 + +- rm /var/lib/SAPHana/SAPTopologyON ++ rm $HA_RSCTMP/SAPHana/SAPTopologyON + rc=$OCF_SUCCESS +- ++ + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc + } +@@ -557,13 +580,13 @@ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + +- if [ -f /var/lib/SAPHana/SAPTopologyON ]; then ++ if [ -f $HA_RSCTMP/SAPHana/SAPTopologyON ]; then + rc=$OCF_SUCCESS + else + rc=$OCF_NOT_RUNNING + fi + +- super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc + } + +@@ -575,37 +598,37 @@ + # sht_status: Lightweight check of SAP instance only with OS tools + # + function sht_status() { +- super_ocf_log info "FLOW $FUNCNAME ($*)" +- local rc=0 ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=0 + +- sht_monitor; rc=$? +- return $rc ++ sht_monitor; rc=$? ++ return $rc + } + + + # + # function: sht_validate - validation of (some) variables/parameters + # params: - +-# globals: OCF_*(r), SID(r), InstanceName(r), InstanceNr(r), +-# sht_validate: Check the symantic of the input parameters ++# globals: OCF_*(r), SID(r), InstanceName(r), InstanceNr(r), ++# sht_validate: Check the symantic of the input parameters + # + function sht_validate() { +- super_ocf_log info "FLOW $FUNCNAME ($*)" +- local rc=$OCF_SUCCESS +- if [ $(echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$') -ne 1 ] +- then +- super_ocf_log err "ACT: Parsing instance profile name: '$SID' is not a valid SID!" +- rc=$OCF_ERR_ARGS +- fi ++ super_ocf_log info "FLOW $FUNCNAME ($*)" ++ local rc=$OCF_SUCCESS ++ if [ $(echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$') -ne 1 ] ++ then ++ super_ocf_log err "ACT: Parsing instance profile name: '$SID' is not a valid SID!" ++ rc=$OCF_ERR_ARGS ++ fi + +- if [ $(echo "$InstanceNr" | grep -c '^[0-9][0-9]$') -ne 1 ] +- then +- super_ocf_log err "ACT: Parsing instance profile name: '$InstanceNr' is not a valid instance number!" +- rc=$OCF_ERR_ARGS +- fi ++ if [ $(echo "$InstanceNr" | grep -c '^[0-9][0-9]$') -ne 1 ] ++ then ++ super_ocf_log err "ACT: Parsing instance profile name: '$InstanceNr' is not a valid instance number!" ++ rc=$OCF_ERR_ARGS ++ fi + +- super_ocf_log info "FLOW $FUNCNAME rc=$rc" +- return $rc ++ super_ocf_log info "FLOW $FUNCNAME rc=$rc" ++ return $rc + } + + # +@@ -661,15 +684,15 @@ + + if ocf_is_probe; then + super_ocf_log debug "DBG2: PROBE ONLY" ++ sht_monitor; rc=$? + else + super_ocf_log debug "DBG2: REGULAR MONITOR" + if ! check_saphostagent; then + start_saphostagent + fi +- fi + # + # First check, if we are PRIMARY or SECONDARY +- # ++ # + super_ocf_log debug "DBG2: HANA SID $SID" + super_ocf_log debug "DBG2: HANA InstanceName $InstanceName" + super_ocf_log debug "DBG2: HANA InstanceNr $InstanceNr" +@@ -721,8 +744,8 @@ + set_hana_attribute ${NODENAME} "$site" ${ATTR_NAME_HANA_SITE[@]} + fi + case "$hanaPrim" in +- P ) ;; +- S ) # only secondary may propargate its sync status ++ P ) ;; ++ S ) # only secondary may propargate its sync status + case $(crm_attribute --type crm_config --name cluster-infrastructure -q) in + *corosync* ) nodelist=$(crm_node -l | awk '{ print $2 }');; + *openais* ) nodelist=$(crm_node -l | awk '/member/ {print $2}');; +@@ -732,8 +755,10 @@ + for n in ${nodelist}; do + set_hana_attribute ${n} "$srmode" ${ATTR_NAME_HANA_SRMODE[@]} + done +- ;; ++ ;; + esac ++ # ++ fi # end ocf_is_NOT_probe + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc + } +@@ -752,7 +777,7 @@ + } + + # +-# function: main - main function to operate ++# function: main - main function to operate + # params: ACTION + # globals: OCF_*(r), SID(w), sidadm(w), InstanceName(w), DIR_EXECUTABLE(w), ACTION(w), CLACT(w), ra_rc(rw), $0(r), %ENV(r) + # +@@ -763,7 +788,7 @@ + InstanceName="" + InstanceNr="" + DIR_EXECUTABLE="" +-SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" ++SAPHanaFilter="ra-act-dec-lpa" + NODENAME=$(crm_node -n) + + if [ $# -ne 1 ] +@@ -785,11 +810,11 @@ + exit $OCF_SUCCESS;; + notify) sht_notify + exit $OCF_SUCCESS;; +- admin-setup) admin-setup +- exit $OCF_SUCCESS;; ++ admin-setup) admin-setup ++ exit $OCF_SUCCESS;; + *);; + esac +-sht_init ++sht_init + + if ! ocf_is_root + then +@@ -810,7 +835,6 @@ + exit $OCF_ERR_ARGS + fi + +- + if is_clone + then + CLACT=_clone +@@ -830,12 +854,12 @@ + sht_$ACTION$CLACT + ra_rc=$? + ;; +- validate-all) ++ validate-all) + sht_validate + ra_rc=$? + ;; +- *) # seams to be a unknown request +- sht_methods ++ *) # seams to be a unknown request ++ sht_methods + ra_rc=$OCF_ERR_UNIMPLEMENTED + ;; + esac diff --git a/SOURCES/bz1351446-1-rabbitmq-cluster-dump-restore-users-3.6.x.patch b/SOURCES/bz1351446-1-rabbitmq-cluster-dump-restore-users-3.6.x.patch new file mode 100644 index 0000000..47975b4 --- /dev/null +++ b/SOURCES/bz1351446-1-rabbitmq-cluster-dump-restore-users-3.6.x.patch @@ -0,0 +1,102 @@ +From f00a952bd5e133cad30689d9edcc98f5d33a71a9 Mon Sep 17 00:00:00 2001 +From: Peter Lemenkov +Date: Thu, 16 Jun 2016 16:44:48 +0200 +Subject: [PATCH] Enable dump/restore users from RabbitMQ ver. 3.6.x + +RabbitMQ changed internal_users scheme since ver. 3.6.0. See the +following links for further details: + +* rabbitmq/rabbitmq-server#270 +* rabbitmq/rabbitmq-server#310 +* rabbitmq/rabbitmq-common@9c86a7401cf464dc20527890192c5dc0fe43b6c8 +* rabbitmq/rabbitmq-server@93b5a3a8092f52063cbca3ab661c7c6bae43c512 + +CC @oalbrigt + +Signed-off-by: Peter Lemenkov +--- + heartbeat/rabbitmq-cluster | 64 ++++++++++++++++++++++++++++++++++++---------- + 1 file changed, 50 insertions(+), 14 deletions(-) + +diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster +index 0724901..facca35 100755 +--- a/heartbeat/rabbitmq-cluster ++++ b/heartbeat/rabbitmq-cluster +@@ -342,14 +342,40 @@ rmq_start() { + rmq_join_existing "$join_list" + rc=$? + +- # Restore users (if any) +- BaseDataDir=`dirname $RMQ_DATA_DIR` +- if [ -f $BaseDataDir/users.erl ] ; then +- rabbitmqctl eval " +- {ok, [Users]} = file:consult(\"$BaseDataDir/users.erl\"), +- lists:foreach(fun(X) -> mnesia:dirty_write(rabbit_user, X) end, Users). +- " +- rm -f $BaseDataDir/users.erl ++ # Restore users (if any) ++ BaseDataDir=`dirname $RMQ_DATA_DIR` ++ if [ -f $BaseDataDir/users.erl ] ; then ++ rabbitmqctl eval " ++ ++ [WildPattern] = ets:select(mnesia_gvar, [ { {{rabbit_user, wild_pattern}, '\\\$1'}, [], ['\\\$1'] } ]), ++ ++ %% Read users first ++ {ok, [Users]} = file:consult(\"$BaseDataDir/users.erl\"), ++ ++ Upgrade = fun ++ ({internal_user, A, B, C}) -> {internal_user, A, B, C, rabbit_password_hashing_md5}; ++ ({internal_user, A, B, C, D}) -> {internal_user, A, B, C, D} ++ end, ++ ++ Downgrade = fun ++ ({internal_user, A, B, C}) -> {internal_user, A, B, C}; ++ ({internal_user, A, B, C, rabbit_password_hashing_md5}) -> {internal_user, A, B, C}; ++ %% Incompatible scheme, so we will loose user's password ('B' value) during conversion. ++ %% Unfortunately, this case will require manual intervention - user have to run: ++ %% rabbitmqctl change_password ++ ({internal_user, A, B, C, _}) -> {internal_user, A, B, C} ++ end, ++ ++ case WildPattern of ++ %% Version < 3.6.0 ++ {internal_user,'_','_','_'} -> ++ lists:foreach(fun(X) -> mnesia:dirty_write(rabbit_user, Downgrade(X)) end, Users); ++ %% Version >= 3.6.0 ++ {internal_user,'_','_','_','_'} -> ++ lists:foreach(fun(X) -> mnesia:dirty_write(rabbit_user, Upgrade(X)) end, Users) ++ end. ++ " ++ rm -f $BaseDataDir/users.erl + fi + + if [ $rc -ne 0 ]; then +@@ -362,12 +388,22 @@ rmq_start() { + } + + rmq_stop() { +- # Backup users +- BaseDataDir=`dirname $RMQ_DATA_DIR` +- rabbitmqctl eval " +- Users = mnesia:dirty_select(rabbit_user, [{ {internal_user, '\\\$1', '_', '_'}, [{'/=', '\\\$1', <<\"guest\">>}], ['\\\$_'] } ]), +- file:write_file(\"$BaseDataDir/users.erl\", io_lib:fwrite(\"~p.~n\", [Users])). +- " ++ # Backup users ++ BaseDataDir=`dirname $RMQ_DATA_DIR` ++ rabbitmqctl eval " ++ [WildPattern] = ets:select(mnesia_gvar, [ { {{rabbit_user, wild_pattern}, '\\\$1'}, [], ['\\\$1'] } ]), ++ ++ Users = case WildPattern of ++ %% Version < 3.6.0 ++ {internal_user,'_','_','_'} -> ++ mnesia:dirty_select(rabbit_user, [{ {internal_user, '\\\$1', '_', '_'}, [{'/=', '\\\$1', <<\"guest\">>}], ['\\\$_'] } ]); ++ %% Version >= 3.6.0 ++ {internal_user,'_','_','_','_'} -> ++ mnesia:dirty_select(rabbit_user, [{ {internal_user, '\\\$1', '_', '_', '_'}, [{'/=', '\\\$1', <<\"guest\">>}], ['\\\$_'] } ]) ++ end, ++ ++ file:write_file(\"$BaseDataDir/users.erl\", io_lib:fwrite(\"~p.~n\", [Users])). ++ " + + rmq_monitor + if [ $? -eq $OCF_NOT_RUNNING ]; then diff --git a/SOURCES/bz1351446-2-rabbitmq-cluster-dump-restore-users-3.6.x.patch b/SOURCES/bz1351446-2-rabbitmq-cluster-dump-restore-users-3.6.x.patch new file mode 100644 index 0000000..32a05c3 --- /dev/null +++ b/SOURCES/bz1351446-2-rabbitmq-cluster-dump-restore-users-3.6.x.patch @@ -0,0 +1,37 @@ +From 74b3cff4fce5483d126b16131db53f8bd5804c82 Mon Sep 17 00:00:00 2001 +From: Peter Lemenkov +Date: Tue, 21 Jun 2016 15:48:07 +0200 +Subject: [PATCH] Don't run scriptlets if Mnesia isn't available + +See this rhbz for further details and symptoms: + +https://bugzilla.redhat.com/1343905 + +Signed-off-by: Peter Lemenkov +--- + heartbeat/rabbitmq-cluster | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster +index facca35..18e3206 100755 +--- a/heartbeat/rabbitmq-cluster ++++ b/heartbeat/rabbitmq-cluster +@@ -346,6 +346,8 @@ rmq_start() { + BaseDataDir=`dirname $RMQ_DATA_DIR` + if [ -f $BaseDataDir/users.erl ] ; then + rabbitmqctl eval " ++ %% Run only if Mnesia is ready, otherwise exit. ++ lists:any(fun({mnesia,_,_}) -> true; ({_,_,_}) -> false end, application:which_applications()) orelse halt(), + + [WildPattern] = ets:select(mnesia_gvar, [ { {{rabbit_user, wild_pattern}, '\\\$1'}, [], ['\\\$1'] } ]), + +@@ -391,6 +393,9 @@ rmq_stop() { + # Backup users + BaseDataDir=`dirname $RMQ_DATA_DIR` + rabbitmqctl eval " ++ %% Run only if Mnesia is still available, otherwise exit. ++ lists:any(fun({mnesia,_,_}) -> true; ({_,_,_}) -> false end, application:which_applications()) orelse halt(), ++ + [WildPattern] = ets:select(mnesia_gvar, [ { {{rabbit_user, wild_pattern}, '\\\$1'}, [], ['\\\$1'] } ]), + + Users = case WildPattern of diff --git a/SOURCES/bz1351446-3-rabbitmq-cluster-dump-restore-users-3.6.x.patch b/SOURCES/bz1351446-3-rabbitmq-cluster-dump-restore-users-3.6.x.patch new file mode 100644 index 0000000..2d1abe8 --- /dev/null +++ b/SOURCES/bz1351446-3-rabbitmq-cluster-dump-restore-users-3.6.x.patch @@ -0,0 +1,53 @@ +From 279bae7ec9a571a4d52b0d876850e27772eb0933 Mon Sep 17 00:00:00 2001 +From: Jiri Stransky +Date: Thu, 23 Jun 2016 12:55:06 +0200 +Subject: [PATCH] RabbitMQ: Forget node before 2nd joining attempt + +If a first attempt at joining an existing cluster has failed and we +resort to wiping the local RabbitMQ data, make sure we also request the +local node to be forgotten from the existing cluster before we make the +join attempt, otherwise the node will be rejected. +--- + heartbeat/rabbitmq-cluster | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster +index 0724901..b9ae38e 100755 +--- a/heartbeat/rabbitmq-cluster ++++ b/heartbeat/rabbitmq-cluster +@@ -279,6 +279,22 @@ rmq_join_existing() + return $OCF_SUCCESS + } + ++rmq_forget_cluster_node_remotely() { ++ local running_cluster_nodes="$1" ++ local node_to_forget="$2" ++ ++ ocf_log info "Forgetting $node_to_forget via nodes [ $(echo $running_cluster_nodes | tr '\n' ' ') ]." ++ for running_cluster_node in $running_cluster_nodes; do ++ rabbitmqctl -n $running_cluster_node forget_cluster_node $node_to_forget ++ if [ $? = 0 ]; then ++ ocf_log info "Succeeded forgetting $node_to_forget via $running_cluster_node." ++ return ++ else ++ ocf_log err "Failed to forget node $node_to_forget via $running_cluster_node." ++ fi ++ done ++} ++ + rmq_notify() { + node_list="${OCF_RESKEY_CRM_meta_notify_stop_uname}" + mode="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" +@@ -336,9 +352,12 @@ rmq_start() { + rmq_join_existing "$join_list" + if [ $? -ne 0 ]; then + ocf_log info "node failed to join, wiping data directory and trying again" ++ local local_rmq_node="$(${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l forever --query --name $RMQ_CRM_ATTR_COOKIE_LAST_KNOWN -q)" ++ + # if the graceful join fails, use the hammer and reset all the data. + rmq_stop + rmq_wipe_data ++ rmq_forget_cluster_node_remotely "$join_list" "$local_rmq_node" + rmq_join_existing "$join_list" + rc=$? + diff --git a/SOURCES/bz1351446-4-rabbitmq-automatic-cluster-recovery.patch b/SOURCES/bz1351446-4-rabbitmq-automatic-cluster-recovery.patch new file mode 100644 index 0000000..d51cfe7 --- /dev/null +++ b/SOURCES/bz1351446-4-rabbitmq-automatic-cluster-recovery.patch @@ -0,0 +1,39 @@ +commit 1621dbb60454840d469f3a0e317a97d94510f7ab +Author: John Eckersberg +Date: Tue Jul 26 13:47:39 2016 -0400 + + rabbitmq: Allow automatic cluster recovery before forcing it + + When joining a node into an existing cluster, check to see if it is + already clustered before force removing it from the cluster and + re-adding. If the clustering is already functional there's no need to + force it again. + +diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster +index 651b837..966dd64 100755 +--- a/heartbeat/rabbitmq-cluster ++++ b/heartbeat/rabbitmq-cluster +@@ -238,6 +238,11 @@ rmq_start_first() + return $rc + } + ++rmq_is_clustered() ++{ ++ $RMQ_CTL eval 'rabbit_mnesia:is_clustered().' | grep -q true ++} ++ + rmq_join_existing() + { + local join_list="$1" +@@ -249,6 +254,11 @@ rmq_join_existing() + return $OCF_ERR_GENERIC + fi + ++ if rmq_is_clustered; then ++ ocf_log info "Successfully re-joined existing rabbitmq cluster automatically" ++ return $OCF_SUCCESS ++ fi ++ + # unconditionally join the cluster + $RMQ_CTL stop_app > /dev/null 2>&1 + for node in $(echo "$join_list"); do diff --git a/SPECS/resource-agents.spec b/SPECS/resource-agents.spec index 92ff257..eaf2820 100644 --- a/SPECS/resource-agents.spec +++ b/SPECS/resource-agents.spec @@ -32,7 +32,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 3.9.5 -Release: 54%{?dist}.10 +Release: 54%{?dist}.16 License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} @@ -137,6 +137,13 @@ Patch92: bz1318744-galera-crash-recovery.patch Patch93: bz1318744-galera-heuristic-recovered.patch Patch94: bz1318744-galera-no-grastate.patch Patch95: bz1332435-nfsserver-var-lib-nfs-fix.patch +Patch96: bz1344225-garbd-Introduces-garbd-resource-agent.patch +Patch97: bz1344228-rabbitmq-cluster-return-code-69-not-running.patch +Patch98: bz1347536-saphana-mcos-support.patch +Patch99: bz1351446-1-rabbitmq-cluster-dump-restore-users-3.6.x.patch +Patch100: bz1351446-2-rabbitmq-cluster-dump-restore-users-3.6.x.patch +Patch101: bz1351446-3-rabbitmq-cluster-dump-restore-users-3.6.x.patch +Patch102: bz1351446-4-rabbitmq-automatic-cluster-recovery.patch Obsoletes: heartbeat-resources <= %{version} Provides: heartbeat-resources = %{version} @@ -202,6 +209,7 @@ A set of scripts to interface with several services to operate in a High Availability environment for both Pacemaker and rgmanager service managers. +%ifarch x86_64 %package sap License: GPLv2+ Summary: SAP cluster resource agents and connector script @@ -217,6 +225,7 @@ Requires: perl The SAP resource agents and connector script interface with Pacemaker to allow SAP instances to be managed in a cluster environment. +%endif %ifarch x86_64 %package sap-hana @@ -337,6 +346,13 @@ exit 1 %patch93 -p1 %patch94 -p1 %patch95 -p1 +%patch96 -p1 -F2 +%patch97 -p1 +%patch98 -p1 +%patch99 -p1 +%patch100 -p1 +%patch101 -p1 -F2 +%patch102 -p1 %build if [ ! -f configure ]; then @@ -344,6 +360,7 @@ if [ ! -f configure ]; then fi chmod 755 heartbeat/galera +chmod 755 heartbeat/garbd chmod 755 heartbeat/mysql-common.sh chmod 755 heartbeat/nfsnotify chmod 755 heartbeat/docker @@ -572,6 +589,7 @@ rm -rf %{buildroot} ccs_update_schema > /dev/null 2>&1 ||: %endif +%ifarch x86_64 %files sap %defattr(-,root,root) %{_sbindir}/sap_redhat_cluster_connector @@ -580,6 +598,7 @@ ccs_update_schema > /dev/null 2>&1 ||: %{_mandir}/man7/*SAP* %exclude %{_mandir}/man7/*SAPHana* %exclude /usr/lib/ocf/resource.d/heartbeat/SAPHana* +%endif %ifarch x86_64 %files sap-hana @@ -589,6 +608,31 @@ ccs_update_schema > /dev/null 2>&1 ||: %endif %changelog +* Wed Jul 27 2016 Andrew Beekhof - 3.9.5-54.16 +- rabbit: Allow automatic cluster recovery before forcing it + + Resolves: rhbz#1351446 + +* Tue Jul 5 2016 Oyvind Albrigtsen - 3.9.5-54.15 +- rabbitmq-cluster: add return codes for not running + + Resolves: rhbz#1344228 + +* Thu Jun 30 2016 Oyvind Albrigtsen - 3.9.5-54.14 +- rabbitmq-cluster: support dump/restore users for RabbitMQ v. 3.6.x + + Resolves: rhbz#1351446 + +* Fri Jun 17 2016 Oyvind Albrigtsen - 3.9.5-54.13 +- SAP HANA: add Multiple Components One System (MCOS) support + + Resolves: rhbz#1347536 + +* Thu Jun 9 2016 Oyvind Albrigtsen - 3.9.5-54.12 +- garbd: Introduces garbd resource-agent + + Resolves: rhbz#1344225 + * Tue May 3 2016 Oyvind Albrigtsen - 3.9.5-54.10 - nfsserver: fix nfs-idmapd fails to start due to var-lib-nfs-rpc_pipefs.mount being active @@ -660,7 +704,7 @@ ccs_update_schema > /dev/null 2>&1 ||: Resolves: rhbz#1244827 -* Wed Jul 10 2015 David Vossel - 3.9.5-48 +* Fri Jul 10 2015 David Vossel - 3.9.5-48 - add support for oracle resource agents Resolves: rhbz#1232376 @@ -779,7 +823,7 @@ ccs_update_schema > /dev/null 2>&1 ||: Resolves: rhbz#1116166 Resolves: rhbz#1128933 -* Tue Sep 17 2014 David Vossel - 3.9.5-32 +* Wed Sep 17 2014 David Vossel - 3.9.5-32 - Fixes iSCSILogicalUnit syntax error - Fixes mysql stop operation when db storage is unavailable @@ -1000,7 +1044,7 @@ Resolves: rhbz# 773395 * Thu Jul 05 2012 Chris Feist - 3.9.2-3.4 - Fix location of lvm (change from /sbin to /usr/sbin) -* Tue Apr 04 2012 Jon Ciesla - 3.9.2-3.3 +* Wed Apr 04 2012 Jon Ciesla - 3.9.2-3.3 - Rebuilt to fix rawhide dependency issues (caused by move of fsck from /sbin to /usr/sbin).