diff -uNr a/heartbeat/SAPHana b/heartbeat/SAPHana --- a/heartbeat/SAPHana 2016-10-14 10:09:56.479051279 +0200 +++ b/heartbeat/SAPHana 2016-10-14 10:29:23.990066292 +0200 @@ -2,8 +2,8 @@ # # SAPHana # -# Description: Manages two single SAP HANA Instance in System Replication -# Planned: do also manage scale-up scenarios +# Description: Manages two SAP HANA Databases in System Replication +# Planned: do also manage scale-out scenarios # currently the SAPHana is dependent of the analysis of # SAPHanaTopology # For supported scenarios please read the README file provided @@ -16,7 +16,7 @@ # Support: linux@sap.com # License: GNU General Public License (GPL) # Copyright: (c) 2013,2014 SUSE Linux Products GmbH -# Copyright: (c) 2015 SUSE Linux GmbH +# (c) 2015-2016 SUSE Linux GmbH # # An example usage: # See usage() function below for more details... @@ -29,12 +29,13 @@ # OCF_RESKEY_INSTANCE_PROFILE (optional, well known directories will be searched by default) # OCF_RESKEY_PREFER_SITE_TAKEOVER (optional, default is no) # OCF_RESKEY_DUPLICATE_PRIMARY_TIMEOUT (optional, time difference needed between two last-primary-tiemstampe (lpt)) -# OCF_RESKEY_SAPHanaFilter (optional, should only be set if been told by support or for debugging purposes) +# OCF_RESKEY_SAPHanaFilter (outdated, replaced by cluster property hana_${sid}_glob_filter) # # ####################################################################### # # Initialization: +SAPHanaVersion="0.152.17" timeB=$(date '+%s') : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} @@ -43,6 +44,12 @@ # ####################################################################### # +log_attributes=false +if ocf_is_true "$log_attributes"; then + log_attr_file="/var/log/fhATTRIBUTES" +else + log_attr_file="/dev/null" +fi HANA_STATE_PRIMARY=0 HANA_STATE_SECONDARY=1 @@ -107,7 +114,7 @@ cat <<-EOF usage: $0 ($methods) - $0 manages a SAP HANA Instance as an HA resource. + $0 manages two SAP HANA databases (scale-up) in system replication. The 'start' operation starts the HANA instance or bring the "clone instance" to a WAITING status The 'stop' operation stops the HANA instance @@ -145,15 +152,14 @@ -0.151.1 +$SAPHanaVersion -Manages two SAP HANA instances in system replication (SR). +Manages two SAP HANA database systems in system replication (SR). -The SAPHanaSR resource agent manages two SAP Hana instances (databases) which are configured -in system replication. This first version is limited to the scale-up scenario. Scale-Out is -not supported in this version. +The SAPHanaSR resource agent manages two SAP HANA database systems which are configured +in system replication. SAPHana supports Scale-Up scenarios. -Managing the two SAP HANA instances means that the resource agent controls the start/stop of the +Managing the two SAP HANA database systems means that the resource agent controls the start/stop of the instances. In addition the resource agent is able to monitor the SAP HANA databases to check their availability on landscape host configuration level. For this monitoring the resource agent relies on interfaces provided by SAP. A third task of the resource agent is to also check the synchronisation status @@ -205,9 +211,10 @@ Should cluster/RA prefer to switchover to slave instance instead of restarting master locally? Default="yes" no: Do prefer restart locally yes: Do prefer takever to remote site + never: Do never run a sr_takeover (promote) at the secondary side. THIS VALUE IS CURRENTLY NOT SUPPORTED. Local or site recover preferred? - + Define, if a former primary should automatically be registered. @@ -220,7 +227,7 @@ Time difference needed between to primary time stamps, if a dual-primary situation occurs Time difference needed between to primary time stamps, if a dual-primary situation occurs. If the time difference is - less than the time gap, then the cluster hold one or both instances in a "WAITING" status. This is to give an admin + less than the time gap, then the cluster holds one or both instances in a "WAITING" status. This is to give an admin a chance to react on a failover. A failed former primary will be registered after the time difference is passed. After this registration to the new primary all data will be overwritten by the system replication. @@ -290,6 +297,45 @@ local rc=0; tr -d '"'; return $rc } +# function: version: cpmpare two HANA version strings +function ver_lt() { + ocf_version_cmp $1 $2 + test $? -eq 0 && return 0 || return 1 +} + +function ver_le() { + ocf_version_cmp $1 $2 + test $? -eq 0 -o $? -eq 1 && return 0 || return 1 +} + +function ver_gt() { + ocf_version_cmp $1 $2 + test $? -eq 2 && return 0 || return 1 +} + +function ver_ge() { + ocf_version_cmp $1 $2 + test $? -eq 2 -o $? -eq 1 && return 0 || return 1 +} +# +# function: version: cpmpare two HANA version strings +# +function version() { + if [ $# -eq 3 ]; then + case "$2" in + LE | le | "<=" ) ver_le $1 $3;; + LT | lt | "<" ) ver_lt $1 $3;; + GE | ge | ">=" ) ver_ge $1 $3;; + GT | gt | ">" ) ver_gt $1 $3;; + * ) return 1; + esac + elif [ $# -ge 5 ]; then + version $1 $2 $3 && shift 2 && version $* + else + return 1; + fi +} + # # function: remoteHost2remoteNode - convert a SAP remoteHost to the cluster node name # params: remoteHost @@ -372,12 +418,16 @@ dstr=$(date) case "$attr_store" in reboot | forever ) - echo "$dstr: SAPHana: crm_attribute -N ${attr_node} -G -n \"$attr_name\" -l $attr_store -q" >> /var/log/fhATTRIBUTE - crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q -d "$attr_default" 2>>/var/log/fhATTRIBUTE; rc=$? + if ocf_is_true "$log_attributes"; then + echo "$dstr: SAPHana: crm_attribute -N ${attr_node} -G -n \"$attr_name\" -l $attr_store -q" >> $log_attr_file + fi + crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q -d "$attr_default" 2>>$log_attr_file; rc=$? ;; props ) - echo "$dstr: SAPHana: crm_attribute -G -n \"$attr_name\" -t crm_config -q" >> /var/log/fhATTRIBUTE - crm_attribute -G -n "$attr_name" -t crm_config -q -d "$attr_default" 2>>/var/log/fhATTRIBUTE; rc=$? + if ocf_is_true "$log_attributes"; then + echo "$dstr: SAPHana: crm_attribute -G -n \"$attr_name\" -t crm_config -q" >> $log_attr_file + fi + crm_attribute -G -n "$attr_name" -t crm_config -q -d "$attr_default" 2>>$log_attr_file; rc=$? ;; esac super_ocf_log info "FLOW $FUNCNAME rc=$rc" @@ -405,12 +455,16 @@ dstr=$(date) case "$attr_store" in reboot | forever ) - echo "$dstr: SAPHana: crm_attribute -N $attr_node -v $attr_value -n \"$attr_name\" -l $attr_store" >> /var/log/fhATTRIBUTE - crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store 2>>/var/log/fhATTRIBUTE; rc=$? + if ocf_is_true "$log_attributes"; then + echo "$dstr: SAPHana: crm_attribute -N $attr_node -v $attr_value -n \"$attr_name\" -l $attr_store" >> $log_attr_file + fi + crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store 2>>$log_attr_file; rc=$? ;; props ) - echo "$dstr: SAPHana: crm_attribute -v $attr_value -n \"$attr_name\" -t crm_config -s SAPHanaSR" >> /var/log/fhATTRIBUTE - crm_attribute -v $attr_value -n "$attr_name" -t crm_config -s SAPHanaSR 2>>/var/log/fhATTRIBUTE; rc=$? + if ocf_is_true "$log_attributes"; then + echo "$dstr: SAPHana: crm_attribute -v $attr_value -n \"$attr_name\" -t crm_config -s SAPHanaSR" >> $log_attr_file + fi + crm_attribute -v $attr_value -n "$attr_name" -t crm_config -s SAPHanaSR 2>>$log_attr_file; rc=$? ;; esac else @@ -460,6 +514,10 @@ # DONE: PRIO2: Only adjust master if value is really different (try to check that) oldscore=$(${HA_SBIN_DIR}/crm_master -G -q -l reboot) if [ "$oldscore" != "$score" ]; then + dstr=$(date) + if ocf_is_true "$log_attributes"; then + echo "$dstr: SAPHana: crm_master -v $score -l reboot " >> $log_attr_file + fi super_ocf_log debug "DBG: SET crm master: $score (old: $oldscore)" ${HA_SBIN_DIR}/crm_master -v $score -l reboot; rc=$? else @@ -471,9 +529,9 @@ } # -# function: scoring_crm_master - score instance due to role ans sync match (table SCORING_TABLE_PREFERRED_SITE_TAKEOVER) +# function: scoring_crm_master - score instance due to role ans sync match (table SCORING_TABLE) # params: NODE_ROLES NODE_SYNC_STATUS -# globals: SCORING_TABLE_PREFERRED_SITE_TAKEOVER[@], +# globals: SCORING_TABLE[@], # scoring_crm_master() { @@ -482,7 +540,7 @@ local sync="$2" local skip=0 local myScore="" - for scan in "${SCORING_TABLE_PREFERRED_SITE_TAKEOVER[@]}"; do + for scan in "${SCORING_TABLE[@]}"; do if [ $skip -eq 0 ]; then read rolePatt syncPatt score <<< $scan if grep "$rolePatt" <<< "$roles"; then @@ -494,7 +552,7 @@ fi done super_ocf_log debug "DBG: scoring_crm_master adjust score $myScore" - # TODO: PRIO1: DO Not Score, If we did not found our role/sync at this moment - bsc#919925 + # DONE: PRIO1: DO Not Score, If we did not found our role/sync at this moment - bsc#919925 if [ -n "$myScore" ]; then set_crm_master $myScore fi @@ -514,28 +572,91 @@ } # +# function: HANA_CALL +# params: timeout-in-seconds cmd-line +# globals: sid(r), SID(r), InstanceName(r) +# +function HANA_CALL() +{ + # + # TODO: PRIO 5: remove 'su - ${sidadm} later, when SAP HANA resoled issue with + # root-user-called hdbnsutil -sr_state (which creates root-owned shared memory file in /var/lib/hdb/SID/shmgrp) + # TODO: PRIO 5: Maybe make "su" optional by a parameter + local timeOut=0 + local onTimeOut="" + local rc=0 + local use_su=1 # Default to be changed later (see TODO above) + local pre_cmd="" + local cmd="" + local pre_script="" + local output="" + while [ $# -gt 0 ]; do + case "$1" in + --timeout ) timeOut=$2; shift;; + --use-su ) use_su=1;; + --on-timeout ) onTimeOut="$2"; shift;; + --cmd ) shift; cmd="$*"; break;; + esac + shift + done + + if [ $use_su -eq 1 ]; then + pre_cmd="su - ${sid}adm -c" + pre_script="true" + else + # as root user we need the library path to the SAP kernel to be able to call sapcontrol + # check, if we already added DIR_EXECUTABLE at the beginning of LD_LIBRARY_PATH + if [ "${LD_LIBRARY_PATH%%*:}" != "$DIR_EXECUTABLE" ] + then + MY_LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH + fi + pre_cmd="bash -c" + pre_script="LD_LIBRARY_PATH=$MY_LD_LIBRARY_PATH; export LD_LIBRARY_PATH" + fi + case $timeOut in + 0 | inf ) + output=$($pre_cmd "$pre_script; /usr/sap/$SID/$InstanceName/HDBSettings.sh $cmd"); rc=$? + ;; + * ) + output=$(timeout $timeOut $pre_cmd "$pre_script; /usr/sap/$SID/$InstanceName/HDBSettings.sh $cmd"); rc=$? + # + # on timeout ... + # + if [ $rc -eq 124 -a -n "$onTimeOut" ]; then + local second_output="" + second_output=$($pre_cmd "$pre_script; /usr/sap/$SID/$InstanceName/HDBSettings.sh $onTimeOut"); + fi + ;; + esac + echo "$output" + return $rc; +} + +# # function: saphana_init - initialize variables for the resource agent # params: InstanceName -# globals: OCF_*(r), SID(w), sid(rw), sidadm(w), InstanceName(w), InstanceNr(w), SAPVIRHOST(w), PreferSiteTakeover(w), -# globals: sr_name(w), remoteHost(w), otherNodes(w), rem_SR_name(w) +# globals: OCF_*(r), SID(w), sid(rw), sidadm(w), InstanceName(w), InstanceNr(w), SAPVIRHOST(w), PreferSiteTakeover(w), +# globals: sr_name(w), remoteHost(w), otherNodes(w), remSR_name(w) # globals: ATTR_NAME_HANA_SYNC_STATUS(w), ATTR_NAME_HANA_CLONE_STATE(w) # globals: DIR_EXECUTABLE(w), SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), LD_LIBRARY_PATH(w), PATH(w) # globals: LPA_DIRECTORY(w), SIDInstanceName(w), remoteNode(w), hdbSrQueryTimeout(w) +# globals: NODENAME(w), vNAME(w), hdbver(w), # saphana_init : Define global variables with default values, if optional parameters are not set # function saphana_init() { super_ocf_log info "FLOW $FUNCNAME ($*)" local rc=$OCF_SUCCESS - local vName local clN # local site # two parameter models (for transition only) # OLD: InstanceName # NEW: SID InstanceNumber + NODENAME=$(crm_node -n) SID=$OCF_RESKEY_SID InstanceNr=$OCF_RESKEY_InstanceNumber SIDInstanceName="${SID}_HDB${InstanceNr}" InstanceName="HDB${InstanceNr}" + export SAPSYSTEMNAME=$SID super_ocf_log debug "DBG: Used new method to get SID ($SID) and InstanceNr ($InstanceNr)" sid=$(echo "$SID" | tr [:upper:] [:lower:]) sidadm="${sid}adm" @@ -544,15 +665,23 @@ # DONE: PRIO4: SAPVIRHOST might be different to NODENAME # DONE: PRIO1: ASK: Is the output format of ListInstances fix? Could we take that as an API? Answer: Yes # try to catch: Inst Info : LNX - 42 - lv9041 - 740, patch 36, changelist 1444691 - # We rely on the following format: SID is word#4, NR is work#6, vHost is word#8 - vName=$(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances \ - | awk '$4 == SID && $6=NR { print $8 }' SID=$SID NR=$InstanceNr) + # We rely on the following format: SID is word#4, SYSNR is work#6, vHost is word#8 + if [ -e /usr/sap/hostctrl/exe/saphostctrl ]; then + vName=$(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances \ + | awk '$4 == SID && $6 == SYSNR { print $8 }' SID=$SID SYSNR=$InstanceNr 2>/dev/null ) + super_ocf_log debug "DBG: ListInstances: $(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances)" + else + super_ocf_log error "ERR: SAPHOSTAGENT is not installed at /usr/sap/hostctrl/exe (saphostctrl missing)" + fi if [ -z "$vName" ]; then # # if saphostctrl does not know the answer, try to fallback to attribute provided by SAPHanaTopology # vName=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_VHOST[@]} "$NODENAME"); fi + if [ -z "$vName" ]; then # last fallback if we are not able to figure out the virtual host name + vName="$NODENAME" + fi SAPVIRHOST=${vName} PreferSiteTakeover="$OCF_RESKEY_PREFER_SITE_TAKEOVER" AUTOMATED_REGISTER="${OCF_RESKEY_AUTOMATED_REGISTER:-false}" @@ -571,6 +700,12 @@ ATTR_NAME_HANA_SRMODE=("hana_${sid}_srmode" "forever") ATTR_NAME_HANA_VHOST=("hana_${sid}_vhost" "forever") ATTR_NAME_HANA_STATUS=("hana_${sid}_status" "reboot") + ATTR_NAME_HANA_OPERATION_MODE=("hana_${sid}_op_mode" "forever") + # + # new "central" attributes + # + ATTR_NAME_HANA_FILTER=("hana_${sid}_glob_filter" "props" "ra-act-dec-lpa") + SAPHanaFilter=$(get_hana_attribute "X" ${ATTR_NAME_HANA_FILTER[@]}) # # TODO: PRIO4: Table for non-preferred-site-takeover # @@ -591,9 +726,7 @@ ) SCORING_TABLE_PREFERRED_LOCAL_RESTART=( "[0-9]*:P:[^:]*:master .* 150" - "[0-9]*:P:[^:]*:slave .* 140" - "[0-9]*:P:[^:]*:\? .* 0" - "[0-9]*:P:[^:]*:- .* 0" + "[0-9]*:P:[^:]*:.* .* 140" "[0-9]*:S:[^:]*:master SOK 100" "[0-9]*:S:[^:]*:master SFAIL -INFINITY" "[0-9]*:S:[^:]*:slave SOK 10" @@ -602,6 +735,25 @@ "[0-9]*:S:[^:]*:- .* 0" ".* .* -1" ) + SCORING_TABLE_PREFERRED_NEVER=( + "[234]*:P:[^:]*:master .* 150" + "[015-9]*:P:[^:]*:master .* 90" + "[0-9]*:P:[^:]*:.* .* -INFINITY" + "[0-9]*:S:[^:]*:.* .* -INFINITY" + ".* .* -INFINITY" + ) + if ocf_is_true $PreferSiteTakeover; then + SCORING_TABLE=("${SCORING_TABLE_PREFERRED_SITE_TAKEOVER[@]}") + else + case "$PreferSiteTakeover" in + never|NEVER|Never ) + SCORING_TABLE=("${SCORING_TABLE_PREFERRED_NEVER[@]}") + ;; + * ) + SCORING_TABLE=("${SCORING_TABLE_PREFERRED_LOCAL_RESTART[@]}") + ;; + esac + fi # DUPLICATE_PRIMARY_TIMEOUT="${OCF_RESKEY_DUPLICATE_PRIMARY_TIMEOUT:-7200}" super_ocf_log debug "DBG: DUPLICATE_PRIMARY_TIMEOUT=$DUPLICATE_PRIMARY_TIMEOUT" @@ -615,7 +767,7 @@ esac # # - + # remoteHost=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_REMOTEHOST[@]}); if [ -z "$remoteHost" ]; then if [ ${#otherNodes[@]} -eq 1 ]; then # we are a 2 node cluster, lets assume the other is the remote-host @@ -640,7 +792,7 @@ sr_mode="sync" fi if [ -n "$remoteNode" ]; then - rem_SR_name=$(get_hana_attribute ${remoteNode} ${ATTR_NAME_HANA_SITE[@]}); + remSR_name=$(get_hana_attribute ${remoteNode} ${ATTR_NAME_HANA_SITE[@]}); fi super_ocf_log debug "DBG: sr_name=$sr_name, remoteHost=$remoteHost, remoteNode=$remoteNode, sr_mode=$sr_mode" # optional OCF parameters, we try to guess which directories are correct @@ -671,26 +823,21 @@ # SAPSTARTPROFILE="$(ls -1 $DIR_PROFILE/${OCF_RESKEY_INSTANCE_PROFILE:-${SID}_${InstanceName}_*})" fi - # as root user we need the library path to the SAP kernel to be able to call sapcontrol - # check, if we already added DIR_EXECUTABLE at the beginning of LD_LIBRARY_PATH - if [ "${LD_LIBRARY_PATH%%*:}" != "$DIR_EXECUTABLE" ] - then - LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH - export LD_LIBRARY_PATH - fi PATH=${PATH}:${DIR_EXECUTABLE}; export PATH + local ges_ver + ges_ver=$(HANA_CALL --timeout 10 --cmd "HDB version" | tr -d " " | awk -F: '$1 == "version" {print $2}') + hdbver=${ges_ver%.*.*} + # + # since rev 111.00 we should use a new hdbnsutil option to get the -sr_state + # since rev 112.03 the old option is changed and we should use -sr_stateConfiguration where ever possible + # + hdbState="hdbnsutil -sr_state" + hdbMap="hdbnsutil -sr_state" + if version "$hdbver" ">=" "1.00.111"; then + hdbState="hdbnsutil -sr_stateConfiguration" + hdbMap="hdbnsutil -sr_stateHostMapping" + fi super_ocf_log info "FLOW $FUNCNAME rc=$OCF_SUCCESS" - ############################# - # TODO: PRIO9: To be able to call landscapeHostConfig.py without su (so as root) - # TODO: PRIO9: Research for environment script .htacces or something like that - #export SAPSYSTEMNAME=ZLF - #export DIR_INSTANCE=/usr/sap/ZLF/HDB02 - #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$DIR_INSTANCE/exe:$DIR_INSTANCE/exe/Python/lib - #export PYTHONPATH=$DIR_INSTANCE/$HOST:$DIR_INSTANCE/exe/python_support:$DIR_INSTANCE/exe - #export PYTHONHOME=$DIR_INSTANCE/exe/Python - #export SAP_RETRIEVAL_PATH=$DIR_INSTANCE/$HOST - #export DIR_EXECUTABLE=$DIR_INSTANCE/exe - ############################# return $OCF_SUCCESS } @@ -765,7 +912,11 @@ # or ownership - they will be recreated by sapstartsrv during next start rm -f /tmp/.sapstream5${InstanceNr}13 rm -f /tmp/.sapstream5${InstanceNr}14 - $SAPSTARTSRV pf=$SAPSTARTPROFILE -D -u $sidadm + ( + export PATH="$DIR_EXECUTABLE${PATH:+:}$PATH" + export LD_LIBRARY_PATH="$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH" + $SAPSTARTSRV pf=$SAPSTARTPROFILE -D -u $sidadm + ) # now make sure the daemon has been started and is able to respond local srvrc=1 while [ $srvrc -eq 1 -a $(pgrep -f "sapstartsrv.*$runninginst" | wc -l) -gt 0 ] @@ -809,31 +960,47 @@ function check_for_primary() { super_ocf_log info "FLOW $FUNCNAME ($*)" local rc=$HANA_STATE_DEFECT - node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) - node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') - super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" - # TODO: PRIO2: Maybe we need to use a fallback interface when hdbnsitil does not answer properly -> lookup in config files? + # TODO: PRIO 3: Check beginning from which SPS does SAP support HDBSettings.sh? + # TODO: Limit the runtime of hdbnsutil and use getParameter.py as fallback + # TODO: PRIO2: Maybe we need to use a fallback interface when hdbnsutil does not answer properly -> lookup in config files? # This might also solve some problems when we could not figure-out the ilocal or remote site name - for i in 1 2 3 4 5 6 7 8 9; do + local chkMethod="" + for chkMethod in hU hU hU gP; do + case "$chkMethod" in + gP ) + local gpKeys="" + gpKeys=$(echo --key=global.ini/system_replication/{mode,site_name,site_id}) + node_full_status=$(HANA_CALL --timeout 60 --cmd "HDBSettings.sh getParameter.py $gpKeys --sapcontrol=1" 2>&1 | awk -F/ 'BEGIN {out=0} /^SAPCONTROL-OK: / { out=1 } /^SAPCONTROL-OK: / { out=0 } /=/ {if (out==1) {print $3} }') + node_status=$(echo "$node_full_status" | awk -F= '$1=="mode" {print $2}') + super_ocf_log info "ACT: Using getParameter.py as fallback - node_status=$node_status" + ;; + hU | * ) + # DONE: PRIO1: Begginning from SAP HANA rev 112.03 -sr_state is not longer supported + node_full_status=$(HANA_CALL --timeout 60 --cmd "$hdbState" 2>/dev/null ) + node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') + super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" + ;; + esac case "$node_status" in primary ) - super_ocf_log info "FLOW: $FUNCNAME rc=HANA_STATE_PRIMARY" - return $HANA_STATE_PRIMARY;; + rc=$HANA_STATE_PRIMARY + break;; syncmem | sync | async ) - super_ocf_log info "FLOW: $FUNCNAME rc=HANA_STATE_SECONDARY" - return $HANA_STATE_SECONDARY;; + rc=$HANA_STATE_SECONDARY + break;; none ) # have seen that mode on second side BEFEORE we registered it as replica - super_ocf_log info "FLOW: $FUNCNAME rc=HANA_STATE_STANDALONE" - return $HANA_STATE_STANDALONE;; + rc=$HANA_STATE_STANDALONE + break;; * ) super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" dump=$( echo $node_status | hexdump -C ); super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" - node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) - node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') + # TODO: Limit the runtime of hdbnsutil and use getParameter.py as fallback + # SAP_CALL super_ocf_log debug "DEC: check_for_primary: loop=$i: node_status=$node_status" # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes esac; + sleep 2 done super_ocf_log info "FLOW $FUNCNAME rc=$rc" return $rc @@ -854,12 +1021,18 @@ { super_ocf_log info "FLOW $FUNCNAME ($*)" local rc=-1 srRc=0 all_nodes_other_side="" n="" siteParam="" - if [ -n "$rem_SR_name" ]; then - siteParam="--site=$rem_SR_name" + if [ -n "$remSR_name" ]; then + siteParam="--site=$remSR_name" fi - FULL_SR_STATUS=$(su - $sidadm -c "python $DIR_EXECUTABLE/python_support/systemReplicationStatus.py $siteParam" 2>/dev/null); srRc=$? - super_ocf_log info "DEC $FUNCNAME systemReplicationStatus.py (to site '$rem_SR_name')-> $srRc" - super_ocf_log info "FLOW $FUNCNAME systemReplicationStatus.py (to site '$rem_SR_name')-> $srRc" + # TODO: Get rid of the su by using a new interface: + # SAPSYSTEMNAME=SLE /usr/sap/SLE/HDB00/HDBSettings.sh systemReplicationStatus.py $siteParam + # TODO: Check beginning from which SPS does SAP support HDBSettings.sh? + # TODO: Limit the runtime of systemReplicationStatus.py + # SAP_CALL + # FULL_SR_STATUS=$(su - $sidadm -c "python $DIR_EXECUTABLE/python_support/systemReplicationStatus.py $siteParam" 2>/dev/null); srRc=$? + FULL_SR_STATUS=$(HANA_CALL --timeout 60 --cmd "systemReplicationStatus.py" 2>/dev/null); srRc=$? + super_ocf_log info "DEC $FUNCNAME systemReplicationStatus.py (to site '$remSR_name')-> $srRc" + super_ocf_log info "FLOW $FUNCNAME systemReplicationStatus.py (to site '$remSR_name')-> $srRc" # # TODO: PRIO2: Here we might also need to filter additional sites (if multi tier should be supported) # And is the check for return code capable for chains? @@ -890,7 +1063,7 @@ # ok we should be careful and set secondary to SFAIL super_ocf_log info "FLOW $FUNCNAME SFAIL" set_hana_attribute "$remoteNode" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} - super_ocf_log info "ACT site=$sr_name, seting SFAIL for secondary (5) - srRc=$srRc lss=$lss" + super_ocf_log info "ACT site=$sr_name, setting SFAIL for secondary (5) - srRc=$srRc lss=$lss" # TODO: PRIO1 - P004: need to check LSS again to avoid dying primary to block (SFAIL) secondary lpa_set_lpt 10 "$remoteNode" rc=1 @@ -898,7 +1071,7 @@ else super_ocf_log info "FLOW $FUNCNAME SFAIL" set_hana_attribute "$remoteNode" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} - super_ocf_log info "ACT site=$sr_name, seting SFAIL for secondary (2) - srRc=$srRc" + super_ocf_log info "ACT site=$sr_name, setting SFAIL for secondary (2) - srRc=$srRc" # TODO: PRIO1 - P004: need to check LSS again to avoid dying primary to block (SFAIL) secondary lpa_set_lpt 10 "$remoteNode" rc=1; @@ -992,14 +1165,28 @@ super_ocf_log info "FLOW $FUNCNAME ($*)" local rc=0 # - su - $sidadm -c "python $DIR_EXECUTABLE/python_support/landscapeHostConfiguration.py" 1>/dev/null 2>/dev/null; rc=$? + # TODO: Get rid of the su by using a new interface: + # SAPSYSTEMNAME=SLE /usr/sap/SLE/HDB00/HDBSettings.sh landscapeHostConfiguration.py + # TODO: Check beginning from which SPS does SAP support HDBSettings.sh? + # DONE: Limit the runtime of landscapeHostConfiguration.py + HANA_CALL --timeout 60 --cmd "landscapeHostConfiguration.py" 1>/dev/null 2>/dev/null; rc=$? + if [ $rc -eq 124 ]; then + # TODO: PRIO 1: Check, if we should loop here like 'for i in 1 2 3 ...' ? + # landscape timeout + sleep 20 + HANA_CALL --timeout 60 --cmd "landscapeHostConfiguration.py" 1>/dev/null 2>/dev/null; rc=$? + if [ $rc -eq 124 ]; then + # TODO PRIO2: How to handle still hanging lss - current solution is to say "FATAL" + rc=0 + fi + fi return $rc; } # # function: register_hana_secondary - register local hana as secondary to the other site # params: - -# globals: sidadm(r), remoteHost(r), InstanceNr(r), sr_mode(r), sr_name(r) +# globals: sidadm(r), remoteHost(r), InstanceNr(r), sr_mode(r), sr_name(r), hdbver(r) # register_hana_secondary # function register_hana_secondary() @@ -1007,17 +1194,31 @@ super_ocf_log info "FLOW $FUNCNAME ($*)" local rc=2; local remoteInstance=""; + local newParameter=0 remoteInstance=$InstanceNr + + + if version "$hdbver" ">=" "1.00.110"; then + newParameter=1 + fi + if ocf_is_true ${AUTOMATED_REGISTER}; then - # - # - # - # - # - super_ocf_log info "ACT: REGISTER: hdbnsutil -sr_register --remoteHost=$remoteHost --remoteInstance=$remoteInstance --mode=$sr_mode --name=$sr_name" - # - # - su - $sidadm -c "hdbnsutil -sr_register --remoteHost=$remoteHost --remoteInstance=$remoteInstance --mode=$sr_mode --name=$sr_name"; rc=$? + # TODO: Get rid of the su by using a new interface: + # SAPSYSTEMNAME=SLE /usr/sap/SLE/HDB00/HDBSettings.sh hdbnsutil -sr_register ... + # TODO: Check beginning from which SPS does SAP support HDBSettings.sh? + # TODO: Limit the runtime of hdbnsutil -sr_register ???? + if [ $newParameter -eq 1 ]; then + local hanaOM="" + hanaOM=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_OPERATION_MODE[@]}) + if [ -n "$hanaOM" ]; then + hanaOM="--operationMode=$hanaOM" + fi + super_ocf_log info "ACT: REGISTER: hdbnsutil -sr_register --remoteHost=$remoteHost --remoteInstance=$remoteInstance --replicationMode=$sr_mode $hanaOM --name=$sr_name" + HANA_CALL --timeout inf --use-su --cmd "hdbnsutil -sr_register --remoteHost=$remoteHost --remoteInstance=$remoteInstance --replicationMode=$sr_mode $hanaOM --name=$sr_name"; rc=$? + else + super_ocf_log info "ACT: REGISTER: hdbnsutil -sr_register --remoteHost=$remoteHost --remoteInstance=$remoteInstance --mode=$sr_mode --name=$sr_name" + HANA_CALL --timeout inf --use-su --cmd "hdbnsutil -sr_register --remoteHost=$remoteHost --remoteInstance=$remoteInstance --mode=$sr_mode --name=$sr_name"; rc=$? + fi # backup_global_and_nameserver else super_ocf_log info "ACT: SAPHANA DROP REGISTER because AUTOMATED_REGISTER is set to FALSE" @@ -1051,7 +1252,7 @@ check_sapstartsrv rc=$? # - # TODO: ASK: PRIO5: For SCALE-OUT - do we need to use an other call like StartSystem? Or better to use the HDB command? + # DONE: ASK: PRIO5: For SCALE-OUT - do we need to use an other call like StartSystem? Or better to use the HDB command? # if [ $rc -eq $OCF_SUCCESS ]; then output=$($SAPCONTROL -nr $InstanceNr -function Start) @@ -1169,7 +1370,7 @@ 0 ) # LPA says start-up lpa_advice="start" # TODO: PRIO1: We need to do a special handling for remote being a 234-Secondary in SR Status SOK - # if ( remote_role like [234]:S ) && ( remote_sync_status is SOK|PRIM ) && ( PreferSiteTakeover ) + # if ( remote_role like [234]:S ) && ( remote_sync_status is SOK|PRIM ) && ( PreferSiteTakeover ) # then lpa_advice="wait" remoteRole=$(get_hana_attribute $remoteNode ${ATTR_NAME_HANA_ROLES[@]}) remoteSync=$(get_hana_attribute $remoteNode ${ATTR_NAME_HANA_SYNC_STATUS[@]}) @@ -1193,17 +1394,20 @@ 1) # LPA says register! lpa_advice="register" ;; - 2) # LPA says wait for second LPT + 2) # LPA says wait for older LPA to expire + lpa_advice="wait" + ;; + 3) # LPA says to wait for remote LPA to be reported/announced lpa_advice="wait" ;; - 3 | 4 ) # LPA says something is completely wrong - FAIL resource # TODO: PRIO1: RC3 for waiting remote side to report lss + 4) # LPA says something is completely wrong - FAIL resource # TODO: PRIO1: RC3 for waiting remote side to report lss lpa_advice="fail" ;; - * ) # LPA failed with an unkonown status - FAIL resource + *) # LPA failed with an unkonown status - FAIL resource lpa_advice="fail" ;; esac - + # DONE: PRIO2: Do we need to differ 0 and 1 here? While 0 is a fatal SAP error, 1 for down/error if [ $lss -eq 0 ]; then super_ocf_log err "ACT: get_hana_landscape_status reports FATAL" @@ -1218,7 +1422,7 @@ 2 | 3 | 4 ) # as landcape says we are up - just set the scores and return code super_ocf_log info "LPA: landcape: UP, LPA: start ==> keep running" LPTloc=$(date '+%s') - lpa_set_lpt $LPTloc + lpa_set_lpt $LPTloc $NODENAME rc=$OCF_SUCCESS ;; 1 ) # landcape says we are down, lets start and adjust scores and return code @@ -1226,7 +1430,7 @@ saphana_start rc=$? LPTloc=$(date '+%s') - lpa_set_lpt $LPTloc + lpa_set_lpt $LPTloc $NODENAME ;; esac scoring_crm_master "$my_role" "$my_sync" @@ -1250,11 +1454,11 @@ if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then super_ocf_log info "ACT: Register successful" lpa_push_lpt 10 - lpa_set_lpt 10 + lpa_set_lpt 10 $NODENAME set_crm_master 0 saphana_start_secondary rc=$? - lpa_set_lpt 10 + lpa_set_lpt 10 $NODENAME else super_ocf_log err "ACT: Register failed" rc=$OCF_NOT_RUNNING @@ -1279,11 +1483,19 @@ rc=$OCF_ERR_GENERIC ;; 1 ) # we are down, so we should wait --> followup in next monitor - super_ocf_log info "LPA: landcape: DOWN, LPA: wait ==> keep waiting" - # TODO: PRIO3: Check, if WAITING is correct here - set_hana_attribute ${NODENAME} "WAITING4LPA" ${ATTR_NAME_HANA_CLONE_STATE[@]} - set_crm_master -9000 - rc=$OCF_SUCCESS + # DONE: PRIO3: Check, if WAITING is correct here + if ocf_is_true "$AUTOMATED_REGISTER" ; then + super_ocf_log info "LPA: landcape: DOWN, LPA: wait ==> keep waiting" + super_ocf_log info "RA: landcape: DOWN, LPA: wait ==> keep waiting" + set_hana_attribute ${NODENAME} "WAITING4LPA" ${ATTR_NAME_HANA_CLONE_STATE[@]} + set_crm_master -9000 + rc=$OCF_SUCCESS + else + super_ocf_log warning "LPA: OLD primary needs manual registration (AUTOMATED_REGISTER='false')" + set_hana_attribute ${NODENAME} "WAITING4REG" ${ATTR_NAME_HANA_CLONE_STATE[@]} + set_crm_master -9000 + rc=$OCF_NOT_RUNNING + fi ;; esac ;; @@ -1309,22 +1521,24 @@ local ch ch_role # # get actual list of cluster members - # + # if [ -n "$otherNodes" ]; then for ch in ${otherNodes[@]}; do if [ $rc -eq 1 ]; then ch_role=$(get_hana_attribute ${ch} ${ATTR_NAME_HANA_ROLES[@]}) -# TODO: PRIO3: check if [0-9], [234] or [34] is correct -# TODO: PRIO4: Do we need different checks like "any-primary-master" or "running-primary-master" ? -# grep '[0-9]*:P:[^:]*:master:' <<< $ch_role && rc=0 -# grep '[34]:P:[^:]*:master:' <<< $ch_role && rc=0 -# Match "Running+Available Primary" Master -> Match field 1: 3/4, 2: P, 4: master - awk -F: 'BEGIN { rc=1 } - $1 ~ "[34]" && $2 ="P" && $4="master" { rc=0 } - END { exit rc }' <<< $ch_role ; rc=$? + # TODO: PRIO3: check if [0-9], [234] or [34] is correct + # TODO: PRIO4: Do we need different checks like "any-primary-master" or "running-primary-master" ? + # grep '[0-9]*:P:[^:]*:master:' <<< $ch_role && rc=0 + # grep '[34]:P:[^:]*:master:' <<< $ch_role && rc=0 + # Match "Running+Available Primary" Master -> Match field 1: 3/4, 2: P, 4: master + super_ocf_log debug "DBG: check_for_primary_master (3) ch_role=$ch_role" + awk -F: 'BEGIN { rc=1 } + $1 ~ "[34]" && $2 == "P" && $4 == "master" { rc=0 } + END { exit rc }' <<< $ch_role ; rc=$? + super_ocf_log debug "DBG: check_for_primary_master (4) rc=$rc" fi done - fi + fi super_ocf_log info "FLOW $FUNCNAME rc=$rc" return $rc } @@ -1378,7 +1592,7 @@ ####### LPA - begin # lpa_push_lpt 10 - lpa_set_lpt 10 + lpa_set_lpt 10 $NODENAME # ####### LPA - end # @@ -1404,7 +1618,7 @@ rc=$OCF_SUCCESS fi else - lpa_set_lpt 10 + lpa_set_lpt 10 $NODENAME fi else super_ocf_log info "ACT: wait_for_primary_master ==> WAITING" @@ -1454,7 +1668,7 @@ then if [ $STATE -eq $OCF_NOT_RUNNING ] then - [ "$MONLOG" != "NOLOG" ] && ocf_log err "SAP instance service $SERVICE is not running with status $COLOR !" + [ "$MONLOG" != "NOLOG" ] && ocf_log err "SAP instance service $SERVICE status color is $COLOR !" rc=$STATE fi count=1 @@ -1511,13 +1725,17 @@ local crm_rc=1 local lpt=$1 local clpt=-1 - local node=${2:-${NODENAME}} + local node=$2 set_hana_attribute ${node} "$lpt" ${LPA_ATTR[@]}; crm_rc=$? - clpt=$(lpa_get_lpt $NODENAME) - if [ "$lpt" != "$clpt" ]; then - rc=2 + if [ -n "$node" ]; then + clpt=$(lpa_get_lpt $NODENAME) + if [ "$lpt" != "$clpt" ]; then + rc=2 + else + rc=0 + fi else - rc=0 + super_ocf_log info "DEC: lpa_set_lpt ignore to change value for empty node name" fi super_ocf_log info "FLOW $FUNCNAME rc=$rc" return $rc @@ -1608,7 +1826,7 @@ else rc=2 fi - lpa_set_lpt $LPTloc + lpa_set_lpt $LPTloc $NODENAME super_ocf_log info "FLOW $FUNCNAME rc=$rc" return $rc } @@ -1621,9 +1839,10 @@ # # Returncodes: # 0: start -# 1: register than start -# 2: wait4gab -# 3: wait4other +# 1: register (then start) +# 2: wait4gab (WAIT4LPA - Older LPA needs to expire) +# 3: wait4other (WAIT4LPA - Remote LPA needs to be announced) +# 4: lpa internal error # # Initializing (if NO local LPT-file): # SECONDARY sets to 10 @@ -1648,7 +1867,7 @@ # function lpa_check_lpt_status() { super_ocf_log info "FLOW $FUNCNAME ($*)" - local rc=0 + local rc=4 local LPTloc=-1 local LPTrem=-1 local LPTMark=1000 @@ -1666,16 +1885,16 @@ if [ -z "$LPTloc" -o "$LPTloc" -eq -1 -o "$lparc" -ne 0 ]; then # last option - try to initialize as PRIMARY lpa_push_lpt 20 - lpa_set_lpt 20 + lpa_set_lpt 20 $NODENAME LPTloc=20 # DEFAULT fi fi - # TODO PRIO1: REMOVE remoteNode dependency - lpa_get_lpt + # TODO PRIO1: REMOVE remoteNode dependency - lpa_get_lpt LPTrem=$(lpa_get_lpt $remoteNode); lparc=$? if [ $lparc -ne 0 ]; then # LPT of the other node could not be evaluated - LPA says WAIT super_ocf_log debug "DBG: LPA: LPTloc=$LPTloc, LPTrem undefined ==> WAIT" - rc=2 + rc=3 else super_ocf_log debug "DBG: LPA: LPTloc ($LPTloc) LPTrem ($LPTrem) delta ($delta)" if [ $LPTloc -lt $LPTMark -a $LPTrem -lt $LPTMark ]; then @@ -1683,11 +1902,11 @@ else delta=$DUPLICATE_PRIMARY_TIMEOUT # at least one of the lpts is a real timestamp so include delta-gap fi - if (( delta < LPTloc - LPTrem )); then + if (( delta < LPTloc - LPTrem )); then # We are the winner - LPA says STARTUP super_ocf_log debug "DBG: LPA: LPTloc wins $LPTloc > $LPTrem + $delta ==> START" rc=0 - elif (( delta < LPTrem - LPTloc )); then + elif (( delta < LPTrem - LPTloc )); then if ocf_is_true "$AUTOMATED_REGISTER" ; then # The other one has won - LPA says REGISTER super_ocf_log debug "DBG: LPA: LPTrem wins $LPTrem > $LPTloc + $delta ==> REGISTER" @@ -1697,12 +1916,12 @@ rc=2 fi - else + else super_ocf_log debug "DBG: LPA: Difference between LPTloc and LPTrem is less than delta ($delta) ==> WAIT" # TODO: PRIO3: ADD STALEMATE-HANDLING HERE; currently admin should set one of the lpa to 20 rc=2 - fi - fi + fi + fi super_ocf_log info "FLOW $FUNCNAME rc=$rc" return $rc } @@ -1716,6 +1935,7 @@ { super_ocf_log info "FLOW $FUNCNAME ($*)" local rc=0 + # always true for scale-up super_ocf_log info "FLOW $FUNCNAME rc=$rc" return $rc } @@ -1728,23 +1948,15 @@ # function saphana_start_clone() { super_ocf_log info "FLOW $FUNCNAME ($*)" - local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING + local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING local sqlrc; - local chkusr; - # TODO: PRIO4: remove check_secstore_users later - secUser=$(check_secstore_users SAPHANA${SID}SR SLEHALOC RHELHALOC) ; chkusr=$? - if [ $chkusr -ne 0 ]; then - super_ocf_log err "ACT: Secure store users are missing (see best practice manual how to setup the users)" - rc=$OCF_ERR_CONFIGURED + set_hana_attribute ${NODENAME} "DEMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} + check_for_primary; primary_status=$? + if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then + saphana_start_primary; rc=$? else - set_hana_attribute ${NODENAME} "DEMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} - check_for_primary; primary_status=$? - if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then - saphana_start_primary; rc=$? - else - lpa_set_lpt 10 - saphana_start_secondary; rc=$? - fi + lpa_set_lpt 10 $NODENAME + saphana_start_secondary; rc=$? fi super_ocf_log info "FLOW $FUNCNAME rc=$rc" return $rc @@ -1761,9 +1973,10 @@ local rc=0 local primary_status="x" set_hana_attribute ${NODENAME} "UNDEFINED" ${ATTR_NAME_HANA_CLONE_STATE[@]} + super_ocf_log debug "DBG: SET UNDEFINED" check_for_primary; primary_status=$? if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then - lpa_set_lpt 10 + lpa_set_lpt 10 $NODENAME fi saphana_stop; rc=$? return $rc @@ -1813,26 +2026,42 @@ # seems admin already decided that for us? -> we are running - set DEMOTED promoted=0; LPTloc=$(date '+%s') - lpa_set_lpt $LPTloc + lpa_set_lpt $LPTloc $NODENAME fi lpa_check_lpt_status; lparc=$? - # TODO: PRIO1: Need to differ lpa_check_lpt_status return codes - if [ $lparc -lt 2 ]; then - # lpa - no need to wait any longer - lets try a new start - saphana_start_clone - rc=$? - super_ocf_log info "FLOW $FUNCNAME rc=$rc" - return $rc - else - lpa_init_lpt $HANA_STATE_PRIMARY - # still waiting for second site to report lpa-lpt - if ocf_is_true "$AUTOMATED_REGISTER" ; then - super_ocf_log info "LPA: Still waiting for remote site to report LPA status" - else - super_ocf_log info "LPA: Dual primary detected and AUTOMATED_REGISTER='false' ==> WAITING" - fi - return $OCF_SUCCESS - fi + # DONE: PRIO1: Need to differ lpa_check_lpt_status return codes + case "$lparc" in + 0 | 1 ) + # lpa - no need to wait any longer - lets try a new start + saphana_start_clone + rc=$? + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc + ;; + 2 ) + lpa_init_lpt $HANA_STATE_PRIMARY + # still waiting for second site to expire + if ocf_is_true "$AUTOMATED_REGISTER" ; then + super_ocf_log info "LPA: Still waiting for remote site to report LPA status" + else + super_ocf_log info "LPA: Dual primary detected and AUTOMATED_REGISTER='false' ==> WAITING" + super_ocf_log info "LPA: You need to manually sr_register the older primary" + fi + return $OCF_SUCCESS + ;; + 3 ) + lpa_init_lpt $HANA_STATE_PRIMARY + # still waiting for second site to report lpa-lpt + super_ocf_log info "LPA: Still waiting for remote site to report LPA status" + return $OCF_SUCCESS + ;; + 4 ) + # lpa internal error + # TODO PRIO3: Impplement special handling for this issue - should we fail the ressource? + super_ocf_log info "LPA: LPA reports an internal error" + return $OCF_SUCCESS + ;; + esac promoted=0; ;; UNDEFINED ) @@ -1848,7 +2077,7 @@ ;; esac fi - get_hana_landscape_status; lss=$? + get_hana_landscape_status; lss=$? super_ocf_log debug "DBG: saphana_monitor_clone: get_hana_landscape_status=$lss" case "$lss" in 0 ) # FATAL or ERROR @@ -1876,19 +2105,20 @@ # # TODO PRIO1: REMOVE remoteNode dependency - get_sync_status remoteSync=$(get_hana_attribute $remoteNode ${ATTR_NAME_HANA_SYNC_STATUS[@]}) + # TODO HANDLING OF "NEVER" case "$remoteSync" in SOK | PRIM ) super_ocf_log info "DEC: PreferSiteTakeover selected so decrease promotion score here (and reset lpa)" set_crm_master 5 if check_for_primary_master; then - lpa_set_lpt 20 + lpa_set_lpt 20 $NODENAME fi ;; SFAIL ) - super_ocf_log info "DEC: PreferSiteTakeover selected BUT remoteHost is not in sync (SFAIL) ==> local restart preferred" + super_ocf_log info "DEC: PreferSiteTakeover selected BUT remoteHost is not in sync (SFAIL) ==> local restart preferred" ;; * ) - super_ocf_log info "DEC: PreferSiteTakeover selected BUT remoteHost is not in sync ($remoteSync) ==> local restart preferred" + super_ocf_log info "DEC: PreferSiteTakeover selected BUT remoteHost is not in sync ($remoteSync) ==> local restart preferred" ;; esac else @@ -1916,7 +2146,7 @@ rc=$OCF_SUCCESS else LPTloc=$(date '+%s') - lpa_set_lpt $LPTloc + lpa_set_lpt $LPTloc $NODENAME lpa_push_lpt $LPTloc if [ "$promoted" -eq 1 ]; then set_hana_attribute "$NODENAME" "PRIM" ${ATTR_NAME_HANA_SYNC_STATUS[@]} @@ -1931,12 +2161,14 @@ fi my_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) my_role=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_ROLES[@]}) - case "$my_role" in + case "$my_role" in [12]:P:*:master:* ) # primary is down or may not anser hdbsql query so drop analyze_hana_sync_status ;; [34]:P:*:*:* ) # primary is up and should now be able to anser hdbsql query if [ -f $DIR_EXECUTABLE/python_support/systemReplicationStatus.py ]; then - analyze_hana_sync_statusSRS + if [ "$promote_attr" = "PROMOTED" ]; then + analyze_hana_sync_statusSRS + fi else analyze_hana_sync_statusSQL fi @@ -1949,8 +2181,8 @@ [234]:P:* ) # dual primary, but other instance marked as PROMOTED by the cluster lpa_check_lpt_status; again_lpa_rc=$? if [ $again_lpa_rc -eq 2 ]; then - super_ocf_log info "DEC: Dual primary detected, other instance is PROMOTED and lpa stalemate ==> local restart" - lpa_set_lpt 10 + super_ocf_log info "DEC: Dual primary detected, other instance is PROMOTED and lpa stalemate ==> local restart" + lpa_set_lpt 10 $NODENAME lpa_push_lpt 10 rc=$OCF_NOT_RUNNING fi @@ -1993,7 +2225,7 @@ # OK, we are running as HANA SECONDARY # if ! lpa_get_lpt ${NODENAME}; then - lpa_set_lpt 10 + lpa_set_lpt 10 $NODENAME lpa_push_lpt 10 fi promote_attr=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_CLONE_STATE[@]}) @@ -2042,17 +2274,25 @@ 0 ) # FATAL # DONE: PRIO1: Maybe we need to differ between 0 and 1. While 0 is a fatal sap error, 1 is down/error # TODO: PRIO3: is OCF_ERR_GENERIC best option? - lpa_set_lpt 10 + lpa_set_lpt 10 $NODENAME rc=$OCF_ERR_GENERIC ;; 1 ) # ERROR - lpa_set_lpt 10 + lpa_set_lpt 10 $NODENAME rc=$OCF_NOT_RUNNING ;; 2 | 3 | 4 ) # WARN INFO OK rc=$OCF_SUCCESS - lpa_set_lpt 30 + lpa_set_lpt 30 $NODENAME sync_attr=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) + local hanaOM="" + local hanaOut1="" + # TODO: PRIO 3: check, if using getParameter.py is the best option to analyze the set operationMode + # DONE: PRIO 3: Should we default to logreplay for SAP HANA >= SPS11 ? + hanaOut1=$(HANA_CALL --timeout 10 --use-su --cmd "getParameter.py --key=global.ini/system_replication/operation_mode --sapcontrol=1") + hanaFilter1=$(echo "$hanaOut1" | awk -F/ 'BEGIN {out=0} /^SAPCONTROL-OK: / { out=1 } /^SAPCONTROL-OK: / { out=0 } /=/ {if (out==1) {print $3} }') + hanaOM=$(echo "$hanaFilter1" | awk -F= '$1=="operation_mode" {print $2}') + set_hana_attribute ${NODENAME} "$hanaOM" ${ATTR_NAME_HANA_OPERATION_MODE[@]} super_ocf_log debug "DBG: sync_attr=$sync_attr" case "$sync_attr" in "SOK" ) # This is a possible node to promote, when primary is missing @@ -2112,7 +2352,7 @@ fi # # First check, if we are PRIMARY or SECONDARY - # + # check_for_primary; primary_status=$? if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then # FIX: bsc#919925 Leaving Node Maintenance stops HANA Resource Agent @@ -2145,7 +2385,7 @@ # # function: saphana_promote_clone - promote a hana clone # params: - -# globals: OCF_*(r), NODENAME(r), HANA_STATE_*, SID(r), InstanceName(r), +# globals: OCF_*(r), NODENAME(r), HANA_STATE_*, SID(r), InstanceName(r), # saphana_promote_clone: # In a Master/Slave configuration get Master being the primary OR by running hana takeover # @@ -2169,7 +2409,7 @@ else if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then # - # we are SECONDARY/SLAVE and need to takepover ... + # we are SECONDARY/SLAVE and need to takeover ... promote on the replica (secondary) side... # promote on the replica side... # hana_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) @@ -2178,9 +2418,14 @@ super_ocf_log info "ACT: !!!!!!! Promote REPLICA $SID-$InstanceName to be primary. !!!!!!" LPTloc=$(date '+%s') # lpa_set_lpt 20 $remoteNode - lpa_set_lpt $LPTloc + lpa_set_lpt $LPTloc $NODENAME lpa_push_lpt $LPTloc - su - $sidadm -c "hdbnsutil -sr_takeover" + # TODO: Get rid of the su by using a new interface: + # SAPSYSTEMNAME=SLE /usr/sap/SLE/HDB00/HDBSettings.sh hdbnsutil -sr_takeover ... + # TODO: Check beginning from which SPS does SAP support HDBSettings.sh? + # TODO: Limit the runtime of hdbnsutil -sr_takeover ???? + # SAP_CALL + HANA_CALL --timeout inf --use-su --cmd "hdbnsutil -sr_takeover" # # now gain check, if we are primary NOW # @@ -2248,7 +2493,6 @@ SAPSTARTPROFILE="" SAPHanaFilter="ra-act-dec-lpa" -NODENAME=$(crm_node -n) if [ $# -ne 1 ] @@ -2306,8 +2550,7 @@ fi # What kind of method was invoked? -THE_VERSION=$(saphana_meta_data | grep ' - 0.151.1 + $SAPHanaVersion Analyzes SAP HANA System Replication Topology. This RA analyzes the SAP HANA topology and "sends" all findings via the node status attributes to all nodes in the cluster. These attributes are taken by the SAPHana RA to control the SAP Hana Databases. @@ -207,12 +215,12 @@ dstr=$(date) case "$attr_store" in reboot | forever ) - echo "$dstr: SAPHanaTopology: crm_attribute -N ${attr_node} -G -n \"$attr_name\" -l $attr_store -q" >> /var/log/fhATTRIBUTE - crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q -d "$attr_default" 2>>/var/log/fhATTRIBUTE; rc=$? + echo "$dstr: SAPHanaTopology: crm_attribute -N ${attr_node} -G -n \"$attr_name\" -l $attr_store -q" >> $log_attr_file + crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q -d "$attr_default" 2>>$log_attr_file; rc=$? ;; props ) - echo "$dstr: SAPHanaTopology: crm_attribute -G -n \"$attr_name\" -t crm_config -q" >> /var/log/fhATTRIBUTE - crm_attribute -G -n "$attr_name" -t crm_config -q -d "$attr_default" 2>>/var/log/fhATTRIBUTE; rc=$? + echo "$dstr: SAPHanaTopology: crm_attribute -G -n \"$attr_name\" -t crm_config -q" >> $log_attr_file + crm_attribute -G -n "$attr_name" -t crm_config -q -d "$attr_default" 2>>$log_attr_file; rc=$? ;; esac super_ocf_log info "FLOW $FUNCNAME rc=$rc" @@ -282,6 +290,53 @@ } # +# function: dequote - filter: remove quotes (") from stdin +# params: - +# globals: - +function dequote() +{ + local rc=0; tr -d '"'; return $rc +} + +# function: version: cpmpare two HANA version strings +function ver_lt() { + ocf_version_cmp $1 $2 + test $? -eq 0 && return 0 || return 1 +} + +function ver_le() { + ocf_version_cmp $1 $2 + test $? -eq 0 -o $? -eq 1 && return 0 || return 1 +} + +function ver_gt() { + ocf_version_cmp $1 $2 + test $? -eq 2 && return 0 || return 1 +} + +function ver_ge() { + ocf_version_cmp $1 $2 + test $? -eq 2 -o $? -eq 1 && return 0 || return 1 +} +# +# function: version: cpmpare two HANA version strings +# +function version() { + if [ $# -eq 3 ]; then + case "$2" in + LE | le | "<=" ) ver_le $1 $3;; + LT | lt | "<" ) ver_lt $1 $3;; + GE | ge | ">=" ) ver_ge $1 $3;; + GT | gt | ">" ) ver_gt $1 $3;; + * ) return 1; + esac + elif [ $# -ge 5 ]; then + version $1 $2 $3 && shift 2 && version $* + else + return 1; + fi +} +# # function: is_clone - report, if resource is configured as a clone (also master/slave) # params: - # globals: OCF_*(r) @@ -314,12 +369,74 @@ } # +# function: HANA_CALL +# params: timeout-in-seconds cmd-line +# globals: sid(r), SID(r), InstanceName(r) +# +function HANA_CALL() +{ + # + # TODO: PRIO 5: remove 'su - ${sidadm} later, when SAP HANA resoled issue with + # root-user-called hdbnsutil -sr_state (which creates root-owned shared memory file in /var/lib/hdb/SID/shmgrp) + # TODO: PRIO 5: Maybe make "su" optional by a parameter + local timeOut=0 + local onTimeOut="" + local rc=0 + local use_su=1 # Default to be changed later (see TODO above) + local pre_cmd="" + local cmd="" + local pre_script="" + local output="" + while [ $# -gt 0 ]; do + case "$1" in + --timeout ) timeOut=$2; shift;; + --use-su ) use_su=1;; + --on-timeout ) onTimeOut="$2"; shift;; + --cmd ) shift; cmd="$*"; break;; + esac + shift + done + + if [ $use_su -eq 1 ]; then + pre_cmd="su - ${sid}adm -c" + pre_script="true" + else + # as root user we need the library path to the SAP kernel to be able to call sapcontrol + # check, if we already added DIR_EXECUTABLE at the beginning of LD_LIBRARY_PATH + if [ "${LD_LIBRARY_PATH%%*:}" != "$DIR_EXECUTABLE" ] + then + MY_LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH + fi + pre_cmd="bash -c" + pre_script="LD_LIBRARY_PATH=$MY_LD_LIBRARY_PATH; export LD_LIBRARY_PATH" + fi + case $timeout in + 0 | inf ) + output=$($pre_cmd "$pre_script; /usr/sap/$SID/$InstanceName/HDBSettings.sh $cmd"); rc=$? + ;; + * ) + output=$(timeout $timeOut $pre_cmd "$pre_script; /usr/sap/$SID/$InstanceName/HDBSettings.sh $cmd"); rc=$? + # + # on timeout ... + # + if [ $rc -eq 124 -a -n "$onTimeOut" ]; then + local second_output="" + second_output=$($pre_cmd "$pre_script; /usr/sap/$SID/$InstanceName/HDBSettings.sh $onTimeOut"); + fi + ;; + esac + echo "$output" + return $rc; +} + +# # function: sht_init - initialize variables for the resource agent # params: - # globals: OCF_*(r), SID(w), sid(rw), sidadm(w), InstanceName(w), InstanceNr(w), -# globals: meta_notify_master_uname(w), HANA_SR_TOLOPOGY(w), sr_name(w), remoteHost(w) +# globals: meta_notify_master_uname(w), HANA_SR_TOLOPOGY(w), sr_name(w) # globals: ATTR_NAME_HANA_SYNC_STATUS(w), ATTR_NAME_HANA_PRIMARY_AT(w), ATTR_NAME_HANA_CLONE_STATE(w) # globals: DIR_EXECUTABLE(w), SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), LD_LIBRARY_PATH(w), PATH(w), nodelist(w) +# globals: NODENAME(w), hdbver(w) # sht_init : Define global variables with default values, if optional parameters are not set # # @@ -331,12 +448,14 @@ local hdbANSWER="" local siteID local siteNAME + local chkMethod="" HOSTEXECNAME=saphostexec USRSAP=/usr/sap SAPSERVICE_PATH=${USRSAP}/sapservices SAPHOSTCTRL_PATH=${USRSAP}/hostctrl/exe HOSTEXEC_PATH=${SAPHOSTCTRL_PATH}/${HOSTEXECNAME} HOSTEXEC_PROFILE_PATH=${SAPHOSTCTRL_PATH}/host_profile + NODENAME=$(crm_node -n) SID=$OCF_RESKEY_SID InstanceNr=$OCF_RESKEY_InstanceNumber myInstanceName="${SID}_HDB${InstanceNr}" @@ -382,13 +501,6 @@ DIR_PROFILE="$OCF_RESKEY_DIR_PROFILE" fi - # as root user we need the library path to the SAP kernel to be able to call sapcontrol - # check, if we already added DIR_EXECUTABLE at the beginning of LD_LIBRARY_PATH - if [ "${LD_LIBRARY_PATH%%*:}" != "$DIR_EXECUTABLE" ] - then - LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH - export LD_LIBRARY_PATH - fi PATH=${PATH}:${DIR_EXECUTABLE} # @@ -399,12 +511,45 @@ *openais* ) nodelist=$(crm_node -l | awk '/member/ {print $2}');; *cman* ) nodelist=$(crm_node -l);; esac + # + # get HANA version + # + local ges_ver + ges_ver=$(HANA_CALL --timeout 10 --cmd "HDB version" | tr -d " " | awk -F: '$1 == "version" {print $2}') + hdbver=${ges_ver%.*.*} + # + # since rev 111.00 we should use a new hdbnsutil option to get the -sr_state + # since rev 112.03 the old option is changed and we should use -sr_stateConfiguration where ever possible + # + hdbState="hdbnsutil -sr_state" + hdbMap="hdbnsutil -sr_state" + if version "$hdbver" ">=" "1.00.111"; then + hdbState="hdbnsutil -sr_stateConfiguration" + hdbMap="hdbnsutil -sr_stateHostMapping" + fi #### SAP-CALL # hdbnsutil was a bit unstable in some tests so we recall the tool, if it fails to report the srmode - for i in 1 2 3 4 5 6 7 8 9; do - hdbANSWER=$(su - ${sidadm} -c "hdbnsutil -sr_state --sapcontrol=1" 2>/dev/null) - super_ocf_log debug "DBG2: hdbANSWER=\$\(su - ${sidadm} -c \"hdbnsutil -sr_state --sapcontrol=1\"\)" - srmode=$(echo "$hdbANSWER" | awk -F= '/mode/ {print $2}') + for chkMethod in hU hU hU gP ; do + # DONE: Limit the runtime of hdbnsutil. + # TODO: Use getParameter.py if we get no answer + # SAP_CALL + #super_ocf_log debug "DBG2: hdbANSWER=$hdbANSWER" + #srmode=$(echo "$hdbANSWER" | awk -F= '/mode/ {print $2}') + case "$chkMethod" in + gP ) # call getParameter (gP) + local gpKeys="" + gpKeys=$(echo --key=global.ini/system_replication/{mode,site_name,site_id}) + hdbANSWER=$(HANA_CALL --timeout 60 --cmd "HDBSettings.sh getParameter.py $gpKeys --sapcontrol=1" 2>&1 | awk -F/ 'BEGIN {out=0} /^SAPCONTROL-OK: / { out=1 } /^SAPCONTROL-OK: / { out=0 } /=/ {if (out==1) {print $3} }') + srmode=$(echo "$hdbANSWER" | awk -F= '$1=="mode" {print $2}') + super_ocf_log info "ACT: hdbnsutil not answering - using global.ini as fallback - srmode=$srmode" + ;; + hU | * ) # call hdbnsUtil (hU) ( also for unknown chkMethod ) + # DONE: PRIO1: Begginning from SAP HANA rev 112.03 -sr_state is not longer supported + hdbANSWER=$(HANA_CALL --timeout 60 --cmd "$hdbState --sapcontrol=1" 2>/dev/null) + super_ocf_log debug "DBG2: hdbANSWER=$hdbANSWER" + srmode=$(echo "$hdbANSWER" | awk -F= '$1=="mode" {print $2}') + ;; + esac case "$srmode" in primary | syncmem | sync | async | none ) # we can leave the loop as we already got a result @@ -417,27 +562,51 @@ esac done # TODO PRIO3: Implement a file lookup, if we did not get a result - siteID=$(echo "$hdbANSWER" | awk -F= '/site id/ {print $2}') - siteNAME=$(echo "$hdbANSWER" | awk -F= '/site name/ {print $2}') + siteID=$(echo "$hdbANSWER" | awk -F= '/site.id/ {print $2}') # allow 'site_id' AND 'site id' + siteNAME=$(echo "$hdbANSWER" | awk -F= '/site.name/ {print $2}') site=$siteNAME srmode=$(echo "$hdbANSWER" | awk -F= '/mode/ {print $2}') - MAPPING=$(echo "$hdbANSWER" | awk -F[=/] '$1 ~ "mapping" && $3 !~ site { print $4 }' site=$site) - super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING" # - # filter all non-cluster mappings + # for rev >= 111 we use the new mapping query # - # DONE: PRIO2: Need mapping between HANA HOSTS not cluster NODES - local hanaVHost - hanaRemoteHost=$(for n1 in $nodelist; do - hanaVHost=$(get_hana_attribute ${n1} ${ATTR_NAME_HANA_VHOST[@]}) - for n2 in $MAPPING; do - if [ "$hanaVHost" == "$n2" ]; then - echo $hanaVHost; - fi; - done; - done ) - super_ocf_log info "DEC: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" - super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" + if version "$hdbver" ">=" "1.00.111"; then + hdbANSWER=$(HANA_CALL --timeout 60 --cmd "$hdbMap --sapcontrol=1" 2>/dev/null) + fi + MAPPING=$(echo "$hdbANSWER" | awk -F[=/] '$1 == "mapping" && $3 != site { print $4 }' site=$site) + super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING" + if [ -n "$MAPPING" ]; then + # we have a mapping from HANA, lets use it + # + # filter all non-cluster mappings + # + local hanaVHost="" + local n1="" + hanaRemoteHost="" + for n1 in $nodelist; do + hanaVHost=$(get_hana_attribute ${n1} ${ATTR_NAME_HANA_VHOST[@]}) + for n2 in $MAPPING; do + if [ "$hanaVHost" == "$n2" ]; then + hanaRemoteHost="$hanaVHost" + fi; + done; + done + super_ocf_log info "DEC: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" + super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" + else + # HANA DID NOT TOLD THE MAPPING, LETS TRY TO USE THE SITE ATTRIBUTES + local n1="" + local hanaSite="" + for n1 in $nodelist; do + # TODO: PRIO9 - For multi tier with more than 2 chain/star members IN the cluster we might need to be + # able to catch more than one remoteHost + # currently having more than 2 HANA in a chain/star members IN the cluster is not allowed, the third must be external + if [ "$NODENAME" != "$n1" ]; then + hanaSite=$(get_hana_attribute ${n1} ${ATTR_NAME_HANA_SITE[@]}) + hanaRemoteHost="$n1" + fi + done + super_ocf_log info "DEC: site=$site, mode=$srmode, hanaRemoteHost=$hanaRemoteHost - found by remote site ($hanaSite)" + fi super_ocf_log info "FLOW $FUNCNAME rc=$OCF_SUCCESS" return $OCF_SUCCESS } @@ -446,38 +615,29 @@ # function: check_for_primary - check if local SAP HANA is configured as primary # params: - # globals: HANA_STATE_PRIMARY(r), HANA_STATE_SECONDARY(r), HANA_STATE_DEFECT(r), HANA_STATE_STANDALONE(r) +# srmode(r) # function check_for_primary() { super_ocf_log info "FLOW $FUNCNAME ($*)" local rc=0 - node_status=$srmode - super_ocf_log debug "DBG2: check_for_primary: node_status=$node_status" - super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" - for i in 1 2 3 4 5 6 7 8 9; do - case "$node_status" in - primary ) - super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_PRIMARY" - return $HANA_STATE_PRIMARY;; - syncmem | sync | async ) - super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_SECONDARY" - return $HANA_STATE_SECONDARY;; - none ) # have seen that mode on second side BEFEORE we registered it as replica - super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_STANDALONE" - return $HANA_STATE_STANDALONE;; - * ) - # TODO: PRIO1: Should we set SFAIL? - # TODO: PRIO2: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes - dump=$( echo $node_status | hexdump -C ); - super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP: <$dump>" - #### SAP-CALL - node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) - node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') - super_ocf_log info "DEC: check_for_primary: loop=$i: node_status=$node_status" - # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes - esac; - done - super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_DEFECT" - return $HANA_STATE_DEFECT + super_ocf_log debug "DBG: check_for_primary: srmode=$srmode" + case "$srmode" in + primary ) + super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_PRIMARY" + rc=$HANA_STATE_PRIMARY;; + syncmem | sync | async ) + super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_SECONDARY" + rc=$HANA_STATE_SECONDARY;; + none ) # have seen that mode on second side BEFEORE we registered it as replica + super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_STANDALONE" + rc=$HANA_STATE_STANDALONE;; + * ) + dump=$( echo $srmode | hexdump -C ); + super_ocf_log err "ACT: check_for_primary: we didn't expect srmode to be: DUMP: <$dump>" + rc=$HANA_STATE_DEFECT + esac; + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc } @@ -653,7 +813,7 @@ function sht_stop_clone() { super_ocf_log info "FLOW $FUNCNAME ($*)" local rc=0 - check_for_primary; primary_status=$? + check_for_primary; primary_status=$? if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then hanaPrim="P" elif [ $primary_status -eq $HANA_STATE_SECONDARY ]; then @@ -663,7 +823,7 @@ else hanaPrim="-" fi - set_hana_attribute "${NODENAME}" "1:$hanaPrim:-:-:-:-" ${ATTR_NAME_HANA_ROLES[@]} + set_hana_attribute "${NODENAME}" "1:$hanaPrim:-:-:-:-" ${ATTR_NAME_HANA_ROLES[@]} sht_stop; rc=$? return $rc } @@ -718,28 +878,49 @@ fi # DONE: PRIO1: ASK: Is the output format of ListInstances fix? Could we take that as an API? # try to catch: Inst Info : LNX - 42 - lv9041 - 740, patch 36, changelist 1444691 - # We rely on the following format: SID is word#4, NR is work#6, vHost is word#8 + # We rely on the following format: SID is word#4, SYSNR is word#6, vHost is word#8 #### SAP-CALL vName=$(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances \ - | awk '$4 == SID && $6=NR { print $8 }' SID=$SID NR=$InstanceNr 2>/dev/null ) + | awk '$4 == SID && $6 == SYSNR { print $8 }' SID=$SID SYSNR=$InstanceNr 2>/dev/null ) # super_ocf_log debug "DBG: ListInstances: $(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances)" if [ -n "$vName" ]; then - set_hana_attribute ${NODENAME} "$vName" ${ATTR_NAME_HANA_VHOST[@]} + set_hana_attribute ${NODENAME} "$vName" ${ATTR_NAME_HANA_VHOST[@]} else vName=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_VHOST[@]}) fi #site=$(get_site_name) #### SAP-CALL - hanaANSWER=$(su - $sidadm -c "python exe/python_support/landscapeHostConfiguration.py" 2>/dev/null); hanalrc="$?" - hanarole=$(echo "$hanaANSWER" | tr -d ' ' | awk -F'|' '$2 == host { printf "%s:%s:%s:%s\n",$10,$11,$12,$13 } ' host=${vName}) + # SAP_CALL + #hanaANSWER=$(su - $sidadm -c "python exe/python_support/landscapeHostConfiguration.py" 2>/dev/null); hanalrc="$?" + # + # since rev 09x SAP has added the --sapcontrol option for the landscapeHostConfiguration interface + # we begin to use --sapcontrol with rev 100 + # since rev 120 we need to use the --sapcontrol, because SAP changed the tool output + # + if version "$hdbver" ">=" "1.00.100"; then + hanaANSWER=$(HANA_CALL --timeout 60 --cmd "landscapeHostConfiguration.py --sapcontrol=1" 2>/dev/null); hanalrc="$?" + # TODO: PRIO9: Do we need to check the lines: 'SAPCONTROL-OK: ' and 'SAPCONTROL-OK: '? + hanarole=$(echo "$hanaANSWER" | tr -d ' ' | \ + awk -F= '$1 == "nameServerConfigRole" {f1=$2} + $1 == "nameServerActualRole" {f2=$2} + $1 == "indexServerConfigRole" {f3=$2} + $1 == "indexServerActualRole" {f4=$2} + END { printf "%s:%s:%s:%s\n", f1, f2, f3,f4 }') + else + # + # old code for backward compatability + # + hanaANSWER=$(HANA_CALL --timeout 60 --cmd "landscapeHostConfiguration.py" 2>/dev/null); hanalrc="$?" + hanarole=$(echo "$hanaANSWER" | tr -d ' ' | awk -F'|' '$2 == host { printf "%s:%s:%s:%s\n",$10,$11,$12,$13 } ' host=${vName}) + fi #if [ -z "$MAPPING" ]; then # super_ocf_log info "ACT: Did not find remote Host at this moment" #fi # FH TODO PRIO3: TRY TO GET RID OF "ATTR_NAME_HANA_REMOTEHOST" if [ -n "$hanaRemoteHost" ]; then - set_hana_attribute ${NODENAME} "$hanaRemoteHost" ${ATTR_NAME_HANA_REMOTEHOST[@]} + set_hana_attribute ${NODENAME} "$hanaRemoteHost" ${ATTR_NAME_HANA_REMOTEHOST[@]} fi - set_hana_attribute ${NODENAME} "$hanalrc:$hanaPrim:$hanarole" ${ATTR_NAME_HANA_ROLES[@]} + set_hana_attribute ${NODENAME} "$hanalrc:$hanaPrim:$hanarole" ${ATTR_NAME_HANA_ROLES[@]} if [ -n "$site" ]; then set_hana_attribute ${NODENAME} "$site" ${ATTR_NAME_HANA_SITE[@]} fi @@ -748,8 +929,8 @@ S ) # only secondary may propargate its sync status case $(crm_attribute --type crm_config --name cluster-infrastructure -q) in *corosync* ) nodelist=$(crm_node -l | awk '{ print $2 }');; - *openais* ) nodelist=$(crm_node -l | awk '/member/ {print $2}');; - *cman* ) nodelist=$(crm_node -l);; + *openais* ) nodelist=$(crm_node -l | awk '/member/ {print $2}');; + *cman* ) nodelist=$(crm_node -l);; esac for n in ${nodelist}; do @@ -789,7 +970,6 @@ InstanceNr="" DIR_EXECUTABLE="" SAPHanaFilter="ra-act-dec-lpa" -NODENAME=$(crm_node -n) if [ $# -ne 1 ] then @@ -846,8 +1026,7 @@ fi fi -THE_VERSION=$(sht_meta_data | grep ' +# License: GPL v2+ +my $Version="0.18.2016.02.16.1"; # +################################################################## use POSIX; use strict; +use Sys::Syslog; +use Sys::Hostname; +use File::Path; +use Getopt::Long; +use lib '/usr/share/SAPHanaSR/tests'; +use SAPHanaSRTools; + +################################### +## this part is not for scale out and currently NOT zero-config + +my $ClusterNodes=2; +my $ClusterPrimaries=1; +my $ClusterSecondaries=1; +my %Name; +my %Host; +my $host = hostname(); +my $varlib='/var/lib/SAPHanaTD'; +my $testfile='SAPHanaTD.status'; +my $testcount=0; +my $first_test=1; my $sid=""; -my $table_title = "Host \\ Attr"; -my %Name; +my @sids; +my $ino=""; +my $sortBy=""; +my $table_titleH = "Host"; +#my %Name; my %Host; +my %Site; +my %Global; +my %HName; +my %SName; +my %GName; +my $help; +my $version; +my $cibFile=""; + +sub init() +{ + my $result = GetOptions ("sid=s" => \@sids, + "sort=s" => \$sortBy, + "cib=s" => \$cibFile, + "version" => \$version, + "help" => \$help, + ); + return 0; +} + +init(); + +if ( $help ) { + printf "SAPHanaSR-showAttr {[--sid=]} [--sort=] [--cib=]\n"; + printf ""; + exit 0; +} +if ( $version ) { + printf "%s\n", $Version; + exit 0; +} + +if ( $cibFile ne "" ) { + printf "Using cib file %s\n", $cibFile; +} sub max { # thanks to http://www.perlunity.de/perl/forum/thread_018329.shtml my $a = shift; @@ -21,113 +80,75 @@ return $a > $b ? $a : $b; } -sub print_attr_host() -{ - my ($HKey, $AKey); - printf "%-22s", "Attribute \\ Host"; - foreach $HKey (sort keys %Host) { - printf "%-16s ", $HKey; - } - printf "\n"; - - printf "%s\n", "-" x 120 ; - - foreach $AKey (sort keys %Name) { - printf "%-22s", $AKey; - foreach $HKey (sort keys %Host) { - printf "%-16.16s ", $Host{$HKey} -> {$AKey}; - } - - printf "\n"; - } - return 0; -} - -sub print_host_attr() -{ - my ($AKey, $HKey, $len, $line_len, $hclen); - $hclen=$Name{_hosts}->{_length}; - $line_len=$hclen+1; - printf "%-$hclen.${hclen}s ", "$table_title"; - foreach $AKey (sort keys %Name) { - if ($AKey ne "_hosts") { - $len = $Name{$AKey}->{_length}; - $line_len=$line_len+$len+1; - printf "%-$len.${len}s ", $Name{$AKey}->{_title}; +sub read_cib($) { + my $sid = shift(); + if ( $cibFile eq "" ) { + printf "Open live cib\n"; + open CIB, "cibadmin -Ql |" or die "CIB could not be read from cluster"; + } else { + open CIB, "<$cibFile" or die "CIB file $cibFile not found or not able to read it"; + } + while () { + chomp; + my ($host, $name, $site, $value); + if ( $_ =~ /cib-last-written="([^"]*)"/ ) { + printf "CIB-time: %s\n", $1; } - } - printf "\n"; - printf "%s\n", "-" x $line_len ; - foreach $HKey (sort keys %Host) { - printf "%-$hclen.${hclen}s ", $HKey; - foreach $AKey (sort keys %Name) { - if ($AKey ne "_hosts") { - $len = $Name{$AKey}->{_length}; - printf "%-$len.${len}s ", $Host{$HKey} -> {$AKey}; - } - } - printf "\n"; - } - return 0; -} - -open ListInstances, "/usr/sap/hostctrl/exe/saphostctrl -function ListInstances|"; -while () { - # try to catch: Inst Info : LNX - 42 - lv9041 - 740, patch 36, changelist 1444691 - chomp; - if ( $_ =~ /:\s+([A-Z][A-Z0-9][A-Z0-9])\s+-/ ) { - $sid=tolower("$1"); - } -} -close ListInstances; - - -open CIB, "cibadmin -Ql |"; -while () { - chomp; - my ($host, $name, $value); - my $found=0; - if ( $_ =~ /nvpair.*name="(\w+_${sid}_\w+)"/ ) { - $name=$1; - # find attribute in forever and reboot store :) - if ( $_ =~ /id="(status|nodes)-([a-zA-Z0-9\_\-]+)-/ ) { - $host=$2; - } - if ( $_ =~ /value="([^"]+)"/ ) { - $value=$1; - $found=1; - } - } - if ( $found == 1 ) { - # - # handle the hosts name and table-title - # - $Host{$host}->{$name}=${value}; - if ( defined ($Name{_hosts}->{_length})) { - $Name{_hosts}->{_length} = max($Name{_hosts}->{_length}, length($host )); - } else { - $Name{_hosts}->{_length} = length($host ); + if ( $_ =~ /node_state id=".+" uname="([a-zA-Z0-9\-\_]+)" .*crmd="([a-zA-Z0-9\-\_]+)"/ ) { + insertAttribute($sid, \%Host, \%HName, $1, "node_status", $2); } - $Name{_hosts}->{_length} = max($Name{_hosts}->{_length}, length( $table_title)); - # - # now handle the attributes name and value - # - $Name{$name}->{$host}=${value}; - if ( defined ($Name{$name}->{_length})) { - $Name{$name}->{_length} = max($Name{$name}->{_length}, length($value )); - } else { - $Name{$name}->{_length} = length($value ); + if ( $_ =~ /nvpair.*name="([a-zA-Z0-9\_\-]+_${sid}_([a-zA-Z0-9\-\_]+))"/ ) { + $name=$1; + if ( $_ =~ /id=.(status|nodes)-([a-zA-Z0-9\_\-]+)-/ ) { + # found attribute in nodes forever and reboot store + $host=$2; + if ( $_ =~ /value="([^"]+)"/ ) { + $value=$1; + insertAttribute($sid, \%Host, \%HName, $host, $name, $value); + } + } elsif ( $_ =~ /id=.SAPHanaSR-[a-zA-Z0-9\_\-]+_site_[a-zA-Z0-9\-]+_([a-zA-Z0-9\_\-]+)/) { + # found a site attribute + $site=$1; + if ( $name =~ /[a-zA-Z0-9\_\-]+_site_([a-zA-Z0-9\-]+)/ ) { + $name = $1; + } + if ( $_ =~ /value="([^"]+)"/ ) { + $value=$1; + insertAttribute($sid, \%Site, \%SName, $site, $name, $value); + } + } elsif ( $_ =~ /id=.SAPHanaSR-[a-zA-Z0-9\_\-]+_glob_[a-zA-Z0-9\_\-]+/) { + # found a global attribute + $host="GLOBAL"; + if ( $name =~ /([a-zA-Z0-9\_\-]+)_glob_([a-zA-Z0-9\_\-]+)/ ) { + $name = $2; + } + if ( $_ =~ /value="([^"]+)"/ ) { + $value=$1; + insertAttribute($sid, \%Global, \%GName, "global", $name, $value); + } + } } - if ( $name =~ /hana_${sid}_(.*)/ ) { - $Name{$name}->{_title} = $1; - } else { - $Name{$name}->{_title} = $name; - } - $Name{$name}->{_length} = max($Name{$name}->{_length}, length( $Name{$name}->{_title})); - # printf "%-8s %-20s %-30s\n", $1, $2, $3; - } + } + close CIB; } -close CIB; -#print_attr_host; -print_host_attr; +if ( 0 == @sids ) { + my $sid_ino_list; + ( $sid_ino_list ) = get_sid_and_InstNr(); + @sids = split(",", $sid_ino_list); + +} + +foreach $sid (@sids) { + ( $sid, $ino ) = split(":", $sid); + $sid=tolower("$sid"); + %Host=(); + %HName=(); + read_cib($sid); + get_hana_attributes($sid); + if ( keys(%Host) == 0 ) { + printf "No attributes found for SID=%s\n", $sid; + } else { + print_host_attr(\%Host, \%HName, "Hosts", $sortBy); + } +}