Blob Blame History Raw
--- ClusterLabs-resource-agents-e711383f/heartbeat/aws-vpc-route53.in	2018-06-29 14:05:02.000000000 +0200
+++ /home/oalbrigt/src/resource-agents/heartbeat/aws-vpc-route53.in	2019-11-07 12:24:18.822111495 +0100
@@ -152,9 +152,15 @@
 END
 }
 
-ec2ip_validate() {
+r53_validate() {
 	ocf_log debug "function: validate"
 
+	# Check for required binaries
+	ocf_log debug "Checking for required binaries"
+	for command in curl dig; do
+		check_binary "$command"
+	done
+
 	# Full name
 	[[ -z "$OCF_RESKEY_fullname" ]] && ocf_log error "Full name parameter not set $OCF_RESKEY_fullname!" && exit $OCF_ERR_CONFIGURED
 
@@ -175,32 +181,111 @@
 	ocf_log debug "ok"
 
 	if [ -n "$OCF_RESKEY_profile" ]; then
-		AWS_PROFILE_OPT="--profile $OCF_RESKEY_profile"
+		AWS_PROFILE_OPT="--profile $OCF_RESKEY_profile --cli-connect-timeout 10"
 	else
-		AWS_PROFILE_OPT="--profile default"
+		AWS_PROFILE_OPT="--profile default --cli-connect-timeout 10"
 	fi
 
 	return $OCF_SUCCESS
 }
 
-ec2ip_monitor() {
-	ec2ip_validate
+r53_monitor() {
+	#
+	# For every start action the agent  will call Route53 API to check for DNS record
+	# otherwise it will try to get results directly bu querying the DNS using "dig".
+	# Due to complexity in some DNS architectures "dig" can fail, and if this happens
+	# the monitor will fallback to the Route53 API call.
+	#
+	# There will be no failure, failover or restart of the agent if the monitor operation fails
+	# hence we only return $OCF_SUCESS in this function
+	#
+	# In case of the monitor operation detects a wrong or non-existent Route53 DNS entry
+	# it will try to fix the existing one, or create it again
+	#
+	#
+	ARECORD=""
+	IPREGEX="^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$"
+	r53_validate
 	ocf_log debug "Checking Route53 record sets"
-	IPADDRESS="$(ec2metadata aws ip | grep local-ipv4 | /usr/bin/awk '{ print $2 }')"
-	ARECORD="$(aws $AWS_PROFILE_OPT route53 list-resource-record-sets --hosted-zone-id $OCF_RESKEY_hostedzoneid --query "ResourceRecordSets[?Name=='$OCF_RESKEY_fullname']" | grep RESOURCERECORDS | /usr/bin/awk '{ print $2 }' )"
-	ocf_log debug "Found IP address: $ARECORD ."
-	if [ "${ARECORD}" == "${IPADDRESS}" ]; then
-		ocf_log debug "ARECORD $ARECORD found"
+	#
+	IPADDRESS="$(curl -s http://169.254.169.254/latest/meta-data/local-ipv4)"
+	#
+	if [ "$__OCF_ACTION" = "start" ] || ocf_is_probe ; then
+		#
+		cmd="aws $AWS_PROFILE_OPT route53 list-resource-record-sets --hosted-zone-id $OCF_RESKEY_hostedzoneid --query ResourceRecordSets[?Name=='$OCF_RESKEY_fullname']"
+		ocf_log info "Route53 Agent Starting or probing - executing monitoring API call: $cmd"
+		CLIRES="$($cmd 2>&1)"
+		rc=$?
+		ocf_log debug "awscli returned code: $rc"
+		if [ $rc -ne 0 ]; then
+			CLIRES=$(echo $CLIRES | grep -v '^$')
+			ocf_log warn "Route53 API returned an error: $CLIRES"
+			ocf_log warn "Skipping cluster action due to API call error"
+			return $OCF_ERR_GENERIC
+		fi
+		ARECORD=$(echo $CLIRES | grep RESOURCERECORDS | awk '{ print $5 }')
+		#
+		if ocf_is_probe; then
+			#
+			# Prevent R53 record change during probe
+			#
+			if [[ $ARECORD =~ $IPREGEX ]] && [ "$ARECORD" != "$IPADDRESS" ]; then
+				ocf_log debug "Route53 DNS record $ARECORD found at probing, disregarding"
+				return $OCF_NOT_RUNNING
+			fi
+		fi
+	else
+		#
+		cmd="dig +retries=3 +time=5 +short $OCF_RESKEY_fullname 2>/dev/null"
+		ocf_log info "executing monitoring command : $cmd"
+		ARECORD="$($cmd)"
+		rc=$?
+		ocf_log debug "dig return code: $rc"
+		#
+		if  [[ ! $ARECORD =~ $IPREGEX ]] || [ $rc -ne 0 ]; then
+			ocf_log info "Fallback to Route53 API query due to DNS resolution failure"
+			cmd="aws $AWS_PROFILE_OPT route53 list-resource-record-sets --hosted-zone-id $OCF_RESKEY_hostedzoneid --query ResourceRecordSets[?Name=='$OCF_RESKEY_fullname']"
+			ocf_log debug "executing monitoring API call: $cmd"
+			CLIRES="$($cmd 2>&1)"
+			rc=$?
+			ocf_log debug "awscli return code: $rc"
+			if [ $rc -ne 0 ]; then
+				CLIRES=$(echo $CLIRES | grep -v '^$')
+				ocf_log warn "Route53 API returned an error: $CLIRES"
+				ocf_log warn "Monitor skipping cluster action due to API call error"
+				return $OCF_SUCCESS
+			fi
+			ARECORD=$(echo $CLIRES | grep RESOURCERECORDS | awk '{ print $5 }')
+		fi
+		#
+	fi
+	ocf_log info "Route53 DNS record pointing $OCF_RESKEY_fullname to IP address $ARECORD"
+	#
+	if [ "$ARECORD" == "$IPADDRESS" ]; then
+		ocf_log info "Route53 DNS record $ARECORD found"
+		return $OCF_SUCCESS
+	elif [[ $ARECORD =~ $IPREGEX ]] && [ "$ARECORD" != "$IPADDRESS" ]; then
+		ocf_log info "Route53 DNS record points to a different host, setting DNS record on Route53 to this host"
+		_update_record "UPSERT" "$IPADDRESS"
 		return $OCF_SUCCESS
 	else
-		ocf_log debug "No ARECORD found"
-		return $OCF_NOT_RUNNING
+		ocf_log info "No Route53 DNS record found, setting DNS record on Route53 to this host"
+		_update_record "UPSERT" "$IPADDRESS"
+		return $OCF_SUCCESS
 	fi
 
 	return $OCF_SUCCESS
 }
 
 _update_record() {
+	#
+	# This function is the one that will actually execute Route53's API call
+	# and configure the DNS record using the correct API calls and parameters
+	#
+	# It creates a temporary JSON file under /tmp with the required API payload
+	#
+	# Failures in this function are critical and will cause the agent to fail
+	#
 	update_action="$1"
 	IPADDRESS="$2"
 	ocf_log info "Updating Route53 $OCF_RESKEY_hostedzoneid with $IPADDRESS for $OCF_RESKEY_fullname"
@@ -209,19 +294,19 @@
 		ocf_exit_reason "Failed to create temporary file for record update"
 		exit $OCF_ERR_GENERIC
 	fi
-	cat >>"${ROUTE53RECORD}" <<-EOF
+	cat >>"$ROUTE53RECORD" <<-EOF
 	{
 		  "Comment": "Update record to reflect new IP address for a system ",
 		  "Changes": [
 			  {
-				  "Action": "${update_action}",
+				  "Action": "$update_action",
 				  "ResourceRecordSet": {
-					  "Name": "${OCF_RESKEY_fullname}",
+					  "Name": "$OCF_RESKEY_fullname",
 					  "Type": "A",
-					  "TTL": ${OCF_RESKEY_ttl},
+					  "TTL": $OCF_RESKEY_ttl,
 					  "ResourceRecords": [
 						  {
-							  "Value": "${IPADDRESS}"
+							  "Value": "$IPADDRESS"
 						  }
 					  ]
 				  }
@@ -229,46 +314,53 @@
 		  ]
 	}
 	EOF
-	cmd="aws --profile ${OCF_RESKEY_profile} route53 change-resource-record-sets --hosted-zone-id ${OCF_RESKEY_hostedzoneid} \
-	  --change-batch file://${ROUTE53RECORD} "
+	cmd="aws --profile $OCF_RESKEY_profile route53 change-resource-record-sets --hosted-zone-id $OCF_RESKEY_hostedzoneid --change-batch file://$ROUTE53RECORD "
 	ocf_log debug "Executing command: $cmd"
-	CHANGEID=$($cmd | grep CHANGEINFO |	 /usr/bin/awk -F'\t' '{ print $3 }' )
-	ocf_log debug "Change id: ${CHANGEID}"
-	rmtempfile ${ROUTE53RECORD}
-	CHANGEID=$(echo $CHANGEID |cut -d'/' -f 3 |cut -d'"' -f 1 )
-	ocf_log debug "Change id: ${CHANGEID}"
+	CLIRES="$($cmd 2>&1)"
+	rc=$?
+	ocf_log debug "awscli returned code: $rc"
+	if [ $rc -ne 0 ]; then
+		CLIRES=$(echo $CLIRES | grep -v '^$')
+		ocf_log warn "Route53 API returned an error: $CLIRES"
+		ocf_log warn "Skipping cluster action due to API call error"
+		return $OCF_ERR_GENERIC
+	fi
+	CHANGEID=$(echo $CLIRES | awk '{ print $12 }')
+	ocf_log debug "Change id: $CHANGEID"
+	rmtempfile $ROUTE53RECORD
+	CHANGEID=$(echo $CHANGEID | cut -d'/' -f 3 | cut -d'"' -f 1 )
+	ocf_log debug "Change id: $CHANGEID"
 	STATUS="PENDING"
-	MYSECONDS=2
+	MYSECONDS=20
 	while [ "$STATUS" = 'PENDING' ]; do
-		sleep	${MYSECONDS}
-		STATUS="$(aws --profile ${OCF_RESKEY_profile} route53 get-change --id $CHANGEID | grep CHANGEINFO |  /usr/bin/awk -F'\t' '{ print $4 }' |cut -d'"' -f 2 )"
-		ocf_log debug "Waited for ${MYSECONDS} seconds and checked execution of Route 53 update status: ${STATUS} "
+		sleep $MYSECONDS
+		STATUS="$(aws --profile $OCF_RESKEY_profile route53 get-change --id $CHANGEID | grep CHANGEINFO | awk -F'\t' '{ print $4 }' |cut -d'"' -f 2 )"
+		ocf_log debug "Waited for $MYSECONDS seconds and checked execution of Route 53 update status: $STATUS "
 	done
 }
 
-ec2ip_stop() {
-	ocf_log info "Bringing down Route53 agent. (Will remove ARECORD)"
-	IPADDRESS="$(ec2metadata aws ip | grep local-ipv4 | /usr/bin/awk '{ print $2 }')"
-	ARECORD="$(aws $AWS_PROFILE_OPT route53 list-resource-record-sets --hosted-zone-id $OCF_RESKEY_hostedzoneid --query "ResourceRecordSets[?Name=='$OCF_RESKEY_fullname']" | grep RESOURCERECORDS | /usr/bin/awk '{ print $2 }' )"
-	ocf_log debug "Found IP address: $ARECORD ."
-	if [ ${ARECORD} != ${IPADDRESS} ]; then
-		ocf_log debug "No ARECORD found"
-		return $OCF_SUCCESS
-	else
-		# determine IP address
-		IPADDRESS="$(ec2metadata aws ip | grep local-ipv4 | /usr/bin/awk '{ print $2 }')"
-		# Patch file
-		ocf_log debug "Deleting IP address to ${IPADDRESS}"
-		return $OCF_SUCCESS
-	fi
-
-	_update_record "DELETE" "$IPADDRESS"
+r53_stop() {
+	#
+	# Stop operation doesn't perform any API call or try to remove the DNS record
+	# this mostly because this is not necessarily mandatory or desired
+	# the start and monitor functions will take care of changing the DNS record
+	# if the agent starts in a different cluster node
+	#
+	ocf_log info "Bringing down Route53 agent. (Will NOT remove Route53 DNS record)"
 	return $OCF_SUCCESS
 }
 
-ec2ip_start() {
-	IPADDRESS="$(ec2metadata aws ip | grep local-ipv4 | /usr/bin/awk '{ print $2 }')"
-	_update_record "UPSERT" "$IPADDRESS"
+r53_start() {
+	#
+	# Start agent and config DNS in Route53
+	#
+	ocf_log info "Starting Route53 DNS update...."
+	IPADDRESS="$(curl -s http://169.254.169.254/latest/meta-data/local-ipv4)"
+	r53_monitor
+	if [ $? != $OCF_SUCCESS ]; then
+		ocf_log info "Could not start agent - check configurations"
+		return $OCF_ERR_GENERIC
+	fi
 	return $OCF_SUCCESS
 }
 
@@ -284,16 +376,16 @@
 		exit $OCF_SUCCESS
 		;;
 	monitor)
-		ec2ip_monitor
+		r53_monitor
 		;;
 	stop)
-		ec2ip_stop
+		r53_stop
 		;;
 	validate-all)
-		ec2ip_validate
+		r53_validate
 		;;
 	start)
-		ec2ip_start
+		r53_start
 		;;
 	*)
 		usage