Blame SOURCES/bz1834193-2-fence_aws-fix-race-condition.patch

b1c6fc
--- a/agents/aws/fence_aws.py	2020-03-26 10:31:03.653171381 +0100
b1c6fc
+++ b/agents/aws/fence_aws.py	2020-03-24 16:21:16.942155519 +0100
b1c6fc
@@ -3,14 +3,33 @@
b1c6fc
 import sys, re
b1c6fc
 import logging
b1c6fc
 import atexit
b1c6fc
+import requests
b1c6fc
 sys.path.append("@FENCEAGENTSLIBDIR@")
b1c6fc
 from fencing import *
b1c6fc
-from fencing import fail, fail_usage, EC_TIMED_OUT, run_delay
b1c6fc
+from fencing import fail, fail_usage, run_delay, EC_STATUS, SyslogLibHandler
b1c6fc
 
b1c6fc
 import boto3
b1c6fc
 from botocore.exceptions import ClientError, EndpointConnectionError, NoRegionError
b1c6fc
 
b1c6fc
+logger = logging.getLogger("fence_aws")
b1c6fc
+logger.propagate = False
b1c6fc
+logger.setLevel(logging.INFO)
b1c6fc
+logger.addHandler(SyslogLibHandler())
b1c6fc
+logging.getLogger('botocore.vendored').propagate = False
b1c6fc
+	
b1c6fc
+def get_instance_id():
b1c6fc
+	try:
b1c6fc
+		r = requests.get('http://169.254.169.254/latest/meta-data/instance-id')
b1c6fc
+		return r.content.decode("UTF-8")
b1c6fc
+	except HTTPError as http_err:
b1c6fc
+		logger.error('HTTP error occurred while trying to access EC2 metadata server: %s', http_err)
b1c6fc
+	except Exception as err:
b1c6fc
+		logger.error('A fatal error occurred while trying to access EC2 metadata server: %s', err)
b1c6fc
+	return None
b1c6fc
+	
b1c6fc
+
b1c6fc
 def get_nodes_list(conn, options):
b1c6fc
+	logger.info("Starting monitor operation")
b1c6fc
 	result = {}
b1c6fc
 	try:
b1c6fc
 		for instance in conn.instances.all():
b1c6fc
@@ -19,13 +38,17 @@
b1c6fc
 		fail_usage("Failed: Incorrect Access Key or Secret Key.")
b1c6fc
 	except EndpointConnectionError:
b1c6fc
 		fail_usage("Failed: Incorrect Region.")
b1c6fc
-
b1c6fc
+	except Exception as e:
b1c6fc
+		logger.error("Failed to get node list: %s", e)
b1c6fc
+	logger.debug("Monitor operation OK: %s",result)
b1c6fc
 	return result
b1c6fc
 
b1c6fc
 def get_power_status(conn, options):
b1c6fc
+	logger.debug("Starting status operation")
b1c6fc
 	try:
b1c6fc
 		instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [options["--plug"]]}])
b1c6fc
 		state = list(instance)[0].state["Name"]
b1c6fc
+		logger.info("Status operation for EC2 instance %s returned state: %s",options["--plug"],state.upper())
b1c6fc
 		if state == "running":
b1c6fc
 			return "on"
b1c6fc
 		elif state == "stopped":
b1c6fc
@@ -38,20 +61,49 @@
b1c6fc
 	except EndpointConnectionError:
b1c6fc
 		fail_usage("Failed: Incorrect Region.")
b1c6fc
 	except IndexError:
b1c6fc
+		fail(EC_STATUS)
b1c6fc
+	except Exception as e:
b1c6fc
+		logging.error("Failed to get power status: %s", e)
b1c6fc
+		fail(EC_STATUS)
b1c6fc
+
b1c6fc
+def get_self_power_status(conn, instance_id):
b1c6fc
+	try:
b1c6fc
+		instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [instance_id]}])
b1c6fc
+		state = list(instance)[0].state["Name"]
b1c6fc
+		if state == "running":
b1c6fc
+			logging.debug("Captured my (%s) state and it %s - returning OK - Proceeding with fencing",instance_id,state.upper())
b1c6fc
+			return "ok"
b1c6fc
+		else:
b1c6fc
+			logging.debug("Captured my (%s) state it is %s - returning Alert - Unable to fence other nodes",instance_id,state.upper())
b1c6fc
+			return "alert"
b1c6fc
+	
b1c6fc
+	except ClientError:
b1c6fc
+		fail_usage("Failed: Incorrect Access Key or Secret Key.")
b1c6fc
+	except EndpointConnectionError:
b1c6fc
+		fail_usage("Failed: Incorrect Region.")
b1c6fc
+	except IndexError:
b1c6fc
 		return "fail"
b1c6fc
 
b1c6fc
 def set_power_status(conn, options):
b1c6fc
-	if (options["--action"]=="off"):
b1c6fc
-		conn.instances.filter(InstanceIds=[options["--plug"]]).stop(Force=True)
b1c6fc
-	elif (options["--action"]=="on"):
b1c6fc
-		conn.instances.filter(InstanceIds=[options["--plug"]]).start()
b1c6fc
-
b1c6fc
+	my_instance = get_instance_id()
b1c6fc
+	try:
b1c6fc
+		if (options["--action"]=="off"):
b1c6fc
+			if (get_self_power_status(conn,my_instance) == "ok"):
b1c6fc
+				conn.instances.filter(InstanceIds=[options["--plug"]]).stop(Force=True)
b1c6fc
+				logger.info("Called StopInstance API call for %s", options["--plug"])
b1c6fc
+			else:
b1c6fc
+				logger.info("Skipping fencing as instance is not in running status")
b1c6fc
+		elif (options["--action"]=="on"):
b1c6fc
+			conn.instances.filter(InstanceIds=[options["--plug"]]).start()
b1c6fc
+	except Exception as e:
b1c6fc
+		logger.error("Failed to power %s %s: %s", \
b1c6fc
+				options["--action"], options["--plug"], e)
b1c6fc
 
b1c6fc
 def define_new_opts():
b1c6fc
 	all_opt["region"] = {
b1c6fc
 		"getopt" : "r:",
b1c6fc
 		"longopt" : "region",
b1c6fc
-		"help" : "-r, --region=[name]            Region, e.g. us-east-1",
b1c6fc
+		"help" : "-r, --region=[region]           Region, e.g. us-east-1",
b1c6fc
 		"shortdesc" : "Region.",
b1c6fc
 		"required" : "0",
b1c6fc
 		"order" : 2
b1c6fc
@@ -59,7 +111,7 @@
b1c6fc
 	all_opt["access_key"] = {
b1c6fc
 		"getopt" : "a:",
b1c6fc
 		"longopt" : "access-key",
b1c6fc
-		"help" : "-a, --access-key=[name]         Access Key",
b1c6fc
+		"help" : "-a, --access-key=[key]         Access Key",
b1c6fc
 		"shortdesc" : "Access Key.",
b1c6fc
 		"required" : "0",
b1c6fc
 		"order" : 3
b1c6fc
@@ -67,23 +119,32 @@
b1c6fc
 	all_opt["secret_key"] = {
b1c6fc
 		"getopt" : "s:",
b1c6fc
 		"longopt" : "secret-key",
b1c6fc
-		"help" : "-s, --secret-key=[name]         Secret Key",
b1c6fc
+		"help" : "-s, --secret-key=[key]         Secret Key",
b1c6fc
 		"shortdesc" : "Secret Key.",
b1c6fc
 		"required" : "0",
b1c6fc
 		"order" : 4
b1c6fc
 	}
b1c6fc
+	all_opt["boto3_debug"] = {
b1c6fc
+		"getopt" : "b:",
b1c6fc
+		"longopt" : "boto3_debug",
b1c6fc
+		"help" : "-b, --boto3_debug=[option]      Boto3 and Botocore library debug logging",
b1c6fc
+		"shortdesc": "Boto Lib debug",
b1c6fc
+		"required": "0",
b1c6fc
+		"order": 5
b1c6fc
+	}
b1c6fc
 
b1c6fc
 # Main agent method
b1c6fc
 def main():
b1c6fc
 	conn = None
b1c6fc
 
b1c6fc
-	device_opt = ["port", "no_password", "region", "access_key", "secret_key"]
b1c6fc
+	device_opt = ["port", "no_password", "region", "access_key", "secret_key", "boto3_debug"]
b1c6fc
 
b1c6fc
 	atexit.register(atexit_handler)
b1c6fc
 
b1c6fc
 	define_new_opts()
b1c6fc
 
b1c6fc
 	all_opt["power_timeout"]["default"] = "60"
b1c6fc
+	all_opt["boto3_debug"]["default"] = "off"
b1c6fc
 
b1c6fc
 	options = check_input(device_opt, process_input(device_opt))
b1c6fc
 
b1c6fc
@@ -99,6 +160,28 @@
b1c6fc
 
b1c6fc
 	run_delay(options)
b1c6fc
 
b1c6fc
+	if options.get("--verbose") is not None:
b1c6fc
+		lh = logging.FileHandler('/var/log/fence_aws_debug.log')
b1c6fc
+		logger.addHandler(lh)
b1c6fc
+		lhf = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
b1c6fc
+		lh.setFormatter(lhf)
b1c6fc
+		logger.setLevel(logging.DEBUG)
b1c6fc
+	
b1c6fc
+	if options["--boto3_debug"] != "on":
b1c6fc
+		boto3.set_stream_logger('boto3',logging.INFO)
b1c6fc
+		boto3.set_stream_logger('botocore',logging.INFO)
b1c6fc
+		logging.getLogger('botocore').propagate = False
b1c6fc
+		logging.getLogger('boto3').propagate = False
b1c6fc
+	else:
b1c6fc
+		log_format = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
b1c6fc
+		logging.getLogger('botocore').propagate = False
b1c6fc
+		logging.getLogger('boto3').propagate = False
b1c6fc
+		fdh = logging.FileHandler('/var/log/fence_aws_boto3.log')
b1c6fc
+		fdh.setFormatter(log_format)
b1c6fc
+		logging.getLogger('boto3').addHandler(fdh)
b1c6fc
+		logging.getLogger('botocore').addHandler(fdh)
b1c6fc
+		logging.debug("Boto debug level is %s and sending debug info to /var/log/fence_aws_boto3.log", options["--boto3_debug"])
b1c6fc
+
b1c6fc
 	region = options.get("--region")
b1c6fc
 	access_key = options.get("--access-key")
b1c6fc
 	secret_key = options.get("--secret-key")
b1c6fc
@@ -106,12 +189,12 @@
b1c6fc
 		conn = boto3.resource('ec2', region_name=region,
b1c6fc
 				      aws_access_key_id=access_key,
b1c6fc
 				      aws_secret_access_key=secret_key)
b1c6fc
-	except:
b1c6fc
-		fail_usage("Failed: Unable to connect to AWS. Check your configuration.")
b1c6fc
+	except Exception as e:
b1c6fc
+		fail_usage("Failed: Unable to connect to AWS: " + str(e))
b1c6fc
 
b1c6fc
 	# Operate the fencing device
b1c6fc
 	result = fence_action(conn, options, set_power_status, get_power_status, get_nodes_list)
b1c6fc
 	sys.exit(result)
b1c6fc
 
b1c6fc
 if __name__ == "__main__":
b1c6fc
-	main()
b1c6fc
+	main()
b1c6fc
\ No newline at end of file