From e80e142092c53102a46886e9748b8e25465ce4f6 Mon Sep 17 00:00:00 2001 From: Marek 'marx' Grac Date: Wed, 20 Jan 2016 11:32:21 +0100 Subject: [PATCH] fence_compute: Sync with master branch --- fence/agents/compute/fence_compute.py | 180 ++++++++++++++++++++++++++-------- tests/data/metadata/fence_compute.xml | 16 +-- 2 files changed, 150 insertions(+), 46 deletions(-) diff --git a/fence/agents/compute/fence_compute.py b/fence/agents/compute/fence_compute.py index 82d9c46..d9fe54a 100644 --- a/fence/agents/compute/fence_compute.py +++ b/fence/agents/compute/fence_compute.py @@ -19,6 +19,9 @@ REDHAT_COPYRIGHT="Copyright (C) Red Hat, Inc. 2004-2010 All rights reserved." override_status = "" nova = None +EVACUABLE_TAG = "evacuable" +TRUE_TAGS = ['true'] + def get_power_status(_, options): global override_status @@ -32,8 +35,8 @@ def get_power_status(_, options): if nova: try: services = nova.services.list(host=options["--plug"]) - for service in services: + logging.debug("Status of %s is %s" % (service.binary, service.state)) if service.binary == "nova-compute": if service.state == "up": status = "on" @@ -49,31 +52,91 @@ def get_power_status(_, options): # NOTE(sbauza); We mimic the host-evacuate module since it's only a contrib # module which is not stable def _server_evacuate(server, on_shared_storage): - success = True + success = False error_message = "" try: - nova.servers.evacuate(server=server['uuid'], on_shared_storage=on_shared_storage) + logging.debug("Resurrecting instance: %s" % server) + (response, dictionary) = nova.servers.evacuate(server=server, on_shared_storage=on_shared_storage) + + if response == None: + error_message = "No response while evacuating instance" + elif response.status_code == 200: + success = True + error_message = response.reason + else: + error_message = response.reason + except Exception as e: - success = False error_message = "Error while evacuating instance: %s" % e return { - "server_uuid": server['uuid'], - "evacuate_accepted": success, - "error_message": error_message, + "uuid": server, + "accepted": success, + "reason": error_message, } -def _host_evacuate(host, on_shared_storage): - hypervisors = nova.hypervisors.search(host, servers=True) - response = [] - for hyper in hypervisors: - if hasattr(hyper, 'servers'): - for server in hyper.servers: - response.append(_server_evacuate(server, on_shared_storage)) +def _is_server_evacuable(server, evac_flavors, evac_images): + if server.flavor.get('id') in evac_flavors: + return True + if server.image.get('id') in evac_images: + return True + return False + +def _get_evacuable_flavors(): + result = [] + flavors = nova.flavors.list() + # Since the detailed view for all flavors doesn't provide the extra specs, + # we need to call each of the flavor to get them. + for flavor in flavors: + if flavor.get_keys().get(EVACUABLE_TAG).strip().lower() in TRUE_TAGS: + result.append(flavor.id) + return result + +def _get_evacuable_images(): + result = [] + images = nova.images.list(detailed=True) + for image in images: + if hasattr(image, 'metadata'): + if image.metadata.get(EVACUABLE_TAG).strip.lower() in TRUE_TAGS: + result.append(image.id) + return result + +def _host_evacuate(options): + result = True + servers = nova.servers.list(search_opts={'host': options["--plug"]}) + if options["--instance-filtering"] == "False": + evacuables = servers + else: + flavors = _get_evacuable_flavors() + images = _get_evacuable_images() + # Identify all evacuable servers + evacuables = [server for server in servers + if _is_server_evacuable(server, flavors, images)] + + if options["--no-shared-storage"] != "False": + on_shared_storage = False + else: + on_shared_storage = True + + for server in evacuables: + if hasattr(server, 'id'): + response = _server_evacuate(server.id, on_shared_storage) + if response["accepted"]: + logging.debug("Evacuated %s from %s: %s" % + (response["uuid"], options["--plug"], response["reason"])) + else: + logging.error("Evacuation of %s on %s failed: %s" % + (response["uuid"], options["--plug"], response["reason"])) + result = False + else: + logging.error("Could not evacuate instance: %s" % server.to_dict()) + # Should a malformed instance result in a failed evacuation? + # result = False + return result def set_attrd_status(host, status, options): logging.debug("Setting fencing status for %s to %s" % (host, status)) - run_command(options, "attrd_updater -p -n evacute -Q -N %s -v %s" % (host, status)) + run_command(options, "attrd_updater -p -n evacuate -Q -N %s -U %s" % (host, status)) def set_power_status(_, options): global override_status @@ -86,28 +149,53 @@ def set_power_status(_, options): if options["--action"] == "on": if get_power_status(_, options) == "on": + # Forcing the service back up in case it was disabled nova.services.enable(options["--plug"], 'nova-compute') + try: + # Forcing the host back up + nova.services.force_down( + options["--plug"], "nova-compute", force_down=False) + except Exception as e: + # In theory, if foce_down=False fails, that's for the exact + # same possible reasons that below with force_down=True + # eg. either an incompatible version or an old client. + # Since it's about forcing back to a default value, there is + # no real worries to just consider it's still okay even if the + # command failed + logging.info("Exception from attempt to force " + "host back up via nova API: " + "%s: %s" % (e.__class__.__name__, e)) else: # Pretend we're 'on' so that the fencing library doesn't loop forever waiting for the node to boot override_status = "on" return - # need to wait for nova to update its internal status or we - # cannot call host-evacuate - while get_power_status(_, options) != "off": - # Loop forever if need be. - # - # Some callers (such as Pacemaker) will have a timer - # running and kill us if necessary - logging.debug("Waiting for nova to update it's internal state") - time.sleep(1) - - if options["--no-shared-storage"] != "False": - on_shared_storage = False - else: - on_shared_storage = True + try: + nova.services.force_down( + options["--plug"], "nova-compute", force_down=True) + except Exception as e: + # Something went wrong when we tried to force the host down. + # That could come from either an incompatible API version + # eg. UnsupportedVersion or VersionNotFoundForAPIMethod + # or because novaclient is old and doesn't include force_down yet + # eg. AttributeError + # In that case, fallbacking to wait for Nova to catch the right state. + + logging.error("Exception from attempt to force host down via nova API: " + "%s: %s" % (e.__class__.__name__, e)) + # need to wait for nova to update its internal status or we + # cannot call host-evacuate + while get_power_status(_, options) != "off": + # Loop forever if need be. + # + # Some callers (such as Pacemaker) will have a timer + # running and kill us if necessary + logging.debug("Waiting for nova to update it's internal state for %s" % options["--plug"]) + time.sleep(1) + + if not _host_evacuate(options): + sys.exit(1) - _host_evacuate(options["--plug"], on_shared_storage) return def get_plugs_list(_, options): @@ -117,9 +205,9 @@ def get_plugs_list(_, options): hypervisors = nova.hypervisors.list() for hypervisor in hypervisors: longhost = hypervisor.hypervisor_hostname - if options["--action"] == "list" and options["--domain"] != "": - shorthost = longhost.replace("." + options["--domain"], - "") + if options["--domain"] != "": + shorthost = longhost.replace("." + options["--domain"], "") + result[longhost] = ("", None) result[shorthost] = ("", None) else: result[longhost] = ("", None) @@ -164,7 +252,7 @@ def define_new_opts(): "order": 5, } all_opt["record-only"] = { - "getopt" : "", + "getopt" : "r:", "longopt" : "record-only", "help" : "--record-only Record the target as needing evacuation but as yet do not intiate it", "required" : "0", @@ -172,6 +260,15 @@ def define_new_opts(): "default" : "False", "order": 5, } + all_opt["instance-filtering"] = { + "getopt" : "", + "longopt" : "instance-filtering", + "help" : "--instance-filtering Only evacuate instances create from images and flavors with evacuable=true", + "required" : "0", + "shortdesc" : "Only evacuate flagged instances", + "default" : "False", + "order": 5, + } all_opt["no-shared-storage"] = { "getopt" : "", "longopt" : "no-shared-storage", @@ -187,17 +284,17 @@ def main(): global nova atexit.register(atexit_handler) - device_opt = ["login", "passwd", "tenant-name", "auth-url", + device_opt = ["login", "passwd", "tenant-name", "auth-url", "fabric_fencing", "on_target", "no_login", "no_password", "port", "domain", "no-shared-storage", "endpoint-type", - "record-only"] + "record-only", "instance-filtering"] define_new_opts() all_opt["shell_timeout"]["default"] = "180" options = check_input(device_opt, process_input(device_opt)) docs = {} - docs["shortdesc"] = "Fence agent for nova compute nodes" - docs["longdesc"] = "fence_nova_host is a Nova fencing notification agent" + docs["shortdesc"] = "Fence agent for the automatic resurrection of OpenStack compute instances" + docs["longdesc"] = "Used to tell Nova that compute nodes are down and to reschedule flagged instances" docs["vendorurl"] = "" show_docs(options, docs) @@ -213,7 +310,10 @@ def main(): if options["--action"] != "list" and options["--domain"] != "" and options.has_key("--plug"): options["--plug"] = options["--plug"] + "." + options["--domain"] - if options["--record-only"] != "False": + if options["--record-only"] in [ "2", "Disabled", "disabled" ]: + sys.exit(0) + + elif options["--record-only"] in [ "1", "True", "true", "Yes", "yes"]: if options["--action"] == "on": set_attrd_status(options["--plug"], "no", options) sys.exit(0) @@ -222,7 +322,7 @@ def main(): set_attrd_status(options["--plug"], "yes", options) sys.exit(0) - elif options["--action"] in ["status", "monitor"]: + elif options["--action"] in ["monitor", "status"]: sys.exit(0) # The first argument is the Nova client version diff --git a/tests/data/metadata/fence_compute.xml b/tests/data/metadata/fence_compute.xml index 846a861..98bed4e 100644 --- a/tests/data/metadata/fence_compute.xml +++ b/tests/data/metadata/fence_compute.xml @@ -1,6 +1,6 @@ - -fence_nova_host is a Nova fencing notification agent + +Used to tell Nova that compute nodes are down and to reschedule flagged instances @@ -35,7 +35,7 @@ - + Fencing Action @@ -48,6 +48,11 @@ DNS domain in which hosts live + + + + Only evacuate flagged instances + @@ -55,7 +60,7 @@ - + Only record the target as needing evacuation @@ -115,9 +120,8 @@ - + - -- 2.4.3