From e80e142092c53102a46886e9748b8e25465ce4f6 Mon Sep 17 00:00:00 2001
From: Marek 'marx' Grac <mgrac@redhat.com>
Date: Wed, 20 Jan 2016 11:32:21 +0100
Subject: [PATCH] fence_compute: Sync with master branch
---
fence/agents/compute/fence_compute.py | 180 ++++++++++++++++++++++++++--------
tests/data/metadata/fence_compute.xml | 16 +--
2 files changed, 150 insertions(+), 46 deletions(-)
diff --git a/fence/agents/compute/fence_compute.py b/fence/agents/compute/fence_compute.py
index 82d9c46..d9fe54a 100644
--- a/fence/agents/compute/fence_compute.py
+++ b/fence/agents/compute/fence_compute.py
@@ -19,6 +19,9 @@ REDHAT_COPYRIGHT="Copyright (C) Red Hat, Inc. 2004-2010 All rights reserved."
override_status = ""
nova = None
+EVACUABLE_TAG = "evacuable"
+TRUE_TAGS = ['true']
+
def get_power_status(_, options):
global override_status
@@ -32,8 +35,8 @@ def get_power_status(_, options):
if nova:
try:
services = nova.services.list(host=options["--plug"])
-
for service in services:
+ logging.debug("Status of %s is %s" % (service.binary, service.state))
if service.binary == "nova-compute":
if service.state == "up":
status = "on"
@@ -49,31 +52,91 @@ def get_power_status(_, options):
# NOTE(sbauza); We mimic the host-evacuate module since it's only a contrib
# module which is not stable
def _server_evacuate(server, on_shared_storage):
- success = True
+ success = False
error_message = ""
try:
- nova.servers.evacuate(server=server['uuid'], on_shared_storage=on_shared_storage)
+ logging.debug("Resurrecting instance: %s" % server)
+ (response, dictionary) = nova.servers.evacuate(server=server, on_shared_storage=on_shared_storage)
+
+ if response == None:
+ error_message = "No response while evacuating instance"
+ elif response.status_code == 200:
+ success = True
+ error_message = response.reason
+ else:
+ error_message = response.reason
+
except Exception as e:
- success = False
error_message = "Error while evacuating instance: %s" % e
return {
- "server_uuid": server['uuid'],
- "evacuate_accepted": success,
- "error_message": error_message,
+ "uuid": server,
+ "accepted": success,
+ "reason": error_message,
}
-def _host_evacuate(host, on_shared_storage):
- hypervisors = nova.hypervisors.search(host, servers=True)
- response = []
- for hyper in hypervisors:
- if hasattr(hyper, 'servers'):
- for server in hyper.servers:
- response.append(_server_evacuate(server, on_shared_storage))
+def _is_server_evacuable(server, evac_flavors, evac_images):
+ if server.flavor.get('id') in evac_flavors:
+ return True
+ if server.image.get('id') in evac_images:
+ return True
+ return False
+
+def _get_evacuable_flavors():
+ result = []
+ flavors = nova.flavors.list()
+ # Since the detailed view for all flavors doesn't provide the extra specs,
+ # we need to call each of the flavor to get them.
+ for flavor in flavors:
+ if flavor.get_keys().get(EVACUABLE_TAG).strip().lower() in TRUE_TAGS:
+ result.append(flavor.id)
+ return result
+
+def _get_evacuable_images():
+ result = []
+ images = nova.images.list(detailed=True)
+ for image in images:
+ if hasattr(image, 'metadata'):
+ if image.metadata.get(EVACUABLE_TAG).strip.lower() in TRUE_TAGS:
+ result.append(image.id)
+ return result
+
+def _host_evacuate(options):
+ result = True
+ servers = nova.servers.list(search_opts={'host': options["--plug"]})
+ if options["--instance-filtering"] == "False":
+ evacuables = servers
+ else:
+ flavors = _get_evacuable_flavors()
+ images = _get_evacuable_images()
+ # Identify all evacuable servers
+ evacuables = [server for server in servers
+ if _is_server_evacuable(server, flavors, images)]
+
+ if options["--no-shared-storage"] != "False":
+ on_shared_storage = False
+ else:
+ on_shared_storage = True
+
+ for server in evacuables:
+ if hasattr(server, 'id'):
+ response = _server_evacuate(server.id, on_shared_storage)
+ if response["accepted"]:
+ logging.debug("Evacuated %s from %s: %s" %
+ (response["uuid"], options["--plug"], response["reason"]))
+ else:
+ logging.error("Evacuation of %s on %s failed: %s" %
+ (response["uuid"], options["--plug"], response["reason"]))
+ result = False
+ else:
+ logging.error("Could not evacuate instance: %s" % server.to_dict())
+ # Should a malformed instance result in a failed evacuation?
+ # result = False
+ return result
def set_attrd_status(host, status, options):
logging.debug("Setting fencing status for %s to %s" % (host, status))
- run_command(options, "attrd_updater -p -n evacute -Q -N %s -v %s" % (host, status))
+ run_command(options, "attrd_updater -p -n evacuate -Q -N %s -U %s" % (host, status))
def set_power_status(_, options):
global override_status
@@ -86,28 +149,53 @@ def set_power_status(_, options):
if options["--action"] == "on":
if get_power_status(_, options) == "on":
+ # Forcing the service back up in case it was disabled
nova.services.enable(options["--plug"], 'nova-compute')
+ try:
+ # Forcing the host back up
+ nova.services.force_down(
+ options["--plug"], "nova-compute", force_down=False)
+ except Exception as e:
+ # In theory, if foce_down=False fails, that's for the exact
+ # same possible reasons that below with force_down=True
+ # eg. either an incompatible version or an old client.
+ # Since it's about forcing back to a default value, there is
+ # no real worries to just consider it's still okay even if the
+ # command failed
+ logging.info("Exception from attempt to force "
+ "host back up via nova API: "
+ "%s: %s" % (e.__class__.__name__, e))
else:
# Pretend we're 'on' so that the fencing library doesn't loop forever waiting for the node to boot
override_status = "on"
return
- # need to wait for nova to update its internal status or we
- # cannot call host-evacuate
- while get_power_status(_, options) != "off":
- # Loop forever if need be.
- #
- # Some callers (such as Pacemaker) will have a timer
- # running and kill us if necessary
- logging.debug("Waiting for nova to update it's internal state")
- time.sleep(1)
-
- if options["--no-shared-storage"] != "False":
- on_shared_storage = False
- else:
- on_shared_storage = True
+ try:
+ nova.services.force_down(
+ options["--plug"], "nova-compute", force_down=True)
+ except Exception as e:
+ # Something went wrong when we tried to force the host down.
+ # That could come from either an incompatible API version
+ # eg. UnsupportedVersion or VersionNotFoundForAPIMethod
+ # or because novaclient is old and doesn't include force_down yet
+ # eg. AttributeError
+ # In that case, fallbacking to wait for Nova to catch the right state.
+
+ logging.error("Exception from attempt to force host down via nova API: "
+ "%s: %s" % (e.__class__.__name__, e))
+ # need to wait for nova to update its internal status or we
+ # cannot call host-evacuate
+ while get_power_status(_, options) != "off":
+ # Loop forever if need be.
+ #
+ # Some callers (such as Pacemaker) will have a timer
+ # running and kill us if necessary
+ logging.debug("Waiting for nova to update it's internal state for %s" % options["--plug"])
+ time.sleep(1)
+
+ if not _host_evacuate(options):
+ sys.exit(1)
- _host_evacuate(options["--plug"], on_shared_storage)
return
def get_plugs_list(_, options):
@@ -117,9 +205,9 @@ def get_plugs_list(_, options):
hypervisors = nova.hypervisors.list()
for hypervisor in hypervisors:
longhost = hypervisor.hypervisor_hostname
- if options["--action"] == "list" and options["--domain"] != "":
- shorthost = longhost.replace("." + options["--domain"],
- "")
+ if options["--domain"] != "":
+ shorthost = longhost.replace("." + options["--domain"], "")
+ result[longhost] = ("", None)
result[shorthost] = ("", None)
else:
result[longhost] = ("", None)
@@ -164,7 +252,7 @@ def define_new_opts():
"order": 5,
}
all_opt["record-only"] = {
- "getopt" : "",
+ "getopt" : "r:",
"longopt" : "record-only",
"help" : "--record-only Record the target as needing evacuation but as yet do not intiate it",
"required" : "0",
@@ -172,6 +260,15 @@ def define_new_opts():
"default" : "False",
"order": 5,
}
+ all_opt["instance-filtering"] = {
+ "getopt" : "",
+ "longopt" : "instance-filtering",
+ "help" : "--instance-filtering Only evacuate instances create from images and flavors with evacuable=true",
+ "required" : "0",
+ "shortdesc" : "Only evacuate flagged instances",
+ "default" : "False",
+ "order": 5,
+ }
all_opt["no-shared-storage"] = {
"getopt" : "",
"longopt" : "no-shared-storage",
@@ -187,17 +284,17 @@ def main():
global nova
atexit.register(atexit_handler)
- device_opt = ["login", "passwd", "tenant-name", "auth-url",
+ device_opt = ["login", "passwd", "tenant-name", "auth-url", "fabric_fencing", "on_target",
"no_login", "no_password", "port", "domain", "no-shared-storage", "endpoint-type",
- "record-only"]
+ "record-only", "instance-filtering"]
define_new_opts()
all_opt["shell_timeout"]["default"] = "180"
options = check_input(device_opt, process_input(device_opt))
docs = {}
- docs["shortdesc"] = "Fence agent for nova compute nodes"
- docs["longdesc"] = "fence_nova_host is a Nova fencing notification agent"
+ docs["shortdesc"] = "Fence agent for the automatic resurrection of OpenStack compute instances"
+ docs["longdesc"] = "Used to tell Nova that compute nodes are down and to reschedule flagged instances"
docs["vendorurl"] = ""
show_docs(options, docs)
@@ -213,7 +310,10 @@ def main():
if options["--action"] != "list" and options["--domain"] != "" and options.has_key("--plug"):
options["--plug"] = options["--plug"] + "." + options["--domain"]
- if options["--record-only"] != "False":
+ if options["--record-only"] in [ "2", "Disabled", "disabled" ]:
+ sys.exit(0)
+
+ elif options["--record-only"] in [ "1", "True", "true", "Yes", "yes"]:
if options["--action"] == "on":
set_attrd_status(options["--plug"], "no", options)
sys.exit(0)
@@ -222,7 +322,7 @@ def main():
set_attrd_status(options["--plug"], "yes", options)
sys.exit(0)
- elif options["--action"] in ["status", "monitor"]:
+ elif options["--action"] in ["monitor", "status"]:
sys.exit(0)
# The first argument is the Nova client version
diff --git a/tests/data/metadata/fence_compute.xml b/tests/data/metadata/fence_compute.xml
index 846a861..98bed4e 100644
--- a/tests/data/metadata/fence_compute.xml
+++ b/tests/data/metadata/fence_compute.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" ?>
-<resource-agent name="fence_compute" shortdesc="Fence agent for nova compute nodes" >
-<longdesc>fence_nova_host is a Nova fencing notification agent</longdesc>
+<resource-agent name="fence_compute" shortdesc="Fence agent for the automatic resurrection of OpenStack compute instances" >
+<longdesc>Used to tell Nova that compute nodes are down and to reschedule flagged instances</longdesc>
<vendor-url></vendor-url>
<parameters>
<parameter name="port" unique="0" required="1">
@@ -35,7 +35,7 @@
</parameter>
<parameter name="action" unique="0" required="1">
<getopt mixed="-o, --action=[action]" />
- <content type="string" default="reboot" />
+ <content type="string" default="off" />
<shortdesc lang="en">Fencing Action</shortdesc>
</parameter>
<parameter name="login" unique="0" required="0">
@@ -48,6 +48,11 @@
<content type="string" />
<shortdesc lang="en">DNS domain in which hosts live</shortdesc>
</parameter>
+ <parameter name="instance-filtering" unique="0" required="0">
+ <getopt mixed="--instance-filtering" />
+ <content type="boolean" default="False" />
+ <shortdesc lang="en">Only evacuate flagged instances</shortdesc>
+ </parameter>
<parameter name="no-shared-storage" unique="0" required="0">
<getopt mixed="--no-shared-storage" />
<content type="boolean" default="False" />
@@ -55,7 +60,7 @@
</parameter>
<parameter name="record-only" unique="0" required="0">
<getopt mixed="--record-only" />
- <content type="boolean" default="False" />
+ <content type="string" default="False" />
<shortdesc lang="en">Only record the target as needing evacuation</shortdesc>
</parameter>
<parameter name="verbose" unique="0" required="0">
@@ -115,9 +120,8 @@
</parameter>
</parameters>
<actions>
- <action name="on" automatic="0"/>
+ <action name="on" on_target="1" automatic="1"/>
<action name="off" />
- <action name="reboot" />
<action name="status" />
<action name="list" />
<action name="list-status" />
--
2.4.3