commit 0fa5ce2c14fa36610630469c14c07537eb4f4807
Author: Andrew Beekhof <andrew@beekhof.net>
Date: Wed Oct 1 16:56:59 2014 +1000
Import: pacemaker-rollup-be1e835
diff --git a/attrd/Makefile.am b/attrd/Makefile.am
index 802a3fa..9d5e223 100644
--- a/attrd/Makefile.am
+++ b/attrd/Makefile.am
@@ -32,25 +32,12 @@ attrd_LDADD = $(top_builddir)/lib/cluster/libcrmcluster.la \
$(top_builddir)/lib/cib/libcib.la \
$(CLUSTERLIBS)
-if BUILD_HEARTBEAT_SUPPORT
-attrd_SOURCES += legacy.c
-else
-
-if BUILD_CS_SUPPORT
-
-if BUILD_CS_PLUGIN
-attrd_SOURCES += legacy.c
-else
-# Only build the new version where CPG is exclusively used for communication
+if BUILD_ATOMIC_ATTRD
attrd_SOURCES += main.c commands.c
-endif
-
else
attrd_SOURCES += legacy.c
endif
-endif
-
clean-generic:
rm -f *.log *.debug *.xml *~
diff --git a/attrd/commands.c b/attrd/commands.c
index 038e7e4..c48ef1b 100644
--- a/attrd/commands.c
+++ b/attrd/commands.c
@@ -17,6 +17,8 @@
*/
#include <crm_internal.h>
+#include <sys/types.h>
+#include <regex.h>
#include <glib.h>
#include <crm/msg_xml.h>
@@ -63,7 +65,7 @@ typedef struct attribute_value_s {
void write_attribute(attribute_t *a);
void write_or_elect_attribute(attribute_t *a);
-void attrd_peer_update(crm_node_t *peer, xmlNode *xml, bool filter);
+void attrd_peer_update(crm_node_t *peer, xmlNode *xml, const char *host, bool filter);
void attrd_peer_sync(crm_node_t *peer, xmlNode *xml);
void attrd_peer_remove(uint32_t nodeid, const char *host, gboolean uncache, const char *source);
@@ -191,16 +193,41 @@ attrd_client_message(crm_client_t *client, xmlNode *xml)
char *host = crm_element_value_copy(xml, F_ATTRD_HOST);
const char *attr = crm_element_value(xml, F_ATTRD_ATTRIBUTE);
const char *value = crm_element_value(xml, F_ATTRD_VALUE);
+ const char *regex = crm_element_value(xml, F_ATTRD_REGEX);
- a = g_hash_table_lookup(attributes, attr);
+ if(attr == NULL && regex) {
+ GHashTableIter aIter;
+ regex_t *r_patt = calloc(1, sizeof(regex_t));
+
+ crm_debug("Setting %s to %s", regex, value);
+ if (regcomp(r_patt, regex, REG_EXTENDED)) {
+ crm_err("Bad regex '%s' for update", regex);
+ regfree(r_patt);
+ free(r_patt);
+ return;
+ }
- if(host == NULL) {
+ g_hash_table_iter_init(&aIter, attributes);
+ while (g_hash_table_iter_next(&aIter, (gpointer *) & attr, NULL)) {
+ int status = regexec(r_patt, attr, 0, NULL, 0);
+
+ if(status == 0) {
+ crm_trace("Matched %s with %s", attr, regex);
+ crm_xml_add(xml, F_ATTRD_ATTRIBUTE, attr);
+ send_attrd_message(NULL, xml);
+ }
+ }
+ return;
+
+ } else if(host == NULL) {
crm_trace("Inferring host");
host = strdup(attrd_cluster->uname);
crm_xml_add(xml, F_ATTRD_HOST, host);
crm_xml_add_int(xml, F_ATTRD_HOST_ID, attrd_cluster->nodeid);
}
+ a = g_hash_table_lookup(attributes, attr);
+
if (value) {
int offset = 1;
int int_value = 0;
@@ -254,6 +281,7 @@ attrd_client_message(crm_client_t *client, xmlNode *xml)
}
if(broadcast) {
+ /* Ends up at attrd_peer_message() */
send_attrd_message(NULL, xml);
}
}
@@ -265,6 +293,7 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml)
const char *v = crm_element_value(xml, F_ATTRD_VERSION);
const char *op = crm_element_value(xml, F_ATTRD_TASK);
const char *election_op = crm_element_value(xml, F_CRM_TASK);
+ const char *host = crm_element_value(xml, F_ATTRD_HOST);
if(election_op) {
enum election_result rc = 0;
@@ -293,7 +322,7 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml)
const char *name = crm_element_value(xml, F_ATTRD_ATTRIBUTE);
crm_trace("Compatibility update of %s from %s", name, peer->uname);
- attrd_peer_update(peer, xml, FALSE);
+ attrd_peer_update(peer, xml, host, FALSE);
} else if(safe_str_eq(op, "flush")) {
const char *name = crm_element_value(xml, F_ATTRD_ATTRIBUTE);
@@ -336,13 +365,12 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml)
}
if(safe_str_eq(op, "update")) {
- attrd_peer_update(peer, xml, FALSE);
+ attrd_peer_update(peer, xml, host, FALSE);
} else if(safe_str_eq(op, "sync")) {
attrd_peer_sync(peer, xml);
} else if(safe_str_eq(op, "peer-remove")) {
- const char *host = crm_element_value(xml, F_ATTRD_HOST);
attrd_peer_remove(0, host, TRUE, peer->uname);
} else if(safe_str_eq(op, "sync-response")
@@ -351,7 +379,8 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml)
crm_notice("Processing %s from %s", op, peer->uname);
for (child = __xml_first_child(xml); child != NULL; child = __xml_next(child)) {
- attrd_peer_update(peer, child, TRUE);
+ host = crm_element_value(child, F_ATTRD_HOST);
+ attrd_peer_update(peer, child, host, TRUE);
}
}
}
@@ -409,12 +438,11 @@ attrd_peer_remove(uint32_t nodeid, const char *host, gboolean uncache, const cha
}
void
-attrd_peer_update(crm_node_t *peer, xmlNode *xml, bool filter)
+attrd_peer_update(crm_node_t *peer, xmlNode *xml, const char *host, bool filter)
{
bool changed = FALSE;
attribute_value_t *v = NULL;
- const char *host = crm_element_value(xml, F_ATTRD_HOST);
const char *attr = crm_element_value(xml, F_ATTRD_ATTRIBUTE);
const char *value = crm_element_value(xml, F_ATTRD_VALUE);
@@ -424,6 +452,19 @@ attrd_peer_update(crm_node_t *peer, xmlNode *xml, bool filter)
a = create_attribute(xml);
}
+ if(host == NULL) {
+ GHashTableIter vIter;
+ g_hash_table_iter_init(&vIter, a->values);
+
+ crm_debug("Setting %s for all hosts to %s", attr, value);
+
+ xml_remove_prop(xml, F_ATTRD_HOST_ID);
+ while (g_hash_table_iter_next(&vIter, (gpointer *) & host, NULL)) {
+ attrd_peer_update(peer, xml, host, filter);
+ }
+ return;
+ }
+
v = g_hash_table_lookup(a->values, host);
if(v == NULL) {
diff --git a/cib/messages.c b/cib/messages.c
index 4b79912..9c66349 100644
--- a/cib/messages.c
+++ b/cib/messages.c
@@ -292,6 +292,11 @@ cib_process_upgrade_server(const char *op, int options, const char *section, xml
crm_xml_add(up, F_TYPE, "cib");
crm_xml_add(up, F_CIB_OPERATION, CIB_OP_UPGRADE);
crm_xml_add(up, F_CIB_SCHEMA_MAX, get_schema_name(new_version));
+ crm_xml_add(up, F_CIB_DELEGATED, host);
+ crm_xml_add(up, F_CIB_CLIENTID, crm_element_value(req, F_CIB_CLIENTID));
+ crm_xml_add(up, F_CIB_CALLOPTS, crm_element_value(req, F_CIB_CALLOPTS));
+ crm_xml_add(up, F_CIB_CALLID, crm_element_value(req, F_CIB_CALLID));
+
send_cluster_message(NULL, crm_msg_cib, up, FALSE);
free_xml(up);
diff --git a/configure.ac b/configure.ac
index 40adffe..1edff40 100644
--- a/configure.ac
+++ b/configure.ac
@@ -75,6 +75,7 @@ CC_IN_CONFIGURE=yes
export CC_IN_CONFIGURE
LDD=ldd
+BUILD_ATOMIC_ATTRD=1
dnl ========================================================================
dnl Compiler characteristics
@@ -1260,6 +1261,7 @@ case $SUPPORT_HEARTBEAT in
dnl objdump -x ${libdir}/libccmclient.so | grep SONAME | awk '{print $2}'
AC_DEFINE_UNQUOTED(CCM_LIBRARY, "libccmclient.so.1", Library to load for ccm support)
AC_DEFINE_UNQUOTED(HEARTBEAT_LIBRARY, "libhbclient.so.1", Library to load for heartbeat support)
+ BUILD_ATOMIC_ATTRD=0
else
SUPPORT_HEARTBEAT=0
fi
@@ -1341,6 +1343,7 @@ SUPPORT_PLUGIN=0
if test $SUPPORT_CS = 1 -a x$HAVE_confdb = x1; then
dnl Need confdb to support cman and the plugins
SUPPORT_PLUGIN=1
+ BUILD_ATOMIC_ATTRD=0
LCRSODIR=`$PKGCONFIG corosync --variable=lcrsodir`
STACKS="$STACKS corosync-plugin"
COROSYNC_LIBS="$COROSYNC_LIBS $confdb_LIBS"
@@ -1382,6 +1385,9 @@ AM_CONDITIONAL(BUILD_CS_SUPPORT, test $SUPPORT_CS = 1)
AM_CONDITIONAL(BUILD_CS_PLUGIN, test $SUPPORT_PLUGIN = 1)
AM_CONDITIONAL(BUILD_CMAN, test $SUPPORT_CMAN = 1)
+AM_CONDITIONAL(BUILD_ATOMIC_ATTRD, test $BUILD_ATOMIC_ATTRD = 1)
+AC_DEFINE_UNQUOTED(HAVE_ATOMIC_ATTRD, $BUILD_ATOMIC_ATTRD, Support the new atomic attrd)
+
AC_SUBST(SUPPORT_CMAN)
AC_SUBST(SUPPORT_CS)
@@ -1401,6 +1407,9 @@ else
PCMK_FEATURES="$PCMK_FEATURES $STACKS"
fi
+if test ${BUILD_ATOMIC_ATTRD} = 1; then
+ PCMK_FEATURES="$PCMK_FEATURES atomic-attrd"
+fi
AC_SUBST(CLUSTERLIBS)
AC_SUBST(LCRSODIR)
@@ -1871,6 +1880,7 @@ tools/Makefile \
tools/crm_report \
tools/report.common \
tools/cibsecret \
+ tools/crm_mon.upstart \
xml/Makefile \
lib/gnu/Makefile \
)
diff --git a/crmd/lrm.c b/crmd/lrm.c
index db0bffb..44634fb 100644
--- a/crmd/lrm.c
+++ b/crmd/lrm.c
@@ -1162,7 +1162,7 @@ get_lrm_resource(lrm_state_t * lrm_state, xmlNode * resource, xmlNode * op_msg,
if (!rsc) {
fsa_data_t *msg_data = NULL;
- crm_err("Could not add resource %s to LRM", id);
+ crm_err("Could not add resource %s to LRM %s", id, lrm_state->node_name);
register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
}
}
@@ -1175,13 +1175,17 @@ delete_resource(lrm_state_t * lrm_state,
const char *id,
lrmd_rsc_info_t * rsc,
GHashTableIter * gIter,
- const char *sys, const char *host, const char *user, ha_msg_input_t * request)
+ const char *sys,
+ const char *host,
+ const char *user,
+ ha_msg_input_t * request,
+ gboolean unregister)
{
int rc = pcmk_ok;
crm_info("Removing resource %s for %s (%s) on %s", id, sys, user ? user : "internal", host);
- if (rsc) {
+ if (rsc && unregister) {
rc = lrm_state_unregister_rsc(lrm_state, id, 0);
}
@@ -1224,6 +1228,7 @@ do_lrm_invoke(long long action,
const char *user_name = NULL;
const char *target_node = NULL;
gboolean is_remote_node = FALSE;
+ gboolean crm_rsc_delete = FALSE;
if (input->xml != NULL) {
/* Remote node operations are routed here to their remote connections */
@@ -1259,6 +1264,8 @@ do_lrm_invoke(long long action,
crm_trace("LRM command from: %s", from_sys);
if (safe_str_eq(crm_op, CRM_OP_LRM_DELETE)) {
+ /* remember this delete op came from crm_resource */
+ crm_rsc_delete = TRUE;
operation = CRMD_ACTION_DELETE;
} else if (safe_str_eq(crm_op, CRM_OP_LRM_REFRESH)) {
@@ -1370,13 +1377,17 @@ do_lrm_invoke(long long action,
} else if (safe_str_eq(operation, CRM_OP_REPROBE) || safe_str_eq(crm_op, CRM_OP_REPROBE)) {
GHashTableIter gIter;
rsc_history_t *entry = NULL;
+ gboolean unregister = is_remote_lrmd_ra(NULL, NULL, entry->id) ? FALSE : TRUE;
crm_notice("Forcing the status of all resources to be redetected");
g_hash_table_iter_init(&gIter, lrm_state->resource_history);
while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
+ /* only unregister the resource during a reprobe if it is not a remote connection
+ * resource. otherwise unregistering the connection will terminate remote-node
+ * membership */
delete_resource(lrm_state, entry->id, &entry->rsc, &gIter, from_sys, from_host,
- user_name, NULL);
+ user_name, NULL, unregister);
}
/* Now delete the copy in the CIB */
@@ -1499,6 +1510,7 @@ do_lrm_invoke(long long action,
free(op_key);
} else if (rsc != NULL && safe_str_eq(operation, CRMD_ACTION_DELETE)) {
+ gboolean unregister = TRUE;
#if ENABLE_ACL
int cib_rc = delete_rsc_status(lrm_state, rsc->id, cib_dryrun | cib_sync_call, user_name);
@@ -1523,7 +1535,11 @@ do_lrm_invoke(long long action,
return;
}
#endif
- delete_resource(lrm_state, rsc->id, rsc, NULL, from_sys, from_host, user_name, input);
+ if (crm_rsc_delete == TRUE && is_remote_lrmd_ra(NULL, NULL, rsc->id)) {
+ unregister = FALSE;
+ }
+
+ delete_resource(lrm_state, rsc->id, rsc, NULL, from_sys, from_host, user_name, input, unregister);
} else if (rsc != NULL) {
do_lrm_rsc_op(lrm_state, rsc, operation, input->xml, input->msg);
diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c
index 98f59c8..f3dedeb 100644
--- a/crmd/remote_lrmd_ra.c
+++ b/crmd/remote_lrmd_ra.c
@@ -251,6 +251,8 @@ connection_takeover_timeout_cb(gpointer data)
crm_debug("takeover event timed out for node %s", cmd->rsc_id);
cmd->takeover_timeout_id = 0;
+ lrm_state = lrm_state_find(cmd->rsc_id);
+
handle_remote_ra_stop(lrm_state, cmd);
free_cmd(cmd);
@@ -379,6 +381,11 @@ remote_lrm_op_callback(lrmd_event_data_t * op)
cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
} else {
+
+ if (safe_str_eq(cmd->action, "start")) {
+ /* clear PROBED value if it happens to be set after start completes. */
+ update_attrd(lrm_state->node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
+ }
lrm_state_reset_tables(lrm_state);
cmd->rc = PCMK_OCF_OK;
cmd->op_status = PCMK_LRM_OP_DONE;
diff --git a/crmd/te_actions.c b/crmd/te_actions.c
index 926996b..a3aa78b 100644
--- a/crmd/te_actions.c
+++ b/crmd/te_actions.c
@@ -546,17 +546,26 @@ te_update_job_count(crm_action_t * action, int offset)
return;
}
- if (safe_str_eq(task, CRMD_ACTION_MIGRATE) || safe_str_eq(task, CRMD_ACTION_MIGRATED)) {
+ /* if we have a router node, this means the action is performing
+ * on a remote node. For now, we count all action occuring on a
+ * remote node against the job list on the cluster node hosting
+ * the connection resources */
+ target = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE);
+
+ if ((target == NULL) &&
+ (safe_str_eq(task, CRMD_ACTION_MIGRATE) || safe_str_eq(task, CRMD_ACTION_MIGRATED))) {
+
const char *t1 = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_SOURCE);
const char *t2 = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_TARGET);
te_update_job_count_on(t1, offset, TRUE);
te_update_job_count_on(t2, offset, TRUE);
-
- } else {
-
- te_update_job_count_on(target, offset, FALSE);
+ return;
+ } else if (target == NULL) {
+ target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
}
+
+ te_update_job_count_on(target, offset, FALSE);
}
static gboolean
@@ -597,6 +606,8 @@ te_should_perform_action_on(crm_graph_t * graph, crm_action_t * action, const ch
}
}
+ crm_trace("Peer %s has not hit their limit yet. current jobs = %d limit= %d limit", target, r->jobs, limit);
+
return TRUE;
}
@@ -611,7 +622,15 @@ te_should_perform_action(crm_graph_t * graph, crm_action_t * action)
return TRUE;
}
- if (safe_str_eq(task, CRMD_ACTION_MIGRATE) || safe_str_eq(task, CRMD_ACTION_MIGRATED)) {
+ /* if we have a router node, this means the action is performing
+ * on a remote node. For now, we count all action occuring on a
+ * remote node against the job list on the cluster node hosting
+ * the connection resources */
+ target = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE);
+
+ if ((target == NULL) &&
+ (safe_str_eq(task, CRMD_ACTION_MIGRATE) || safe_str_eq(task, CRMD_ACTION_MIGRATED))) {
+
target = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_SOURCE);
if(te_should_perform_action_on(graph, action, target) == FALSE) {
return FALSE;
@@ -619,7 +638,7 @@ te_should_perform_action(crm_graph_t * graph, crm_action_t * action)
target = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_TARGET);
- } else {
+ } else if (target == NULL) {
target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
}
diff --git a/crmd/te_events.c b/crmd/te_events.c
index afe3072..b81a13e 100644
--- a/crmd/te_events.c
+++ b/crmd/te_events.c
@@ -161,10 +161,6 @@ update_failcount(xmlNode * event, const char *event_node_uuid, int rc, int targe
do_update = TRUE;
value = failed_stop_offset;
- } else if (safe_str_eq(task, CRMD_ACTION_STOP)) {
- do_update = TRUE;
- value = failed_stop_offset;
-
} else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) {
do_update = TRUE;
diff --git a/crmd/throttle.c b/crmd/throttle.c
index 04a3cf1..6e853ae 100644
--- a/crmd/throttle.c
+++ b/crmd/throttle.c
@@ -430,7 +430,7 @@ throttle_mode(void)
unsigned int blocked = 0;
enum throttle_state_e mode = throttle_none;
-#ifndef ON_SOLARIS
+#ifdef ON_SOLARIS
return throttle_none;
#endif
@@ -508,44 +508,41 @@ static void
throttle_send_command(enum throttle_state_e mode)
{
xmlNode *xml = NULL;
+ static enum throttle_state_e last = -1;
- xml = create_request(CRM_OP_THROTTLE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
- crm_xml_add_int(xml, F_CRM_THROTTLE_MODE, mode);
- crm_xml_add_int(xml, F_CRM_THROTTLE_MAX, throttle_job_max);
+ if(mode != last) {
+ crm_info("New throttle mode: %.4x (was %.4x)", mode, last);
+ last = mode;
- send_cluster_message(NULL, crm_msg_crmd, xml, TRUE);
- free_xml(xml);
+ xml = create_request(CRM_OP_THROTTLE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
+ crm_xml_add_int(xml, F_CRM_THROTTLE_MODE, mode);
+ crm_xml_add_int(xml, F_CRM_THROTTLE_MAX, throttle_job_max);
- crm_info("Updated throttle state to %.4x", mode);
+ send_cluster_message(NULL, crm_msg_crmd, xml, TRUE);
+ free_xml(xml);
+ }
}
static gboolean
throttle_timer_cb(gpointer data)
{
static bool send_updates = FALSE;
- static enum throttle_state_e last = -1;
-
enum throttle_state_e now = throttle_none;
- if(send_updates == FALSE) {
- /* Optimize for the true case */
- if(compare_version(fsa_our_dc_version, "3.0.8") < 0) {
- crm_trace("DC version %s doesn't support throttling", fsa_our_dc_version);
-
- } else {
- send_updates = TRUE;
- }
- }
-
if(send_updates) {
now = throttle_mode();
- }
+ throttle_send_command(now);
+
+ } else if(compare_version(fsa_our_dc_version, "3.0.8") < 0) {
+ /* Optimize for the true case */
+ crm_trace("DC version %s doesn't support throttling", fsa_our_dc_version);
- if(send_updates && now != last) {
- crm_debug("New throttle mode: %.4x (was %.4x)", now, last);
+ } else {
+ send_updates = TRUE;
+ now = throttle_mode();
throttle_send_command(now);
- last = now;
}
+
return TRUE;
}
@@ -595,9 +592,11 @@ throttle_update_job_max(const char *preference)
void
throttle_init(void)
{
- throttle_records = g_hash_table_new_full(
- crm_str_hash, g_str_equal, NULL, throttle_record_free);
- throttle_timer = mainloop_timer_add("throttle", 30* 1000, TRUE, throttle_timer_cb, NULL);
+ if(throttle_records == NULL) {
+ throttle_records = g_hash_table_new_full(
+ crm_str_hash, g_str_equal, NULL, throttle_record_free);
+ throttle_timer = mainloop_timer_add("throttle", 30 * 1000, TRUE, throttle_timer_cb, NULL);
+ }
throttle_update_job_max(NULL);
mainloop_timer_start(throttle_timer);
diff --git a/cts/CTS.py b/cts/CTS.py
index 04189f2..f4198c4 100644
--- a/cts/CTS.py
+++ b/cts/CTS.py
@@ -225,10 +225,13 @@ class CtsLab:
class NodeStatus:
def __init__(self, env):
- pass
+ self.Env = env
def IsNodeBooted(self, node):
'''Return TRUE if the given node is booted (responds to pings)'''
+ if self.Env["docker"]:
+ return RemoteFactory().getInstance()("localhost", "docker inspect --format {{.State.Running}} %s | grep -q true" % node, silent=True) == 0
+
return RemoteFactory().getInstance()("localhost", "ping -nq -c1 -w1 %s" % node, silent=True) == 0
def IsSshdUp(self, node):
@@ -442,6 +445,9 @@ class ClusterManager(UserDict):
self.debug("Quorum: %d Len: %d" % (q, len(self.Env["nodes"])))
return peer_list
+ for n in self.Env["nodes"]:
+ peer_state[n] = "unknown"
+
# Now see if any states need to be updated
self.debug("looking for: " + repr(stonith.regexes))
shot = stonith.look(0)
@@ -457,7 +463,8 @@ class ClusterManager(UserDict):
peer_state[peer] = "complete"
self.__instance_errorstoignore.append(self.templates["Pat:Fencing_ok"] % peer)
- elif re.search(self.templates["Pat:Fencing_start"] % n, shot):
+ elif peer_state[n] != "complete" and re.search(self.templates["Pat:Fencing_start"] % n, shot):
+ # TODO: Correctly detect multiple fencing operations for the same host
peer = n
peer_state[peer] = "in-progress"
self.__instance_errorstoignore.append(self.templates["Pat:Fencing_start"] % peer)
diff --git a/cts/CTSlab.py b/cts/CTSlab.py
index 314c347..9b336a5 100755
--- a/cts/CTSlab.py
+++ b/cts/CTSlab.py
@@ -107,9 +107,9 @@ if __name__ == '__main__':
if Environment["ListTests"] == 1:
Tests = TestList(cm, Audits)
- Environment.log("Total %d tests"%len(Tests))
+ LogFactory().log("Total %d tests"%len(Tests))
for test in Tests :
- Environment.log(str(test.name));
+ LogFactory().log(str(test.name));
sys.exit(0)
elif len(Environment["tests"]) == 0:
diff --git a/cts/CTStests.py b/cts/CTStests.py
index 918dff0..cd5b7ce 100644
--- a/cts/CTStests.py
+++ b/cts/CTStests.py
@@ -83,6 +83,7 @@ class CTSTest:
self.passed = 1
self.is_loop = 0
self.is_unsafe = 0
+ self.is_docker_unsafe = 0
self.is_experimental = 0
self.is_container = 0
self.is_valgrind = 0
@@ -224,6 +225,8 @@ class CTSTest:
return 0
elif self.is_experimental and not self.Env["experimental-tests"]:
return 0
+ elif self.is_docker_unsafe and self.Env["docker"]:
+ return 0
elif self.is_container and not self.Env["container-tests"]:
return 0
elif self.Env["benchmark"] and self.benchmark == 0:
@@ -1359,6 +1362,8 @@ class ComponentFail(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "ComponentFail"
+ # TODO make this work correctly in docker.
+ self.is_docker_unsafe = 1
self.startall = SimulStartLite(cm)
self.complist = cm.Components()
self.patterns = []
@@ -1419,6 +1424,15 @@ class ComponentFail(CTSTest):
self.okerrpatterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name))
self.okerrpatterns.append(self.templates["Pat:ChildExit"])
+ if chosen.name == "stonith":
+ # Ignore actions for STONITH resources
+ (rc, lines) = self.rsh(node, "crm_resource -c", None)
+ for line in lines:
+ if re.search("^Resource", line):
+ r = AuditResource(self.CM, line)
+ if r.rclass == "stonith":
+ self.okerrpatterns.append(self.templates["LogActions: Recover.*%s"] % r.id)
+
# supply a copy so self.patterns doesnt end up empty
tmpPats = []
tmpPats.extend(self.patterns)
@@ -2512,6 +2526,7 @@ class RemoteLXC(CTSTest):
self.startall = SimulStartLite(cm)
self.num_containers = 2
self.is_container = 1
+ self.is_docker_unsafe = 1
self.failed = 0
self.fail_string = ""
@@ -2624,6 +2639,7 @@ class RemoteBaremetal(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "RemoteBaremetal"
+ self.is_docker_unsafe = 1
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.stop = StopTest(cm)
diff --git a/cts/environment.py b/cts/environment.py
index de1d099..d741452 100644
--- a/cts/environment.py
+++ b/cts/environment.py
@@ -71,6 +71,7 @@ class Environment:
self["loop-tests"] = 1
self["scenario"] = "random"
self["stats"] = 0
+ self["docker"] = 0
self.RandomGen = random.Random()
self.logger = LogFactory()
@@ -143,7 +144,9 @@ class Environment:
# GoodThing(tm).
try:
n = node.strip()
- gethostbyname_ex(n)
+ if self.data["docker"] == 0:
+ gethostbyname_ex(n)
+
self.Nodes.append(n)
except:
self.logger.log(node+" not found in DNS... aborting")
@@ -191,7 +194,10 @@ class Environment:
return "crm-lha"
elif self.data["Stack"] == "corosync 2.x":
- return "crm-mcp"
+ if self["docker"]:
+ return "crm-mcp-docker"
+ else:
+ return "crm-mcp"
elif self.data["Stack"] == "corosync (cman)":
return "crm-cman"
@@ -342,6 +348,10 @@ class Environment:
elif args[i] == "--qarsh":
RemoteFactory().enable_qarsh()
+ elif args[i] == "--docker":
+ self["docker"] = 1
+ RemoteFactory().enable_docker()
+
elif args[i] == "--stonith" or args[i] == "--fencing":
skipthis=1
if args[i+1] == "1" or args[i+1] == "yes":
@@ -352,6 +362,9 @@ class Environment:
self["DoStonith"]=1
self["stonith-type"] = "fence_xvm"
self["stonith-params"] = "pcmk_arg_map=domain:uname,delay=0"
+ elif args[i+1] == "docker":
+ self["DoStonith"]=1
+ self["stonith-type"] = "fence_docker_cts"
elif args[i+1] == "scsi":
self["DoStonith"]=1
self["stonith-type"] = "fence_scsi"
@@ -644,6 +657,7 @@ class Environment:
print "\t [--container-tests] include pacemaker_remote tests that run in lxc container resources"
print "\t [--oprofile 'node list'] list of cluster nodes to run oprofile on]"
print "\t [--qarsh] use the QARSH backdoor to access nodes instead of SSH"
+ print "\t [--docker] Indicates nodes are docker nodes."
print "\t [--seed random_seed]"
print "\t [--set option=value]"
print "\t "
diff --git a/cts/lxc_autogen.sh.in b/cts/lxc_autogen.sh.in
index 6900b67..e11532b 100755
--- a/cts/lxc_autogen.sh.in
+++ b/cts/lxc_autogen.sh.in
@@ -72,6 +72,7 @@ if [ $verify -eq 1 ]; then
virsh -c lxc:/// list --all > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "Could not connect 'virsh -c lxc:///' check that libvirt lxc driver is installed"
+ # yum install -y libvirt-daemon-driver-lxc libvirt-daemon-lxc libvirt-login-shell
exit 1
fi
diff --git a/cts/patterns.py b/cts/patterns.py
index f651965..8d34e1c 100644
--- a/cts/patterns.py
+++ b/cts/patterns.py
@@ -364,9 +364,12 @@ class crm_cs_v0(BasePatterns):
self.components["stonith-ignore"] = [
"LogActions: Recover Fencing",
"Updating failcount for Fencing",
+ "error: crm_ipc_read: Connection to stonith-ng failed",
+ "error: mainloop_gio_callback: Connection to stonith-ng.*closed (I/O condition=17)",
+ "crit: tengine_stonith_connection_destroy: Fencing daemon connection failed",
"error: te_connect_stonith:.*Sign-in failed: triggered a retry",
"STONITH connection failed, finalizing .* pending operations.",
- "process_lrm_event:.*Operation Fencing.* Error"
+ "process_lrm_event:.*Operation Fencing.* Error",
]
self.components["stonith-ignore"].extend(self.components["common-ignore"])
@@ -409,6 +412,20 @@ class crm_mcp(crm_cs_v0):
# "Pat:We_stopped" : "%s.*Stopped Corosync Cluster Engine",
# })
+class crm_mcp_docker(crm_mcp):
+ '''
+ The crm version 4 cluster manager class.
+ It implements the things we need to talk to and manipulate
+ crm clusters running on top of native corosync (no plugins)
+ '''
+ def __init__(self, name):
+ crm_mcp.__init__(self, name)
+
+ self.commands.update({
+ "StartCmd" : "pcmk_start",
+ "StopCmd" : "pcmk_stop",
+ })
+
class crm_cman(crm_cs_v0):
'''
The crm version 3 cluster manager class.
@@ -454,6 +471,8 @@ class PatternSelector:
crm_cman(name)
elif name == "crm-mcp":
crm_mcp(name)
+ elif name == "crm-mcp-docker":
+ crm_mcp_docker(name)
def get_variant(self, variant):
if patternvariants.has_key(variant):
diff --git a/cts/remote.py b/cts/remote.py
index c8253c3..7920fc9 100644
--- a/cts/remote.py
+++ b/cts/remote.py
@@ -261,6 +261,12 @@ class RemoteFactory:
def new(self, silent=False):
return RemoteExec(RemoteFactory.rsh, silent)
+ def enable_docker(self):
+ print "Using DOCKER backend for connections to cluster nodes"
+
+ RemoteFactory.rsh.Command = "/usr/libexec/phd/docker/phd_docker_remote_cmd "
+ RemoteFactory.rsh.CpCommand = "/usr/libexec/phd/docker/phd_docker_cp"
+
def enable_qarsh(self):
# http://nstraz.wordpress.com/2008/12/03/introducing-qarsh/
print "Using QARSH for connections to cluster nodes"
diff --git a/cts/watcher.py b/cts/watcher.py
index d33e580..5e6ee43 100644
--- a/cts/watcher.py
+++ b/cts/watcher.py
@@ -165,7 +165,11 @@ class FileObj(SearchObj):
global log_watcher_bin
self.debug("Installing %s on %s" % (log_watcher_bin, host))
- self.rsh(host, '''echo "%s" > %s''' % (log_watcher, log_watcher_bin), silent=True)
+
+ os.system("cat << END >> %s\n%s\nEND" %(log_watcher_bin, log_watcher))
+ os.system("chmod 755 %s" %(log_watcher_bin))
+
+ self.rsh.cp(log_watcher_bin, "root@%s:%s" % (host, log_watcher_bin))
has_log_watcher[host] = 1
self.harvest()
@@ -176,7 +180,8 @@ class FileObj(SearchObj):
if match:
last_offset = self.offset
self.offset = match.group(1)
- #if last_offset == "EOF": self.debug("Got %d lines, new offset: %s" % (len(lines), self.offset))
+ #if last_offset == "EOF": self.debug("Got %d lines, new offset: %s" % (len(outLines), self.offset))
+ self.debug("Got %d lines, new offset: %s %s" % (len(outLines), self.offset, repr(self.delegate)))
elif re.search("^CTSwatcher:.*truncated", line):
self.log(line)
@@ -199,7 +204,7 @@ class FileObj(SearchObj):
global log_watcher_bin
return self.rsh.call_async(self.host,
- "python %s -t %s -p CTSwatcher: -l 200 -f %s -o %s" % (log_watcher_bin, self.name, self.filename, self.offset),
+ "python %s -t %s -p CTSwatcher: -l 200 -f %s -o %s -t %s" % (log_watcher_bin, self.name, self.filename, self.offset, self.name),
completionDelegate=self)
def setend(self):
@@ -208,7 +213,7 @@ class FileObj(SearchObj):
global log_watcher_bin
(rc, lines) = self.rsh(self.host,
- "python %s -t %s -p CTSwatcher: -l 2 -f %s -o %s" % (log_watcher_bin, self.name, self.filename, "EOF"),
+ "python %s -t %s -p CTSwatcher: -l 2 -f %s -o %s -t %s" % (log_watcher_bin, self.name, self.filename, "EOF", self.name),
None, silent=True)
for line in lines:
@@ -386,7 +391,7 @@ class LogWatcher(RemoteExec):
def async_complete(self, pid, returncode, outLines, errLines):
# TODO: Probably need a lock for updating self.line_cache
- self.logger.debug("%s: Got %d lines from %d" % (self.name, len(outLines), pid))
+ self.logger.debug("%s: Got %d lines from %d (total %d)" % (self.name, len(outLines), pid, len(self.line_cache)))
if len(outLines):
self.cache_lock.acquire()
self.line_cache.extend(outLines)
@@ -407,7 +412,7 @@ class LogWatcher(RemoteExec):
for t in pending:
t.join(60.0)
if t.isAlive():
- self.logger.log("%s: Aborting after 20s waiting for %d logging commands" % (self.name, repr(t)))
+ self.logger.log("%s: Aborting after 20s waiting for %s logging commands" % (self.name, repr(t)))
return
#print "Got %d lines" % len(self.line_cache)
@@ -484,9 +489,6 @@ class LogWatcher(RemoteExec):
if len(self.line_cache) == 0 and end < time.time():
self.debug("Single search terminated: start=%d, end=%d, now=%d, lines=%d" % (begin, end, time.time(), lines))
return None
- elif len(self.line_cache) == 0:
- self.debug("Single search timed out: start=%d, end=%d, now=%d, lines=%d" % (begin, end, time.time(), lines))
- return None
else:
self.debug("Waiting: start=%d, end=%d, now=%d, lines=%d" % (begin, end, time.time(), len(self.line_cache)))
time.sleep(1)
@@ -520,6 +522,7 @@ class LogWatcher(RemoteExec):
self.unmatched = self.regexes
self.matched = returnresult
self.regexes = save_regexes
+ self.end()
return None
returnresult.append(oneresult)
diff --git a/extra/resources/remote b/extra/resources/remote
index 9e0482b..9f141a2 100644
--- a/extra/resources/remote
+++ b/extra/resources/remote
@@ -62,11 +62,11 @@ meta_data() {
</parameter>
</parameters>
<actions>
- <action name="start" timeout="15" />
- <action name="stop" timeout="15" />
- <action name="monitor" timeout="15" />
- <action name="migrate_to" timeout="15" />
- <action name="migrate_from" timeout="15" />
+ <action name="start" timeout="40" />
+ <action name="stop" timeout="40" />
+ <action name="monitor" timeout="30" />
+ <action name="migrate_to" timeout="60" />
+ <action name="migrate_from" timeout="60" />
<action name="meta-data" timeout="5" />
</actions>
</resource-agent>
diff --git a/fencing/commands.c b/fencing/commands.c
index a4e9f30..577ea95 100644
--- a/fencing/commands.c
+++ b/fencing/commands.c
@@ -1094,7 +1094,10 @@ stonith_device_action(xmlNode * msg, char **output)
device = g_hash_table_lookup(device_list, id);
}
- if (device) {
+ if (device && device->api_registered == FALSE) {
+ rc = -ENODEV;
+
+ } else if (device) {
cmd = create_async_command(msg);
if (cmd == NULL) {
free_device(device);
diff --git a/fencing/main.c b/fencing/main.c
index 5ae36cf..b03659e 100644
--- a/fencing/main.c
+++ b/fencing/main.c
@@ -415,7 +415,7 @@ topology_remove_helper(const char *node, int level)
xmlNode *data = create_xml_node(NULL, F_STONITH_LEVEL);
xmlNode *notify_data = create_xml_node(NULL, STONITH_OP_LEVEL_DEL);
- crm_xml_add(data, "origin", __FUNCTION__);
+ crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__);
crm_xml_add_int(data, XML_ATTR_ID, level);
crm_xml_add(data, F_STONITH_TARGET, node);
diff --git a/include/crm/services.h b/include/crm/services.h
index e8bc172..5310709 100644
--- a/include/crm/services.h
+++ b/include/crm/services.h
@@ -152,6 +152,7 @@ enum nagios_exitcode {
int status;
int sequence;
int expected_rc;
+ int synchronous;
char *stderr_data;
char *stdout_data;
diff --git a/include/crm_internal.h b/include/crm_internal.h
index ba78da2..3eb88de 100644
--- a/include/crm_internal.h
+++ b/include/crm_internal.h
@@ -220,7 +220,7 @@ gboolean crm_remote_recv(crm_remote_t * remote, int total_timeout /*ms */ , int
xmlNode *crm_remote_parse_buffer(crm_remote_t * remote);
int crm_remote_tcp_connect(const char *host, int port);
int crm_remote_tcp_connect_async(const char *host, int port, int timeout, /*ms */
- void *userdata, void (*callback) (void *userdata, int sock));
+ int *timer_id, void *userdata, void (*callback) (void *userdata, int sock));
# ifdef HAVE_GNUTLS_GNUTLS_H
/*!
@@ -276,6 +276,7 @@ int crm_read_pidfile(const char *filename);
# define attrd_channel T_ATTRD
# define F_ATTRD_KEY "attr_key"
# define F_ATTRD_ATTRIBUTE "attr_name"
+# define F_ATTRD_REGEX "attr_regex"
# define F_ATTRD_TASK "task"
# define F_ATTRD_VALUE "attr_value"
# define F_ATTRD_SET "attr_set"
diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c
index 9410506..24700e5 100644
--- a/lib/cluster/membership.c
+++ b/lib/cluster/membership.c
@@ -389,7 +389,9 @@ crm_find_peer(unsigned int id, const char *uname)
}
} else if(uname && by_id->uname) {
- crm_warn("Node '%s' and '%s' share the same cluster nodeid: %u", by_id->uname, by_name->uname, id);
+ crm_dump_peer_hash(LOG_INFO, __FUNCTION__);
+ crm_warn("Node '%s' and '%s' share the same cluster nodeid: %u %s", by_id->uname, by_name->uname, id, uname);
+ crm_abort(__FILE__, __FUNCTION__, __LINE__, "member weirdness", TRUE, TRUE);
} else if(id && by_name->id) {
crm_warn("Node %u and %u share the same name: '%s'", by_id->id, by_name->id, uname);
diff --git a/lib/common/ipc.c b/lib/common/ipc.c
index c1801a4..f26225f 100644
--- a/lib/common/ipc.c
+++ b/lib/common/ipc.c
@@ -806,7 +806,7 @@ crm_ipc_connect(crm_ipc_t * client)
#ifdef HAVE_IPCS_GET_BUFFER_SIZE
client->max_buf_size = qb_ipcc_get_buffer_size(client->ipc);
- if (client->max_buf_size < client->buf_size) {
+ if (client->max_buf_size > client->buf_size) {
free(client->buffer);
client->buffer = calloc(1, client->max_buf_size);
client->buf_size = client->max_buf_size;
diff --git a/lib/common/remote.c b/lib/common/remote.c
index 0a7cd93..e2492b9 100644
--- a/lib/common/remote.c
+++ b/lib/common/remote.c
@@ -737,11 +737,12 @@ check_connect_finished(gpointer userdata)
static int
internal_tcp_connect_async(int sock,
const struct sockaddr *addr, socklen_t addrlen, int timeout /* ms */ ,
- void *userdata, void (*callback) (void *userdata, int sock))
+ int *timer_id, void *userdata, void (*callback) (void *userdata, int sock))
{
int rc = 0;
int flag = 0;
int interval = 500;
+ int timer;
struct tcp_async_cb_data *cb_data = NULL;
if ((flag = fcntl(sock, F_GETFL)) >= 0) {
@@ -782,7 +783,10 @@ internal_tcp_connect_async(int sock,
* Something about the way mainloop is currently polling prevents this from working at the
* moment though. */
crm_trace("fd %d: scheduling to check if connect finished in %dms second", sock, interval);
- g_timeout_add(interval, check_connect_finished, cb_data);
+ timer = g_timeout_add(interval, check_connect_finished, cb_data);
+ if (timer_id) {
+ *timer_id = timer;
+ }
return 0;
}
@@ -809,10 +813,11 @@ internal_tcp_connect(int sock, const struct sockaddr *addr, socklen_t addrlen)
* \internal
* \brief tcp connection to server at specified port
* \retval negative, failed to connect.
+ * \retval positive, sock fd
*/
int
-crm_remote_tcp_connect_async(const char *host, int port, int timeout, /*ms */
- void *userdata, void (*callback) (void *userdata, int sock))
+crm_remote_tcp_connect_async(const char *host, int port, int timeout, /*ms */
+ int *timer_id, void *userdata, void (*callback) (void *userdata, int sock))
{
char buffer[256];
struct addrinfo *res = NULL;
@@ -877,8 +882,7 @@ crm_remote_tcp_connect_async(const char *host, int port, int timeout, /*ms */
if (callback) {
if (internal_tcp_connect_async
- (sock, rp->ai_addr, rp->ai_addrlen, timeout, userdata, callback) == 0) {
- sock = 0;
+ (sock, rp->ai_addr, rp->ai_addrlen, timeout, timer_id, userdata, callback) == 0) {
goto async_cleanup; /* Success for now, we'll hear back later in the callback */
}
@@ -903,5 +907,5 @@ async_cleanup:
int
crm_remote_tcp_connect(const char *host, int port)
{
- return crm_remote_tcp_connect_async(host, port, -1, NULL, NULL);
+ return crm_remote_tcp_connect_async(host, port, -1, NULL, NULL, NULL);
}
diff --git a/lib/common/utils.c b/lib/common/utils.c
index e559c51..dc54e6d 100644
--- a/lib/common/utils.c
+++ b/lib/common/utils.c
@@ -2005,6 +2005,9 @@ attrd_update_delegate(crm_ipc_t * ipc, char command, const char *host, const cha
}
switch (command) {
+ case 'u':
+ crm_xml_add(update, F_ATTRD_TASK, "update");
+ crm_xml_add(update, F_ATTRD_REGEX, name);
case 'D':
case 'U':
case 'v':
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index 2837682..06b9492 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -192,7 +192,7 @@ create_device_registration_xml(const char *id, const char *namespace, const char
#endif
crm_xml_add(data, XML_ATTR_ID, id);
- crm_xml_add(data, "origin", __FUNCTION__);
+ crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__);
crm_xml_add(data, "agent", agent);
crm_xml_add(data, "namespace", namespace);
if (rsc_provides) {
@@ -229,7 +229,7 @@ stonith_api_remove_device(stonith_t * st, int call_options, const char *name)
xmlNode *data = NULL;
data = create_xml_node(NULL, F_STONITH_DEVICE);
- crm_xml_add(data, "origin", __FUNCTION__);
+ crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__);
crm_xml_add(data, XML_ATTR_ID, name);
rc = stonith_send_command(st, STONITH_OP_DEVICE_DEL, data, NULL, call_options, 0);
free_xml(data);
@@ -244,7 +244,7 @@ stonith_api_remove_level(stonith_t * st, int options, const char *node, int leve
xmlNode *data = NULL;
data = create_xml_node(NULL, F_STONITH_LEVEL);
- crm_xml_add(data, "origin", __FUNCTION__);
+ crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__);
crm_xml_add(data, F_STONITH_TARGET, node);
crm_xml_add_int(data, XML_ATTR_ID, level);
rc = stonith_send_command(st, STONITH_OP_LEVEL_DEL, data, NULL, options, 0);
@@ -260,7 +260,7 @@ create_level_registration_xml(const char *node, int level, stonith_key_value_t *
crm_xml_add_int(data, XML_ATTR_ID, level);
crm_xml_add(data, F_STONITH_TARGET, node);
- crm_xml_add(data, "origin", __FUNCTION__);
+ crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__);
for (; device_list; device_list = device_list->next) {
xmlNode *dev = create_xml_node(data, F_STONITH_DEVICE);
@@ -1255,7 +1255,7 @@ stonith_api_query(stonith_t * stonith, int call_options, const char *target,
CRM_CHECK(devices != NULL, return -EINVAL);
data = create_xml_node(NULL, F_STONITH_DEVICE);
- crm_xml_add(data, "origin", __FUNCTION__);
+ crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__);
crm_xml_add(data, F_STONITH_TARGET, target);
crm_xml_add(data, F_STONITH_ACTION, "off");
rc = stonith_send_command(stonith, STONITH_OP_QUERY, data, &output, call_options, timeout);
@@ -1296,7 +1296,7 @@ stonith_api_call(stonith_t * stonith,
xmlNode *data = NULL;
data = create_xml_node(NULL, F_STONITH_DEVICE);
- crm_xml_add(data, "origin", __FUNCTION__);
+ crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__);
crm_xml_add(data, F_STONITH_DEVICE, id);
crm_xml_add(data, F_STONITH_ACTION, action);
crm_xml_add(data, F_STONITH_TARGET, victim);
diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c
index 3496098..b8c5d23 100644
--- a/lib/lrmd/lrmd_client.c
+++ b/lib/lrmd/lrmd_client.c
@@ -89,6 +89,9 @@ typedef struct lrmd_private_s {
int port;
gnutls_psk_client_credentials_t psk_cred_c;
+ /* while the async connection is occuring, this is the id
+ * of the connection timeout timer. */
+ int async_timer;
int sock;
/* since tls requires a round trip across the network for a
* request/reply, there are times where we just want to be able
@@ -1101,6 +1104,8 @@ lrmd_tcp_connect_cb(void *userdata, int sock)
int rc = sock;
gnutls_datum_t psk_key = { NULL, 0 };
+ native->async_timer = 0;
+
if (rc < 0) {
lrmd_tls_connection_destroy(lrmd);
crm_info("remote lrmd connect to %s at port %d failed", native->server, native->port);
@@ -1152,14 +1157,23 @@ lrmd_tcp_connect_cb(void *userdata, int sock)
static int
lrmd_tls_connect_async(lrmd_t * lrmd, int timeout /*ms */ )
{
- int rc = 0;
+ int rc = -1;
+ int sock = 0;
+ int timer_id = 0;
+
lrmd_private_t *native = lrmd->private;
lrmd_gnutls_global_init();
- rc = crm_remote_tcp_connect_async(native->server, native->port, timeout, lrmd,
+ sock = crm_remote_tcp_connect_async(native->server, native->port, timeout, &timer_id, lrmd,
lrmd_tcp_connect_cb);
+ if (sock != -1) {
+ native->sock = sock;
+ rc = 0;
+ native->async_timer = timer_id;
+ }
+
return rc;
}
@@ -1319,6 +1333,11 @@ lrmd_tls_disconnect(lrmd_t * lrmd)
native->remote->tls_session = 0;
}
+ if (native->async_timer) {
+ g_source_remove(native->async_timer);
+ native->async_timer = 0;
+ }
+
if (native->source != NULL) {
/* Attached to mainloop */
mainloop_del_ipc_client(native->source);
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index b699201..7127c12 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -1756,6 +1756,7 @@ process_rsc_state(resource_t * rsc, node_t * node,
if (rsc->role > RSC_ROLE_STOPPED
&& node->details->online == FALSE && is_set(rsc->flags, pe_rsc_managed)) {
+ char *reason = NULL;
gboolean should_fence = FALSE;
/* if this is a remote_node living in a container, fence the container
@@ -1768,14 +1769,25 @@ process_rsc_state(resource_t * rsc, node_t * node,
should_fence = TRUE;
} else if (is_set(data_set->flags, pe_flag_stonith_enabled)) {
+ if (is_baremetal_remote_node(node) && is_not_set(node->details->remote_rsc->flags, pe_rsc_failed)) {
+ /* setting unceen = true means that fencing of the remote node will
+ * only occur if the connection resource is not going to start somewhere.
+ * This allows connection resources on a failed cluster-node to move to
+ * another node without requiring the baremetal remote nodes to be fenced
+ * as well. */
+ node->details->unseen = TRUE;
+ reason = g_strdup_printf("because %s is active there. Fencing will be revoked if remote-node connection can be re-established on another cluster-node.", rsc->id);
+ }
should_fence = TRUE;
}
if (should_fence) {
- char *reason = g_strdup_printf("because %s is thought to be active there", rsc->id);
+ if (reason == NULL) {
+ reason = g_strdup_printf("because %s is thought to be active there", rsc->id);
+ }
pe_fence_node(data_set, node, reason);
- g_free(reason);
}
+ g_free(reason);
}
if (node->details->unclean) {
@@ -1840,6 +1852,17 @@ process_rsc_state(resource_t * rsc, node_t * node,
break;
}
+ /* ensure a remote-node connection failure forces an unclean remote-node
+ * to be fenced. By setting unseen = FALSE, the remote-node failure will
+ * result in a fencing operation regardless if we're going to attempt to
+ * reconnect to the remote-node in this transition or not. */
+ if (is_set(rsc->flags, pe_rsc_failed) && rsc->is_remote_node) {
+ node_t *tmpnode = pe_find_node(data_set->nodes, rsc->id);
+ if (tmpnode->details->unclean) {
+ tmpnode->details->unseen = FALSE;
+ }
+ }
+
if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) {
if (is_set(rsc->flags, pe_rsc_orphan)) {
if (is_set(rsc->flags, pe_rsc_managed)) {
@@ -2160,7 +2183,7 @@ unpack_lrm_resources(node_t * node, xmlNode * lrm_rsc_list, pe_working_set_t * d
for (gIter = unexpected_containers; gIter != NULL; gIter = gIter->next) {
remote = (resource_t *) gIter->data;
if (remote->role != RSC_ROLE_STARTED) {
- crm_warn("Recovering container resource %s. Resource is unexpectedly running and involves a remote-node.");
+ crm_warn("Recovering container resource %s. Resource is unexpectedly running and involves a remote-node.", remote->container->id);
set_bit(remote->container->flags, pe_rsc_failed);
}
}
diff --git a/lib/services/dbus.c b/lib/services/dbus.c
index 8b8aee1..587589c 100644
--- a/lib/services/dbus.c
+++ b/lib/services/dbus.c
@@ -6,6 +6,14 @@
#define BUS_PROPERTY_IFACE "org.freedesktop.DBus.Properties"
+struct db_getall_data
+{
+ char *name;
+ char *target;
+ char *object;
+ void *userdata;
+ void (*callback)(const char *name, const char *value, void *userdata);
+};
static bool pcmk_dbus_error_check(DBusError *err, const char *prefix, const char *function, int line)
{
@@ -107,8 +115,9 @@ DBusMessage *pcmk_dbus_send_recv(DBusMessage *msg, DBusConnection *connection, D
method = dbus_message_get_member (msg);
// send message and get a handle for a reply
- if (!dbus_connection_send_with_reply (connection, msg, &pending, -1)) { // -1 is default timeout
+ if (!dbus_connection_send_with_reply (connection, msg, &pending, -1/* aka. DBUS_TIMEOUT_USE_DEFAULT */)) {
if(error) {
+ dbus_error_init(error);
error->message = "Call to dbus_connection_send_with_reply() failed";
error->name = "org.clusterlabs.pacemaker.SendFailed";
}
@@ -126,13 +135,7 @@ DBusMessage *pcmk_dbus_send_recv(DBusMessage *msg, DBusConnection *connection, D
reply = dbus_pending_call_steal_reply(pending);
}
- if(pcmk_dbus_find_error(method, pending, reply, error)) {
- crm_trace("Was error: '%s' '%s'", error->name, error->message);
- if(reply) {
- dbus_message_unref(reply);
- reply = NULL;
- }
- }
+ pcmk_dbus_find_error(method, pending, reply, error);
if(pending) {
/* free the pending message handle */
@@ -156,7 +159,7 @@ bool pcmk_dbus_send(DBusMessage *msg, DBusConnection *connection,
method = dbus_message_get_member (msg);
// send message and get a handle for a reply
- if (!dbus_connection_send_with_reply (connection, msg, &pending, -1)) { // -1 is default timeout
+ if (!dbus_connection_send_with_reply (connection, msg, &pending, -1/* aka. DBUS_TIMEOUT_USE_DEFAULT */)) { // -1 is default timeout
crm_err("Send with reply failed for %s", method);
return FALSE;
@@ -205,65 +208,38 @@ bool pcmk_dbus_type_check(DBusMessage *msg, DBusMessageIter *field, int expected
dbus_message_iter_init(msg, &args);
do_crm_log_alias(LOG_ERR, __FILE__, function, line,
- "Unexepcted DBus type, expected %c instead of %c in '%s'",
- expected, dtype, dbus_message_iter_get_signature(&args));
+ "Unexepcted DBus type, expected %c in '%s' instead of %c",
+ expected, dbus_message_iter_get_signature(&args), dtype);
return FALSE;
}
return TRUE;
}
-char *
-pcmk_dbus_get_property(
- DBusConnection *connection, const char *target, const char *obj, const gchar * iface, const char *name)
+static char *
+pcmk_dbus_lookup_result(DBusMessage *reply, struct db_getall_data *data)
{
- DBusMessage *msg;
- DBusMessageIter args;
- DBusMessageIter dict;
- DBusMessage *reply = NULL;
- /* DBusBasicValue value; */
- const char *method = "GetAll";
- char *output = NULL;
DBusError error;
+ char *output = NULL;
+ DBusMessageIter dict;
+ DBusMessageIter args;
- /* desc = systemd_unit_property(path, BUS_NAME ".Unit", "Description"); */
-
- dbus_error_init(&error);
- crm_info("Calling: %s on %s", method, target);
- msg = dbus_message_new_method_call(target, // target for the method call
- obj, // object to call on
- BUS_PROPERTY_IFACE, // interface to call on
- method); // method name
-
- if (NULL == msg) {
- crm_err("Call to %s failed: No message", method);
- return NULL;
- }
-
- CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &iface, DBUS_TYPE_INVALID));
-
- reply = pcmk_dbus_send_recv(msg, connection, &error);
- dbus_message_unref(msg);
-
- if(error.name) {
- crm_err("Call to %s for %s failed: No reply", method, iface);
- return NULL;
-
- } else if (!dbus_message_iter_init(reply, &args)) {
- crm_err("Cannot get properties for %s from %s", obj, iface);
- return NULL;
+ if(pcmk_dbus_find_error("GetAll", (void*)&error, reply, &error)) {
+ crm_err("Cannot get properties from %s for %s", data->target, data->object);
+ goto cleanup;
}
+ dbus_message_iter_init(reply, &args);
if(!pcmk_dbus_type_check(reply, &args, DBUS_TYPE_ARRAY, __FUNCTION__, __LINE__)) {
- crm_err("Call to %s failed: Message has invalid arguments", method);
- dbus_message_unref(reply);
- return NULL;
+ crm_err("Invalid reply from %s for %s", data->target, data->object);
+ goto cleanup;
}
dbus_message_iter_recurse(&args, &dict);
while (dbus_message_iter_get_arg_type (&dict) != DBUS_TYPE_INVALID) {
DBusMessageIter sv;
DBusMessageIter v;
+ DBusBasicValue name;
DBusBasicValue value;
if(!pcmk_dbus_type_check(reply, &dict, DBUS_TYPE_DICT_ENTRY, __FUNCTION__, __LINE__)) {
@@ -277,10 +253,9 @@ pcmk_dbus_get_property(
switch(dtype) {
case DBUS_TYPE_STRING:
- dbus_message_iter_get_basic(&sv, &value);
+ dbus_message_iter_get_basic(&sv, &name);
- crm_trace("Got: %s", value.str);
- if(strcmp(value.str, name) != 0) {
+ if(data->name && strcmp(name.str, data->name) != 0) {
dbus_message_iter_next (&sv); /* Skip the value */
}
break;
@@ -289,8 +264,17 @@ pcmk_dbus_get_property(
if(pcmk_dbus_type_check(reply, &v, DBUS_TYPE_STRING, __FUNCTION__, __LINE__)) {
dbus_message_iter_get_basic(&v, &value);
- crm_trace("Result: %s", value.str);
- output = strdup(value.str);
+ crm_trace("Property %s[%s] is '%s'", data->object, name.str, value.str);
+ if(data->callback) {
+ data->callback(name.str, value.str, data->userdata);
+
+ } else {
+ output = strdup(value.str);
+ }
+
+ if(data->name) {
+ goto cleanup;
+ }
}
break;
default:
@@ -302,8 +286,82 @@ pcmk_dbus_get_property(
dbus_message_iter_next (&dict);
}
+ cleanup:
+ free(data->target);
+ free(data->object);
+ free(data->name);
+ free(data);
+
+ return output;
+}
+
+static void
+pcmk_dbus_lookup_cb(DBusPendingCall *pending, void *user_data)
+{
+ DBusMessage *reply = NULL;
+
+ if(pending) {
+ reply = dbus_pending_call_steal_reply(pending);
+ }
+
+ pcmk_dbus_lookup_result(reply, user_data);
+
+ if(reply) {
+ dbus_message_unref(reply);
+ }
+}
+
+char *
+pcmk_dbus_get_property(
+ DBusConnection *connection, const char *target, const char *obj, const gchar * iface, const char *name,
+ void (*callback)(const char *name, const char *value, void *userdata), void *userdata)
+{
+ DBusMessage *msg;
+ const char *method = "GetAll";
+ char *output = NULL;
+
+ struct db_getall_data *query_data = NULL;
+
+ /* char *state = pcmk_dbus_get_property(systemd_proxy, BUS_NAME, unit, BUS_NAME ".Unit", "ActiveState"); */
+
+ crm_debug("Calling: %s on %s", method, target);
+ msg = dbus_message_new_method_call(target, // target for the method call
+ obj, // object to call on
+ BUS_PROPERTY_IFACE, // interface to call on
+ method); // method name
+
+ if (NULL == msg) {
+ crm_err("Call to %s failed: No message", method);
+ return NULL;
+ }
+
+ CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &iface, DBUS_TYPE_INVALID));
+
+ query_data = malloc(sizeof(struct db_getall_data));
+ query_data->target = strdup(target);
+ query_data->object = strdup(obj);
+ query_data->callback = callback;
+ query_data->userdata = userdata;
+ query_data->name = NULL;
+
+ if(name) {
+ query_data->name = strdup(name);
+ }
+
+ if(query_data->callback) {
+ pcmk_dbus_send(msg, connection, pcmk_dbus_lookup_cb, query_data);
+
+ } else {
+ DBusMessage *reply = pcmk_dbus_send_recv(msg, connection, NULL);
+
+ output = pcmk_dbus_lookup_result(reply, query_data);
+ if(reply) {
+ dbus_message_unref(reply);
+ }
+ }
+
+ dbus_message_unref(msg);
- crm_trace("Property %s[%s] is '%s'", obj, name, output);
return output;
}
@@ -354,6 +412,14 @@ pcmk_dbus_watch_add(DBusWatch *watch, void *data){
}
static void
+pcmk_dbus_watch_toggle(DBusWatch *watch, void *data)
+{
+ mainloop_io_t *client = dbus_watch_get_data(watch);
+ crm_notice("DBus client %p is now %s", client, dbus_watch_get_enabled(watch)?"enabled":"disabled");
+}
+
+
+static void
pcmk_dbus_watch_remove(DBusWatch *watch, void *data){
mainloop_io_t *client = dbus_watch_get_data(watch);
@@ -404,7 +470,7 @@ pcmk_dbus_timeout_toggle(DBusTimeout *timeout, void *data){
void pcmk_dbus_connection_setup_with_select(DBusConnection *c){
dbus_connection_set_timeout_functions(
c, pcmk_dbus_timeout_add, pcmk_dbus_timeout_remove, pcmk_dbus_timeout_toggle, NULL, NULL);
- dbus_connection_set_watch_functions(c, pcmk_dbus_watch_add, pcmk_dbus_watch_remove, NULL, NULL, NULL);
+ dbus_connection_set_watch_functions(c, pcmk_dbus_watch_add, pcmk_dbus_watch_remove, pcmk_dbus_watch_toggle, NULL, NULL);
dbus_connection_set_dispatch_status_function(c, pcmk_dbus_connection_dispatch, NULL, NULL);
pcmk_dbus_connection_dispatch(c, dbus_connection_get_dispatch_status(c), NULL);
diff --git a/lib/services/pcmk-dbus.h b/lib/services/pcmk-dbus.h
index 3b7a598..ed80c5f 100644
--- a/lib/services/pcmk-dbus.h
+++ b/lib/services/pcmk-dbus.h
@@ -6,7 +6,9 @@ bool pcmk_dbus_send(DBusMessage *msg, DBusConnection *connection,
void(*done)(DBusPendingCall *pending, void *user_data), void *user_data);
DBusMessage *pcmk_dbus_send_recv(DBusMessage *msg, DBusConnection *connection, DBusError *error);
bool pcmk_dbus_type_check(DBusMessage *msg, DBusMessageIter *field, int expected, const char *function, int line);
-char *pcmk_dbus_get_property(DBusConnection *connection, const char *target, const char *obj, const gchar * iface, const char *name);
+char *pcmk_dbus_get_property(
+ DBusConnection *connection, const char *target, const char *obj, const gchar * iface, const char *name,
+ void (*callback)(const char *name, const char *value, void *userdata), void *userdata);
bool pcmk_dbus_find_error(const char *method, DBusPendingCall* pending, DBusMessage *reply, DBusError *error);
diff --git a/lib/services/services.c b/lib/services/services.c
index 7b32405..8590b56 100644
--- a/lib/services/services.c
+++ b/lib/services/services.c
@@ -473,6 +473,7 @@ handle_duplicate_recurring(svc_action_t * op, void (*action_callback) (svc_actio
gboolean
services_action_async(svc_action_t * op, void (*action_callback) (svc_action_t *))
{
+ op->synchronous = false;
if (action_callback) {
op->opaque->callback = action_callback;
}
@@ -491,7 +492,7 @@ services_action_async(svc_action_t * op, void (*action_callback) (svc_action_t *
}
if (op->standard && strcasecmp(op->standard, "systemd") == 0) {
#if SUPPORT_SYSTEMD
- return systemd_unit_exec(op, FALSE);
+ return systemd_unit_exec(op);
#endif
}
return services_os_action_execute(op, FALSE);
@@ -502,6 +503,7 @@ services_action_sync(svc_action_t * op)
{
gboolean rc = TRUE;
+ op->synchronous = true;
if (op == NULL) {
crm_trace("No operation to execute");
return FALSE;
@@ -512,7 +514,7 @@ services_action_sync(svc_action_t * op)
#endif
} else if (op->standard && strcasecmp(op->standard, "systemd") == 0) {
#if SUPPORT_SYSTEMD
- rc = systemd_unit_exec(op, TRUE);
+ rc = systemd_unit_exec(op);
#endif
} else {
rc = services_os_action_execute(op, TRUE);
diff --git a/lib/services/systemd.c b/lib/services/systemd.c
index e81d178..c967430 100644
--- a/lib/services/systemd.c
+++ b/lib/services/systemd.c
@@ -35,6 +35,9 @@
/*
/usr/share/dbus-1/interfaces/org.freedesktop.systemd1.Manager.xml
*/
+gboolean
+systemd_unit_exec_with_unit(svc_action_t * op, const char *unit);
+
struct unit_info {
const char *id;
@@ -49,6 +52,15 @@ struct unit_info {
const char *job_path;
};
+struct pcmk_dbus_data
+{
+ char *name;
+ char *unit;
+ DBusError error;
+ svc_action_t *op;
+ void (*callback)(DBusMessage *reply, svc_action_t *op);
+};
+
static DBusMessage *systemd_new_method(const char *iface, const char *method)
{
crm_trace("Calling: %s on %s", method, iface);
@@ -101,6 +113,7 @@ systemd_service_name(const char *name)
static bool
systemd_daemon_reload(void)
{
+ /* TODO: Make this asynchronous */
const char *method = "Reload";
DBusMessage *reply = NULL;
DBusMessage *msg = systemd_new_method(BUS_NAME".Manager", method);
@@ -114,21 +127,55 @@ systemd_daemon_reload(void)
return TRUE;
}
-static gboolean
-systemd_unit_by_name(const gchar * arg_name, gchar ** out_unit)
+static const char *
+systemd_loadunit_result(DBusMessage *reply, svc_action_t * op)
+{
+ const char *path = NULL;
+
+ if(pcmk_dbus_find_error("LoadUnit", (void*)&path, reply, NULL)) {
+ if(op) {
+ crm_warn("No unit found for %s", op->rsc);
+ }
+
+ } else if(pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, __FUNCTION__, __LINE__)) {
+ dbus_message_get_args (reply, NULL,
+ DBUS_TYPE_OBJECT_PATH, &path,
+ DBUS_TYPE_INVALID);
+ }
+
+ if(op) {
+ systemd_unit_exec_with_unit(op, path);
+ }
+
+ return path;
+}
+
+
+static void
+systemd_loadunit_cb(DBusPendingCall *pending, void *user_data)
+{
+ DBusMessage *reply = NULL;
+
+ if(pending) {
+ reply = dbus_pending_call_steal_reply(pending);
+ }
+
+ systemd_loadunit_result(reply, user_data);
+
+ if(reply) {
+ dbus_message_unref(reply);
+ }
+}
+
+static char *
+systemd_unit_by_name(const gchar * arg_name, svc_action_t *op)
{
DBusMessage *msg;
DBusMessage *reply = NULL;
- const char *method = "GetUnit";
char *name = NULL;
- DBusError error;
/*
- <method name="GetUnit">
- <arg name="name" type="s" direction="in"/>
- <arg name="unit" type="o" direction="out"/>
- </method>
-
+ Equivalent to GetUnit if its already loaded
<method name="LoadUnit">
<arg name="name" type="s" direction="in"/>
<arg name="unit" type="o" direction="out"/>
@@ -139,51 +186,34 @@ systemd_unit_by_name(const gchar * arg_name, gchar ** out_unit)
return FALSE;
}
- name = systemd_service_name(arg_name);
+ msg = systemd_new_method(BUS_NAME".Manager", "LoadUnit");
+ CRM_ASSERT(msg != NULL);
- while(TRUE) {
- msg = systemd_new_method(BUS_NAME".Manager", method);
- CRM_ASSERT(msg != NULL);
+ name = systemd_service_name(arg_name);
+ CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &name, DBUS_TYPE_INVALID));
+ free(name);
- CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &name, DBUS_TYPE_INVALID));
+ if(op == NULL || op->synchronous) {
+ const char *unit = NULL;
+ char *munit = NULL;
+ DBusError error;
dbus_error_init(&error);
reply = pcmk_dbus_send_recv(msg, systemd_proxy, &error);
dbus_message_unref(msg);
- if(error.name) {
- crm_info("Call to %s failed: %s", method, error.name);
-
- } else if(pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, __FUNCTION__, __LINE__)) {
- if(out_unit) {
- char *path = NULL;
-
- dbus_message_get_args (reply, NULL,
- DBUS_TYPE_OBJECT_PATH, &path,
- DBUS_TYPE_INVALID);
-
- *out_unit = strdup(path);
- }
- dbus_message_unref(reply);
- free(name);
- return TRUE;
+ unit = systemd_loadunit_result(reply, op);
+ if(unit) {
+ munit = strdup(unit);
}
-
- if(strcmp(method, "LoadUnit") != 0) {
- method = "LoadUnit";
- crm_debug("Cannot find %s, reloading the systemd manager configuration", name);
- systemd_daemon_reload();
- if(reply) {
- dbus_message_unref(reply);
- reply = NULL;
- }
-
- } else {
- free(name);
- return FALSE;
+ if(reply) {
+ dbus_message_unref(reply);
}
+ return munit;
}
- return FALSE;
+
+ pcmk_dbus_send(msg, systemd_proxy, systemd_loadunit_cb, op);
+ return NULL;
}
GList *
@@ -220,6 +250,10 @@ systemd_unit_listall(void)
crm_err("Call to %s failed: %s", method, error.name);
return NULL;
+ } else if (reply == NULL) {
+ crm_err("Call to %s failed: Message has no reply", method);
+ return NULL;
+
} else if (!dbus_message_iter_init(reply, &args)) {
crm_err("Call to %s failed: Message has no arguments", method);
dbus_message_unref(reply);
@@ -269,21 +303,27 @@ systemd_unit_listall(void)
gboolean
systemd_unit_exists(const char *name)
{
- return systemd_unit_by_name(name, NULL);
+ /* Note: Makes a blocking dbus calls
+ * Used by resources_find_service_class() when resource class=service
+ */
+ if(systemd_unit_by_name(name, NULL)) {
+ return TRUE;
+ }
+ return FALSE;
}
static char *
systemd_unit_metadata(const char *name)
{
- char *path = NULL;
char *meta = NULL;
char *desc = NULL;
+ char *path = systemd_unit_by_name(name, NULL);
- if (systemd_unit_by_name(name, &path)) {
- CRM_ASSERT(path);
- desc = pcmk_dbus_get_property(systemd_proxy, BUS_NAME, path, BUS_NAME ".Unit", "Description");
+ if (path) {
+ /* TODO: Worth a making blocking call for? Probably not. Possibly if cached. */
+ desc = pcmk_dbus_get_property(systemd_proxy, BUS_NAME, path, BUS_NAME ".Unit", "Description", NULL, NULL);
} else {
- desc = g_strdup_printf("systemd unit file for %s", name);
+ desc = g_strdup_printf("Systemd unit file for %s", name);
}
meta = g_strdup_printf("<?xml version=\"1.0\"?>\n"
@@ -335,24 +375,15 @@ systemd_mask_error(svc_action_t *op, const char *error)
}
static void
-systemd_async_dispatch(DBusPendingCall *pending, void *user_data)
+systemd_exec_result(DBusMessage *reply, svc_action_t *op)
{
DBusError error;
- DBusMessage *reply = NULL;
- svc_action_t *op = user_data;
- dbus_error_init(&error);
- if(pending) {
- reply = dbus_pending_call_steal_reply(pending);
- }
- if(reply == NULL) {
- crm_err("No reply for %s action on %s", op->action, op->rsc);
-
- } else if(pcmk_dbus_find_error(op->action, pending, reply, &error)) {
+ if(pcmk_dbus_find_error(op->action, (void*)&error, reply, &error)) {
/* ignore "already started" or "not running" errors */
if (!systemd_mask_error(op, error.name)) {
- crm_err("%s for %s: %s", op->action, op->rsc, error.message);
+ crm_err("Could not issue %s for %s: %s (%s)", op->action, op->rsc, error.message);
}
} else {
@@ -372,6 +403,21 @@ systemd_async_dispatch(DBusPendingCall *pending, void *user_data)
}
operation_finalize(op);
+}
+
+static void
+systemd_async_dispatch(DBusPendingCall *pending, void *user_data)
+{
+ DBusError error;
+ DBusMessage *reply = NULL;
+ svc_action_t *op = user_data;
+
+ dbus_error_init(&error);
+ if(pending) {
+ reply = dbus_pending_call_steal_reply(pending);
+ }
+
+ systemd_exec_result(reply, op);
if(pending) {
dbus_pending_call_unref(pending);
@@ -383,61 +429,56 @@ systemd_async_dispatch(DBusPendingCall *pending, void *user_data)
#define SYSTEMD_OVERRIDE_ROOT "/run/systemd/system/"
+static void
+systemd_unit_check(const char *name, const char *state, void *userdata)
+{
+ svc_action_t * op = userdata;
+
+ CRM_ASSERT(state != NULL);
+
+ if (g_strcmp0(state, "active") == 0) {
+ op->rc = PCMK_OCF_OK;
+ } else if (g_strcmp0(state, "activating") == 0) {
+ op->rc = PCMK_OCF_PENDING;
+ } else {
+ op->rc = PCMK_OCF_NOT_RUNNING;
+ }
+
+ if (op->synchronous == FALSE) {
+ operation_finalize(op);
+ }
+}
+
gboolean
-systemd_unit_exec(svc_action_t * op, gboolean synchronous)
+systemd_unit_exec_with_unit(svc_action_t * op, const char *unit)
{
- DBusError error;
- char *unit = NULL;
- const char *replace_s = "replace";
- gboolean pass = FALSE;
const char *method = op->action;
- char *name = systemd_service_name(op->agent);
DBusMessage *msg = NULL;
DBusMessage *reply = NULL;
- dbus_error_init(&error);
- op->rc = PCMK_OCF_UNKNOWN_ERROR;
- CRM_ASSERT(systemd_init());
-
- crm_debug("Performing %ssynchronous %s op on systemd unit %s named '%s'",
- synchronous ? "" : "a", op->action, op->agent, op->rsc);
-
- if (safe_str_eq(op->action, "meta-data")) {
- op->stdout_data = systemd_unit_metadata(op->agent);
- op->rc = PCMK_OCF_OK;
- goto cleanup;
- }
+ CRM_ASSERT(unit);
- pass = systemd_unit_by_name(op->agent, &unit);
- if (pass == FALSE) {
+ if (unit == NULL) {
crm_debug("Could not obtain unit named '%s'", op->agent);
-#if 0
- if (error && strstr(error->message, "systemd1.NoSuchUnit")) {
- op->rc = PCMK_OCF_NOT_INSTALLED;
- op->status = PCMK_LRM_OP_NOT_INSTALLED;
- }
-#endif
+ op->rc = PCMK_OCF_NOT_INSTALLED;
+ op->status = PCMK_LRM_OP_NOT_INSTALLED;
goto cleanup;
}
if (safe_str_eq(op->action, "monitor") || safe_str_eq(method, "status")) {
- char *state = pcmk_dbus_get_property(systemd_proxy, BUS_NAME, unit, BUS_NAME ".Unit", "ActiveState");
-
- if (g_strcmp0(state, "active") == 0) {
- op->rc = PCMK_OCF_OK;
- } else if (g_strcmp0(state, "activating") == 0) {
- op->rc = PCMK_OCF_PENDING;
- } else {
- op->rc = PCMK_OCF_NOT_RUNNING;
+ char *state = pcmk_dbus_get_property(systemd_proxy, BUS_NAME, unit, BUS_NAME ".Unit", "ActiveState",
+ op->synchronous?NULL:systemd_unit_check, op);
+ if (op->synchronous) {
+ systemd_unit_check("ActiveState", state, op);
+ free(state);
+ return op->rc == PCMK_OCF_OK;
}
-
- free(state);
- goto cleanup;
+ return TRUE;
} else if (g_strcmp0(method, "start") == 0) {
FILE *file_strm = NULL;
char *override_dir = g_strdup_printf("%s/%s", SYSTEMD_OVERRIDE_ROOT, unit);
- char *override_file = g_strdup_printf("%s/50-pacemaker.conf", override_dir);
+ char *override_file = g_strdup_printf("%s/%s/50-pacemaker.conf", SYSTEMD_OVERRIDE_ROOT, unit);
method = "StartUnit";
crm_build_path(override_dir, 0755);
@@ -446,11 +487,11 @@ systemd_unit_exec(svc_action_t * op, gboolean synchronous)
if (file_strm != NULL) {
int rc = fprintf(file_strm, "[Service]\nRestart=no");
if (rc < 0) {
- crm_perror(LOG_ERR, "Cannot write to systemd override file %s: %s (%d)", override_file, pcmk_strerror(errno), errno);
+ crm_perror(LOG_ERR, "Cannot write to systemd override file %s", override_file);
}
} else {
- crm_err("Cannot open systemd override file %s for writing: %s (%d)", override_file, pcmk_strerror(errno), errno);
+ crm_err("Cannot open systemd override file %s for writing", override_file);
}
if (file_strm != NULL) {
@@ -471,6 +512,7 @@ systemd_unit_exec(svc_action_t * op, gboolean synchronous)
} else if (g_strcmp0(method, "restart") == 0) {
method = "RestartUnit";
+
} else {
op->rc = PCMK_OCF_UNIMPLEMENT_FEATURE;
goto cleanup;
@@ -482,54 +524,66 @@ systemd_unit_exec(svc_action_t * op, gboolean synchronous)
CRM_ASSERT(msg != NULL);
/* (ss) */
- CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &name, DBUS_TYPE_INVALID));
- CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &replace_s, DBUS_TYPE_INVALID));
+ {
+ const char *replace_s = "replace";
+ char *name = systemd_service_name(op->agent);
+
+ CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &name, DBUS_TYPE_INVALID));
+ CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &replace_s, DBUS_TYPE_INVALID));
- if (synchronous == FALSE) {
- free(unit);
free(name);
- return pcmk_dbus_send(msg, systemd_proxy, systemd_async_dispatch, op);
}
- dbus_error_init(&error);
- reply = pcmk_dbus_send_recv(msg, systemd_proxy, &error);
-
- if(error.name) {
- /* ignore "already started" or "not running" errors */
- if(!systemd_mask_error(op, error.name)) {
- crm_err("Could not issue %s for %s: %s (%s)", method, op->rsc, error.name, unit);
- }
- goto cleanup;
-
- } else if(!pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, __FUNCTION__, __LINE__)) {
- crm_warn("Call to %s passed but return type was unexpected", op->action);
- op->rc = PCMK_OCF_OK;
+ if (op->synchronous == FALSE) {
+ return pcmk_dbus_send(msg, systemd_proxy, systemd_async_dispatch, op);
} else {
- const char *path = NULL;
+ DBusError error;
- dbus_message_get_args (reply, NULL,
- DBUS_TYPE_OBJECT_PATH, &path,
- DBUS_TYPE_INVALID);
- crm_info("Call to %s passed: %s", op->action, path);
- op->rc = PCMK_OCF_OK;
+ reply = pcmk_dbus_send_recv(msg, systemd_proxy, &error);
+ systemd_exec_result(reply, op);
+ if(reply) {
+ dbus_message_unref(reply);
+ }
}
- cleanup:
- free(unit);
- free(name);
-
if(msg) {
dbus_message_unref(msg);
}
- if(reply) {
- dbus_message_unref(reply);
+ cleanup:
+ if (op->synchronous == FALSE) {
+ operation_finalize(op);
+ return TRUE;
}
- if (synchronous == FALSE) {
- operation_finalize(op);
+ return op->rc == PCMK_OCF_OK;
+}
+
+gboolean
+systemd_unit_exec(svc_action_t * op)
+{
+ CRM_ASSERT(op);
+ CRM_ASSERT(systemd_init());
+ op->rc = PCMK_OCF_UNKNOWN_ERROR;
+ crm_debug("Performing %ssynchronous %s op on systemd unit %s named '%s'",
+ op->synchronous ? "" : "a", op->action, op->agent, op->rsc);
+
+ if (safe_str_eq(op->action, "meta-data")) {
+ /* TODO: See if we can teach the lrmd not to make these calls synchronously */
+ op->stdout_data = systemd_unit_metadata(op->agent);
+ op->rc = PCMK_OCF_OK;
+
+ if (op->synchronous == FALSE) {
+ operation_finalize(op);
+ }
return TRUE;
}
+
+ systemd_unit_by_name(op->agent, op);
+ if (op->synchronous == FALSE) {
+ return TRUE;
+ }
+
return op->rc == PCMK_OCF_OK;
}
diff --git a/lib/services/systemd.h b/lib/services/systemd.h
index 6e1b80b..c86bafe 100644
--- a/lib/services/systemd.h
+++ b/lib/services/systemd.h
@@ -17,7 +17,7 @@
*/
G_GNUC_INTERNAL GList *systemd_unit_listall(void);
-G_GNUC_INTERNAL int systemd_unit_exec(svc_action_t * op, gboolean synchronous);
+G_GNUC_INTERNAL int systemd_unit_exec(svc_action_t * op);
G_GNUC_INTERNAL gboolean systemd_unit_exists(const gchar * name);
G_GNUC_INTERNAL gboolean systemd_unit_running(const gchar * name);
G_GNUC_INTERNAL void systemd_cleanup(void);
diff --git a/lib/services/upstart.c b/lib/services/upstart.c
index f47e8ff..4c7211d 100644
--- a/lib/services/upstart.c
+++ b/lib/services/upstart.c
@@ -275,6 +275,10 @@ get_first_instance(const gchar * job)
crm_err("Call to %s failed: %s", method, error.name);
goto done;
+ } else if(reply == NULL) {
+ crm_err("Call to %s failed: no reply", method);
+ goto done;
+
} else if (!dbus_message_iter_init(reply, &args)) {
crm_err("Call to %s failed: Message has no arguments", method);
goto done;
@@ -304,31 +308,22 @@ get_first_instance(const gchar * job)
return instance;
}
-gboolean
-upstart_job_running(const gchar * name)
+static void
+upstart_job_check(const char *name, const char *state, void *userdata)
{
- bool running = FALSE;
- char *job = NULL;
-
- if(upstart_job_by_name(name, &job)) {
- char *path = get_first_instance(job);
+ svc_action_t * op = userdata;
- if (path) {
- char *state = pcmk_dbus_get_property(
- upstart_proxy, BUS_NAME, path, UPSTART_06_API ".Instance", "state");
-
- crm_info("State of %s: %s", name, state);
- if (state) {
- running = !g_strcmp0(state, "running");
- }
- free(state);
- }
- free(path);
+ if (state && g_strcmp0(state, "running") == 0) {
+ op->rc = PCMK_OCF_OK;
+ /* } else if (g_strcmp0(state, "activating") == 0) { */
+ /* op->rc = PCMK_OCF_PENDING; */
+ } else {
+ op->rc = PCMK_OCF_NOT_RUNNING;
}
- free(job);
- crm_info("%s is%s running", name, running ? "" : " not");
- return running;
+ if (op->synchronous == FALSE) {
+ operation_finalize(op);
+ }
}
static char *
@@ -465,10 +460,24 @@ upstart_job_exec(svc_action_t * op, gboolean synchronous)
}
if (safe_str_eq(op->action, "monitor") || safe_str_eq(action, "status")) {
- if (upstart_job_running(op->agent)) {
- op->rc = PCMK_OCF_OK;
- } else {
- op->rc = PCMK_OCF_NOT_RUNNING;
+
+ char *path = get_first_instance(job);
+
+ op->rc = PCMK_OCF_NOT_RUNNING;
+ if(path) {
+ char *state = pcmk_dbus_get_property(
+ upstart_proxy, BUS_NAME, path, UPSTART_06_API ".Instance", "state",
+ op->synchronous?NULL:upstart_job_check, op);
+
+ free(job);
+ free(path);
+
+ if(op->synchronous) {
+ upstart_job_check("state", state, op);
+ free(state);
+ return op->rc == PCMK_OCF_OK;
+ }
+ return TRUE;
}
goto cleanup;
@@ -503,7 +512,7 @@ upstart_job_exec(svc_action_t * op, gboolean synchronous)
CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_BOOLEAN, &arg_wait, DBUS_TYPE_INVALID));
- if (synchronous == FALSE) {
+ if (op->synchronous == FALSE) {
free(job);
return pcmk_dbus_send(msg, upstart_proxy, upstart_async_dispatch, op);
}
@@ -545,7 +554,7 @@ upstart_job_exec(svc_action_t * op, gboolean synchronous)
dbus_message_unref(reply);
}
- if (synchronous == FALSE) {
+ if (op->synchronous == FALSE) {
operation_finalize(op);
return TRUE;
}
diff --git a/lrmd/lrmd.c b/lrmd/lrmd.c
index f3abfdb..7075b9f 100644
--- a/lrmd/lrmd.c
+++ b/lrmd/lrmd.c
@@ -874,6 +874,12 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc)
if (cmd->lrmd_op_status == PCMK_LRM_OP_CANCELLED) {
recurring = 0;
/* do nothing */
+
+ } else if (rc == -ENODEV && safe_str_eq(cmd->action, "monitor")) {
+ /* Not registered == inactive */
+ cmd->lrmd_op_status = PCMK_LRM_OP_DONE;
+ cmd->exec_rc = PCMK_OCF_NOT_RUNNING;
+
} else if (rc) {
/* Attempt to map return codes to op status if possible */
switch (rc) {
@@ -884,6 +890,7 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc)
cmd->lrmd_op_status = PCMK_LRM_OP_TIMEOUT;
break;
default:
+ /* TODO: This looks wrong. Status should be _DONE and exec_rc set to an error */
cmd->lrmd_op_status = PCMK_LRM_OP_ERROR;
}
} else {
diff --git a/lrmd/regression.py.in b/lrmd/regression.py.in
index b6b6718..a9a32ef 100755
--- a/lrmd/regression.py.in
+++ b/lrmd/regression.py.in
@@ -240,6 +240,13 @@ class Tests:
self.action_timeout = " -t 5000 "
if self.tls:
self.rsc_classes.remove("stonith")
+ if "systemd" in self.rsc_classes:
+ # the lrmd_dummy_daemon requires this, we are importing it
+ # here just to guarantee it is installed before allowing this
+ # script to run. Otherwise, running without this import being
+ # available will make all the systemd tests look like they fail,
+ # which is really scary looking. I'd rather see the import fail.
+ import systemd.daemon
print "Testing "+repr(self.rsc_classes)
diff --git a/mcp/pacemaker.combined.upstart.in b/mcp/pacemaker.combined.upstart.in
index 9540019..6301d10 100644
--- a/mcp/pacemaker.combined.upstart.in
+++ b/mcp/pacemaker.combined.upstart.in
@@ -30,6 +30,9 @@ pre-start script
# give it time to fail.
sleep 2
pidof corosync || { exit 1; }
+
+ # if you use crm_mon, uncomment the line below.
+ #start crm_mon
end script
post-start script
@@ -59,6 +62,9 @@ post-stop script
# and invalidate above "respawn" stanza.
#pidof crmd && killall -q -9 corosync
+ # if you use crm_mon, uncomment the line below.
+ #stop crm_mon
+
# if you use corosync-notifyd, uncomment the line below.
#stop corosync-notifyd || true
end script
diff --git a/pacemaker.spec.in b/pacemaker.spec.in
index bee6bfc..597fb3a 100644
--- a/pacemaker.spec.in
+++ b/pacemaker.spec.in
@@ -283,11 +283,13 @@ make DESTDIR=%{buildroot} docdir=%{pcmk_docdir} V=1 install
mkdir -p ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig
mkdir -p ${RPM_BUILD_ROOT}%{_var}/lib/pacemaker/cores
install -m 644 mcp/pacemaker.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/pacemaker
+install -m 644 tools/crm_mon.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/crm_mon
%if %{with upstart_job}
mkdir -p ${RPM_BUILD_ROOT}%{_sysconfdir}/init
install -m 644 mcp/pacemaker.upstart ${RPM_BUILD_ROOT}%{_sysconfdir}/init/pacemaker.conf
install -m 644 mcp/pacemaker.combined.upstart ${RPM_BUILD_ROOT}%{_sysconfdir}/init/pacemaker.combined.conf
+install -m 644 tools/crm_mon.upstart ${RPM_BUILD_ROOT}%{_sysconfdir}/init/crm_mon.conf
%endif
# Scripts that should be executable
@@ -395,6 +397,7 @@ exit 0
%exclude %{_datadir}/pacemaker/tests
%config(noreplace) %{_sysconfdir}/sysconfig/pacemaker
+%config(noreplace) %{_sysconfdir}/sysconfig/crm_mon
%config(noreplace) %{_sysconfdir}/logrotate.d/pacemaker
%{_sbindir}/pacemakerd
@@ -451,6 +454,7 @@ exit 0
%if %{with upstart_job}
%config(noreplace) %{_sysconfdir}/init/pacemaker.conf
%config(noreplace) %{_sysconfdir}/init/pacemaker.combined.conf
+%config(noreplace) %{_sysconfdir}/init/crm_mon.conf
%endif
%files cli
diff --git a/pengine/allocate.c b/pengine/allocate.c
index f9f9f3c..8d02d9b 100644
--- a/pengine/allocate.c
+++ b/pengine/allocate.c
@@ -1680,16 +1680,41 @@ apply_remote_node_ordering(pe_working_set_t *data_set)
action,
pe_order_preserve | pe_order_implies_then | pe_order_runnable_left,
data_set);
-
} else if (safe_str_eq(action->task, "stop")) {
- custom_action_order(action->rsc,
- NULL,
- action,
- remote_rsc,
- generate_op_key(remote_rsc->id, RSC_STOP, 0),
- NULL,
- pe_order_preserve | pe_order_implies_first,
- data_set);
+ gboolean after_start = FALSE;
+
+ /* handle special case with baremetal remote where stop actions need to be
+ * ordered after the connection resource starts somewhere else. */
+ if (is_baremetal_remote_node(action->node)) {
+ node_t *cluster_node = remote_rsc->running_on ? remote_rsc->running_on->data : NULL;
+
+ /* if the current cluster node a baremetal connection resource
+ * is residing on is unclean, we can't process any operations on that
+ * remote node until after it starts somewhere else. */
+ if (cluster_node && cluster_node->details->unclean == TRUE) {
+ after_start = TRUE;
+ }
+ }
+
+ if (after_start) {
+ custom_action_order(remote_rsc,
+ generate_op_key(remote_rsc->id, RSC_START, 0),
+ NULL,
+ action->rsc,
+ NULL,
+ action,
+ pe_order_preserve | pe_order_implies_then | pe_order_runnable_left,
+ data_set);
+ } else {
+ custom_action_order(action->rsc,
+ NULL,
+ action,
+ remote_rsc,
+ generate_op_key(remote_rsc->id, RSC_STOP, 0),
+ NULL,
+ pe_order_preserve | pe_order_implies_first,
+ data_set);
+ }
}
}
}
diff --git a/pengine/regression.sh b/pengine/regression.sh
index 5f98215..bdc7d3a 100755
--- a/pengine/regression.sh
+++ b/pengine/regression.sh
@@ -762,9 +762,11 @@ echo ""
do_test remote-startup-probes "Baremetal remote-node startup probes"
do_test remote-startup "Startup a newly discovered remote-nodes with no status."
do_test remote-fence-unclean "Fence unclean baremetal remote-node"
+do_test remote-fence-unclean2 "Fence baremetal remote-node after cluster node fails and connection can not be recovered"
do_test remote-move "Move remote-node connection resource"
do_test remote-disable "Disable a baremetal remote-node"
do_test remote-orphaned "Properly shutdown orphaned connection resource"
+do_test remote-recover "Recover connection resource after cluster-node fails."
do_test remote-stale-node-entry "Make sure we properly handle leftover remote-node entries in the node section"
echo ""
test_results
diff --git a/pengine/test10/remote-fence-unclean2.dot b/pengine/test10/remote-fence-unclean2.dot
new file mode 100644
index 0000000..6cff564
--- /dev/null
+++ b/pengine/test10/remote-fence-unclean2.dot
@@ -0,0 +1,10 @@
+digraph "g" {
+"all_stopped" [ style=bold color="green" fontcolor="orange"]
+"fake_stop_0 rhel7-alt4" -> "all_stopped" [ style = bold]
+"fake_stop_0 rhel7-alt4" [ style=bold color="green" fontcolor="orange"]
+"stonith 'reboot' rhel7-alt4" -> "fake_stop_0 rhel7-alt4" [ style = bold]
+"stonith 'reboot' rhel7-alt4" -> "stonith_complete" [ style = bold]
+"stonith 'reboot' rhel7-alt4" [ style=bold color="green" fontcolor="black"]
+"stonith_complete" -> "all_stopped" [ style = bold]
+"stonith_complete" [ style=bold color="green" fontcolor="orange"]
+}
diff --git a/pengine/test10/remote-fence-unclean2.exp b/pengine/test10/remote-fence-unclean2.exp
new file mode 100644
index 0000000..e58b617
--- /dev/null
+++ b/pengine/test10/remote-fence-unclean2.exp
@@ -0,0 +1,49 @@
+<transition_graph cluster-delay="60s" stonith-timeout="60s" failed-stop-offset="INFINITY" failed-start-offset="INFINITY" transition_id="0">
+ <synapse id="0">
+ <action_set>
+ <pseudo_event id="6" operation="stop" operation_key="fake_stop_0">
+ <attributes CRM_meta_name="stop" CRM_meta_timeout="20000" />
+ </pseudo_event>
+ </action_set>
+ <inputs>
+ <trigger>
+ <crm_event id="8" operation="stonith" operation_key="stonith-rhel7-alt4-reboot" on_node="rhel7-alt4" on_node_uuid="rhel7-alt4"/>
+ </trigger>
+ </inputs>
+ </synapse>
+ <synapse id="1">
+ <action_set>
+ <crm_event id="8" operation="stonith" operation_key="stonith-rhel7-alt4-reboot" on_node="rhel7-alt4" on_node_uuid="rhel7-alt4">
+ <attributes CRM_meta_last_failure_fake="1411503989" CRM_meta_on_node="rhel7-alt4" CRM_meta_on_node_uuid="rhel7-alt4" CRM_meta_probe_complete="true" CRM_meta_stonith_action="reboot" />
+ </crm_event>
+ </action_set>
+ <inputs/>
+ </synapse>
+ <synapse id="2">
+ <action_set>
+ <pseudo_event id="7" operation="stonith_complete" operation_key="stonith_complete">
+ <attributes />
+ </pseudo_event>
+ </action_set>
+ <inputs>
+ <trigger>
+ <crm_event id="8" operation="stonith" operation_key="stonith-rhel7-alt4-reboot" on_node="rhel7-alt4" on_node_uuid="rhel7-alt4"/>
+ </trigger>
+ </inputs>
+ </synapse>
+ <synapse id="3">
+ <action_set>
+ <pseudo_event id="1" operation="all_stopped" operation_key="all_stopped">
+ <attributes />
+ </pseudo_event>
+ </action_set>
+ <inputs>
+ <trigger>
+ <pseudo_event id="6" operation="stop" operation_key="fake_stop_0"/>
+ </trigger>
+ <trigger>
+ <pseudo_event id="7" operation="stonith_complete" operation_key="stonith_complete"/>
+ </trigger>
+ </inputs>
+ </synapse>
+</transition_graph>
diff --git a/pengine/test10/remote-fence-unclean2.scores b/pengine/test10/remote-fence-unclean2.scores
new file mode 100644
index 0000000..10fc7fd
--- /dev/null
+++ b/pengine/test10/remote-fence-unclean2.scores
@@ -0,0 +1,13 @@
+Allocation scores:
+native_color: fake allocation score on rhel7-alt1: 0
+native_color: fake allocation score on rhel7-alt2: 0
+native_color: fake allocation score on rhel7-alt3: 0
+native_color: fake allocation score on rhel7-alt4: INFINITY
+native_color: rhel7-alt4 allocation score on rhel7-alt1: 0
+native_color: rhel7-alt4 allocation score on rhel7-alt2: 0
+native_color: rhel7-alt4 allocation score on rhel7-alt3: 0
+native_color: rhel7-alt4 allocation score on rhel7-alt4: -INFINITY
+native_color: shooter allocation score on rhel7-alt1: 0
+native_color: shooter allocation score on rhel7-alt2: 0
+native_color: shooter allocation score on rhel7-alt3: 0
+native_color: shooter allocation score on rhel7-alt4: -INFINITY
diff --git a/pengine/test10/remote-fence-unclean2.summary b/pengine/test10/remote-fence-unclean2.summary
new file mode 100644
index 0000000..bfaf77b
--- /dev/null
+++ b/pengine/test10/remote-fence-unclean2.summary
@@ -0,0 +1,30 @@
+
+Current cluster status:
+Node rhel7-alt1 (1): standby
+Node rhel7-alt2 (2): standby
+RemoteNode rhel7-alt4: UNCLEAN (offline)
+OFFLINE: [ rhel7-alt3 ]
+
+ shooter (stonith:fence_xvm): Stopped
+ rhel7-alt4 (ocf::pacemaker:remote): Stopped
+ fake (ocf::heartbeat:Dummy): Started rhel7-alt4
+
+Transition Summary:
+ * Stop fake (rhel7-alt4)
+
+Executing cluster transition:
+ * Fencing rhel7-alt4 (reboot)
+ * Pseudo action: stonith_complete
+ * Pseudo action: fake_stop_0
+ * Pseudo action: all_stopped
+
+Revised cluster status:
+Node rhel7-alt1 (1): standby
+Node rhel7-alt2 (2): standby
+OFFLINE: [ rhel7-alt3 ]
+RemoteOFFLINE: [ rhel7-alt4 ]
+
+ shooter (stonith:fence_xvm): Stopped
+ rhel7-alt4 (ocf::pacemaker:remote): Stopped
+ fake (ocf::heartbeat:Dummy): Stopped
+
diff --git a/pengine/test10/remote-fence-unclean2.xml b/pengine/test10/remote-fence-unclean2.xml
new file mode 100644
index 0000000..78fc4f1
--- /dev/null
+++ b/pengine/test10/remote-fence-unclean2.xml
@@ -0,0 +1,115 @@
+<cib crm_feature_set="3.0.9" validate-with="pacemaker-2.1" epoch="13" num_updates="8" admin_epoch="0" cib-last-written="Tue Sep 23 16:28:22 2014" have-quorum="1" dc-uuid="2">
+ <configuration>
+ <crm_config>
+ <cluster_property_set id="cib-bootstrap-options">
+ <nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="1.1.12-6da3f72"/>
+ <nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="corosync"/>
+ <nvpair id="cib-bootstrap-options-cluster-name" name="cluster-name" value="phd"/>
+ <nvpair id="cib-bootstrap-options-last-lrm-refresh" name="last-lrm-refresh" value="1411504087"/>
+ </cluster_property_set>
+ </crm_config>
+ <nodes>
+ <node id="1" uname="rhel7-alt1">
+ <instance_attributes id="nodes-1">
+ <nvpair id="nodes-1-standby" name="standby" value="on"/>
+ </instance_attributes>
+ </node>
+ <node id="2" uname="rhel7-alt2">
+ <instance_attributes id="nodes-2">
+ <nvpair id="nodes-2-standby" name="standby" value="on"/>
+ </instance_attributes>
+ </node>
+ <node id="3" uname="rhel7-alt3"/>
+ </nodes>
+ <resources>
+ <primitive class="stonith" id="shooter" type="fence_xvm">
+ <instance_attributes id="shooter-instance_attributes"/>
+ <operations>
+ <op id="shooter-monitor-interval-60s" interval="60s" name="monitor"/>
+ </operations>
+ </primitive>
+ <primitive class="ocf" id="rhel7-alt4" provider="pacemaker" type="remote">
+ <instance_attributes id="rhel7-alt4-instance_attributes"/>
+ <operations>
+ <op id="rhel7-alt4-start-timeout-15" interval="0s" name="start" timeout="15"/>
+ <op id="rhel7-alt4-stop-timeout-15" interval="0s" name="stop" timeout="15"/>
+ <op id="rhel7-alt4-monitor-timeout-15" interval="60s" name="monitor" timeout="15"/>
+ </operations>
+ </primitive>
+ <primitive class="ocf" id="fake" provider="heartbeat" type="Dummy">
+ <instance_attributes id="fake-instance_attributes"/>
+ <operations>
+ <op id="fake-start-timeout-20" interval="0s" name="start" timeout="20"/>
+ <op id="fake-stop-timeout-20" interval="0s" name="stop" timeout="20"/>
+ <op id="fake-monitor-interval-10" interval="10" name="monitor" timeout="20"/>
+ </operations>
+ </primitive>
+ </resources>
+ <constraints>
+ <rsc_location id="location-fake-rhel7-alt4-INFINITY" node="rhel7-alt4" rsc="fake" score="INFINITY"/>
+ </constraints>
+ </configuration>
+ <status>
+ <node_state id="2" uname="rhel7-alt2" in_ccm="true" crmd="online" crm-debug-origin="post_cache_update" join="member" expected="member">
+ <transient_attributes id="2">
+ <instance_attributes id="status-2">
+ <nvpair id="status-2-shutdown" name="shutdown" value="0"/>
+ <nvpair id="status-2-probe_complete" name="probe_complete" value="true"/>
+ </instance_attributes>
+ </transient_attributes>
+ <lrm id="2">
+ <lrm_resources>
+ <lrm_resource id="shooter" type="fence_xvm" class="stonith">
+ <lrm_rsc_op id="shooter_last_0" operation_key="shooter_stop_0" operation="stop" crm-debug-origin="build_active_RAs" crm_feature_set="3.0.9" transition-key="11:8:0:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:0;11:8:0:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="14" rc-code="0" op-status="0" interval="0" last-run="1411503701" last-rc-change="1411503701" exec-time="1" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" on_node="rhel7-alt2"/>
+ </lrm_resource>
+ <lrm_resource id="rhel7-alt4" type="remote" class="ocf" provider="pacemaker">
+ <lrm_rsc_op id="rhel7-alt4_last_0" operation_key="rhel7-alt4_monitor_0" operation="monitor" crm-debug-origin="build_active_RAs" crm_feature_set="3.0.9" transition-key="8:5:7:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:7;8:5:7:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="1" rc-code="7" op-status="0" interval="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" on_node="rhel7-alt2"/>
+ </lrm_resource>
+ <lrm_resource id="fake" type="Dummy" class="ocf" provider="heartbeat">
+ <lrm_rsc_op id="fake_last_0" operation_key="fake_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="8:21:7:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:7;8:21:7:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="19" rc-code="7" op-status="0" interval="0" last-run="1411504086" last-rc-change="1411504086" exec-time="34" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" on_node="rhel7-alt2" op-force-restart=" state " op-restart-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ </lrm_resource>
+ </lrm_resources>
+ </lrm>
+ </node_state>
+ <node_state id="1" uname="rhel7-alt1" in_ccm="true" crmd="online" crm-debug-origin="post_cache_update" join="member" expected="member">
+ <lrm id="1">
+ <lrm_resources>
+ <lrm_resource id="shooter" type="fence_xvm" class="stonith">
+ <lrm_rsc_op id="shooter_last_0" operation_key="shooter_stop_0" operation="stop" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="11:23:0:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:0;11:23:0:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="19" rc-code="0" op-status="0" interval="0" last-run="1411504102" last-rc-change="1411504102" exec-time="1" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" on_node="rhel7-alt1"/>
+ <lrm_rsc_op id="shooter_monitor_60000" operation_key="shooter_monitor_60000" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="16:15:0:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:0;16:15:0:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="12" rc-code="0" op-status="0" interval="60000" last-rc-change="1411504079" exec-time="10" queue-time="0" op-digest="4811cef7f7f94e3a35a70be7916cb2fd" on_node="rhel7-alt1"/>
+ </lrm_resource>
+ <lrm_resource id="rhel7-alt4" type="remote" class="ocf" provider="pacemaker">
+ <lrm_rsc_op id="rhel7-alt4_last_0" operation_key="rhel7-alt4_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="9:15:7:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:7;9:15:7:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="1" rc-code="7" op-status="0" interval="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" on_node="rhel7-alt1"/>
+ </lrm_resource>
+ <lrm_resource id="fake" type="Dummy" class="ocf" provider="heartbeat">
+ <lrm_rsc_op id="fake_last_0" operation_key="fake_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="8:18:7:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:7;8:18:7:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="17" rc-code="7" op-status="0" interval="0" last-run="1411504087" last-rc-change="1411504087" exec-time="29" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" on_node="rhel7-alt1" op-force-restart=" state " op-restart-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ </lrm_resource>
+ </lrm_resources>
+ </lrm>
+ <transient_attributes id="1">
+ <instance_attributes id="status-1">
+ <nvpair id="status-1-shutdown" name="shutdown" value="0"/>
+ <nvpair id="status-1-probe_complete" name="probe_complete" value="true"/>
+ </instance_attributes>
+ </transient_attributes>
+ </node_state>
+ <node_state id="3" uname="rhel7-alt3" in_ccm="false" crmd="offline" crm-debug-origin="send_stonith_update" join="down" expected="down"/>
+ <node_state id="rhel7-alt4" remote_node="true" uname="rhel7-alt4" crm-debug-origin="post_cache_update">
+ <lrm id="rhel7-alt4">
+ <lrm_resources>
+ <lrm_resource id="fake" type="Dummy" class="ocf" provider="heartbeat">
+ <lrm_rsc_op id="fake_last_failure_0" operation_key="fake_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="12:21:7:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:0;12:21:7:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="34" rc-code="0" op-status="0" interval="0" last-run="1411504087" last-rc-change="1411504087" exec-time="29" queue-time="1" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ <lrm_rsc_op id="fake_last_0" operation_key="fake_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="12:21:7:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:0;12:21:7:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="34" rc-code="0" op-status="0" interval="0" last-run="1411504087" last-rc-change="1411504087" exec-time="29" queue-time="1" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" on_node="rhel7-alt3" op-force-restart=" state " op-restart-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ <lrm_rsc_op id="fake_monitor_10000" operation_key="fake_monitor_10000" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="16:22:0:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:0;16:22:0:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="35" rc-code="0" op-status="0" interval="10000" last-rc-change="1411504087" exec-time="29" queue-time="0" op-digest="4811cef7f7f94e3a35a70be7916cb2fd" on_node="rhel7-alt3"/>
+ </lrm_resource>
+ </lrm_resources>
+ </lrm>
+ <transient_attributes id="rhel7-alt4">
+ <instance_attributes id="status-rhel7-alt4">
+ <nvpair id="status-rhel7-alt4-probe_complete" name="probe_complete" value="true"/>
+ <nvpair id="status-rhel7-alt4-last-failure-fake" name="last-failure-fake" value="1411503989"/>
+ </instance_attributes>
+ </transient_attributes>
+ </node_state>
+ </status>
+</cib>
diff --git a/pengine/test10/remote-recover.dot b/pengine/test10/remote-recover.dot
new file mode 100644
index 0000000..1da6a7b
--- /dev/null
+++ b/pengine/test10/remote-recover.dot
@@ -0,0 +1,17 @@
+ digraph "g" {
+"all_stopped" [ style=bold color="green" fontcolor="orange"]
+"fake_monitor_10000 rhel7-alt4" [ style=bold color="green" fontcolor="black"]
+"fake_start_0 rhel7-alt4" -> "fake_monitor_10000 rhel7-alt4" [ style = bold]
+"fake_start_0 rhel7-alt4" [ style=bold color="green" fontcolor="black"]
+"fake_stop_0 rhel7-alt4" -> "all_stopped" [ style = bold]
+"fake_stop_0 rhel7-alt4" -> "fake_start_0 rhel7-alt4" [ style = bold]
+"fake_stop_0 rhel7-alt4" [ style=bold color="green" fontcolor="black"]
+"rhel7-alt4_monitor_60000 rhel7-alt1" [ style=bold color="green" fontcolor="black"]
+"rhel7-alt4_start_0 rhel7-alt1" -> "fake_monitor_10000 rhel7-alt4" [ style = bold]
+"rhel7-alt4_start_0 rhel7-alt1" -> "fake_start_0 rhel7-alt4" [ style = bold]
+"rhel7-alt4_start_0 rhel7-alt1" -> "rhel7-alt4_monitor_60000 rhel7-alt1" [ style = bold]
+"rhel7-alt4_start_0 rhel7-alt1" [ style=bold color="green" fontcolor="black"]
+"shooter_monitor_60000 rhel7-alt1" [ style=bold color="green" fontcolor="black"]
+"shooter_start_0 rhel7-alt1" -> "shooter_monitor_60000 rhel7-alt1" [ style = bold]
+"shooter_start_0 rhel7-alt1" [ style=bold color="green" fontcolor="black"]
+}
diff --git a/pengine/test10/remote-recover.exp b/pengine/test10/remote-recover.exp
new file mode 100644
index 0000000..37e4f71
--- /dev/null
+++ b/pengine/test10/remote-recover.exp
@@ -0,0 +1,99 @@
+<transition_graph cluster-delay="60s" stonith-timeout="60s" failed-stop-offset="INFINITY" failed-start-offset="INFINITY" transition_id="0">
+ <synapse id="0">
+ <action_set>
+ <rsc_op id="8" operation="monitor" operation_key="shooter_monitor_60000" on_node="rhel7-alt1" on_node_uuid="1">
+ <primitive id="shooter" class="stonith" type="fence_xvm"/>
+ <attributes CRM_meta_interval="60000" CRM_meta_name="monitor" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs>
+ <trigger>
+ <rsc_op id="7" operation="start" operation_key="shooter_start_0" on_node="rhel7-alt1" on_node_uuid="1"/>
+ </trigger>
+ </inputs>
+ </synapse>
+ <synapse id="1">
+ <action_set>
+ <rsc_op id="7" operation="start" operation_key="shooter_start_0" on_node="rhel7-alt1" on_node_uuid="1">
+ <primitive id="shooter" class="stonith" type="fence_xvm"/>
+ <attributes CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs/>
+ </synapse>
+ <synapse id="2">
+ <action_set>
+ <rsc_op id="10" operation="monitor" operation_key="rhel7-alt4_monitor_60000" on_node="rhel7-alt1" on_node_uuid="1">
+ <primitive id="rhel7-alt4" class="ocf" provider="pacemaker" type="remote"/>
+ <attributes CRM_meta_interval="60000" CRM_meta_name="monitor" CRM_meta_timeout="15000" />
+ </rsc_op>
+ </action_set>
+ <inputs>
+ <trigger>
+ <rsc_op id="9" operation="start" operation_key="rhel7-alt4_start_0" on_node="rhel7-alt1" on_node_uuid="1"/>
+ </trigger>
+ </inputs>
+ </synapse>
+ <synapse id="3">
+ <action_set>
+ <rsc_op id="9" operation="start" operation_key="rhel7-alt4_start_0" on_node="rhel7-alt1" on_node_uuid="1">
+ <primitive id="rhel7-alt4" class="ocf" provider="pacemaker" type="remote"/>
+ <attributes CRM_meta_name="start" CRM_meta_timeout="15000" />
+ </rsc_op>
+ </action_set>
+ <inputs/>
+ </synapse>
+ <synapse id="4">
+ <action_set>
+ <rsc_op id="13" operation="monitor" operation_key="fake_monitor_10000" on_node="rhel7-alt4" on_node_uuid="rhel7-alt4" router_node="rhel7-alt1">
+ <primitive id="fake" class="ocf" provider="heartbeat" type="Dummy"/>
+ <attributes CRM_meta_interval="10000" CRM_meta_name="monitor" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs>
+ <trigger>
+ <rsc_op id="9" operation="start" operation_key="rhel7-alt4_start_0" on_node="rhel7-alt1" on_node_uuid="1"/>
+ </trigger>
+ <trigger>
+ <rsc_op id="12" operation="start" operation_key="fake_start_0" on_node="rhel7-alt4" on_node_uuid="rhel7-alt4" router_node="rhel7-alt1"/>
+ </trigger>
+ </inputs>
+ </synapse>
+ <synapse id="5">
+ <action_set>
+ <rsc_op id="12" operation="start" operation_key="fake_start_0" on_node="rhel7-alt4" on_node_uuid="rhel7-alt4" router_node="rhel7-alt1">
+ <primitive id="fake" class="ocf" provider="heartbeat" type="Dummy"/>
+ <attributes CRM_meta_name="start" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs>
+ <trigger>
+ <rsc_op id="9" operation="start" operation_key="rhel7-alt4_start_0" on_node="rhel7-alt1" on_node_uuid="1"/>
+ </trigger>
+ <trigger>
+ <rsc_op id="11" operation="stop" operation_key="fake_stop_0" on_node="rhel7-alt4" on_node_uuid="rhel7-alt4" router_node="rhel7-alt1"/>
+ </trigger>
+ </inputs>
+ </synapse>
+ <synapse id="6">
+ <action_set>
+ <rsc_op id="11" operation="stop" operation_key="fake_stop_0" on_node="rhel7-alt4" on_node_uuid="rhel7-alt4" router_node="rhel7-alt1">
+ <primitive id="fake" class="ocf" provider="heartbeat" type="Dummy"/>
+ <attributes CRM_meta_name="stop" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs/>
+ </synapse>
+ <synapse id="7">
+ <action_set>
+ <pseudo_event id="1" operation="all_stopped" operation_key="all_stopped">
+ <attributes />
+ </pseudo_event>
+ </action_set>
+ <inputs>
+ <trigger>
+ <rsc_op id="11" operation="stop" operation_key="fake_stop_0" on_node="rhel7-alt4" on_node_uuid="rhel7-alt4" router_node="rhel7-alt1"/>
+ </trigger>
+ </inputs>
+ </synapse>
+</transition_graph>
diff --git a/pengine/test10/remote-recover.scores b/pengine/test10/remote-recover.scores
new file mode 100644
index 0000000..10fc7fd
--- /dev/null
+++ b/pengine/test10/remote-recover.scores
@@ -0,0 +1,13 @@
+Allocation scores:
+native_color: fake allocation score on rhel7-alt1: 0
+native_color: fake allocation score on rhel7-alt2: 0
+native_color: fake allocation score on rhel7-alt3: 0
+native_color: fake allocation score on rhel7-alt4: INFINITY
+native_color: rhel7-alt4 allocation score on rhel7-alt1: 0
+native_color: rhel7-alt4 allocation score on rhel7-alt2: 0
+native_color: rhel7-alt4 allocation score on rhel7-alt3: 0
+native_color: rhel7-alt4 allocation score on rhel7-alt4: -INFINITY
+native_color: shooter allocation score on rhel7-alt1: 0
+native_color: shooter allocation score on rhel7-alt2: 0
+native_color: shooter allocation score on rhel7-alt3: 0
+native_color: shooter allocation score on rhel7-alt4: -INFINITY
diff --git a/pengine/test10/remote-recover.summary b/pengine/test10/remote-recover.summary
new file mode 100644
index 0000000..8fd7480
--- /dev/null
+++ b/pengine/test10/remote-recover.summary
@@ -0,0 +1,36 @@
+
+Current cluster status:
+Node rhel7-alt2 (2): standby
+RemoteNode rhel7-alt4: UNCLEAN (offline)
+Online: [ rhel7-alt1 ]
+OFFLINE: [ rhel7-alt3 ]
+
+ shooter (stonith:fence_xvm): Stopped
+ rhel7-alt4 (ocf::pacemaker:remote): Stopped
+ fake (ocf::heartbeat:Dummy): Started rhel7-alt4
+
+Transition Summary:
+ * Start shooter (rhel7-alt1)
+ * Start rhel7-alt4 (rhel7-alt1)
+ * Restart fake (Started rhel7-alt4)
+
+Executing cluster transition:
+ * Resource action: shooter start on rhel7-alt1
+ * Resource action: rhel7-alt4 start on rhel7-alt1
+ * Resource action: fake stop on rhel7-alt4
+ * Pseudo action: all_stopped
+ * Resource action: shooter monitor=60000 on rhel7-alt1
+ * Resource action: rhel7-alt4 monitor=60000 on rhel7-alt1
+ * Resource action: fake start on rhel7-alt4
+ * Resource action: fake monitor=10000 on rhel7-alt4
+
+Revised cluster status:
+Node rhel7-alt2 (2): standby
+Online: [ rhel7-alt1 ]
+OFFLINE: [ rhel7-alt3 ]
+RemoteOnline: [ rhel7-alt4 ]
+
+ shooter (stonith:fence_xvm): Started rhel7-alt1
+ rhel7-alt4 (ocf::pacemaker:remote): Started rhel7-alt1
+ fake (ocf::heartbeat:Dummy): Started rhel7-alt4
+
diff --git a/pengine/test10/remote-recover.xml b/pengine/test10/remote-recover.xml
new file mode 100644
index 0000000..1a83dd9
--- /dev/null
+++ b/pengine/test10/remote-recover.xml
@@ -0,0 +1,114 @@
+<cib crm_feature_set="3.0.9" validate-with="pacemaker-2.1" epoch="13" num_updates="8" admin_epoch="0" cib-last-written="Tue Sep 23 16:28:22 2014" have-quorum="1" dc-uuid="2">
+ <configuration>
+ <crm_config>
+ <cluster_property_set id="cib-bootstrap-options">
+ <nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="1.1.12-6da3f72"/>
+ <nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="corosync"/>
+ <nvpair id="cib-bootstrap-options-cluster-name" name="cluster-name" value="phd"/>
+ <nvpair id="cib-bootstrap-options-last-lrm-refresh" name="last-lrm-refresh" value="1411504087"/>
+ </cluster_property_set>
+ </crm_config>
+ <nodes>
+ <node id="1" uname="rhel7-alt1">
+ <instance_attributes id="nodes-1">
+ </instance_attributes>
+ </node>
+ <node id="2" uname="rhel7-alt2">
+ <instance_attributes id="nodes-2">
+ <nvpair id="nodes-2-standby" name="standby" value="on"/>
+ </instance_attributes>
+ </node>
+ <node id="3" uname="rhel7-alt3"/>
+ </nodes>
+ <resources>
+ <primitive class="stonith" id="shooter" type="fence_xvm">
+ <instance_attributes id="shooter-instance_attributes"/>
+ <operations>
+ <op id="shooter-monitor-interval-60s" interval="60s" name="monitor"/>
+ </operations>
+ </primitive>
+ <primitive class="ocf" id="rhel7-alt4" provider="pacemaker" type="remote">
+ <instance_attributes id="rhel7-alt4-instance_attributes"/>
+ <operations>
+ <op id="rhel7-alt4-start-timeout-15" interval="0s" name="start" timeout="15"/>
+ <op id="rhel7-alt4-stop-timeout-15" interval="0s" name="stop" timeout="15"/>
+ <op id="rhel7-alt4-monitor-timeout-15" interval="60s" name="monitor" timeout="15"/>
+ </operations>
+ </primitive>
+ <primitive class="ocf" id="fake" provider="heartbeat" type="Dummy">
+ <instance_attributes id="fake-instance_attributes"/>
+ <operations>
+ <op id="fake-start-timeout-20" interval="0s" name="start" timeout="20"/>
+ <op id="fake-stop-timeout-20" interval="0s" name="stop" timeout="20"/>
+ <op id="fake-monitor-interval-10" interval="10" name="monitor" timeout="20"/>
+ </operations>
+ </primitive>
+ </resources>
+ <constraints>
+ <rsc_location id="location-fake-rhel7-alt4-INFINITY" node="rhel7-alt4" rsc="fake" score="INFINITY"/>
+ </constraints>
+ </configuration>
+ <status>
+ <node_state id="2" uname="rhel7-alt2" in_ccm="true" crmd="online" crm-debug-origin="post_cache_update" join="member" expected="member">
+ <transient_attributes id="2">
+ <instance_attributes id="status-2">
+ <nvpair id="status-2-shutdown" name="shutdown" value="0"/>
+ <nvpair id="status-2-probe_complete" name="probe_complete" value="true"/>
+ </instance_attributes>
+ </transient_attributes>
+ <lrm id="2">
+ <lrm_resources>
+ <lrm_resource id="shooter" type="fence_xvm" class="stonith">
+ <lrm_rsc_op id="shooter_last_0" operation_key="shooter_stop_0" operation="stop" crm-debug-origin="build_active_RAs" crm_feature_set="3.0.9" transition-key="11:8:0:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:0;11:8:0:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="14" rc-code="0" op-status="0" interval="0" last-run="1411503701" last-rc-change="1411503701" exec-time="1" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" on_node="rhel7-alt2"/>
+ </lrm_resource>
+ <lrm_resource id="rhel7-alt4" type="remote" class="ocf" provider="pacemaker">
+ <lrm_rsc_op id="rhel7-alt4_last_0" operation_key="rhel7-alt4_monitor_0" operation="monitor" crm-debug-origin="build_active_RAs" crm_feature_set="3.0.9" transition-key="8:5:7:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:7;8:5:7:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="1" rc-code="7" op-status="0" interval="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" on_node="rhel7-alt2"/>
+ </lrm_resource>
+ <lrm_resource id="fake" type="Dummy" class="ocf" provider="heartbeat">
+ <lrm_rsc_op id="fake_last_0" operation_key="fake_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="8:21:7:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:7;8:21:7:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="19" rc-code="7" op-status="0" interval="0" last-run="1411504086" last-rc-change="1411504086" exec-time="34" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" on_node="rhel7-alt2" op-force-restart=" state " op-restart-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ </lrm_resource>
+ </lrm_resources>
+ </lrm>
+ </node_state>
+ <node_state id="1" uname="rhel7-alt1" in_ccm="true" crmd="online" crm-debug-origin="post_cache_update" join="member" expected="member">
+ <lrm id="1">
+ <lrm_resources>
+ <lrm_resource id="shooter" type="fence_xvm" class="stonith">
+ <lrm_rsc_op id="shooter_last_0" operation_key="shooter_stop_0" operation="stop" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="11:23:0:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:0;11:23:0:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="19" rc-code="0" op-status="0" interval="0" last-run="1411504102" last-rc-change="1411504102" exec-time="1" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" on_node="rhel7-alt1"/>
+ <lrm_rsc_op id="shooter_monitor_60000" operation_key="shooter_monitor_60000" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="16:15:0:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:0;16:15:0:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="12" rc-code="0" op-status="0" interval="60000" last-rc-change="1411504079" exec-time="10" queue-time="0" op-digest="4811cef7f7f94e3a35a70be7916cb2fd" on_node="rhel7-alt1"/>
+ </lrm_resource>
+ <lrm_resource id="rhel7-alt4" type="remote" class="ocf" provider="pacemaker">
+ <lrm_rsc_op id="rhel7-alt4_last_0" operation_key="rhel7-alt4_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="9:15:7:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:7;9:15:7:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="1" rc-code="7" op-status="0" interval="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" on_node="rhel7-alt1"/>
+ </lrm_resource>
+ <lrm_resource id="fake" type="Dummy" class="ocf" provider="heartbeat">
+ <lrm_rsc_op id="fake_last_0" operation_key="fake_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="8:18:7:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:7;8:18:7:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="17" rc-code="7" op-status="0" interval="0" last-run="1411504087" last-rc-change="1411504087" exec-time="29" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" on_node="rhel7-alt1" op-force-restart=" state " op-restart-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ </lrm_resource>
+ </lrm_resources>
+ </lrm>
+ <transient_attributes id="1">
+ <instance_attributes id="status-1">
+ <nvpair id="status-1-shutdown" name="shutdown" value="0"/>
+ <nvpair id="status-1-probe_complete" name="probe_complete" value="true"/>
+ </instance_attributes>
+ </transient_attributes>
+ </node_state>
+ <node_state id="3" uname="rhel7-alt3" in_ccm="false" crmd="offline" crm-debug-origin="send_stonith_update" join="down" expected="down"/>
+ <node_state id="rhel7-alt4" remote_node="true" uname="rhel7-alt4" crm-debug-origin="post_cache_update">
+ <lrm id="rhel7-alt4">
+ <lrm_resources>
+ <lrm_resource id="fake" type="Dummy" class="ocf" provider="heartbeat">
+ <lrm_rsc_op id="fake_last_failure_0" operation_key="fake_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="12:21:7:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:0;12:21:7:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="34" rc-code="0" op-status="0" interval="0" last-run="1411504087" last-rc-change="1411504087" exec-time="29" queue-time="1" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ <lrm_rsc_op id="fake_last_0" operation_key="fake_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="12:21:7:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:0;12:21:7:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="34" rc-code="0" op-status="0" interval="0" last-run="1411504087" last-rc-change="1411504087" exec-time="29" queue-time="1" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8" on_node="rhel7-alt3" op-force-restart=" state " op-restart-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ <lrm_rsc_op id="fake_monitor_10000" operation_key="fake_monitor_10000" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.9" transition-key="16:22:0:68028369-58a1-453d-bcdd-c1d1ade99959" transition-magic="0:0;16:22:0:68028369-58a1-453d-bcdd-c1d1ade99959" call-id="35" rc-code="0" op-status="0" interval="10000" last-rc-change="1411504087" exec-time="29" queue-time="0" op-digest="4811cef7f7f94e3a35a70be7916cb2fd" on_node="rhel7-alt3"/>
+ </lrm_resource>
+ </lrm_resources>
+ </lrm>
+ <transient_attributes id="rhel7-alt4">
+ <instance_attributes id="status-rhel7-alt4">
+ <nvpair id="status-rhel7-alt4-probe_complete" name="probe_complete" value="true"/>
+ <nvpair id="status-rhel7-alt4-last-failure-fake" name="last-failure-fake" value="1411503989"/>
+ </instance_attributes>
+ </transient_attributes>
+ </node_state>
+ </status>
+</cib>
diff --git a/tools/crm_attribute.c b/tools/crm_attribute.c
index 60d39b6..c37b096 100644
--- a/tools/crm_attribute.c
+++ b/tools/crm_attribute.c
@@ -235,6 +235,7 @@ main(int argc, char **argv)
/* we're updating cluster options - dont populate dest_node */
type = XML_CIB_TAG_CRMCONFIG;
+ } else if (safe_str_eq(type, XML_CIB_TAG_CRMCONFIG)) {
} else if (safe_str_neq(type, XML_CIB_TAG_TICKETS)) {
if (dest_uname == NULL) {
dest_uname = get_node_name(0);
diff --git a/tools/crm_mon.upstart.in b/tools/crm_mon.upstart.in
new file mode 100644
index 0000000..ef0fe7a
--- /dev/null
+++ b/tools/crm_mon.upstart.in
@@ -0,0 +1,39 @@
+# crm_mon - Daemon for pacemaker monitor
+#
+#
+
+kill timeout 3600
+respawn
+respawn limit 10 3600
+
+expect fork
+
+env prog=crm_mon
+env rpm_sysconf=@sysconfdir@/sysconfig/crm_mon
+env rpm_lockfile=@localstatedir@/lock/subsys/crm_mon
+env deb_sysconf=@sysconfdir@/default/crm_mon
+env deb_lockfile=@localstatedir@/lock/crm_mon
+
+
+script
+ [ -f "$rpm_sysconf" ] && . $rpm_sysconf
+ [ -f "$deb_sysconf" ] && . $deb_sysconf
+ exec $prog $OPTIONS
+end script
+
+post-start script
+ [ -f "$rpm_sysconf" ] && . $rpm_sysconf
+ [ -f "$deb_sysconf" ] && . $deb_sysconf
+ [ -z "$LOCK_FILE" -a -d @sysconfdir@/sysconfig ] && LOCK_FILE="$rpm_lockfile"
+ [ -z "$LOCK_FILE" -a -d @sysconfdir@/default ] && LOCK_FILE="$deb_lockfile"
+ touch $LOCK_FILE
+end script
+
+post-stop script
+ [ -f "$rpm_sysconf" ] && . $rpm_sysconf
+ [ -f "$deb_sysconf" ] && . $deb_sysconf
+ [ -z "$LOCK_FILE" -a -d @sysconfdir@/sysconfig ] && LOCK_FILE="$rpm_lockfile"
+ [ -z "$LOCK_FILE" -a -d @sysconfdir@/default ] && LOCK_FILE="$deb_lockfile"
+ rm -f $LOCK_FILE
+end script
+
diff --git a/tools/crm_resource.c b/tools/crm_resource.c
index 6537520..56583e0 100644
--- a/tools/crm_resource.c
+++ b/tools/crm_resource.c
@@ -2214,11 +2214,15 @@ main(int argc, char **argv)
}
} else if (rsc_cmd == 'C') {
-#if 0
+#if HAVE_ATOMIC_ATTRD
xmlNode *cmd = create_request(CRM_OP_REPROBE, NULL, host_uname,
CRM_SYSTEM_CRMD, crm_system_name, our_pid);
- crm_debug("Re-checking the state of all resources on %s", host_uname);
+ crm_debug("Re-checking the state of all resources on %s", host_uname?host_uname:"all nodes");
+
+ rc = attrd_update_delegate(
+ NULL, 'u', host_uname, "fail-count-*", NULL, XML_CIB_TAG_STATUS, NULL, NULL, NULL, FALSE);
+
if (crm_ipc_send(crmd_channel, cmd, 0, 0, NULL) > 0) {
start_mainloop();
}