commit 0fa5ce2c14fa36610630469c14c07537eb4f4807 Author: Andrew Beekhof Date: Wed Oct 1 16:56:59 2014 +1000 Import: pacemaker-rollup-be1e835 diff --git a/attrd/Makefile.am b/attrd/Makefile.am index 802a3fa..9d5e223 100644 --- a/attrd/Makefile.am +++ b/attrd/Makefile.am @@ -32,25 +32,12 @@ attrd_LDADD = $(top_builddir)/lib/cluster/libcrmcluster.la \ $(top_builddir)/lib/cib/libcib.la \ $(CLUSTERLIBS) -if BUILD_HEARTBEAT_SUPPORT -attrd_SOURCES += legacy.c -else - -if BUILD_CS_SUPPORT - -if BUILD_CS_PLUGIN -attrd_SOURCES += legacy.c -else -# Only build the new version where CPG is exclusively used for communication +if BUILD_ATOMIC_ATTRD attrd_SOURCES += main.c commands.c -endif - else attrd_SOURCES += legacy.c endif -endif - clean-generic: rm -f *.log *.debug *.xml *~ diff --git a/attrd/commands.c b/attrd/commands.c index 038e7e4..c48ef1b 100644 --- a/attrd/commands.c +++ b/attrd/commands.c @@ -17,6 +17,8 @@ */ #include +#include +#include #include #include @@ -63,7 +65,7 @@ typedef struct attribute_value_s { void write_attribute(attribute_t *a); void write_or_elect_attribute(attribute_t *a); -void attrd_peer_update(crm_node_t *peer, xmlNode *xml, bool filter); +void attrd_peer_update(crm_node_t *peer, xmlNode *xml, const char *host, bool filter); void attrd_peer_sync(crm_node_t *peer, xmlNode *xml); void attrd_peer_remove(uint32_t nodeid, const char *host, gboolean uncache, const char *source); @@ -191,16 +193,41 @@ attrd_client_message(crm_client_t *client, xmlNode *xml) char *host = crm_element_value_copy(xml, F_ATTRD_HOST); const char *attr = crm_element_value(xml, F_ATTRD_ATTRIBUTE); const char *value = crm_element_value(xml, F_ATTRD_VALUE); + const char *regex = crm_element_value(xml, F_ATTRD_REGEX); - a = g_hash_table_lookup(attributes, attr); + if(attr == NULL && regex) { + GHashTableIter aIter; + regex_t *r_patt = calloc(1, sizeof(regex_t)); + + crm_debug("Setting %s to %s", regex, value); + if (regcomp(r_patt, regex, REG_EXTENDED)) { + crm_err("Bad regex '%s' for update", regex); + regfree(r_patt); + free(r_patt); + return; + } - if(host == NULL) { + g_hash_table_iter_init(&aIter, attributes); + while (g_hash_table_iter_next(&aIter, (gpointer *) & attr, NULL)) { + int status = regexec(r_patt, attr, 0, NULL, 0); + + if(status == 0) { + crm_trace("Matched %s with %s", attr, regex); + crm_xml_add(xml, F_ATTRD_ATTRIBUTE, attr); + send_attrd_message(NULL, xml); + } + } + return; + + } else if(host == NULL) { crm_trace("Inferring host"); host = strdup(attrd_cluster->uname); crm_xml_add(xml, F_ATTRD_HOST, host); crm_xml_add_int(xml, F_ATTRD_HOST_ID, attrd_cluster->nodeid); } + a = g_hash_table_lookup(attributes, attr); + if (value) { int offset = 1; int int_value = 0; @@ -254,6 +281,7 @@ attrd_client_message(crm_client_t *client, xmlNode *xml) } if(broadcast) { + /* Ends up at attrd_peer_message() */ send_attrd_message(NULL, xml); } } @@ -265,6 +293,7 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml) const char *v = crm_element_value(xml, F_ATTRD_VERSION); const char *op = crm_element_value(xml, F_ATTRD_TASK); const char *election_op = crm_element_value(xml, F_CRM_TASK); + const char *host = crm_element_value(xml, F_ATTRD_HOST); if(election_op) { enum election_result rc = 0; @@ -293,7 +322,7 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml) const char *name = crm_element_value(xml, F_ATTRD_ATTRIBUTE); crm_trace("Compatibility update of %s from %s", name, peer->uname); - attrd_peer_update(peer, xml, FALSE); + attrd_peer_update(peer, xml, host, FALSE); } else if(safe_str_eq(op, "flush")) { const char *name = crm_element_value(xml, F_ATTRD_ATTRIBUTE); @@ -336,13 +365,12 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml) } if(safe_str_eq(op, "update")) { - attrd_peer_update(peer, xml, FALSE); + attrd_peer_update(peer, xml, host, FALSE); } else if(safe_str_eq(op, "sync")) { attrd_peer_sync(peer, xml); } else if(safe_str_eq(op, "peer-remove")) { - const char *host = crm_element_value(xml, F_ATTRD_HOST); attrd_peer_remove(0, host, TRUE, peer->uname); } else if(safe_str_eq(op, "sync-response") @@ -351,7 +379,8 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml) crm_notice("Processing %s from %s", op, peer->uname); for (child = __xml_first_child(xml); child != NULL; child = __xml_next(child)) { - attrd_peer_update(peer, child, TRUE); + host = crm_element_value(child, F_ATTRD_HOST); + attrd_peer_update(peer, child, host, TRUE); } } } @@ -409,12 +438,11 @@ attrd_peer_remove(uint32_t nodeid, const char *host, gboolean uncache, const cha } void -attrd_peer_update(crm_node_t *peer, xmlNode *xml, bool filter) +attrd_peer_update(crm_node_t *peer, xmlNode *xml, const char *host, bool filter) { bool changed = FALSE; attribute_value_t *v = NULL; - const char *host = crm_element_value(xml, F_ATTRD_HOST); const char *attr = crm_element_value(xml, F_ATTRD_ATTRIBUTE); const char *value = crm_element_value(xml, F_ATTRD_VALUE); @@ -424,6 +452,19 @@ attrd_peer_update(crm_node_t *peer, xmlNode *xml, bool filter) a = create_attribute(xml); } + if(host == NULL) { + GHashTableIter vIter; + g_hash_table_iter_init(&vIter, a->values); + + crm_debug("Setting %s for all hosts to %s", attr, value); + + xml_remove_prop(xml, F_ATTRD_HOST_ID); + while (g_hash_table_iter_next(&vIter, (gpointer *) & host, NULL)) { + attrd_peer_update(peer, xml, host, filter); + } + return; + } + v = g_hash_table_lookup(a->values, host); if(v == NULL) { diff --git a/cib/messages.c b/cib/messages.c index 4b79912..9c66349 100644 --- a/cib/messages.c +++ b/cib/messages.c @@ -292,6 +292,11 @@ cib_process_upgrade_server(const char *op, int options, const char *section, xml crm_xml_add(up, F_TYPE, "cib"); crm_xml_add(up, F_CIB_OPERATION, CIB_OP_UPGRADE); crm_xml_add(up, F_CIB_SCHEMA_MAX, get_schema_name(new_version)); + crm_xml_add(up, F_CIB_DELEGATED, host); + crm_xml_add(up, F_CIB_CLIENTID, crm_element_value(req, F_CIB_CLIENTID)); + crm_xml_add(up, F_CIB_CALLOPTS, crm_element_value(req, F_CIB_CALLOPTS)); + crm_xml_add(up, F_CIB_CALLID, crm_element_value(req, F_CIB_CALLID)); + send_cluster_message(NULL, crm_msg_cib, up, FALSE); free_xml(up); diff --git a/configure.ac b/configure.ac index 40adffe..1edff40 100644 --- a/configure.ac +++ b/configure.ac @@ -75,6 +75,7 @@ CC_IN_CONFIGURE=yes export CC_IN_CONFIGURE LDD=ldd +BUILD_ATOMIC_ATTRD=1 dnl ======================================================================== dnl Compiler characteristics @@ -1260,6 +1261,7 @@ case $SUPPORT_HEARTBEAT in dnl objdump -x ${libdir}/libccmclient.so | grep SONAME | awk '{print $2}' AC_DEFINE_UNQUOTED(CCM_LIBRARY, "libccmclient.so.1", Library to load for ccm support) AC_DEFINE_UNQUOTED(HEARTBEAT_LIBRARY, "libhbclient.so.1", Library to load for heartbeat support) + BUILD_ATOMIC_ATTRD=0 else SUPPORT_HEARTBEAT=0 fi @@ -1341,6 +1343,7 @@ SUPPORT_PLUGIN=0 if test $SUPPORT_CS = 1 -a x$HAVE_confdb = x1; then dnl Need confdb to support cman and the plugins SUPPORT_PLUGIN=1 + BUILD_ATOMIC_ATTRD=0 LCRSODIR=`$PKGCONFIG corosync --variable=lcrsodir` STACKS="$STACKS corosync-plugin" COROSYNC_LIBS="$COROSYNC_LIBS $confdb_LIBS" @@ -1382,6 +1385,9 @@ AM_CONDITIONAL(BUILD_CS_SUPPORT, test $SUPPORT_CS = 1) AM_CONDITIONAL(BUILD_CS_PLUGIN, test $SUPPORT_PLUGIN = 1) AM_CONDITIONAL(BUILD_CMAN, test $SUPPORT_CMAN = 1) +AM_CONDITIONAL(BUILD_ATOMIC_ATTRD, test $BUILD_ATOMIC_ATTRD = 1) +AC_DEFINE_UNQUOTED(HAVE_ATOMIC_ATTRD, $BUILD_ATOMIC_ATTRD, Support the new atomic attrd) + AC_SUBST(SUPPORT_CMAN) AC_SUBST(SUPPORT_CS) @@ -1401,6 +1407,9 @@ else PCMK_FEATURES="$PCMK_FEATURES $STACKS" fi +if test ${BUILD_ATOMIC_ATTRD} = 1; then + PCMK_FEATURES="$PCMK_FEATURES atomic-attrd" +fi AC_SUBST(CLUSTERLIBS) AC_SUBST(LCRSODIR) @@ -1871,6 +1880,7 @@ tools/Makefile \ tools/crm_report \ tools/report.common \ tools/cibsecret \ + tools/crm_mon.upstart \ xml/Makefile \ lib/gnu/Makefile \ ) diff --git a/crmd/lrm.c b/crmd/lrm.c index db0bffb..44634fb 100644 --- a/crmd/lrm.c +++ b/crmd/lrm.c @@ -1162,7 +1162,7 @@ get_lrm_resource(lrm_state_t * lrm_state, xmlNode * resource, xmlNode * op_msg, if (!rsc) { fsa_data_t *msg_data = NULL; - crm_err("Could not add resource %s to LRM", id); + crm_err("Could not add resource %s to LRM %s", id, lrm_state->node_name); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); } } @@ -1175,13 +1175,17 @@ delete_resource(lrm_state_t * lrm_state, const char *id, lrmd_rsc_info_t * rsc, GHashTableIter * gIter, - const char *sys, const char *host, const char *user, ha_msg_input_t * request) + const char *sys, + const char *host, + const char *user, + ha_msg_input_t * request, + gboolean unregister) { int rc = pcmk_ok; crm_info("Removing resource %s for %s (%s) on %s", id, sys, user ? user : "internal", host); - if (rsc) { + if (rsc && unregister) { rc = lrm_state_unregister_rsc(lrm_state, id, 0); } @@ -1224,6 +1228,7 @@ do_lrm_invoke(long long action, const char *user_name = NULL; const char *target_node = NULL; gboolean is_remote_node = FALSE; + gboolean crm_rsc_delete = FALSE; if (input->xml != NULL) { /* Remote node operations are routed here to their remote connections */ @@ -1259,6 +1264,8 @@ do_lrm_invoke(long long action, crm_trace("LRM command from: %s", from_sys); if (safe_str_eq(crm_op, CRM_OP_LRM_DELETE)) { + /* remember this delete op came from crm_resource */ + crm_rsc_delete = TRUE; operation = CRMD_ACTION_DELETE; } else if (safe_str_eq(crm_op, CRM_OP_LRM_REFRESH)) { @@ -1370,13 +1377,17 @@ do_lrm_invoke(long long action, } else if (safe_str_eq(operation, CRM_OP_REPROBE) || safe_str_eq(crm_op, CRM_OP_REPROBE)) { GHashTableIter gIter; rsc_history_t *entry = NULL; + gboolean unregister = is_remote_lrmd_ra(NULL, NULL, entry->id) ? FALSE : TRUE; crm_notice("Forcing the status of all resources to be redetected"); g_hash_table_iter_init(&gIter, lrm_state->resource_history); while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { + /* only unregister the resource during a reprobe if it is not a remote connection + * resource. otherwise unregistering the connection will terminate remote-node + * membership */ delete_resource(lrm_state, entry->id, &entry->rsc, &gIter, from_sys, from_host, - user_name, NULL); + user_name, NULL, unregister); } /* Now delete the copy in the CIB */ @@ -1499,6 +1510,7 @@ do_lrm_invoke(long long action, free(op_key); } else if (rsc != NULL && safe_str_eq(operation, CRMD_ACTION_DELETE)) { + gboolean unregister = TRUE; #if ENABLE_ACL int cib_rc = delete_rsc_status(lrm_state, rsc->id, cib_dryrun | cib_sync_call, user_name); @@ -1523,7 +1535,11 @@ do_lrm_invoke(long long action, return; } #endif - delete_resource(lrm_state, rsc->id, rsc, NULL, from_sys, from_host, user_name, input); + if (crm_rsc_delete == TRUE && is_remote_lrmd_ra(NULL, NULL, rsc->id)) { + unregister = FALSE; + } + + delete_resource(lrm_state, rsc->id, rsc, NULL, from_sys, from_host, user_name, input, unregister); } else if (rsc != NULL) { do_lrm_rsc_op(lrm_state, rsc, operation, input->xml, input->msg); diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c index 98f59c8..f3dedeb 100644 --- a/crmd/remote_lrmd_ra.c +++ b/crmd/remote_lrmd_ra.c @@ -251,6 +251,8 @@ connection_takeover_timeout_cb(gpointer data) crm_debug("takeover event timed out for node %s", cmd->rsc_id); cmd->takeover_timeout_id = 0; + lrm_state = lrm_state_find(cmd->rsc_id); + handle_remote_ra_stop(lrm_state, cmd); free_cmd(cmd); @@ -379,6 +381,11 @@ remote_lrm_op_callback(lrmd_event_data_t * op) cmd->rc = PCMK_OCF_UNKNOWN_ERROR; } else { + + if (safe_str_eq(cmd->action, "start")) { + /* clear PROBED value if it happens to be set after start completes. */ + update_attrd(lrm_state->node_name, CRM_OP_PROBED, NULL, NULL, TRUE); + } lrm_state_reset_tables(lrm_state); cmd->rc = PCMK_OCF_OK; cmd->op_status = PCMK_LRM_OP_DONE; diff --git a/crmd/te_actions.c b/crmd/te_actions.c index 926996b..a3aa78b 100644 --- a/crmd/te_actions.c +++ b/crmd/te_actions.c @@ -546,17 +546,26 @@ te_update_job_count(crm_action_t * action, int offset) return; } - if (safe_str_eq(task, CRMD_ACTION_MIGRATE) || safe_str_eq(task, CRMD_ACTION_MIGRATED)) { + /* if we have a router node, this means the action is performing + * on a remote node. For now, we count all action occuring on a + * remote node against the job list on the cluster node hosting + * the connection resources */ + target = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); + + if ((target == NULL) && + (safe_str_eq(task, CRMD_ACTION_MIGRATE) || safe_str_eq(task, CRMD_ACTION_MIGRATED))) { + const char *t1 = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_SOURCE); const char *t2 = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_TARGET); te_update_job_count_on(t1, offset, TRUE); te_update_job_count_on(t2, offset, TRUE); - - } else { - - te_update_job_count_on(target, offset, FALSE); + return; + } else if (target == NULL) { + target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); } + + te_update_job_count_on(target, offset, FALSE); } static gboolean @@ -597,6 +606,8 @@ te_should_perform_action_on(crm_graph_t * graph, crm_action_t * action, const ch } } + crm_trace("Peer %s has not hit their limit yet. current jobs = %d limit= %d limit", target, r->jobs, limit); + return TRUE; } @@ -611,7 +622,15 @@ te_should_perform_action(crm_graph_t * graph, crm_action_t * action) return TRUE; } - if (safe_str_eq(task, CRMD_ACTION_MIGRATE) || safe_str_eq(task, CRMD_ACTION_MIGRATED)) { + /* if we have a router node, this means the action is performing + * on a remote node. For now, we count all action occuring on a + * remote node against the job list on the cluster node hosting + * the connection resources */ + target = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); + + if ((target == NULL) && + (safe_str_eq(task, CRMD_ACTION_MIGRATE) || safe_str_eq(task, CRMD_ACTION_MIGRATED))) { + target = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_SOURCE); if(te_should_perform_action_on(graph, action, target) == FALSE) { return FALSE; @@ -619,7 +638,7 @@ te_should_perform_action(crm_graph_t * graph, crm_action_t * action) target = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_TARGET); - } else { + } else if (target == NULL) { target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); } diff --git a/crmd/te_events.c b/crmd/te_events.c index afe3072..b81a13e 100644 --- a/crmd/te_events.c +++ b/crmd/te_events.c @@ -161,10 +161,6 @@ update_failcount(xmlNode * event, const char *event_node_uuid, int rc, int targe do_update = TRUE; value = failed_stop_offset; - } else if (safe_str_eq(task, CRMD_ACTION_STOP)) { - do_update = TRUE; - value = failed_stop_offset; - } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) { do_update = TRUE; diff --git a/crmd/throttle.c b/crmd/throttle.c index 04a3cf1..6e853ae 100644 --- a/crmd/throttle.c +++ b/crmd/throttle.c @@ -430,7 +430,7 @@ throttle_mode(void) unsigned int blocked = 0; enum throttle_state_e mode = throttle_none; -#ifndef ON_SOLARIS +#ifdef ON_SOLARIS return throttle_none; #endif @@ -508,44 +508,41 @@ static void throttle_send_command(enum throttle_state_e mode) { xmlNode *xml = NULL; + static enum throttle_state_e last = -1; - xml = create_request(CRM_OP_THROTTLE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); - crm_xml_add_int(xml, F_CRM_THROTTLE_MODE, mode); - crm_xml_add_int(xml, F_CRM_THROTTLE_MAX, throttle_job_max); + if(mode != last) { + crm_info("New throttle mode: %.4x (was %.4x)", mode, last); + last = mode; - send_cluster_message(NULL, crm_msg_crmd, xml, TRUE); - free_xml(xml); + xml = create_request(CRM_OP_THROTTLE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); + crm_xml_add_int(xml, F_CRM_THROTTLE_MODE, mode); + crm_xml_add_int(xml, F_CRM_THROTTLE_MAX, throttle_job_max); - crm_info("Updated throttle state to %.4x", mode); + send_cluster_message(NULL, crm_msg_crmd, xml, TRUE); + free_xml(xml); + } } static gboolean throttle_timer_cb(gpointer data) { static bool send_updates = FALSE; - static enum throttle_state_e last = -1; - enum throttle_state_e now = throttle_none; - if(send_updates == FALSE) { - /* Optimize for the true case */ - if(compare_version(fsa_our_dc_version, "3.0.8") < 0) { - crm_trace("DC version %s doesn't support throttling", fsa_our_dc_version); - - } else { - send_updates = TRUE; - } - } - if(send_updates) { now = throttle_mode(); - } + throttle_send_command(now); + + } else if(compare_version(fsa_our_dc_version, "3.0.8") < 0) { + /* Optimize for the true case */ + crm_trace("DC version %s doesn't support throttling", fsa_our_dc_version); - if(send_updates && now != last) { - crm_debug("New throttle mode: %.4x (was %.4x)", now, last); + } else { + send_updates = TRUE; + now = throttle_mode(); throttle_send_command(now); - last = now; } + return TRUE; } @@ -595,9 +592,11 @@ throttle_update_job_max(const char *preference) void throttle_init(void) { - throttle_records = g_hash_table_new_full( - crm_str_hash, g_str_equal, NULL, throttle_record_free); - throttle_timer = mainloop_timer_add("throttle", 30* 1000, TRUE, throttle_timer_cb, NULL); + if(throttle_records == NULL) { + throttle_records = g_hash_table_new_full( + crm_str_hash, g_str_equal, NULL, throttle_record_free); + throttle_timer = mainloop_timer_add("throttle", 30 * 1000, TRUE, throttle_timer_cb, NULL); + } throttle_update_job_max(NULL); mainloop_timer_start(throttle_timer); diff --git a/cts/CTS.py b/cts/CTS.py index 04189f2..f4198c4 100644 --- a/cts/CTS.py +++ b/cts/CTS.py @@ -225,10 +225,13 @@ class CtsLab: class NodeStatus: def __init__(self, env): - pass + self.Env = env def IsNodeBooted(self, node): '''Return TRUE if the given node is booted (responds to pings)''' + if self.Env["docker"]: + return RemoteFactory().getInstance()("localhost", "docker inspect --format {{.State.Running}} %s | grep -q true" % node, silent=True) == 0 + return RemoteFactory().getInstance()("localhost", "ping -nq -c1 -w1 %s" % node, silent=True) == 0 def IsSshdUp(self, node): @@ -442,6 +445,9 @@ class ClusterManager(UserDict): self.debug("Quorum: %d Len: %d" % (q, len(self.Env["nodes"]))) return peer_list + for n in self.Env["nodes"]: + peer_state[n] = "unknown" + # Now see if any states need to be updated self.debug("looking for: " + repr(stonith.regexes)) shot = stonith.look(0) @@ -457,7 +463,8 @@ class ClusterManager(UserDict): peer_state[peer] = "complete" self.__instance_errorstoignore.append(self.templates["Pat:Fencing_ok"] % peer) - elif re.search(self.templates["Pat:Fencing_start"] % n, shot): + elif peer_state[n] != "complete" and re.search(self.templates["Pat:Fencing_start"] % n, shot): + # TODO: Correctly detect multiple fencing operations for the same host peer = n peer_state[peer] = "in-progress" self.__instance_errorstoignore.append(self.templates["Pat:Fencing_start"] % peer) diff --git a/cts/CTSlab.py b/cts/CTSlab.py index 314c347..9b336a5 100755 --- a/cts/CTSlab.py +++ b/cts/CTSlab.py @@ -107,9 +107,9 @@ if __name__ == '__main__': if Environment["ListTests"] == 1: Tests = TestList(cm, Audits) - Environment.log("Total %d tests"%len(Tests)) + LogFactory().log("Total %d tests"%len(Tests)) for test in Tests : - Environment.log(str(test.name)); + LogFactory().log(str(test.name)); sys.exit(0) elif len(Environment["tests"]) == 0: diff --git a/cts/CTStests.py b/cts/CTStests.py index 918dff0..cd5b7ce 100644 --- a/cts/CTStests.py +++ b/cts/CTStests.py @@ -83,6 +83,7 @@ class CTSTest: self.passed = 1 self.is_loop = 0 self.is_unsafe = 0 + self.is_docker_unsafe = 0 self.is_experimental = 0 self.is_container = 0 self.is_valgrind = 0 @@ -224,6 +225,8 @@ class CTSTest: return 0 elif self.is_experimental and not self.Env["experimental-tests"]: return 0 + elif self.is_docker_unsafe and self.Env["docker"]: + return 0 elif self.is_container and not self.Env["container-tests"]: return 0 elif self.Env["benchmark"] and self.benchmark == 0: @@ -1359,6 +1362,8 @@ class ComponentFail(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "ComponentFail" + # TODO make this work correctly in docker. + self.is_docker_unsafe = 1 self.startall = SimulStartLite(cm) self.complist = cm.Components() self.patterns = [] @@ -1419,6 +1424,15 @@ class ComponentFail(CTSTest): self.okerrpatterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name)) self.okerrpatterns.append(self.templates["Pat:ChildExit"]) + if chosen.name == "stonith": + # Ignore actions for STONITH resources + (rc, lines) = self.rsh(node, "crm_resource -c", None) + for line in lines: + if re.search("^Resource", line): + r = AuditResource(self.CM, line) + if r.rclass == "stonith": + self.okerrpatterns.append(self.templates["LogActions: Recover.*%s"] % r.id) + # supply a copy so self.patterns doesnt end up empty tmpPats = [] tmpPats.extend(self.patterns) @@ -2512,6 +2526,7 @@ class RemoteLXC(CTSTest): self.startall = SimulStartLite(cm) self.num_containers = 2 self.is_container = 1 + self.is_docker_unsafe = 1 self.failed = 0 self.fail_string = "" @@ -2624,6 +2639,7 @@ class RemoteBaremetal(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "RemoteBaremetal" + self.is_docker_unsafe = 1 self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.stop = StopTest(cm) diff --git a/cts/environment.py b/cts/environment.py index de1d099..d741452 100644 --- a/cts/environment.py +++ b/cts/environment.py @@ -71,6 +71,7 @@ class Environment: self["loop-tests"] = 1 self["scenario"] = "random" self["stats"] = 0 + self["docker"] = 0 self.RandomGen = random.Random() self.logger = LogFactory() @@ -143,7 +144,9 @@ class Environment: # GoodThing(tm). try: n = node.strip() - gethostbyname_ex(n) + if self.data["docker"] == 0: + gethostbyname_ex(n) + self.Nodes.append(n) except: self.logger.log(node+" not found in DNS... aborting") @@ -191,7 +194,10 @@ class Environment: return "crm-lha" elif self.data["Stack"] == "corosync 2.x": - return "crm-mcp" + if self["docker"]: + return "crm-mcp-docker" + else: + return "crm-mcp" elif self.data["Stack"] == "corosync (cman)": return "crm-cman" @@ -342,6 +348,10 @@ class Environment: elif args[i] == "--qarsh": RemoteFactory().enable_qarsh() + elif args[i] == "--docker": + self["docker"] = 1 + RemoteFactory().enable_docker() + elif args[i] == "--stonith" or args[i] == "--fencing": skipthis=1 if args[i+1] == "1" or args[i+1] == "yes": @@ -352,6 +362,9 @@ class Environment: self["DoStonith"]=1 self["stonith-type"] = "fence_xvm" self["stonith-params"] = "pcmk_arg_map=domain:uname,delay=0" + elif args[i+1] == "docker": + self["DoStonith"]=1 + self["stonith-type"] = "fence_docker_cts" elif args[i+1] == "scsi": self["DoStonith"]=1 self["stonith-type"] = "fence_scsi" @@ -644,6 +657,7 @@ class Environment: print "\t [--container-tests] include pacemaker_remote tests that run in lxc container resources" print "\t [--oprofile 'node list'] list of cluster nodes to run oprofile on]" print "\t [--qarsh] use the QARSH backdoor to access nodes instead of SSH" + print "\t [--docker] Indicates nodes are docker nodes." print "\t [--seed random_seed]" print "\t [--set option=value]" print "\t " diff --git a/cts/lxc_autogen.sh.in b/cts/lxc_autogen.sh.in index 6900b67..e11532b 100755 --- a/cts/lxc_autogen.sh.in +++ b/cts/lxc_autogen.sh.in @@ -72,6 +72,7 @@ if [ $verify -eq 1 ]; then virsh -c lxc:/// list --all > /dev/null 2>&1 if [ $? -ne 0 ]; then echo "Could not connect 'virsh -c lxc:///' check that libvirt lxc driver is installed" + # yum install -y libvirt-daemon-driver-lxc libvirt-daemon-lxc libvirt-login-shell exit 1 fi diff --git a/cts/patterns.py b/cts/patterns.py index f651965..8d34e1c 100644 --- a/cts/patterns.py +++ b/cts/patterns.py @@ -364,9 +364,12 @@ class crm_cs_v0(BasePatterns): self.components["stonith-ignore"] = [ "LogActions: Recover Fencing", "Updating failcount for Fencing", + "error: crm_ipc_read: Connection to stonith-ng failed", + "error: mainloop_gio_callback: Connection to stonith-ng.*closed (I/O condition=17)", + "crit: tengine_stonith_connection_destroy: Fencing daemon connection failed", "error: te_connect_stonith:.*Sign-in failed: triggered a retry", "STONITH connection failed, finalizing .* pending operations.", - "process_lrm_event:.*Operation Fencing.* Error" + "process_lrm_event:.*Operation Fencing.* Error", ] self.components["stonith-ignore"].extend(self.components["common-ignore"]) @@ -409,6 +412,20 @@ class crm_mcp(crm_cs_v0): # "Pat:We_stopped" : "%s.*Stopped Corosync Cluster Engine", # }) +class crm_mcp_docker(crm_mcp): + ''' + The crm version 4 cluster manager class. + It implements the things we need to talk to and manipulate + crm clusters running on top of native corosync (no plugins) + ''' + def __init__(self, name): + crm_mcp.__init__(self, name) + + self.commands.update({ + "StartCmd" : "pcmk_start", + "StopCmd" : "pcmk_stop", + }) + class crm_cman(crm_cs_v0): ''' The crm version 3 cluster manager class. @@ -454,6 +471,8 @@ class PatternSelector: crm_cman(name) elif name == "crm-mcp": crm_mcp(name) + elif name == "crm-mcp-docker": + crm_mcp_docker(name) def get_variant(self, variant): if patternvariants.has_key(variant): diff --git a/cts/remote.py b/cts/remote.py index c8253c3..7920fc9 100644 --- a/cts/remote.py +++ b/cts/remote.py @@ -261,6 +261,12 @@ class RemoteFactory: def new(self, silent=False): return RemoteExec(RemoteFactory.rsh, silent) + def enable_docker(self): + print "Using DOCKER backend for connections to cluster nodes" + + RemoteFactory.rsh.Command = "/usr/libexec/phd/docker/phd_docker_remote_cmd " + RemoteFactory.rsh.CpCommand = "/usr/libexec/phd/docker/phd_docker_cp" + def enable_qarsh(self): # http://nstraz.wordpress.com/2008/12/03/introducing-qarsh/ print "Using QARSH for connections to cluster nodes" diff --git a/cts/watcher.py b/cts/watcher.py index d33e580..5e6ee43 100644 --- a/cts/watcher.py +++ b/cts/watcher.py @@ -165,7 +165,11 @@ class FileObj(SearchObj): global log_watcher_bin self.debug("Installing %s on %s" % (log_watcher_bin, host)) - self.rsh(host, '''echo "%s" > %s''' % (log_watcher, log_watcher_bin), silent=True) + + os.system("cat << END >> %s\n%s\nEND" %(log_watcher_bin, log_watcher)) + os.system("chmod 755 %s" %(log_watcher_bin)) + + self.rsh.cp(log_watcher_bin, "root@%s:%s" % (host, log_watcher_bin)) has_log_watcher[host] = 1 self.harvest() @@ -176,7 +180,8 @@ class FileObj(SearchObj): if match: last_offset = self.offset self.offset = match.group(1) - #if last_offset == "EOF": self.debug("Got %d lines, new offset: %s" % (len(lines), self.offset)) + #if last_offset == "EOF": self.debug("Got %d lines, new offset: %s" % (len(outLines), self.offset)) + self.debug("Got %d lines, new offset: %s %s" % (len(outLines), self.offset, repr(self.delegate))) elif re.search("^CTSwatcher:.*truncated", line): self.log(line) @@ -199,7 +204,7 @@ class FileObj(SearchObj): global log_watcher_bin return self.rsh.call_async(self.host, - "python %s -t %s -p CTSwatcher: -l 200 -f %s -o %s" % (log_watcher_bin, self.name, self.filename, self.offset), + "python %s -t %s -p CTSwatcher: -l 200 -f %s -o %s -t %s" % (log_watcher_bin, self.name, self.filename, self.offset, self.name), completionDelegate=self) def setend(self): @@ -208,7 +213,7 @@ class FileObj(SearchObj): global log_watcher_bin (rc, lines) = self.rsh(self.host, - "python %s -t %s -p CTSwatcher: -l 2 -f %s -o %s" % (log_watcher_bin, self.name, self.filename, "EOF"), + "python %s -t %s -p CTSwatcher: -l 2 -f %s -o %s -t %s" % (log_watcher_bin, self.name, self.filename, "EOF", self.name), None, silent=True) for line in lines: @@ -386,7 +391,7 @@ class LogWatcher(RemoteExec): def async_complete(self, pid, returncode, outLines, errLines): # TODO: Probably need a lock for updating self.line_cache - self.logger.debug("%s: Got %d lines from %d" % (self.name, len(outLines), pid)) + self.logger.debug("%s: Got %d lines from %d (total %d)" % (self.name, len(outLines), pid, len(self.line_cache))) if len(outLines): self.cache_lock.acquire() self.line_cache.extend(outLines) @@ -407,7 +412,7 @@ class LogWatcher(RemoteExec): for t in pending: t.join(60.0) if t.isAlive(): - self.logger.log("%s: Aborting after 20s waiting for %d logging commands" % (self.name, repr(t))) + self.logger.log("%s: Aborting after 20s waiting for %s logging commands" % (self.name, repr(t))) return #print "Got %d lines" % len(self.line_cache) @@ -484,9 +489,6 @@ class LogWatcher(RemoteExec): if len(self.line_cache) == 0 and end < time.time(): self.debug("Single search terminated: start=%d, end=%d, now=%d, lines=%d" % (begin, end, time.time(), lines)) return None - elif len(self.line_cache) == 0: - self.debug("Single search timed out: start=%d, end=%d, now=%d, lines=%d" % (begin, end, time.time(), lines)) - return None else: self.debug("Waiting: start=%d, end=%d, now=%d, lines=%d" % (begin, end, time.time(), len(self.line_cache))) time.sleep(1) @@ -520,6 +522,7 @@ class LogWatcher(RemoteExec): self.unmatched = self.regexes self.matched = returnresult self.regexes = save_regexes + self.end() return None returnresult.append(oneresult) diff --git a/extra/resources/remote b/extra/resources/remote index 9e0482b..9f141a2 100644 --- a/extra/resources/remote +++ b/extra/resources/remote @@ -62,11 +62,11 @@ meta_data() { - - - - - + + + + + diff --git a/fencing/commands.c b/fencing/commands.c index a4e9f30..577ea95 100644 --- a/fencing/commands.c +++ b/fencing/commands.c @@ -1094,7 +1094,10 @@ stonith_device_action(xmlNode * msg, char **output) device = g_hash_table_lookup(device_list, id); } - if (device) { + if (device && device->api_registered == FALSE) { + rc = -ENODEV; + + } else if (device) { cmd = create_async_command(msg); if (cmd == NULL) { free_device(device); diff --git a/fencing/main.c b/fencing/main.c index 5ae36cf..b03659e 100644 --- a/fencing/main.c +++ b/fencing/main.c @@ -415,7 +415,7 @@ topology_remove_helper(const char *node, int level) xmlNode *data = create_xml_node(NULL, F_STONITH_LEVEL); xmlNode *notify_data = create_xml_node(NULL, STONITH_OP_LEVEL_DEL); - crm_xml_add(data, "origin", __FUNCTION__); + crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__); crm_xml_add_int(data, XML_ATTR_ID, level); crm_xml_add(data, F_STONITH_TARGET, node); diff --git a/include/crm/services.h b/include/crm/services.h index e8bc172..5310709 100644 --- a/include/crm/services.h +++ b/include/crm/services.h @@ -152,6 +152,7 @@ enum nagios_exitcode { int status; int sequence; int expected_rc; + int synchronous; char *stderr_data; char *stdout_data; diff --git a/include/crm_internal.h b/include/crm_internal.h index ba78da2..3eb88de 100644 --- a/include/crm_internal.h +++ b/include/crm_internal.h @@ -220,7 +220,7 @@ gboolean crm_remote_recv(crm_remote_t * remote, int total_timeout /*ms */ , int xmlNode *crm_remote_parse_buffer(crm_remote_t * remote); int crm_remote_tcp_connect(const char *host, int port); int crm_remote_tcp_connect_async(const char *host, int port, int timeout, /*ms */ - void *userdata, void (*callback) (void *userdata, int sock)); + int *timer_id, void *userdata, void (*callback) (void *userdata, int sock)); # ifdef HAVE_GNUTLS_GNUTLS_H /*! @@ -276,6 +276,7 @@ int crm_read_pidfile(const char *filename); # define attrd_channel T_ATTRD # define F_ATTRD_KEY "attr_key" # define F_ATTRD_ATTRIBUTE "attr_name" +# define F_ATTRD_REGEX "attr_regex" # define F_ATTRD_TASK "task" # define F_ATTRD_VALUE "attr_value" # define F_ATTRD_SET "attr_set" diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c index 9410506..24700e5 100644 --- a/lib/cluster/membership.c +++ b/lib/cluster/membership.c @@ -389,7 +389,9 @@ crm_find_peer(unsigned int id, const char *uname) } } else if(uname && by_id->uname) { - crm_warn("Node '%s' and '%s' share the same cluster nodeid: %u", by_id->uname, by_name->uname, id); + crm_dump_peer_hash(LOG_INFO, __FUNCTION__); + crm_warn("Node '%s' and '%s' share the same cluster nodeid: %u %s", by_id->uname, by_name->uname, id, uname); + crm_abort(__FILE__, __FUNCTION__, __LINE__, "member weirdness", TRUE, TRUE); } else if(id && by_name->id) { crm_warn("Node %u and %u share the same name: '%s'", by_id->id, by_name->id, uname); diff --git a/lib/common/ipc.c b/lib/common/ipc.c index c1801a4..f26225f 100644 --- a/lib/common/ipc.c +++ b/lib/common/ipc.c @@ -806,7 +806,7 @@ crm_ipc_connect(crm_ipc_t * client) #ifdef HAVE_IPCS_GET_BUFFER_SIZE client->max_buf_size = qb_ipcc_get_buffer_size(client->ipc); - if (client->max_buf_size < client->buf_size) { + if (client->max_buf_size > client->buf_size) { free(client->buffer); client->buffer = calloc(1, client->max_buf_size); client->buf_size = client->max_buf_size; diff --git a/lib/common/remote.c b/lib/common/remote.c index 0a7cd93..e2492b9 100644 --- a/lib/common/remote.c +++ b/lib/common/remote.c @@ -737,11 +737,12 @@ check_connect_finished(gpointer userdata) static int internal_tcp_connect_async(int sock, const struct sockaddr *addr, socklen_t addrlen, int timeout /* ms */ , - void *userdata, void (*callback) (void *userdata, int sock)) + int *timer_id, void *userdata, void (*callback) (void *userdata, int sock)) { int rc = 0; int flag = 0; int interval = 500; + int timer; struct tcp_async_cb_data *cb_data = NULL; if ((flag = fcntl(sock, F_GETFL)) >= 0) { @@ -782,7 +783,10 @@ internal_tcp_connect_async(int sock, * Something about the way mainloop is currently polling prevents this from working at the * moment though. */ crm_trace("fd %d: scheduling to check if connect finished in %dms second", sock, interval); - g_timeout_add(interval, check_connect_finished, cb_data); + timer = g_timeout_add(interval, check_connect_finished, cb_data); + if (timer_id) { + *timer_id = timer; + } return 0; } @@ -809,10 +813,11 @@ internal_tcp_connect(int sock, const struct sockaddr *addr, socklen_t addrlen) * \internal * \brief tcp connection to server at specified port * \retval negative, failed to connect. + * \retval positive, sock fd */ int -crm_remote_tcp_connect_async(const char *host, int port, int timeout, /*ms */ - void *userdata, void (*callback) (void *userdata, int sock)) +crm_remote_tcp_connect_async(const char *host, int port, int timeout, /*ms */ + int *timer_id, void *userdata, void (*callback) (void *userdata, int sock)) { char buffer[256]; struct addrinfo *res = NULL; @@ -877,8 +882,7 @@ crm_remote_tcp_connect_async(const char *host, int port, int timeout, /*ms */ if (callback) { if (internal_tcp_connect_async - (sock, rp->ai_addr, rp->ai_addrlen, timeout, userdata, callback) == 0) { - sock = 0; + (sock, rp->ai_addr, rp->ai_addrlen, timeout, timer_id, userdata, callback) == 0) { goto async_cleanup; /* Success for now, we'll hear back later in the callback */ } @@ -903,5 +907,5 @@ async_cleanup: int crm_remote_tcp_connect(const char *host, int port) { - return crm_remote_tcp_connect_async(host, port, -1, NULL, NULL); + return crm_remote_tcp_connect_async(host, port, -1, NULL, NULL, NULL); } diff --git a/lib/common/utils.c b/lib/common/utils.c index e559c51..dc54e6d 100644 --- a/lib/common/utils.c +++ b/lib/common/utils.c @@ -2005,6 +2005,9 @@ attrd_update_delegate(crm_ipc_t * ipc, char command, const char *host, const cha } switch (command) { + case 'u': + crm_xml_add(update, F_ATTRD_TASK, "update"); + crm_xml_add(update, F_ATTRD_REGEX, name); case 'D': case 'U': case 'v': diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c index 2837682..06b9492 100644 --- a/lib/fencing/st_client.c +++ b/lib/fencing/st_client.c @@ -192,7 +192,7 @@ create_device_registration_xml(const char *id, const char *namespace, const char #endif crm_xml_add(data, XML_ATTR_ID, id); - crm_xml_add(data, "origin", __FUNCTION__); + crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__); crm_xml_add(data, "agent", agent); crm_xml_add(data, "namespace", namespace); if (rsc_provides) { @@ -229,7 +229,7 @@ stonith_api_remove_device(stonith_t * st, int call_options, const char *name) xmlNode *data = NULL; data = create_xml_node(NULL, F_STONITH_DEVICE); - crm_xml_add(data, "origin", __FUNCTION__); + crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__); crm_xml_add(data, XML_ATTR_ID, name); rc = stonith_send_command(st, STONITH_OP_DEVICE_DEL, data, NULL, call_options, 0); free_xml(data); @@ -244,7 +244,7 @@ stonith_api_remove_level(stonith_t * st, int options, const char *node, int leve xmlNode *data = NULL; data = create_xml_node(NULL, F_STONITH_LEVEL); - crm_xml_add(data, "origin", __FUNCTION__); + crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__); crm_xml_add(data, F_STONITH_TARGET, node); crm_xml_add_int(data, XML_ATTR_ID, level); rc = stonith_send_command(st, STONITH_OP_LEVEL_DEL, data, NULL, options, 0); @@ -260,7 +260,7 @@ create_level_registration_xml(const char *node, int level, stonith_key_value_t * crm_xml_add_int(data, XML_ATTR_ID, level); crm_xml_add(data, F_STONITH_TARGET, node); - crm_xml_add(data, "origin", __FUNCTION__); + crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__); for (; device_list; device_list = device_list->next) { xmlNode *dev = create_xml_node(data, F_STONITH_DEVICE); @@ -1255,7 +1255,7 @@ stonith_api_query(stonith_t * stonith, int call_options, const char *target, CRM_CHECK(devices != NULL, return -EINVAL); data = create_xml_node(NULL, F_STONITH_DEVICE); - crm_xml_add(data, "origin", __FUNCTION__); + crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__); crm_xml_add(data, F_STONITH_TARGET, target); crm_xml_add(data, F_STONITH_ACTION, "off"); rc = stonith_send_command(stonith, STONITH_OP_QUERY, data, &output, call_options, timeout); @@ -1296,7 +1296,7 @@ stonith_api_call(stonith_t * stonith, xmlNode *data = NULL; data = create_xml_node(NULL, F_STONITH_DEVICE); - crm_xml_add(data, "origin", __FUNCTION__); + crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__); crm_xml_add(data, F_STONITH_DEVICE, id); crm_xml_add(data, F_STONITH_ACTION, action); crm_xml_add(data, F_STONITH_TARGET, victim); diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c index 3496098..b8c5d23 100644 --- a/lib/lrmd/lrmd_client.c +++ b/lib/lrmd/lrmd_client.c @@ -89,6 +89,9 @@ typedef struct lrmd_private_s { int port; gnutls_psk_client_credentials_t psk_cred_c; + /* while the async connection is occuring, this is the id + * of the connection timeout timer. */ + int async_timer; int sock; /* since tls requires a round trip across the network for a * request/reply, there are times where we just want to be able @@ -1101,6 +1104,8 @@ lrmd_tcp_connect_cb(void *userdata, int sock) int rc = sock; gnutls_datum_t psk_key = { NULL, 0 }; + native->async_timer = 0; + if (rc < 0) { lrmd_tls_connection_destroy(lrmd); crm_info("remote lrmd connect to %s at port %d failed", native->server, native->port); @@ -1152,14 +1157,23 @@ lrmd_tcp_connect_cb(void *userdata, int sock) static int lrmd_tls_connect_async(lrmd_t * lrmd, int timeout /*ms */ ) { - int rc = 0; + int rc = -1; + int sock = 0; + int timer_id = 0; + lrmd_private_t *native = lrmd->private; lrmd_gnutls_global_init(); - rc = crm_remote_tcp_connect_async(native->server, native->port, timeout, lrmd, + sock = crm_remote_tcp_connect_async(native->server, native->port, timeout, &timer_id, lrmd, lrmd_tcp_connect_cb); + if (sock != -1) { + native->sock = sock; + rc = 0; + native->async_timer = timer_id; + } + return rc; } @@ -1319,6 +1333,11 @@ lrmd_tls_disconnect(lrmd_t * lrmd) native->remote->tls_session = 0; } + if (native->async_timer) { + g_source_remove(native->async_timer); + native->async_timer = 0; + } + if (native->source != NULL) { /* Attached to mainloop */ mainloop_del_ipc_client(native->source); diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index b699201..7127c12 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -1756,6 +1756,7 @@ process_rsc_state(resource_t * rsc, node_t * node, if (rsc->role > RSC_ROLE_STOPPED && node->details->online == FALSE && is_set(rsc->flags, pe_rsc_managed)) { + char *reason = NULL; gboolean should_fence = FALSE; /* if this is a remote_node living in a container, fence the container @@ -1768,14 +1769,25 @@ process_rsc_state(resource_t * rsc, node_t * node, should_fence = TRUE; } else if (is_set(data_set->flags, pe_flag_stonith_enabled)) { + if (is_baremetal_remote_node(node) && is_not_set(node->details->remote_rsc->flags, pe_rsc_failed)) { + /* setting unceen = true means that fencing of the remote node will + * only occur if the connection resource is not going to start somewhere. + * This allows connection resources on a failed cluster-node to move to + * another node without requiring the baremetal remote nodes to be fenced + * as well. */ + node->details->unseen = TRUE; + reason = g_strdup_printf("because %s is active there. Fencing will be revoked if remote-node connection can be re-established on another cluster-node.", rsc->id); + } should_fence = TRUE; } if (should_fence) { - char *reason = g_strdup_printf("because %s is thought to be active there", rsc->id); + if (reason == NULL) { + reason = g_strdup_printf("because %s is thought to be active there", rsc->id); + } pe_fence_node(data_set, node, reason); - g_free(reason); } + g_free(reason); } if (node->details->unclean) { @@ -1840,6 +1852,17 @@ process_rsc_state(resource_t * rsc, node_t * node, break; } + /* ensure a remote-node connection failure forces an unclean remote-node + * to be fenced. By setting unseen = FALSE, the remote-node failure will + * result in a fencing operation regardless if we're going to attempt to + * reconnect to the remote-node in this transition or not. */ + if (is_set(rsc->flags, pe_rsc_failed) && rsc->is_remote_node) { + node_t *tmpnode = pe_find_node(data_set->nodes, rsc->id); + if (tmpnode->details->unclean) { + tmpnode->details->unseen = FALSE; + } + } + if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) { if (is_set(rsc->flags, pe_rsc_orphan)) { if (is_set(rsc->flags, pe_rsc_managed)) { @@ -2160,7 +2183,7 @@ unpack_lrm_resources(node_t * node, xmlNode * lrm_rsc_list, pe_working_set_t * d for (gIter = unexpected_containers; gIter != NULL; gIter = gIter->next) { remote = (resource_t *) gIter->data; if (remote->role != RSC_ROLE_STARTED) { - crm_warn("Recovering container resource %s. Resource is unexpectedly running and involves a remote-node."); + crm_warn("Recovering container resource %s. Resource is unexpectedly running and involves a remote-node.", remote->container->id); set_bit(remote->container->flags, pe_rsc_failed); } } diff --git a/lib/services/dbus.c b/lib/services/dbus.c index 8b8aee1..587589c 100644 --- a/lib/services/dbus.c +++ b/lib/services/dbus.c @@ -6,6 +6,14 @@ #define BUS_PROPERTY_IFACE "org.freedesktop.DBus.Properties" +struct db_getall_data +{ + char *name; + char *target; + char *object; + void *userdata; + void (*callback)(const char *name, const char *value, void *userdata); +}; static bool pcmk_dbus_error_check(DBusError *err, const char *prefix, const char *function, int line) { @@ -107,8 +115,9 @@ DBusMessage *pcmk_dbus_send_recv(DBusMessage *msg, DBusConnection *connection, D method = dbus_message_get_member (msg); // send message and get a handle for a reply - if (!dbus_connection_send_with_reply (connection, msg, &pending, -1)) { // -1 is default timeout + if (!dbus_connection_send_with_reply (connection, msg, &pending, -1/* aka. DBUS_TIMEOUT_USE_DEFAULT */)) { if(error) { + dbus_error_init(error); error->message = "Call to dbus_connection_send_with_reply() failed"; error->name = "org.clusterlabs.pacemaker.SendFailed"; } @@ -126,13 +135,7 @@ DBusMessage *pcmk_dbus_send_recv(DBusMessage *msg, DBusConnection *connection, D reply = dbus_pending_call_steal_reply(pending); } - if(pcmk_dbus_find_error(method, pending, reply, error)) { - crm_trace("Was error: '%s' '%s'", error->name, error->message); - if(reply) { - dbus_message_unref(reply); - reply = NULL; - } - } + pcmk_dbus_find_error(method, pending, reply, error); if(pending) { /* free the pending message handle */ @@ -156,7 +159,7 @@ bool pcmk_dbus_send(DBusMessage *msg, DBusConnection *connection, method = dbus_message_get_member (msg); // send message and get a handle for a reply - if (!dbus_connection_send_with_reply (connection, msg, &pending, -1)) { // -1 is default timeout + if (!dbus_connection_send_with_reply (connection, msg, &pending, -1/* aka. DBUS_TIMEOUT_USE_DEFAULT */)) { // -1 is default timeout crm_err("Send with reply failed for %s", method); return FALSE; @@ -205,65 +208,38 @@ bool pcmk_dbus_type_check(DBusMessage *msg, DBusMessageIter *field, int expected dbus_message_iter_init(msg, &args); do_crm_log_alias(LOG_ERR, __FILE__, function, line, - "Unexepcted DBus type, expected %c instead of %c in '%s'", - expected, dtype, dbus_message_iter_get_signature(&args)); + "Unexepcted DBus type, expected %c in '%s' instead of %c", + expected, dbus_message_iter_get_signature(&args), dtype); return FALSE; } return TRUE; } -char * -pcmk_dbus_get_property( - DBusConnection *connection, const char *target, const char *obj, const gchar * iface, const char *name) +static char * +pcmk_dbus_lookup_result(DBusMessage *reply, struct db_getall_data *data) { - DBusMessage *msg; - DBusMessageIter args; - DBusMessageIter dict; - DBusMessage *reply = NULL; - /* DBusBasicValue value; */ - const char *method = "GetAll"; - char *output = NULL; DBusError error; + char *output = NULL; + DBusMessageIter dict; + DBusMessageIter args; - /* desc = systemd_unit_property(path, BUS_NAME ".Unit", "Description"); */ - - dbus_error_init(&error); - crm_info("Calling: %s on %s", method, target); - msg = dbus_message_new_method_call(target, // target for the method call - obj, // object to call on - BUS_PROPERTY_IFACE, // interface to call on - method); // method name - - if (NULL == msg) { - crm_err("Call to %s failed: No message", method); - return NULL; - } - - CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &iface, DBUS_TYPE_INVALID)); - - reply = pcmk_dbus_send_recv(msg, connection, &error); - dbus_message_unref(msg); - - if(error.name) { - crm_err("Call to %s for %s failed: No reply", method, iface); - return NULL; - - } else if (!dbus_message_iter_init(reply, &args)) { - crm_err("Cannot get properties for %s from %s", obj, iface); - return NULL; + if(pcmk_dbus_find_error("GetAll", (void*)&error, reply, &error)) { + crm_err("Cannot get properties from %s for %s", data->target, data->object); + goto cleanup; } + dbus_message_iter_init(reply, &args); if(!pcmk_dbus_type_check(reply, &args, DBUS_TYPE_ARRAY, __FUNCTION__, __LINE__)) { - crm_err("Call to %s failed: Message has invalid arguments", method); - dbus_message_unref(reply); - return NULL; + crm_err("Invalid reply from %s for %s", data->target, data->object); + goto cleanup; } dbus_message_iter_recurse(&args, &dict); while (dbus_message_iter_get_arg_type (&dict) != DBUS_TYPE_INVALID) { DBusMessageIter sv; DBusMessageIter v; + DBusBasicValue name; DBusBasicValue value; if(!pcmk_dbus_type_check(reply, &dict, DBUS_TYPE_DICT_ENTRY, __FUNCTION__, __LINE__)) { @@ -277,10 +253,9 @@ pcmk_dbus_get_property( switch(dtype) { case DBUS_TYPE_STRING: - dbus_message_iter_get_basic(&sv, &value); + dbus_message_iter_get_basic(&sv, &name); - crm_trace("Got: %s", value.str); - if(strcmp(value.str, name) != 0) { + if(data->name && strcmp(name.str, data->name) != 0) { dbus_message_iter_next (&sv); /* Skip the value */ } break; @@ -289,8 +264,17 @@ pcmk_dbus_get_property( if(pcmk_dbus_type_check(reply, &v, DBUS_TYPE_STRING, __FUNCTION__, __LINE__)) { dbus_message_iter_get_basic(&v, &value); - crm_trace("Result: %s", value.str); - output = strdup(value.str); + crm_trace("Property %s[%s] is '%s'", data->object, name.str, value.str); + if(data->callback) { + data->callback(name.str, value.str, data->userdata); + + } else { + output = strdup(value.str); + } + + if(data->name) { + goto cleanup; + } } break; default: @@ -302,8 +286,82 @@ pcmk_dbus_get_property( dbus_message_iter_next (&dict); } + cleanup: + free(data->target); + free(data->object); + free(data->name); + free(data); + + return output; +} + +static void +pcmk_dbus_lookup_cb(DBusPendingCall *pending, void *user_data) +{ + DBusMessage *reply = NULL; + + if(pending) { + reply = dbus_pending_call_steal_reply(pending); + } + + pcmk_dbus_lookup_result(reply, user_data); + + if(reply) { + dbus_message_unref(reply); + } +} + +char * +pcmk_dbus_get_property( + DBusConnection *connection, const char *target, const char *obj, const gchar * iface, const char *name, + void (*callback)(const char *name, const char *value, void *userdata), void *userdata) +{ + DBusMessage *msg; + const char *method = "GetAll"; + char *output = NULL; + + struct db_getall_data *query_data = NULL; + + /* char *state = pcmk_dbus_get_property(systemd_proxy, BUS_NAME, unit, BUS_NAME ".Unit", "ActiveState"); */ + + crm_debug("Calling: %s on %s", method, target); + msg = dbus_message_new_method_call(target, // target for the method call + obj, // object to call on + BUS_PROPERTY_IFACE, // interface to call on + method); // method name + + if (NULL == msg) { + crm_err("Call to %s failed: No message", method); + return NULL; + } + + CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &iface, DBUS_TYPE_INVALID)); + + query_data = malloc(sizeof(struct db_getall_data)); + query_data->target = strdup(target); + query_data->object = strdup(obj); + query_data->callback = callback; + query_data->userdata = userdata; + query_data->name = NULL; + + if(name) { + query_data->name = strdup(name); + } + + if(query_data->callback) { + pcmk_dbus_send(msg, connection, pcmk_dbus_lookup_cb, query_data); + + } else { + DBusMessage *reply = pcmk_dbus_send_recv(msg, connection, NULL); + + output = pcmk_dbus_lookup_result(reply, query_data); + if(reply) { + dbus_message_unref(reply); + } + } + + dbus_message_unref(msg); - crm_trace("Property %s[%s] is '%s'", obj, name, output); return output; } @@ -354,6 +412,14 @@ pcmk_dbus_watch_add(DBusWatch *watch, void *data){ } static void +pcmk_dbus_watch_toggle(DBusWatch *watch, void *data) +{ + mainloop_io_t *client = dbus_watch_get_data(watch); + crm_notice("DBus client %p is now %s", client, dbus_watch_get_enabled(watch)?"enabled":"disabled"); +} + + +static void pcmk_dbus_watch_remove(DBusWatch *watch, void *data){ mainloop_io_t *client = dbus_watch_get_data(watch); @@ -404,7 +470,7 @@ pcmk_dbus_timeout_toggle(DBusTimeout *timeout, void *data){ void pcmk_dbus_connection_setup_with_select(DBusConnection *c){ dbus_connection_set_timeout_functions( c, pcmk_dbus_timeout_add, pcmk_dbus_timeout_remove, pcmk_dbus_timeout_toggle, NULL, NULL); - dbus_connection_set_watch_functions(c, pcmk_dbus_watch_add, pcmk_dbus_watch_remove, NULL, NULL, NULL); + dbus_connection_set_watch_functions(c, pcmk_dbus_watch_add, pcmk_dbus_watch_remove, pcmk_dbus_watch_toggle, NULL, NULL); dbus_connection_set_dispatch_status_function(c, pcmk_dbus_connection_dispatch, NULL, NULL); pcmk_dbus_connection_dispatch(c, dbus_connection_get_dispatch_status(c), NULL); diff --git a/lib/services/pcmk-dbus.h b/lib/services/pcmk-dbus.h index 3b7a598..ed80c5f 100644 --- a/lib/services/pcmk-dbus.h +++ b/lib/services/pcmk-dbus.h @@ -6,7 +6,9 @@ bool pcmk_dbus_send(DBusMessage *msg, DBusConnection *connection, void(*done)(DBusPendingCall *pending, void *user_data), void *user_data); DBusMessage *pcmk_dbus_send_recv(DBusMessage *msg, DBusConnection *connection, DBusError *error); bool pcmk_dbus_type_check(DBusMessage *msg, DBusMessageIter *field, int expected, const char *function, int line); -char *pcmk_dbus_get_property(DBusConnection *connection, const char *target, const char *obj, const gchar * iface, const char *name); +char *pcmk_dbus_get_property( + DBusConnection *connection, const char *target, const char *obj, const gchar * iface, const char *name, + void (*callback)(const char *name, const char *value, void *userdata), void *userdata); bool pcmk_dbus_find_error(const char *method, DBusPendingCall* pending, DBusMessage *reply, DBusError *error); diff --git a/lib/services/services.c b/lib/services/services.c index 7b32405..8590b56 100644 --- a/lib/services/services.c +++ b/lib/services/services.c @@ -473,6 +473,7 @@ handle_duplicate_recurring(svc_action_t * op, void (*action_callback) (svc_actio gboolean services_action_async(svc_action_t * op, void (*action_callback) (svc_action_t *)) { + op->synchronous = false; if (action_callback) { op->opaque->callback = action_callback; } @@ -491,7 +492,7 @@ services_action_async(svc_action_t * op, void (*action_callback) (svc_action_t * } if (op->standard && strcasecmp(op->standard, "systemd") == 0) { #if SUPPORT_SYSTEMD - return systemd_unit_exec(op, FALSE); + return systemd_unit_exec(op); #endif } return services_os_action_execute(op, FALSE); @@ -502,6 +503,7 @@ services_action_sync(svc_action_t * op) { gboolean rc = TRUE; + op->synchronous = true; if (op == NULL) { crm_trace("No operation to execute"); return FALSE; @@ -512,7 +514,7 @@ services_action_sync(svc_action_t * op) #endif } else if (op->standard && strcasecmp(op->standard, "systemd") == 0) { #if SUPPORT_SYSTEMD - rc = systemd_unit_exec(op, TRUE); + rc = systemd_unit_exec(op); #endif } else { rc = services_os_action_execute(op, TRUE); diff --git a/lib/services/systemd.c b/lib/services/systemd.c index e81d178..c967430 100644 --- a/lib/services/systemd.c +++ b/lib/services/systemd.c @@ -35,6 +35,9 @@ /* /usr/share/dbus-1/interfaces/org.freedesktop.systemd1.Manager.xml */ +gboolean +systemd_unit_exec_with_unit(svc_action_t * op, const char *unit); + struct unit_info { const char *id; @@ -49,6 +52,15 @@ struct unit_info { const char *job_path; }; +struct pcmk_dbus_data +{ + char *name; + char *unit; + DBusError error; + svc_action_t *op; + void (*callback)(DBusMessage *reply, svc_action_t *op); +}; + static DBusMessage *systemd_new_method(const char *iface, const char *method) { crm_trace("Calling: %s on %s", method, iface); @@ -101,6 +113,7 @@ systemd_service_name(const char *name) static bool systemd_daemon_reload(void) { + /* TODO: Make this asynchronous */ const char *method = "Reload"; DBusMessage *reply = NULL; DBusMessage *msg = systemd_new_method(BUS_NAME".Manager", method); @@ -114,21 +127,55 @@ systemd_daemon_reload(void) return TRUE; } -static gboolean -systemd_unit_by_name(const gchar * arg_name, gchar ** out_unit) +static const char * +systemd_loadunit_result(DBusMessage *reply, svc_action_t * op) +{ + const char *path = NULL; + + if(pcmk_dbus_find_error("LoadUnit", (void*)&path, reply, NULL)) { + if(op) { + crm_warn("No unit found for %s", op->rsc); + } + + } else if(pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, __FUNCTION__, __LINE__)) { + dbus_message_get_args (reply, NULL, + DBUS_TYPE_OBJECT_PATH, &path, + DBUS_TYPE_INVALID); + } + + if(op) { + systemd_unit_exec_with_unit(op, path); + } + + return path; +} + + +static void +systemd_loadunit_cb(DBusPendingCall *pending, void *user_data) +{ + DBusMessage *reply = NULL; + + if(pending) { + reply = dbus_pending_call_steal_reply(pending); + } + + systemd_loadunit_result(reply, user_data); + + if(reply) { + dbus_message_unref(reply); + } +} + +static char * +systemd_unit_by_name(const gchar * arg_name, svc_action_t *op) { DBusMessage *msg; DBusMessage *reply = NULL; - const char *method = "GetUnit"; char *name = NULL; - DBusError error; /* - - - - - + Equivalent to GetUnit if its already loaded @@ -139,51 +186,34 @@ systemd_unit_by_name(const gchar * arg_name, gchar ** out_unit) return FALSE; } - name = systemd_service_name(arg_name); + msg = systemd_new_method(BUS_NAME".Manager", "LoadUnit"); + CRM_ASSERT(msg != NULL); - while(TRUE) { - msg = systemd_new_method(BUS_NAME".Manager", method); - CRM_ASSERT(msg != NULL); + name = systemd_service_name(arg_name); + CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &name, DBUS_TYPE_INVALID)); + free(name); - CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &name, DBUS_TYPE_INVALID)); + if(op == NULL || op->synchronous) { + const char *unit = NULL; + char *munit = NULL; + DBusError error; dbus_error_init(&error); reply = pcmk_dbus_send_recv(msg, systemd_proxy, &error); dbus_message_unref(msg); - if(error.name) { - crm_info("Call to %s failed: %s", method, error.name); - - } else if(pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, __FUNCTION__, __LINE__)) { - if(out_unit) { - char *path = NULL; - - dbus_message_get_args (reply, NULL, - DBUS_TYPE_OBJECT_PATH, &path, - DBUS_TYPE_INVALID); - - *out_unit = strdup(path); - } - dbus_message_unref(reply); - free(name); - return TRUE; + unit = systemd_loadunit_result(reply, op); + if(unit) { + munit = strdup(unit); } - - if(strcmp(method, "LoadUnit") != 0) { - method = "LoadUnit"; - crm_debug("Cannot find %s, reloading the systemd manager configuration", name); - systemd_daemon_reload(); - if(reply) { - dbus_message_unref(reply); - reply = NULL; - } - - } else { - free(name); - return FALSE; + if(reply) { + dbus_message_unref(reply); } + return munit; } - return FALSE; + + pcmk_dbus_send(msg, systemd_proxy, systemd_loadunit_cb, op); + return NULL; } GList * @@ -220,6 +250,10 @@ systemd_unit_listall(void) crm_err("Call to %s failed: %s", method, error.name); return NULL; + } else if (reply == NULL) { + crm_err("Call to %s failed: Message has no reply", method); + return NULL; + } else if (!dbus_message_iter_init(reply, &args)) { crm_err("Call to %s failed: Message has no arguments", method); dbus_message_unref(reply); @@ -269,21 +303,27 @@ systemd_unit_listall(void) gboolean systemd_unit_exists(const char *name) { - return systemd_unit_by_name(name, NULL); + /* Note: Makes a blocking dbus calls + * Used by resources_find_service_class() when resource class=service + */ + if(systemd_unit_by_name(name, NULL)) { + return TRUE; + } + return FALSE; } static char * systemd_unit_metadata(const char *name) { - char *path = NULL; char *meta = NULL; char *desc = NULL; + char *path = systemd_unit_by_name(name, NULL); - if (systemd_unit_by_name(name, &path)) { - CRM_ASSERT(path); - desc = pcmk_dbus_get_property(systemd_proxy, BUS_NAME, path, BUS_NAME ".Unit", "Description"); + if (path) { + /* TODO: Worth a making blocking call for? Probably not. Possibly if cached. */ + desc = pcmk_dbus_get_property(systemd_proxy, BUS_NAME, path, BUS_NAME ".Unit", "Description", NULL, NULL); } else { - desc = g_strdup_printf("systemd unit file for %s", name); + desc = g_strdup_printf("Systemd unit file for %s", name); } meta = g_strdup_printf("\n" @@ -335,24 +375,15 @@ systemd_mask_error(svc_action_t *op, const char *error) } static void -systemd_async_dispatch(DBusPendingCall *pending, void *user_data) +systemd_exec_result(DBusMessage *reply, svc_action_t *op) { DBusError error; - DBusMessage *reply = NULL; - svc_action_t *op = user_data; - dbus_error_init(&error); - if(pending) { - reply = dbus_pending_call_steal_reply(pending); - } - if(reply == NULL) { - crm_err("No reply for %s action on %s", op->action, op->rsc); - - } else if(pcmk_dbus_find_error(op->action, pending, reply, &error)) { + if(pcmk_dbus_find_error(op->action, (void*)&error, reply, &error)) { /* ignore "already started" or "not running" errors */ if (!systemd_mask_error(op, error.name)) { - crm_err("%s for %s: %s", op->action, op->rsc, error.message); + crm_err("Could not issue %s for %s: %s (%s)", op->action, op->rsc, error.message); } } else { @@ -372,6 +403,21 @@ systemd_async_dispatch(DBusPendingCall *pending, void *user_data) } operation_finalize(op); +} + +static void +systemd_async_dispatch(DBusPendingCall *pending, void *user_data) +{ + DBusError error; + DBusMessage *reply = NULL; + svc_action_t *op = user_data; + + dbus_error_init(&error); + if(pending) { + reply = dbus_pending_call_steal_reply(pending); + } + + systemd_exec_result(reply, op); if(pending) { dbus_pending_call_unref(pending); @@ -383,61 +429,56 @@ systemd_async_dispatch(DBusPendingCall *pending, void *user_data) #define SYSTEMD_OVERRIDE_ROOT "/run/systemd/system/" +static void +systemd_unit_check(const char *name, const char *state, void *userdata) +{ + svc_action_t * op = userdata; + + CRM_ASSERT(state != NULL); + + if (g_strcmp0(state, "active") == 0) { + op->rc = PCMK_OCF_OK; + } else if (g_strcmp0(state, "activating") == 0) { + op->rc = PCMK_OCF_PENDING; + } else { + op->rc = PCMK_OCF_NOT_RUNNING; + } + + if (op->synchronous == FALSE) { + operation_finalize(op); + } +} + gboolean -systemd_unit_exec(svc_action_t * op, gboolean synchronous) +systemd_unit_exec_with_unit(svc_action_t * op, const char *unit) { - DBusError error; - char *unit = NULL; - const char *replace_s = "replace"; - gboolean pass = FALSE; const char *method = op->action; - char *name = systemd_service_name(op->agent); DBusMessage *msg = NULL; DBusMessage *reply = NULL; - dbus_error_init(&error); - op->rc = PCMK_OCF_UNKNOWN_ERROR; - CRM_ASSERT(systemd_init()); - - crm_debug("Performing %ssynchronous %s op on systemd unit %s named '%s'", - synchronous ? "" : "a", op->action, op->agent, op->rsc); - - if (safe_str_eq(op->action, "meta-data")) { - op->stdout_data = systemd_unit_metadata(op->agent); - op->rc = PCMK_OCF_OK; - goto cleanup; - } + CRM_ASSERT(unit); - pass = systemd_unit_by_name(op->agent, &unit); - if (pass == FALSE) { + if (unit == NULL) { crm_debug("Could not obtain unit named '%s'", op->agent); -#if 0 - if (error && strstr(error->message, "systemd1.NoSuchUnit")) { - op->rc = PCMK_OCF_NOT_INSTALLED; - op->status = PCMK_LRM_OP_NOT_INSTALLED; - } -#endif + op->rc = PCMK_OCF_NOT_INSTALLED; + op->status = PCMK_LRM_OP_NOT_INSTALLED; goto cleanup; } if (safe_str_eq(op->action, "monitor") || safe_str_eq(method, "status")) { - char *state = pcmk_dbus_get_property(systemd_proxy, BUS_NAME, unit, BUS_NAME ".Unit", "ActiveState"); - - if (g_strcmp0(state, "active") == 0) { - op->rc = PCMK_OCF_OK; - } else if (g_strcmp0(state, "activating") == 0) { - op->rc = PCMK_OCF_PENDING; - } else { - op->rc = PCMK_OCF_NOT_RUNNING; + char *state = pcmk_dbus_get_property(systemd_proxy, BUS_NAME, unit, BUS_NAME ".Unit", "ActiveState", + op->synchronous?NULL:systemd_unit_check, op); + if (op->synchronous) { + systemd_unit_check("ActiveState", state, op); + free(state); + return op->rc == PCMK_OCF_OK; } - - free(state); - goto cleanup; + return TRUE; } else if (g_strcmp0(method, "start") == 0) { FILE *file_strm = NULL; char *override_dir = g_strdup_printf("%s/%s", SYSTEMD_OVERRIDE_ROOT, unit); - char *override_file = g_strdup_printf("%s/50-pacemaker.conf", override_dir); + char *override_file = g_strdup_printf("%s/%s/50-pacemaker.conf", SYSTEMD_OVERRIDE_ROOT, unit); method = "StartUnit"; crm_build_path(override_dir, 0755); @@ -446,11 +487,11 @@ systemd_unit_exec(svc_action_t * op, gboolean synchronous) if (file_strm != NULL) { int rc = fprintf(file_strm, "[Service]\nRestart=no"); if (rc < 0) { - crm_perror(LOG_ERR, "Cannot write to systemd override file %s: %s (%d)", override_file, pcmk_strerror(errno), errno); + crm_perror(LOG_ERR, "Cannot write to systemd override file %s", override_file); } } else { - crm_err("Cannot open systemd override file %s for writing: %s (%d)", override_file, pcmk_strerror(errno), errno); + crm_err("Cannot open systemd override file %s for writing", override_file); } if (file_strm != NULL) { @@ -471,6 +512,7 @@ systemd_unit_exec(svc_action_t * op, gboolean synchronous) } else if (g_strcmp0(method, "restart") == 0) { method = "RestartUnit"; + } else { op->rc = PCMK_OCF_UNIMPLEMENT_FEATURE; goto cleanup; @@ -482,54 +524,66 @@ systemd_unit_exec(svc_action_t * op, gboolean synchronous) CRM_ASSERT(msg != NULL); /* (ss) */ - CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &name, DBUS_TYPE_INVALID)); - CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &replace_s, DBUS_TYPE_INVALID)); + { + const char *replace_s = "replace"; + char *name = systemd_service_name(op->agent); + + CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &name, DBUS_TYPE_INVALID)); + CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &replace_s, DBUS_TYPE_INVALID)); - if (synchronous == FALSE) { - free(unit); free(name); - return pcmk_dbus_send(msg, systemd_proxy, systemd_async_dispatch, op); } - dbus_error_init(&error); - reply = pcmk_dbus_send_recv(msg, systemd_proxy, &error); - - if(error.name) { - /* ignore "already started" or "not running" errors */ - if(!systemd_mask_error(op, error.name)) { - crm_err("Could not issue %s for %s: %s (%s)", method, op->rsc, error.name, unit); - } - goto cleanup; - - } else if(!pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, __FUNCTION__, __LINE__)) { - crm_warn("Call to %s passed but return type was unexpected", op->action); - op->rc = PCMK_OCF_OK; + if (op->synchronous == FALSE) { + return pcmk_dbus_send(msg, systemd_proxy, systemd_async_dispatch, op); } else { - const char *path = NULL; + DBusError error; - dbus_message_get_args (reply, NULL, - DBUS_TYPE_OBJECT_PATH, &path, - DBUS_TYPE_INVALID); - crm_info("Call to %s passed: %s", op->action, path); - op->rc = PCMK_OCF_OK; + reply = pcmk_dbus_send_recv(msg, systemd_proxy, &error); + systemd_exec_result(reply, op); + if(reply) { + dbus_message_unref(reply); + } } - cleanup: - free(unit); - free(name); - if(msg) { dbus_message_unref(msg); } - if(reply) { - dbus_message_unref(reply); + cleanup: + if (op->synchronous == FALSE) { + operation_finalize(op); + return TRUE; } - if (synchronous == FALSE) { - operation_finalize(op); + return op->rc == PCMK_OCF_OK; +} + +gboolean +systemd_unit_exec(svc_action_t * op) +{ + CRM_ASSERT(op); + CRM_ASSERT(systemd_init()); + op->rc = PCMK_OCF_UNKNOWN_ERROR; + crm_debug("Performing %ssynchronous %s op on systemd unit %s named '%s'", + op->synchronous ? "" : "a", op->action, op->agent, op->rsc); + + if (safe_str_eq(op->action, "meta-data")) { + /* TODO: See if we can teach the lrmd not to make these calls synchronously */ + op->stdout_data = systemd_unit_metadata(op->agent); + op->rc = PCMK_OCF_OK; + + if (op->synchronous == FALSE) { + operation_finalize(op); + } return TRUE; } + + systemd_unit_by_name(op->agent, op); + if (op->synchronous == FALSE) { + return TRUE; + } + return op->rc == PCMK_OCF_OK; } diff --git a/lib/services/systemd.h b/lib/services/systemd.h index 6e1b80b..c86bafe 100644 --- a/lib/services/systemd.h +++ b/lib/services/systemd.h @@ -17,7 +17,7 @@ */ G_GNUC_INTERNAL GList *systemd_unit_listall(void); -G_GNUC_INTERNAL int systemd_unit_exec(svc_action_t * op, gboolean synchronous); +G_GNUC_INTERNAL int systemd_unit_exec(svc_action_t * op); G_GNUC_INTERNAL gboolean systemd_unit_exists(const gchar * name); G_GNUC_INTERNAL gboolean systemd_unit_running(const gchar * name); G_GNUC_INTERNAL void systemd_cleanup(void); diff --git a/lib/services/upstart.c b/lib/services/upstart.c index f47e8ff..4c7211d 100644 --- a/lib/services/upstart.c +++ b/lib/services/upstart.c @@ -275,6 +275,10 @@ get_first_instance(const gchar * job) crm_err("Call to %s failed: %s", method, error.name); goto done; + } else if(reply == NULL) { + crm_err("Call to %s failed: no reply", method); + goto done; + } else if (!dbus_message_iter_init(reply, &args)) { crm_err("Call to %s failed: Message has no arguments", method); goto done; @@ -304,31 +308,22 @@ get_first_instance(const gchar * job) return instance; } -gboolean -upstart_job_running(const gchar * name) +static void +upstart_job_check(const char *name, const char *state, void *userdata) { - bool running = FALSE; - char *job = NULL; - - if(upstart_job_by_name(name, &job)) { - char *path = get_first_instance(job); + svc_action_t * op = userdata; - if (path) { - char *state = pcmk_dbus_get_property( - upstart_proxy, BUS_NAME, path, UPSTART_06_API ".Instance", "state"); - - crm_info("State of %s: %s", name, state); - if (state) { - running = !g_strcmp0(state, "running"); - } - free(state); - } - free(path); + if (state && g_strcmp0(state, "running") == 0) { + op->rc = PCMK_OCF_OK; + /* } else if (g_strcmp0(state, "activating") == 0) { */ + /* op->rc = PCMK_OCF_PENDING; */ + } else { + op->rc = PCMK_OCF_NOT_RUNNING; } - free(job); - crm_info("%s is%s running", name, running ? "" : " not"); - return running; + if (op->synchronous == FALSE) { + operation_finalize(op); + } } static char * @@ -465,10 +460,24 @@ upstart_job_exec(svc_action_t * op, gboolean synchronous) } if (safe_str_eq(op->action, "monitor") || safe_str_eq(action, "status")) { - if (upstart_job_running(op->agent)) { - op->rc = PCMK_OCF_OK; - } else { - op->rc = PCMK_OCF_NOT_RUNNING; + + char *path = get_first_instance(job); + + op->rc = PCMK_OCF_NOT_RUNNING; + if(path) { + char *state = pcmk_dbus_get_property( + upstart_proxy, BUS_NAME, path, UPSTART_06_API ".Instance", "state", + op->synchronous?NULL:upstart_job_check, op); + + free(job); + free(path); + + if(op->synchronous) { + upstart_job_check("state", state, op); + free(state); + return op->rc == PCMK_OCF_OK; + } + return TRUE; } goto cleanup; @@ -503,7 +512,7 @@ upstart_job_exec(svc_action_t * op, gboolean synchronous) CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_BOOLEAN, &arg_wait, DBUS_TYPE_INVALID)); - if (synchronous == FALSE) { + if (op->synchronous == FALSE) { free(job); return pcmk_dbus_send(msg, upstart_proxy, upstart_async_dispatch, op); } @@ -545,7 +554,7 @@ upstart_job_exec(svc_action_t * op, gboolean synchronous) dbus_message_unref(reply); } - if (synchronous == FALSE) { + if (op->synchronous == FALSE) { operation_finalize(op); return TRUE; } diff --git a/lrmd/lrmd.c b/lrmd/lrmd.c index f3abfdb..7075b9f 100644 --- a/lrmd/lrmd.c +++ b/lrmd/lrmd.c @@ -874,6 +874,12 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc) if (cmd->lrmd_op_status == PCMK_LRM_OP_CANCELLED) { recurring = 0; /* do nothing */ + + } else if (rc == -ENODEV && safe_str_eq(cmd->action, "monitor")) { + /* Not registered == inactive */ + cmd->lrmd_op_status = PCMK_LRM_OP_DONE; + cmd->exec_rc = PCMK_OCF_NOT_RUNNING; + } else if (rc) { /* Attempt to map return codes to op status if possible */ switch (rc) { @@ -884,6 +890,7 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc) cmd->lrmd_op_status = PCMK_LRM_OP_TIMEOUT; break; default: + /* TODO: This looks wrong. Status should be _DONE and exec_rc set to an error */ cmd->lrmd_op_status = PCMK_LRM_OP_ERROR; } } else { diff --git a/lrmd/regression.py.in b/lrmd/regression.py.in index b6b6718..a9a32ef 100755 --- a/lrmd/regression.py.in +++ b/lrmd/regression.py.in @@ -240,6 +240,13 @@ class Tests: self.action_timeout = " -t 5000 " if self.tls: self.rsc_classes.remove("stonith") + if "systemd" in self.rsc_classes: + # the lrmd_dummy_daemon requires this, we are importing it + # here just to guarantee it is installed before allowing this + # script to run. Otherwise, running without this import being + # available will make all the systemd tests look like they fail, + # which is really scary looking. I'd rather see the import fail. + import systemd.daemon print "Testing "+repr(self.rsc_classes) diff --git a/mcp/pacemaker.combined.upstart.in b/mcp/pacemaker.combined.upstart.in index 9540019..6301d10 100644 --- a/mcp/pacemaker.combined.upstart.in +++ b/mcp/pacemaker.combined.upstart.in @@ -30,6 +30,9 @@ pre-start script # give it time to fail. sleep 2 pidof corosync || { exit 1; } + + # if you use crm_mon, uncomment the line below. + #start crm_mon end script post-start script @@ -59,6 +62,9 @@ post-stop script # and invalidate above "respawn" stanza. #pidof crmd && killall -q -9 corosync + # if you use crm_mon, uncomment the line below. + #stop crm_mon + # if you use corosync-notifyd, uncomment the line below. #stop corosync-notifyd || true end script diff --git a/pacemaker.spec.in b/pacemaker.spec.in index bee6bfc..597fb3a 100644 --- a/pacemaker.spec.in +++ b/pacemaker.spec.in @@ -283,11 +283,13 @@ make DESTDIR=%{buildroot} docdir=%{pcmk_docdir} V=1 install mkdir -p ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig mkdir -p ${RPM_BUILD_ROOT}%{_var}/lib/pacemaker/cores install -m 644 mcp/pacemaker.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/pacemaker +install -m 644 tools/crm_mon.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/crm_mon %if %{with upstart_job} mkdir -p ${RPM_BUILD_ROOT}%{_sysconfdir}/init install -m 644 mcp/pacemaker.upstart ${RPM_BUILD_ROOT}%{_sysconfdir}/init/pacemaker.conf install -m 644 mcp/pacemaker.combined.upstart ${RPM_BUILD_ROOT}%{_sysconfdir}/init/pacemaker.combined.conf +install -m 644 tools/crm_mon.upstart ${RPM_BUILD_ROOT}%{_sysconfdir}/init/crm_mon.conf %endif # Scripts that should be executable @@ -395,6 +397,7 @@ exit 0 %exclude %{_datadir}/pacemaker/tests %config(noreplace) %{_sysconfdir}/sysconfig/pacemaker +%config(noreplace) %{_sysconfdir}/sysconfig/crm_mon %config(noreplace) %{_sysconfdir}/logrotate.d/pacemaker %{_sbindir}/pacemakerd @@ -451,6 +454,7 @@ exit 0 %if %{with upstart_job} %config(noreplace) %{_sysconfdir}/init/pacemaker.conf %config(noreplace) %{_sysconfdir}/init/pacemaker.combined.conf +%config(noreplace) %{_sysconfdir}/init/crm_mon.conf %endif %files cli diff --git a/pengine/allocate.c b/pengine/allocate.c index f9f9f3c..8d02d9b 100644 --- a/pengine/allocate.c +++ b/pengine/allocate.c @@ -1680,16 +1680,41 @@ apply_remote_node_ordering(pe_working_set_t *data_set) action, pe_order_preserve | pe_order_implies_then | pe_order_runnable_left, data_set); - } else if (safe_str_eq(action->task, "stop")) { - custom_action_order(action->rsc, - NULL, - action, - remote_rsc, - generate_op_key(remote_rsc->id, RSC_STOP, 0), - NULL, - pe_order_preserve | pe_order_implies_first, - data_set); + gboolean after_start = FALSE; + + /* handle special case with baremetal remote where stop actions need to be + * ordered after the connection resource starts somewhere else. */ + if (is_baremetal_remote_node(action->node)) { + node_t *cluster_node = remote_rsc->running_on ? remote_rsc->running_on->data : NULL; + + /* if the current cluster node a baremetal connection resource + * is residing on is unclean, we can't process any operations on that + * remote node until after it starts somewhere else. */ + if (cluster_node && cluster_node->details->unclean == TRUE) { + after_start = TRUE; + } + } + + if (after_start) { + custom_action_order(remote_rsc, + generate_op_key(remote_rsc->id, RSC_START, 0), + NULL, + action->rsc, + NULL, + action, + pe_order_preserve | pe_order_implies_then | pe_order_runnable_left, + data_set); + } else { + custom_action_order(action->rsc, + NULL, + action, + remote_rsc, + generate_op_key(remote_rsc->id, RSC_STOP, 0), + NULL, + pe_order_preserve | pe_order_implies_first, + data_set); + } } } } diff --git a/pengine/regression.sh b/pengine/regression.sh index 5f98215..bdc7d3a 100755 --- a/pengine/regression.sh +++ b/pengine/regression.sh @@ -762,9 +762,11 @@ echo "" do_test remote-startup-probes "Baremetal remote-node startup probes" do_test remote-startup "Startup a newly discovered remote-nodes with no status." do_test remote-fence-unclean "Fence unclean baremetal remote-node" +do_test remote-fence-unclean2 "Fence baremetal remote-node after cluster node fails and connection can not be recovered" do_test remote-move "Move remote-node connection resource" do_test remote-disable "Disable a baremetal remote-node" do_test remote-orphaned "Properly shutdown orphaned connection resource" +do_test remote-recover "Recover connection resource after cluster-node fails." do_test remote-stale-node-entry "Make sure we properly handle leftover remote-node entries in the node section" echo "" test_results diff --git a/pengine/test10/remote-fence-unclean2.dot b/pengine/test10/remote-fence-unclean2.dot new file mode 100644 index 0000000..6cff564 --- /dev/null +++ b/pengine/test10/remote-fence-unclean2.dot @@ -0,0 +1,10 @@ +digraph "g" { +"all_stopped" [ style=bold color="green" fontcolor="orange"] +"fake_stop_0 rhel7-alt4" -> "all_stopped" [ style = bold] +"fake_stop_0 rhel7-alt4" [ style=bold color="green" fontcolor="orange"] +"stonith 'reboot' rhel7-alt4" -> "fake_stop_0 rhel7-alt4" [ style = bold] +"stonith 'reboot' rhel7-alt4" -> "stonith_complete" [ style = bold] +"stonith 'reboot' rhel7-alt4" [ style=bold color="green" fontcolor="black"] +"stonith_complete" -> "all_stopped" [ style = bold] +"stonith_complete" [ style=bold color="green" fontcolor="orange"] +} diff --git a/pengine/test10/remote-fence-unclean2.exp b/pengine/test10/remote-fence-unclean2.exp new file mode 100644 index 0000000..e58b617 --- /dev/null +++ b/pengine/test10/remote-fence-unclean2.exp @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pengine/test10/remote-fence-unclean2.scores b/pengine/test10/remote-fence-unclean2.scores new file mode 100644 index 0000000..10fc7fd --- /dev/null +++ b/pengine/test10/remote-fence-unclean2.scores @@ -0,0 +1,13 @@ +Allocation scores: +native_color: fake allocation score on rhel7-alt1: 0 +native_color: fake allocation score on rhel7-alt2: 0 +native_color: fake allocation score on rhel7-alt3: 0 +native_color: fake allocation score on rhel7-alt4: INFINITY +native_color: rhel7-alt4 allocation score on rhel7-alt1: 0 +native_color: rhel7-alt4 allocation score on rhel7-alt2: 0 +native_color: rhel7-alt4 allocation score on rhel7-alt3: 0 +native_color: rhel7-alt4 allocation score on rhel7-alt4: -INFINITY +native_color: shooter allocation score on rhel7-alt1: 0 +native_color: shooter allocation score on rhel7-alt2: 0 +native_color: shooter allocation score on rhel7-alt3: 0 +native_color: shooter allocation score on rhel7-alt4: -INFINITY diff --git a/pengine/test10/remote-fence-unclean2.summary b/pengine/test10/remote-fence-unclean2.summary new file mode 100644 index 0000000..bfaf77b --- /dev/null +++ b/pengine/test10/remote-fence-unclean2.summary @@ -0,0 +1,30 @@ + +Current cluster status: +Node rhel7-alt1 (1): standby +Node rhel7-alt2 (2): standby +RemoteNode rhel7-alt4: UNCLEAN (offline) +OFFLINE: [ rhel7-alt3 ] + + shooter (stonith:fence_xvm): Stopped + rhel7-alt4 (ocf::pacemaker:remote): Stopped + fake (ocf::heartbeat:Dummy): Started rhel7-alt4 + +Transition Summary: + * Stop fake (rhel7-alt4) + +Executing cluster transition: + * Fencing rhel7-alt4 (reboot) + * Pseudo action: stonith_complete + * Pseudo action: fake_stop_0 + * Pseudo action: all_stopped + +Revised cluster status: +Node rhel7-alt1 (1): standby +Node rhel7-alt2 (2): standby +OFFLINE: [ rhel7-alt3 ] +RemoteOFFLINE: [ rhel7-alt4 ] + + shooter (stonith:fence_xvm): Stopped + rhel7-alt4 (ocf::pacemaker:remote): Stopped + fake (ocf::heartbeat:Dummy): Stopped + diff --git a/pengine/test10/remote-fence-unclean2.xml b/pengine/test10/remote-fence-unclean2.xml new file mode 100644 index 0000000..78fc4f1 --- /dev/null +++ b/pengine/test10/remote-fence-unclean2.xml @@ -0,0 +1,115 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pengine/test10/remote-recover.dot b/pengine/test10/remote-recover.dot new file mode 100644 index 0000000..1da6a7b --- /dev/null +++ b/pengine/test10/remote-recover.dot @@ -0,0 +1,17 @@ + digraph "g" { +"all_stopped" [ style=bold color="green" fontcolor="orange"] +"fake_monitor_10000 rhel7-alt4" [ style=bold color="green" fontcolor="black"] +"fake_start_0 rhel7-alt4" -> "fake_monitor_10000 rhel7-alt4" [ style = bold] +"fake_start_0 rhel7-alt4" [ style=bold color="green" fontcolor="black"] +"fake_stop_0 rhel7-alt4" -> "all_stopped" [ style = bold] +"fake_stop_0 rhel7-alt4" -> "fake_start_0 rhel7-alt4" [ style = bold] +"fake_stop_0 rhel7-alt4" [ style=bold color="green" fontcolor="black"] +"rhel7-alt4_monitor_60000 rhel7-alt1" [ style=bold color="green" fontcolor="black"] +"rhel7-alt4_start_0 rhel7-alt1" -> "fake_monitor_10000 rhel7-alt4" [ style = bold] +"rhel7-alt4_start_0 rhel7-alt1" -> "fake_start_0 rhel7-alt4" [ style = bold] +"rhel7-alt4_start_0 rhel7-alt1" -> "rhel7-alt4_monitor_60000 rhel7-alt1" [ style = bold] +"rhel7-alt4_start_0 rhel7-alt1" [ style=bold color="green" fontcolor="black"] +"shooter_monitor_60000 rhel7-alt1" [ style=bold color="green" fontcolor="black"] +"shooter_start_0 rhel7-alt1" -> "shooter_monitor_60000 rhel7-alt1" [ style = bold] +"shooter_start_0 rhel7-alt1" [ style=bold color="green" fontcolor="black"] +} diff --git a/pengine/test10/remote-recover.exp b/pengine/test10/remote-recover.exp new file mode 100644 index 0000000..37e4f71 --- /dev/null +++ b/pengine/test10/remote-recover.exp @@ -0,0 +1,99 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pengine/test10/remote-recover.scores b/pengine/test10/remote-recover.scores new file mode 100644 index 0000000..10fc7fd --- /dev/null +++ b/pengine/test10/remote-recover.scores @@ -0,0 +1,13 @@ +Allocation scores: +native_color: fake allocation score on rhel7-alt1: 0 +native_color: fake allocation score on rhel7-alt2: 0 +native_color: fake allocation score on rhel7-alt3: 0 +native_color: fake allocation score on rhel7-alt4: INFINITY +native_color: rhel7-alt4 allocation score on rhel7-alt1: 0 +native_color: rhel7-alt4 allocation score on rhel7-alt2: 0 +native_color: rhel7-alt4 allocation score on rhel7-alt3: 0 +native_color: rhel7-alt4 allocation score on rhel7-alt4: -INFINITY +native_color: shooter allocation score on rhel7-alt1: 0 +native_color: shooter allocation score on rhel7-alt2: 0 +native_color: shooter allocation score on rhel7-alt3: 0 +native_color: shooter allocation score on rhel7-alt4: -INFINITY diff --git a/pengine/test10/remote-recover.summary b/pengine/test10/remote-recover.summary new file mode 100644 index 0000000..8fd7480 --- /dev/null +++ b/pengine/test10/remote-recover.summary @@ -0,0 +1,36 @@ + +Current cluster status: +Node rhel7-alt2 (2): standby +RemoteNode rhel7-alt4: UNCLEAN (offline) +Online: [ rhel7-alt1 ] +OFFLINE: [ rhel7-alt3 ] + + shooter (stonith:fence_xvm): Stopped + rhel7-alt4 (ocf::pacemaker:remote): Stopped + fake (ocf::heartbeat:Dummy): Started rhel7-alt4 + +Transition Summary: + * Start shooter (rhel7-alt1) + * Start rhel7-alt4 (rhel7-alt1) + * Restart fake (Started rhel7-alt4) + +Executing cluster transition: + * Resource action: shooter start on rhel7-alt1 + * Resource action: rhel7-alt4 start on rhel7-alt1 + * Resource action: fake stop on rhel7-alt4 + * Pseudo action: all_stopped + * Resource action: shooter monitor=60000 on rhel7-alt1 + * Resource action: rhel7-alt4 monitor=60000 on rhel7-alt1 + * Resource action: fake start on rhel7-alt4 + * Resource action: fake monitor=10000 on rhel7-alt4 + +Revised cluster status: +Node rhel7-alt2 (2): standby +Online: [ rhel7-alt1 ] +OFFLINE: [ rhel7-alt3 ] +RemoteOnline: [ rhel7-alt4 ] + + shooter (stonith:fence_xvm): Started rhel7-alt1 + rhel7-alt4 (ocf::pacemaker:remote): Started rhel7-alt1 + fake (ocf::heartbeat:Dummy): Started rhel7-alt4 + diff --git a/pengine/test10/remote-recover.xml b/pengine/test10/remote-recover.xml new file mode 100644 index 0000000..1a83dd9 --- /dev/null +++ b/pengine/test10/remote-recover.xml @@ -0,0 +1,114 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/crm_attribute.c b/tools/crm_attribute.c index 60d39b6..c37b096 100644 --- a/tools/crm_attribute.c +++ b/tools/crm_attribute.c @@ -235,6 +235,7 @@ main(int argc, char **argv) /* we're updating cluster options - dont populate dest_node */ type = XML_CIB_TAG_CRMCONFIG; + } else if (safe_str_eq(type, XML_CIB_TAG_CRMCONFIG)) { } else if (safe_str_neq(type, XML_CIB_TAG_TICKETS)) { if (dest_uname == NULL) { dest_uname = get_node_name(0); diff --git a/tools/crm_mon.upstart.in b/tools/crm_mon.upstart.in new file mode 100644 index 0000000..ef0fe7a --- /dev/null +++ b/tools/crm_mon.upstart.in @@ -0,0 +1,39 @@ +# crm_mon - Daemon for pacemaker monitor +# +# + +kill timeout 3600 +respawn +respawn limit 10 3600 + +expect fork + +env prog=crm_mon +env rpm_sysconf=@sysconfdir@/sysconfig/crm_mon +env rpm_lockfile=@localstatedir@/lock/subsys/crm_mon +env deb_sysconf=@sysconfdir@/default/crm_mon +env deb_lockfile=@localstatedir@/lock/crm_mon + + +script + [ -f "$rpm_sysconf" ] && . $rpm_sysconf + [ -f "$deb_sysconf" ] && . $deb_sysconf + exec $prog $OPTIONS +end script + +post-start script + [ -f "$rpm_sysconf" ] && . $rpm_sysconf + [ -f "$deb_sysconf" ] && . $deb_sysconf + [ -z "$LOCK_FILE" -a -d @sysconfdir@/sysconfig ] && LOCK_FILE="$rpm_lockfile" + [ -z "$LOCK_FILE" -a -d @sysconfdir@/default ] && LOCK_FILE="$deb_lockfile" + touch $LOCK_FILE +end script + +post-stop script + [ -f "$rpm_sysconf" ] && . $rpm_sysconf + [ -f "$deb_sysconf" ] && . $deb_sysconf + [ -z "$LOCK_FILE" -a -d @sysconfdir@/sysconfig ] && LOCK_FILE="$rpm_lockfile" + [ -z "$LOCK_FILE" -a -d @sysconfdir@/default ] && LOCK_FILE="$deb_lockfile" + rm -f $LOCK_FILE +end script + diff --git a/tools/crm_resource.c b/tools/crm_resource.c index 6537520..56583e0 100644 --- a/tools/crm_resource.c +++ b/tools/crm_resource.c @@ -2214,11 +2214,15 @@ main(int argc, char **argv) } } else if (rsc_cmd == 'C') { -#if 0 +#if HAVE_ATOMIC_ATTRD xmlNode *cmd = create_request(CRM_OP_REPROBE, NULL, host_uname, CRM_SYSTEM_CRMD, crm_system_name, our_pid); - crm_debug("Re-checking the state of all resources on %s", host_uname); + crm_debug("Re-checking the state of all resources on %s", host_uname?host_uname:"all nodes"); + + rc = attrd_update_delegate( + NULL, 'u', host_uname, "fail-count-*", NULL, XML_CIB_TAG_STATUS, NULL, NULL, NULL, FALSE); + if (crm_ipc_send(crmd_channel, cmd, 0, 0, NULL) > 0) { start_mainloop(); }