From 3d71c6c4123998ece0bc940efe3d57cacf982d1f Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Nov 19 2015 15:45:07 +0000 Subject: import pacemaker-1.1.13-10.el7 --- diff --git a/.gitignore b/.gitignore index 7fc59c1..ab4a36e 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -SOURCES/pacemaker-a14efad51ca8f1e3742fd8520e051cd7a0864f04.tar.gz +SOURCES/nagios-agents-metadata-105ab8a.tar.gz +SOURCES/pacemaker-44eb2ddf8d4f8fc05256aae2abc9fbf3ae4d1fbc.tar.gz diff --git a/.pacemaker.metadata b/.pacemaker.metadata index d60206d..db08ad9 100644 --- a/.pacemaker.metadata +++ b/.pacemaker.metadata @@ -1 +1,2 @@ -131ba58261066b1fd2cf874e41600262ecb584c6 SOURCES/pacemaker-a14efad51ca8f1e3742fd8520e051cd7a0864f04.tar.gz +ea6c0a27fd0ae8ce02f84a11f08a0d79377041c3 SOURCES/nagios-agents-metadata-105ab8a.tar.gz +116bb67b5d40329efa75d7c06a3360e2b7d51413 SOURCES/pacemaker-44eb2ddf8d4f8fc05256aae2abc9fbf3ae4d1fbc.tar.gz diff --git a/SOURCES/0004-Fix-crm_resource-Correctly-check-if-a-resource-is-un.patch b/SOURCES/0004-Fix-crm_resource-Correctly-check-if-a-resource-is-un.patch new file mode 100644 index 0000000..1ef6a11 --- /dev/null +++ b/SOURCES/0004-Fix-crm_resource-Correctly-check-if-a-resource-is-un.patch @@ -0,0 +1,82 @@ +From: Andrew Beekhof +Date: Fri, 14 Aug 2015 09:43:32 +1000 +Subject: [PATCH] Fix: crm_resource: Correctly check if a resource is unmanaged + or has a target-role + +(cherry picked from commit 3ff29dbe2cab872b452c4580736d23d1f69736fa) +--- + tools/crm_resource.c | 2 +- + tools/crm_resource_runtime.c | 31 ++++++++++++++++++------------- + 2 files changed, 19 insertions(+), 14 deletions(-) + +diff --git a/tools/crm_resource.c b/tools/crm_resource.c +index 2fce3b7..156bbea 100644 +--- a/tools/crm_resource.c ++++ b/tools/crm_resource.c +@@ -888,7 +888,7 @@ main(int argc, char **argv) + rsc = uber_parent(rsc); + } + +- crm_debug("Re-checking the state of %s on %s", rsc_id, host_uname); ++ crm_debug("Re-checking the state of %s for %s on %s", rsc->id, rsc_id, host_uname); + if(rsc) { + crmd_replies_needed = 0; + rc = cli_resource_delete(cib_conn, crmd_channel, host_uname, rsc, &data_set); +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index a270cbf..f260e19 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -616,35 +616,40 @@ cli_resource_delete(cib_t *cib_conn, crm_ipc_t * crmd_channel, const char *host_ + void + cli_resource_check(cib_t * cib_conn, resource_t *rsc) + { +- ++ int need_nl = 0; + char *role_s = NULL; + char *managed = NULL; + resource_t *parent = uber_parent(rsc); + +- find_resource_attr(cib_conn, XML_ATTR_ID, parent->id, +- XML_TAG_META_SETS, NULL, NULL, XML_RSC_ATTR_MANAGED, &managed); ++ find_resource_attr(cib_conn, XML_NVPAIR_ATTR_VALUE, parent->id, ++ NULL, NULL, NULL, XML_RSC_ATTR_MANAGED, &managed); + +- find_resource_attr(cib_conn, XML_ATTR_ID, parent->id, +- XML_TAG_META_SETS, NULL, NULL, XML_RSC_ATTR_TARGET_ROLE, &role_s); ++ find_resource_attr(cib_conn, XML_NVPAIR_ATTR_VALUE, parent->id, ++ NULL, NULL, NULL, XML_RSC_ATTR_TARGET_ROLE, &role_s); + +- if(managed == NULL) { +- managed = strdup("1"); +- } +- if(crm_is_true(managed) == FALSE) { +- printf("\n\t*Resource %s is configured to not be managed by the cluster\n", parent->id); +- } + if(role_s) { + enum rsc_role_e role = text2role(role_s); + if(role == RSC_ROLE_UNKNOWN) { + // Treated as if unset + + } else if(role == RSC_ROLE_STOPPED) { +- printf("\n\t* The configuration specifies that '%s' should remain stopped\n", parent->id); ++ printf("\n * The configuration specifies that '%s' should remain stopped\n", parent->id); ++ need_nl++; + + } else if(parent->variant > pe_clone && role != RSC_ROLE_MASTER) { +- printf("\n\t* The configuration specifies that '%s' should not be promoted\n", parent->id); ++ printf("\n * The configuration specifies that '%s' should not be promoted\n", parent->id); ++ need_nl++; + } + } ++ ++ if(managed && crm_is_true(managed) == FALSE) { ++ printf("%s * The configuration prevents the cluster from stopping or starting '%s' (unmanaged)\n", need_nl == 0?"\n":"", parent->id); ++ need_nl++; ++ } ++ ++ if(need_nl) { ++ printf("\n"); ++ } + } + + int diff --git a/SOURCES/0005-Fix-PE-Bug-cl-5247-Imply-resources-running-on-a-cont.patch b/SOURCES/0005-Fix-PE-Bug-cl-5247-Imply-resources-running-on-a-cont.patch new file mode 100644 index 0000000..cf19707 --- /dev/null +++ b/SOURCES/0005-Fix-PE-Bug-cl-5247-Imply-resources-running-on-a-cont.patch @@ -0,0 +1,328 @@ +From: Andrew Beekhof +Date: Tue, 18 Aug 2015 10:30:49 +1000 +Subject: [PATCH] Fix: PE: Bug cl#5247 - Imply resources running on a container + are stopped when the container is stopped + +(cherry picked from commit e10eff1902d5b451454e2d467ee337c964f536ab) +--- + lib/pengine/unpack.c | 29 ++++++++++++++++++++--------- + pengine/allocate.c | 17 +++++++++++++++++ + pengine/graph.c | 7 ++++++- + pengine/test10/bug-rh-1097457.dot | 2 ++ + pengine/test10/bug-rh-1097457.exp | 12 ++++++++++-- + pengine/test10/bug-rh-1097457.summary | 10 +++++----- + pengine/test10/whitebox-fail1.dot | 1 + + pengine/test10/whitebox-fail1.exp | 6 +++++- + pengine/test10/whitebox-fail1.summary | 8 ++++---- + pengine/test10/whitebox-fail2.dot | 1 + + pengine/test10/whitebox-fail2.exp | 6 +++++- + pengine/test10/whitebox-fail2.summary | 8 ++++---- + 12 files changed, 80 insertions(+), 27 deletions(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 106c674..0f83be4 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -44,7 +44,7 @@ CRM_TRACE_INIT_DATA(pe_status); + + gboolean unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, + enum action_fail_response *failed, pe_working_set_t * data_set); +-static gboolean determine_remote_online_status(node_t * this_node); ++static gboolean determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node); + + static gboolean + is_dangling_container_remote_node(node_t *node) +@@ -73,6 +73,8 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason) + if (is_set(rsc->flags, pe_rsc_failed) == FALSE) { + crm_warn("Remote node %s will be fenced by recovering container resource %s", + node->details->uname, rsc->id, reason); ++ /* node->details->unclean = TRUE; */ ++ node->details->remote_requires_reset = TRUE; + set_bit(rsc->flags, pe_rsc_failed); + } + } else if (is_dangling_container_remote_node(node)) { +@@ -1157,7 +1159,7 @@ unpack_remote_status(xmlNode * status, pe_working_set_t * data_set) + if ((this_node == NULL) || (is_remote_node(this_node) == FALSE)) { + continue; + } +- determine_remote_online_status(this_node); ++ determine_remote_online_status(data_set, this_node); + } + + /* process attributes */ +@@ -1366,7 +1368,7 @@ determine_online_status_fencing(pe_working_set_t * data_set, xmlNode * node_stat + } + + static gboolean +-determine_remote_online_status(node_t * this_node) ++determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node) + { + resource_t *rsc = this_node->details->remote_rsc; + resource_t *container = NULL; +@@ -1393,13 +1395,21 @@ determine_remote_online_status(node_t * this_node) + } + + /* Now check all the failure conditions. */ +- if (is_set(rsc->flags, pe_rsc_failed) || +- (rsc->role == RSC_ROLE_STOPPED) || +- (container && is_set(container->flags, pe_rsc_failed)) || +- (container && container->role == RSC_ROLE_STOPPED)) { ++ if(container && is_set(container->flags, pe_rsc_failed)) { ++ crm_trace("Remote node %s is set to UNCLEAN. rsc failed.", this_node->details->id); ++ this_node->details->online = FALSE; ++ this_node->details->remote_requires_reset = TRUE; + +- crm_trace("Remote node %s is set to OFFLINE. node is stopped or rsc failed.", this_node->details->id); ++ } else if(is_set(rsc->flags, pe_rsc_failed)) { ++ crm_trace("Remote node %s is set to OFFLINE. rsc failed.", this_node->details->id); + this_node->details->online = FALSE; ++ ++ } else if (rsc->role == RSC_ROLE_STOPPED ++ || (container && container->role == RSC_ROLE_STOPPED)) { ++ ++ crm_trace("Remote node %s is set to OFFLINE. node is stopped.", this_node->details->id); ++ this_node->details->online = FALSE; ++ this_node->details->remote_requires_reset = FALSE; + } + + remote_online_done: +@@ -3375,7 +3385,8 @@ find_operations(const char *rsc, const char *node, gboolean active_filter, + continue; + + } else if (is_remote_node(this_node)) { +- determine_remote_online_status(this_node); ++ determine_remote_online_status(data_set, this_node); ++ + } else { + determine_online_status(node_state, this_node, data_set); + } +diff --git a/pengine/allocate.c b/pengine/allocate.c +index c2e56f9..65ae05d 100644 +--- a/pengine/allocate.c ++++ b/pengine/allocate.c +@@ -1406,6 +1406,23 @@ stage6(pe_working_set_t * data_set) + + /* remote-nodes associated with a container resource (such as a vm) are not fenced */ + if (is_container_remote_node(node)) { ++ /* Guest */ ++ if (need_stonith ++ && node->details->remote_requires_reset ++ && pe_can_fence(data_set, node)) { ++ resource_t *container = node->details->remote_rsc->container; ++ char *key = stop_key(container); ++ GListPtr stop_list = find_actions(container->actions, key, NULL); ++ ++ crm_info("Impliying node %s is down when container %s is stopped (%p)", ++ node->details->uname, container->id, stop_list); ++ if(stop_list) { ++ stonith_constraints(node, stop_list->data, data_set); ++ } ++ ++ g_list_free(stop_list); ++ free(key); ++ } + continue; + } + +diff --git a/pengine/graph.c b/pengine/graph.c +index 3d832f0..a50f15b 100644 +--- a/pengine/graph.c ++++ b/pengine/graph.c +@@ -697,7 +697,12 @@ stonith_constraints(node_t * node, action_t * stonith_op, pe_working_set_t * dat + for (lpc = data_set->resources; lpc != NULL; lpc = lpc->next) { + resource_t *rsc = (resource_t *) lpc->data; + +- rsc_stonith_ordering(rsc, stonith_op, data_set); ++ if(stonith_op->rsc == NULL) { ++ rsc_stonith_ordering(rsc, stonith_op, data_set); ++ ++ } else if(stonith_op->rsc != rsc && stonith_op->rsc != rsc->container) { ++ rsc_stonith_ordering(rsc, stonith_op, data_set); ++ } + } + } + +diff --git a/pengine/test10/bug-rh-1097457.dot b/pengine/test10/bug-rh-1097457.dot +index 666099c..078d177 100644 +--- a/pengine/test10/bug-rh-1097457.dot ++++ b/pengine/test10/bug-rh-1097457.dot +@@ -49,10 +49,12 @@ digraph "g" { + "VM2_start_0 lama3" [ style=bold color="green" fontcolor="black"] + "VM2_stop_0 lama3" -> "FAKE4-IP_stop_0 lamaVM2" [ style = bold] + "VM2_stop_0 lama3" -> "FAKE4_stop_0 lamaVM2" [ style = bold] ++"VM2_stop_0 lama3" -> "FAKE6-clone_stop_0" [ style = bold] + "VM2_stop_0 lama3" -> "FAKE6_stop_0 lamaVM2" [ style = bold] + "VM2_stop_0 lama3" -> "FSlun3_stop_0 lamaVM2" [ style = bold] + "VM2_stop_0 lama3" -> "VM2_start_0 lama3" [ style = bold] + "VM2_stop_0 lama3" -> "all_stopped" [ style = bold] ++"VM2_stop_0 lama3" -> "lamaVM2-G4_stop_0" [ style = bold] + "VM2_stop_0 lama3" [ style=bold color="green" fontcolor="black"] + "all_stopped" [ style=bold color="green" fontcolor="orange"] + "lamaVM2-G4_running_0" [ style=bold color="green" fontcolor="orange"] +diff --git a/pengine/test10/bug-rh-1097457.exp b/pengine/test10/bug-rh-1097457.exp +index 36af9f3..175f413 100644 +--- a/pengine/test10/bug-rh-1097457.exp ++++ b/pengine/test10/bug-rh-1097457.exp +@@ -119,7 +119,11 @@ + + + +- ++ ++ ++ ++ ++ + + + +@@ -331,7 +335,11 @@ + + + +- ++ ++ ++ ++ ++ + + + +diff --git a/pengine/test10/bug-rh-1097457.summary b/pengine/test10/bug-rh-1097457.summary +index e2f235d..c8751ae 100644 +--- a/pengine/test10/bug-rh-1097457.summary ++++ b/pengine/test10/bug-rh-1097457.summary +@@ -39,17 +39,17 @@ Transition Summary: + * Restart lamaVM2 (Started lama3) + + Executing cluster transition: +- * Pseudo action: lamaVM2-G4_stop_0 +- * Pseudo action: FAKE6-clone_stop_0 + * Resource action: lamaVM2 stop on lama3 + * Resource action: VM2 stop on lama3 ++ * Pseudo action: lamaVM2-G4_stop_0 + * Pseudo action: FAKE4-IP_stop_0 +- * Pseudo action: FAKE6_stop_0 +- * Pseudo action: FAKE6-clone_stopped_0 +- * Pseudo action: FAKE6-clone_start_0 ++ * Pseudo action: FAKE6-clone_stop_0 + * Resource action: VM2 start on lama3 + * Resource action: VM2 monitor=10000 on lama3 + * Pseudo action: FAKE4_stop_0 ++ * Pseudo action: FAKE6_stop_0 ++ * Pseudo action: FAKE6-clone_stopped_0 ++ * Pseudo action: FAKE6-clone_start_0 + * Resource action: lamaVM2 start on lama3 + * Resource action: lamaVM2 monitor=30000 on lama3 + * Resource action: FSlun3 monitor=10000 on lamaVM2 +diff --git a/pengine/test10/whitebox-fail1.dot b/pengine/test10/whitebox-fail1.dot +index b595015..0f0fe26 100644 +--- a/pengine/test10/whitebox-fail1.dot ++++ b/pengine/test10/whitebox-fail1.dot +@@ -26,6 +26,7 @@ digraph "g" { + "container1_start_0 18node2" -> "lxc1_start_0 18node2" [ style = bold] + "container1_start_0 18node2" [ style=bold color="green" fontcolor="black"] + "container1_stop_0 18node2" -> "B_stop_0 lxc1" [ style = bold] ++"container1_stop_0 18node2" -> "M-clone_stop_0" [ style = bold] + "container1_stop_0 18node2" -> "M_stop_0 lxc1" [ style = bold] + "container1_stop_0 18node2" -> "all_stopped" [ style = bold] + "container1_stop_0 18node2" -> "container1_start_0 18node2" [ style = bold] +diff --git a/pengine/test10/whitebox-fail1.exp b/pengine/test10/whitebox-fail1.exp +index 834b231..01bb142 100644 +--- a/pengine/test10/whitebox-fail1.exp ++++ b/pengine/test10/whitebox-fail1.exp +@@ -96,7 +96,11 @@ + + + +- ++ ++ ++ ++ ++ + + + +diff --git a/pengine/test10/whitebox-fail1.summary b/pengine/test10/whitebox-fail1.summary +index 5e5887b..1586407 100644 +--- a/pengine/test10/whitebox-fail1.summary ++++ b/pengine/test10/whitebox-fail1.summary +@@ -20,17 +20,17 @@ Transition Summary: + * Restart lxc1 (Started 18node2) + + Executing cluster transition: +- * Pseudo action: M-clone_stop_0 + * Resource action: lxc1 stop on 18node2 + * Resource action: container1 stop on 18node2 ++ * Pseudo action: M-clone_stop_0 ++ * Pseudo action: B_stop_0 ++ * Resource action: container1 start on 18node2 + * Pseudo action: M_stop_0 + * Pseudo action: M-clone_stopped_0 + * Pseudo action: M-clone_start_0 +- * Pseudo action: B_stop_0 +- * Pseudo action: all_stopped +- * Resource action: container1 start on 18node2 + * Resource action: lxc1 start on 18node2 + * Resource action: lxc1 monitor=30000 on 18node2 ++ * Pseudo action: all_stopped + * Resource action: M start on lxc1 + * Pseudo action: M-clone_running_0 + * Resource action: B start on lxc1 +diff --git a/pengine/test10/whitebox-fail2.dot b/pengine/test10/whitebox-fail2.dot +index b595015..0f0fe26 100644 +--- a/pengine/test10/whitebox-fail2.dot ++++ b/pengine/test10/whitebox-fail2.dot +@@ -26,6 +26,7 @@ digraph "g" { + "container1_start_0 18node2" -> "lxc1_start_0 18node2" [ style = bold] + "container1_start_0 18node2" [ style=bold color="green" fontcolor="black"] + "container1_stop_0 18node2" -> "B_stop_0 lxc1" [ style = bold] ++"container1_stop_0 18node2" -> "M-clone_stop_0" [ style = bold] + "container1_stop_0 18node2" -> "M_stop_0 lxc1" [ style = bold] + "container1_stop_0 18node2" -> "all_stopped" [ style = bold] + "container1_stop_0 18node2" -> "container1_start_0 18node2" [ style = bold] +diff --git a/pengine/test10/whitebox-fail2.exp b/pengine/test10/whitebox-fail2.exp +index 834b231..01bb142 100644 +--- a/pengine/test10/whitebox-fail2.exp ++++ b/pengine/test10/whitebox-fail2.exp +@@ -96,7 +96,11 @@ + + + +- ++ ++ ++ ++ ++ + + + +diff --git a/pengine/test10/whitebox-fail2.summary b/pengine/test10/whitebox-fail2.summary +index 338173d..ab40d99 100644 +--- a/pengine/test10/whitebox-fail2.summary ++++ b/pengine/test10/whitebox-fail2.summary +@@ -20,17 +20,17 @@ Transition Summary: + * Recover lxc1 (Started 18node2) + + Executing cluster transition: +- * Pseudo action: M-clone_stop_0 + * Resource action: lxc1 stop on 18node2 + * Resource action: container1 stop on 18node2 ++ * Pseudo action: M-clone_stop_0 ++ * Pseudo action: B_stop_0 ++ * Resource action: container1 start on 18node2 + * Pseudo action: M_stop_0 + * Pseudo action: M-clone_stopped_0 + * Pseudo action: M-clone_start_0 +- * Pseudo action: B_stop_0 +- * Pseudo action: all_stopped +- * Resource action: container1 start on 18node2 + * Resource action: lxc1 start on 18node2 + * Resource action: lxc1 monitor=30000 on 18node2 ++ * Pseudo action: all_stopped + * Resource action: M start on lxc1 + * Pseudo action: M-clone_running_0 + * Resource action: B start on lxc1 diff --git a/SOURCES/0006-Fix-Date-Correctly-set-time-from-seconds-since-epoch.patch b/SOURCES/0006-Fix-Date-Correctly-set-time-from-seconds-since-epoch.patch new file mode 100644 index 0000000..ea40f7e --- /dev/null +++ b/SOURCES/0006-Fix-Date-Correctly-set-time-from-seconds-since-epoch.patch @@ -0,0 +1,21 @@ +From: Andrew Beekhof +Date: Tue, 18 Aug 2015 11:06:13 +1000 +Subject: [PATCH] Fix: Date: Correctly set time from seconds-since-epoch + +(cherry picked from commit efa318114d0b2124cc82fe143403e6de502e0134) +--- + lib/common/iso8601.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/lib/common/iso8601.c b/lib/common/iso8601.c +index 769e01b..5f4a73d 100644 +--- a/lib/common/iso8601.c ++++ b/lib/common/iso8601.c +@@ -1011,6 +1011,7 @@ ha_set_tm_time(crm_time_t * target, struct tm *source) + target->days = 1 + source->tm_yday; + } + ++ target->seconds = 0; + if (source->tm_hour >= 0) { + target->seconds += 60 * 60 * source->tm_hour; + } diff --git a/SOURCES/0007-Test-PE-Bug-cl-5247-Imply-resources-running-on-a-con.patch b/SOURCES/0007-Test-PE-Bug-cl-5247-Imply-resources-running-on-a-con.patch new file mode 100644 index 0000000..74aa4b1 --- /dev/null +++ b/SOURCES/0007-Test-PE-Bug-cl-5247-Imply-resources-running-on-a-con.patch @@ -0,0 +1,1419 @@ +From: Andrew Beekhof +Date: Tue, 18 Aug 2015 10:31:06 +1000 +Subject: [PATCH] Test: PE: Bug cl#5247 - Imply resources running on a + container are stopped when the container is stopped + +(cherry picked from commit 825e82a5098bde0412944c7d4f54c3d825ddff08) +--- + pengine/regression.sh | 29 +- + pengine/test10/bug-cl-5247.dot | 136 +++++++ + pengine/test10/bug-cl-5247.exp | 704 +++++++++++++++++++++++++++++++++++++ + pengine/test10/bug-cl-5247.scores | 84 +++++ + pengine/test10/bug-cl-5247.summary | 96 +++++ + pengine/test10/bug-cl-5247.xml | 295 ++++++++++++++++ + 6 files changed, 1331 insertions(+), 13 deletions(-) + create mode 100644 pengine/test10/bug-cl-5247.dot + create mode 100644 pengine/test10/bug-cl-5247.exp + create mode 100644 pengine/test10/bug-cl-5247.scores + create mode 100644 pengine/test10/bug-cl-5247.summary + create mode 100644 pengine/test10/bug-cl-5247.xml + +diff --git a/pengine/regression.sh b/pengine/regression.sh +index 7f73f92..1517e3d 100755 +--- a/pengine/regression.sh ++++ b/pengine/regression.sh +@@ -31,19 +31,6 @@ info Performing the following tests from $io_dir + create_mode="false" + + echo "" +-do_test cloned_start_one "order first clone then clone... first clone_min=2" +-do_test cloned_start_two "order first clone then clone... first clone_min=2" +-do_test cloned_stop_one "order first clone then clone... first clone_min=2" +-do_test cloned_stop_two "order first clone then clone... first clone_min=2" +-do_test clone_min_interleave_start_one "order first clone then clone... first clone_min=2 and then has interleave=true" +-do_test clone_min_interleave_start_two "order first clone then clone... first clone_min=2 and then has interleave=true" +-do_test clone_min_interleave_stop_one "order first clone then clone... first clone_min=2 and then has interleave=true" +-do_test clone_min_interleave_stop_two "order first clone then clone... first clone_min=2 and then has interleave=true" +-do_test clone_min_start_one "order first clone then primitive... first clone_min=2" +-do_test clone_min_start_two "order first clone then primitive... first clone_min=2" +-do_test clone_min_stop_all "order first clone then primitive... first clone_min=2" +-do_test clone_min_stop_one "order first clone then primitive... first clone_min=2" +-do_test clone_min_stop_two "order first clone then primitive... first clone_min=2" + + do_test simple1 "Offline " + do_test simple2 "Start " +@@ -373,6 +360,21 @@ do_test clone-interleave-2 "Clone-3 must stop on pcmk-1 due to interleaved order + do_test clone-interleave-3 "Clone-3 must be recovered on pcmk-1 due to interleaved ordering (no colocation)" + + echo "" ++do_test cloned_start_one "order first clone then clone... first clone_min=2" ++do_test cloned_start_two "order first clone then clone... first clone_min=2" ++do_test cloned_stop_one "order first clone then clone... first clone_min=2" ++do_test cloned_stop_two "order first clone then clone... first clone_min=2" ++do_test clone_min_interleave_start_one "order first clone then clone... first clone_min=2 and then has interleave=true" ++do_test clone_min_interleave_start_two "order first clone then clone... first clone_min=2 and then has interleave=true" ++do_test clone_min_interleave_stop_one "order first clone then clone... first clone_min=2 and then has interleave=true" ++do_test clone_min_interleave_stop_two "order first clone then clone... first clone_min=2 and then has interleave=true" ++do_test clone_min_start_one "order first clone then primitive... first clone_min=2" ++do_test clone_min_start_two "order first clone then primitive... first clone_min=2" ++do_test clone_min_stop_all "order first clone then primitive... first clone_min=2" ++do_test clone_min_stop_one "order first clone then primitive... first clone_min=2" ++do_test clone_min_stop_two "order first clone then primitive... first clone_min=2" ++ ++echo "" + do_test unfence-startup "Clean unfencing" + do_test unfence-definition "Unfencing when the agent changes" + do_test unfence-parameters "Unfencing when the agent parameters changes" +@@ -785,6 +787,7 @@ do_test container-group-3 "Container in group - stop failed" + do_test container-group-4 "Container in group - reached migration-threshold" + do_test container-is-remote-node "Place resource within container when container is remote-node" + do_test bug-rh-1097457 "Kill user defined container/contents ordering" ++do_test bug-cl-5247 "Graph loop when recovering m/s resource in a container" + + echo "" + do_test whitebox-fail1 "Fail whitebox container rsc." +diff --git a/pengine/test10/bug-cl-5247.dot b/pengine/test10/bug-cl-5247.dot +new file mode 100644 +index 0000000..ed728ac +--- /dev/null ++++ b/pengine/test10/bug-cl-5247.dot +@@ -0,0 +1,136 @@ ++digraph "g" { ++"all_stopped" [ style=bold color="green" fontcolor="orange"] ++"grpStonith1_running_0" [ style=bold color="green" fontcolor="orange"] ++"grpStonith1_start_0" -> "grpStonith1_running_0" [ style = bold] ++"grpStonith1_start_0" -> "prmStonith1-2_start_0 bl460g8n4" [ style = bold] ++"grpStonith1_start_0" [ style=bold color="green" fontcolor="orange"] ++"grpStonith1_stop_0" -> "grpStonith1_stopped_0" [ style = bold] ++"grpStonith1_stop_0" -> "prmStonith1-2_stop_0 bl460g8n4" [ style = bold] ++"grpStonith1_stop_0" [ style=bold color="green" fontcolor="orange"] ++"grpStonith1_stopped_0" -> "grpStonith1_start_0" [ style = bold] ++"grpStonith1_stopped_0" [ style=bold color="green" fontcolor="orange"] ++"grpStonith2_running_0" [ style=bold color="green" fontcolor="orange"] ++"grpStonith2_start_0" -> "grpStonith2_running_0" [ style = bold] ++"grpStonith2_start_0" -> "prmStonith2-2_start_0 bl460g8n3" [ style = bold] ++"grpStonith2_start_0" [ style=bold color="green" fontcolor="orange"] ++"grpStonith2_stop_0" -> "grpStonith2_stopped_0" [ style = bold] ++"grpStonith2_stop_0" -> "prmStonith2-2_stop_0 bl460g8n3" [ style = bold] ++"grpStonith2_stop_0" [ style=bold color="green" fontcolor="orange"] ++"grpStonith2_stopped_0" -> "grpStonith2_start_0" [ style = bold] ++"grpStonith2_stopped_0" [ style=bold color="green" fontcolor="orange"] ++"master-group_running_0" [ style=bold color="green" fontcolor="orange"] ++"master-group_start_0" -> "master-group_running_0" [ style = bold] ++"master-group_start_0" -> "vip-master_start_0 pgsr01" [ style = bold] ++"master-group_start_0" -> "vip-rep_start_0 pgsr01" [ style = bold] ++"master-group_start_0" [ style=bold color="green" fontcolor="orange"] ++"master-group_stop_0" -> "master-group_stopped_0" [ style = bold] ++"master-group_stop_0" -> "vip-master_stop_0 pgsr02" [ style = bold] ++"master-group_stop_0" -> "vip-rep_stop_0 pgsr02" [ style = bold] ++"master-group_stop_0" [ style=bold color="green" fontcolor="orange"] ++"master-group_stopped_0" -> "master-group_start_0" [ style = bold] ++"master-group_stopped_0" [ style=bold color="green" fontcolor="orange"] ++"msPostgresql_confirmed-post_notify_demoted_0" -> "master-group_stop_0" [ style = bold] ++"msPostgresql_confirmed-post_notify_demoted_0" -> "msPostgresql_pre_notify_stop_0" [ style = bold] ++"msPostgresql_confirmed-post_notify_demoted_0" -> "pgsql_monitor_9000 pgsr01" [ style = bold] ++"msPostgresql_confirmed-post_notify_demoted_0" [ style=bold color="green" fontcolor="orange"] ++"msPostgresql_confirmed-post_notify_stopped_0" -> "all_stopped" [ style = bold] ++"msPostgresql_confirmed-post_notify_stopped_0" -> "pgsql_monitor_9000 pgsr01" [ style = bold] ++"msPostgresql_confirmed-post_notify_stopped_0" [ style=bold color="green" fontcolor="orange"] ++"msPostgresql_confirmed-pre_notify_demote_0" -> "msPostgresql_demote_0" [ style = bold] ++"msPostgresql_confirmed-pre_notify_demote_0" -> "msPostgresql_post_notify_demoted_0" [ style = bold] ++"msPostgresql_confirmed-pre_notify_demote_0" [ style=bold color="green" fontcolor="orange"] ++"msPostgresql_confirmed-pre_notify_stop_0" -> "msPostgresql_post_notify_stopped_0" [ style = bold] ++"msPostgresql_confirmed-pre_notify_stop_0" -> "msPostgresql_stop_0" [ style = bold] ++"msPostgresql_confirmed-pre_notify_stop_0" [ style=bold color="green" fontcolor="orange"] ++"msPostgresql_demote_0" -> "msPostgresql_demoted_0" [ style = bold] ++"msPostgresql_demote_0" -> "pgsql_demote_0 pgsr02" [ style = bold] ++"msPostgresql_demote_0" [ style=bold color="green" fontcolor="orange"] ++"msPostgresql_demoted_0" -> "msPostgresql_post_notify_demoted_0" [ style = bold] ++"msPostgresql_demoted_0" -> "msPostgresql_stop_0" [ style = bold] ++"msPostgresql_demoted_0" [ style=bold color="green" fontcolor="orange"] ++"msPostgresql_post_notify_demoted_0" -> "msPostgresql_confirmed-post_notify_demoted_0" [ style = bold] ++"msPostgresql_post_notify_demoted_0" -> "pgsql_post_notify_demoted_0 pgsr01" [ style = bold] ++"msPostgresql_post_notify_demoted_0" [ style=bold color="green" fontcolor="orange"] ++"msPostgresql_post_notify_stopped_0" -> "msPostgresql_confirmed-post_notify_stopped_0" [ style = bold] ++"msPostgresql_post_notify_stopped_0" -> "pgsql_post_notify_stop_0 pgsr01" [ style = bold] ++"msPostgresql_post_notify_stopped_0" [ style=bold color="green" fontcolor="orange"] ++"msPostgresql_pre_notify_demote_0" -> "msPostgresql_confirmed-pre_notify_demote_0" [ style = bold] ++"msPostgresql_pre_notify_demote_0" -> "pgsql_pre_notify_demote_0 pgsr01" [ style = bold] ++"msPostgresql_pre_notify_demote_0" [ style=bold color="green" fontcolor="orange"] ++"msPostgresql_pre_notify_stop_0" -> "msPostgresql_confirmed-pre_notify_stop_0" [ style = bold] ++"msPostgresql_pre_notify_stop_0" -> "pgsql_pre_notify_stop_0 pgsr01" [ style = bold] ++"msPostgresql_pre_notify_stop_0" [ style=bold color="green" fontcolor="orange"] ++"msPostgresql_stop_0" -> "msPostgresql_stopped_0" [ style = bold] ++"msPostgresql_stop_0" -> "pgsql_stop_0 pgsr02" [ style = bold] ++"msPostgresql_stop_0" [ style=bold color="green" fontcolor="orange"] ++"msPostgresql_stopped_0" -> "msPostgresql_post_notify_stopped_0" [ style = bold] ++"msPostgresql_stopped_0" [ style=bold color="green" fontcolor="orange"] ++"pgsql_confirmed-post_notify_stop_0" -> "all_stopped" [ style = bold] ++"pgsql_confirmed-post_notify_stop_0" -> "pgsql_monitor_9000 pgsr01" [ style = bold] ++"pgsql_confirmed-post_notify_stop_0" [ style=bold color="green" fontcolor="orange"] ++"pgsql_demote_0 pgsr02" -> "msPostgresql_demoted_0" [ style = bold] ++"pgsql_demote_0 pgsr02" -> "pgsql_stop_0 pgsr02" [ style = bold] ++"pgsql_demote_0 pgsr02" [ style=bold color="green" fontcolor="orange"] ++"pgsql_monitor_9000 pgsr01" [ style=bold color="green" fontcolor="black"] ++"pgsql_post_notify_demoted_0 pgsr01" -> "msPostgresql_confirmed-post_notify_demoted_0" [ style = bold] ++"pgsql_post_notify_demoted_0 pgsr01" [ style=bold color="green" fontcolor="black"] ++"pgsql_post_notify_stop_0 pgsr01" -> "msPostgresql_confirmed-post_notify_stopped_0" [ style = bold] ++"pgsql_post_notify_stop_0 pgsr01" -> "pgsql_confirmed-post_notify_stop_0" [ style = bold] ++"pgsql_post_notify_stop_0 pgsr01" [ style=bold color="green" fontcolor="black"] ++"pgsql_post_notify_stop_0" -> "pgsql_confirmed-post_notify_stop_0" [ style = bold] ++"pgsql_post_notify_stop_0" -> "pgsql_post_notify_stop_0 pgsr01" [ style = bold] ++"pgsql_post_notify_stop_0" [ style=bold color="green" fontcolor="orange"] ++"pgsql_pre_notify_demote_0 pgsr01" -> "msPostgresql_confirmed-pre_notify_demote_0" [ style = bold] ++"pgsql_pre_notify_demote_0 pgsr01" [ style=bold color="green" fontcolor="black"] ++"pgsql_pre_notify_stop_0 pgsr01" -> "msPostgresql_confirmed-pre_notify_stop_0" [ style = bold] ++"pgsql_pre_notify_stop_0 pgsr01" [ style=bold color="green" fontcolor="black"] ++"pgsql_stop_0 pgsr02" -> "all_stopped" [ style = bold] ++"pgsql_stop_0 pgsr02" -> "msPostgresql_stopped_0" [ style = bold] ++"pgsql_stop_0 pgsr02" [ style=bold color="green" fontcolor="orange"] ++"pgsr02_stop_0 bl460g8n4" -> "all_stopped" [ style = bold] ++"pgsr02_stop_0 bl460g8n4" -> "prmDB2_stop_0 bl460g8n4" [ style = bold] ++"pgsr02_stop_0 bl460g8n4" [ style=bold color="green" fontcolor="black"] ++"prmDB2_stop_0 bl460g8n4" -> "all_stopped" [ style = bold] ++"prmDB2_stop_0 bl460g8n4" -> "master-group_stop_0" [ style = bold] ++"prmDB2_stop_0 bl460g8n4" -> "msPostgresql_stop_0" [ style = bold] ++"prmDB2_stop_0 bl460g8n4" -> "pgsql_demote_0 pgsr02" [ style = bold] ++"prmDB2_stop_0 bl460g8n4" -> "pgsql_post_notify_stop_0" [ style = bold] ++"prmDB2_stop_0 bl460g8n4" -> "pgsql_stop_0 pgsr02" [ style = bold] ++"prmDB2_stop_0 bl460g8n4" -> "vip-master_stop_0 pgsr02" [ style = bold] ++"prmDB2_stop_0 bl460g8n4" -> "vip-rep_stop_0 pgsr02" [ style = bold] ++"prmDB2_stop_0 bl460g8n4" [ style=bold color="green" fontcolor="black"] ++"prmStonith1-2_monitor_3600000 bl460g8n4" [ style=bold color="green" fontcolor="black"] ++"prmStonith1-2_start_0 bl460g8n4" -> "grpStonith1_running_0" [ style = bold] ++"prmStonith1-2_start_0 bl460g8n4" -> "prmStonith1-2_monitor_3600000 bl460g8n4" [ style = bold] ++"prmStonith1-2_start_0 bl460g8n4" [ style=bold color="green" fontcolor="black"] ++"prmStonith1-2_stop_0 bl460g8n4" -> "all_stopped" [ style = bold] ++"prmStonith1-2_stop_0 bl460g8n4" -> "grpStonith1_stopped_0" [ style = bold] ++"prmStonith1-2_stop_0 bl460g8n4" -> "prmStonith1-2_start_0 bl460g8n4" [ style = bold] ++"prmStonith1-2_stop_0 bl460g8n4" [ style=bold color="green" fontcolor="orange"] ++"prmStonith2-2_monitor_3600000 bl460g8n3" [ style=bold color="green" fontcolor="black"] ++"prmStonith2-2_start_0 bl460g8n3" -> "grpStonith2_running_0" [ style = bold] ++"prmStonith2-2_start_0 bl460g8n3" -> "prmStonith2-2_monitor_3600000 bl460g8n3" [ style = bold] ++"prmStonith2-2_start_0 bl460g8n3" [ style=bold color="green" fontcolor="black"] ++"prmStonith2-2_stop_0 bl460g8n3" -> "all_stopped" [ style = bold] ++"prmStonith2-2_stop_0 bl460g8n3" -> "grpStonith2_stopped_0" [ style = bold] ++"prmStonith2-2_stop_0 bl460g8n3" -> "prmStonith2-2_start_0 bl460g8n3" [ style = bold] ++"prmStonith2-2_stop_0 bl460g8n3" [ style=bold color="green" fontcolor="black"] ++"vip-master_monitor_10000 pgsr01" [ style=bold color="green" fontcolor="black"] ++"vip-master_start_0 pgsr01" -> "master-group_running_0" [ style = bold] ++"vip-master_start_0 pgsr01" -> "vip-master_monitor_10000 pgsr01" [ style = bold] ++"vip-master_start_0 pgsr01" -> "vip-rep_start_0 pgsr01" [ style = bold] ++"vip-master_start_0 pgsr01" [ style=bold color="green" fontcolor="black"] ++"vip-master_stop_0 pgsr02" -> "all_stopped" [ style = bold] ++"vip-master_stop_0 pgsr02" -> "master-group_stopped_0" [ style = bold] ++"vip-master_stop_0 pgsr02" -> "vip-master_start_0 pgsr01" [ style = bold] ++"vip-master_stop_0 pgsr02" [ style=bold color="green" fontcolor="orange"] ++"vip-rep_monitor_10000 pgsr01" [ style=bold color="green" fontcolor="black"] ++"vip-rep_start_0 pgsr01" -> "master-group_running_0" [ style = bold] ++"vip-rep_start_0 pgsr01" -> "vip-rep_monitor_10000 pgsr01" [ style = bold] ++"vip-rep_start_0 pgsr01" [ style=bold color="green" fontcolor="black"] ++"vip-rep_stop_0 pgsr02" -> "all_stopped" [ style = bold] ++"vip-rep_stop_0 pgsr02" -> "master-group_stopped_0" [ style = bold] ++"vip-rep_stop_0 pgsr02" -> "vip-master_stop_0 pgsr02" [ style = bold] ++"vip-rep_stop_0 pgsr02" -> "vip-rep_start_0 pgsr01" [ style = bold] ++"vip-rep_stop_0 pgsr02" [ style=bold color="green" fontcolor="orange"] ++} +diff --git a/pengine/test10/bug-cl-5247.exp b/pengine/test10/bug-cl-5247.exp +new file mode 100644 +index 0000000..5e36e84 +--- /dev/null ++++ b/pengine/test10/bug-cl-5247.exp +@@ -0,0 +1,704 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/pengine/test10/bug-cl-5247.scores b/pengine/test10/bug-cl-5247.scores +new file mode 100644 +index 0000000..e9e4709 +--- /dev/null ++++ b/pengine/test10/bug-cl-5247.scores +@@ -0,0 +1,84 @@ ++Allocation scores: ++Using the original execution date of: 2015-08-12 02:53:40Z ++clone_color: msPostgresql allocation score on bl460g8n3: -INFINITY ++clone_color: msPostgresql allocation score on bl460g8n4: -INFINITY ++clone_color: msPostgresql allocation score on pgsr01: 0 ++clone_color: msPostgresql allocation score on pgsr02: 0 ++clone_color: pgsql:0 allocation score on bl460g8n3: -INFINITY ++clone_color: pgsql:0 allocation score on bl460g8n4: -INFINITY ++clone_color: pgsql:0 allocation score on pgsr01: 0 ++clone_color: pgsql:0 allocation score on pgsr02: INFINITY ++clone_color: pgsql:1 allocation score on bl460g8n3: -INFINITY ++clone_color: pgsql:1 allocation score on bl460g8n4: -INFINITY ++clone_color: pgsql:1 allocation score on pgsr01: INFINITY ++clone_color: pgsql:1 allocation score on pgsr02: 0 ++group_color: grpStonith1 allocation score on bl460g8n3: -INFINITY ++group_color: grpStonith1 allocation score on bl460g8n4: 0 ++group_color: grpStonith1 allocation score on pgsr01: -INFINITY ++group_color: grpStonith1 allocation score on pgsr02: -INFINITY ++group_color: grpStonith2 allocation score on bl460g8n3: 0 ++group_color: grpStonith2 allocation score on bl460g8n4: -INFINITY ++group_color: grpStonith2 allocation score on pgsr01: -INFINITY ++group_color: grpStonith2 allocation score on pgsr02: -INFINITY ++group_color: master-group allocation score on bl460g8n3: 0 ++group_color: master-group allocation score on bl460g8n4: 0 ++group_color: master-group allocation score on pgsr01: 0 ++group_color: master-group allocation score on pgsr02: 0 ++group_color: prmStonith1-2 allocation score on bl460g8n3: -INFINITY ++group_color: prmStonith1-2 allocation score on bl460g8n4: INFINITY ++group_color: prmStonith1-2 allocation score on pgsr01: -INFINITY ++group_color: prmStonith1-2 allocation score on pgsr02: -INFINITY ++group_color: prmStonith2-2 allocation score on bl460g8n3: INFINITY ++group_color: prmStonith2-2 allocation score on bl460g8n4: -INFINITY ++group_color: prmStonith2-2 allocation score on pgsr01: -INFINITY ++group_color: prmStonith2-2 allocation score on pgsr02: -INFINITY ++group_color: vip-master allocation score on bl460g8n3: 0 ++group_color: vip-master allocation score on bl460g8n4: 0 ++group_color: vip-master allocation score on pgsr01: 0 ++group_color: vip-master allocation score on pgsr02: INFINITY ++group_color: vip-rep allocation score on bl460g8n3: 0 ++group_color: vip-rep allocation score on bl460g8n4: 0 ++group_color: vip-rep allocation score on pgsr01: 0 ++group_color: vip-rep allocation score on pgsr02: INFINITY ++native_color: pgsql:0 allocation score on bl460g8n3: -INFINITY ++native_color: pgsql:0 allocation score on bl460g8n4: -INFINITY ++native_color: pgsql:0 allocation score on pgsr01: -INFINITY ++native_color: pgsql:0 allocation score on pgsr02: -INFINITY ++native_color: pgsql:1 allocation score on bl460g8n3: -INFINITY ++native_color: pgsql:1 allocation score on bl460g8n4: -INFINITY ++native_color: pgsql:1 allocation score on pgsr01: INFINITY ++native_color: pgsql:1 allocation score on pgsr02: -INFINITY ++native_color: pgsr01 allocation score on bl460g8n3: INFINITY ++native_color: pgsr01 allocation score on bl460g8n4: -INFINITY ++native_color: pgsr01 allocation score on pgsr01: -INFINITY ++native_color: pgsr01 allocation score on pgsr02: -INFINITY ++native_color: pgsr02 allocation score on bl460g8n3: -INFINITY ++native_color: pgsr02 allocation score on bl460g8n4: -INFINITY ++native_color: pgsr02 allocation score on pgsr01: -INFINITY ++native_color: pgsr02 allocation score on pgsr02: -INFINITY ++native_color: prmDB1 allocation score on bl460g8n3: INFINITY ++native_color: prmDB1 allocation score on bl460g8n4: -INFINITY ++native_color: prmDB1 allocation score on pgsr01: -INFINITY ++native_color: prmDB1 allocation score on pgsr02: -INFINITY ++native_color: prmDB2 allocation score on bl460g8n3: -INFINITY ++native_color: prmDB2 allocation score on bl460g8n4: -INFINITY ++native_color: prmDB2 allocation score on pgsr01: -INFINITY ++native_color: prmDB2 allocation score on pgsr02: -INFINITY ++native_color: prmStonith1-2 allocation score on bl460g8n3: -INFINITY ++native_color: prmStonith1-2 allocation score on bl460g8n4: INFINITY ++native_color: prmStonith1-2 allocation score on pgsr01: -INFINITY ++native_color: prmStonith1-2 allocation score on pgsr02: -INFINITY ++native_color: prmStonith2-2 allocation score on bl460g8n3: INFINITY ++native_color: prmStonith2-2 allocation score on bl460g8n4: -INFINITY ++native_color: prmStonith2-2 allocation score on pgsr01: -INFINITY ++native_color: prmStonith2-2 allocation score on pgsr02: -INFINITY ++native_color: vip-master allocation score on bl460g8n3: -INFINITY ++native_color: vip-master allocation score on bl460g8n4: -INFINITY ++native_color: vip-master allocation score on pgsr01: INFINITY ++native_color: vip-master allocation score on pgsr02: -INFINITY ++native_color: vip-rep allocation score on bl460g8n3: -INFINITY ++native_color: vip-rep allocation score on bl460g8n4: -INFINITY ++native_color: vip-rep allocation score on pgsr01: 0 ++native_color: vip-rep allocation score on pgsr02: -INFINITY ++pgsql:0 promotion score on none: 0 ++pgsql:1 promotion score on pgsr01: 10 +diff --git a/pengine/test10/bug-cl-5247.summary b/pengine/test10/bug-cl-5247.summary +new file mode 100644 +index 0000000..5564286 +--- /dev/null ++++ b/pengine/test10/bug-cl-5247.summary +@@ -0,0 +1,96 @@ ++Using the original execution date of: 2015-08-12 02:53:40Z ++ ++Current cluster status: ++Online: [ bl460g8n3 bl460g8n4 ] ++Containers: [ pgsr01:prmDB1 ] ++ ++ prmDB1 (ocf::heartbeat:VirtualDomain): Started bl460g8n3 ++ prmDB2 (ocf::heartbeat:VirtualDomain): FAILED bl460g8n4 ++ Resource Group: grpStonith1 ++ prmStonith1-2 (stonith:external/ipmi): Started bl460g8n4 ++ Resource Group: grpStonith2 ++ prmStonith2-2 (stonith:external/ipmi): Started bl460g8n3 ++ Resource Group: master-group ++ vip-master (ocf::heartbeat:Dummy): FAILED pgsr02 ++ vip-rep (ocf::heartbeat:Dummy): FAILED pgsr02 ++ Master/Slave Set: msPostgresql [pgsql] ++ Masters: [ pgsr01 ] ++ Stopped: [ bl460g8n3 bl460g8n4 ] ++ ++Transition Summary: ++ * Stop prmDB2 (bl460g8n4) ++ * Restart prmStonith1-2 (Started bl460g8n4) ++ * Restart prmStonith2-2 (Started bl460g8n3) ++ * Recover vip-master (Started pgsr02 -> pgsr01) ++ * Recover vip-rep (Started pgsr02 -> pgsr01) ++ * Demote pgsql:0 (Master -> Stopped pgsr02) ++ * Stop pgsr02 (bl460g8n4) ++ ++Executing cluster transition: ++ * Pseudo action: grpStonith1_stop_0 ++ * Pseudo action: prmStonith1-2_stop_0 ++ * Pseudo action: grpStonith2_stop_0 ++ * Resource action: prmStonith2-2 stop on bl460g8n3 ++ * Pseudo action: msPostgresql_pre_notify_demote_0 ++ * Resource action: pgsr02 stop on bl460g8n4 ++ * Resource action: prmDB2 stop on bl460g8n4 ++ * Pseudo action: grpStonith1_stopped_0 ++ * Pseudo action: grpStonith1_start_0 ++ * Resource action: prmStonith1-2 start on bl460g8n4 ++ * Resource action: prmStonith1-2 monitor=3600000 on bl460g8n4 ++ * Pseudo action: grpStonith2_stopped_0 ++ * Pseudo action: grpStonith2_start_0 ++ * Resource action: prmStonith2-2 start on bl460g8n3 ++ * Resource action: prmStonith2-2 monitor=3600000 on bl460g8n3 ++ * Pseudo action: pgsql_post_notify_stop_0 ++ * Resource action: pgsql notify on pgsr01 ++ * Pseudo action: msPostgresql_confirmed-pre_notify_demote_0 ++ * Pseudo action: msPostgresql_demote_0 ++ * Pseudo action: grpStonith1_running_0 ++ * Pseudo action: grpStonith2_running_0 ++ * Pseudo action: pgsql_demote_0 ++ * Pseudo action: msPostgresql_demoted_0 ++ * Pseudo action: msPostgresql_post_notify_demoted_0 ++ * Resource action: pgsql notify on pgsr01 ++ * Pseudo action: msPostgresql_confirmed-post_notify_demoted_0 ++ * Pseudo action: msPostgresql_pre_notify_stop_0 ++ * Pseudo action: master-group_stop_0 ++ * Pseudo action: vip-rep_stop_0 ++ * Resource action: pgsql notify on pgsr01 ++ * Pseudo action: msPostgresql_confirmed-pre_notify_stop_0 ++ * Pseudo action: msPostgresql_stop_0 ++ * Pseudo action: vip-master_stop_0 ++ * Pseudo action: pgsql_stop_0 ++ * Pseudo action: msPostgresql_stopped_0 ++ * Pseudo action: master-group_stopped_0 ++ * Pseudo action: master-group_start_0 ++ * Resource action: vip-master start on pgsr01 ++ * Resource action: vip-rep start on pgsr01 ++ * Pseudo action: msPostgresql_post_notify_stopped_0 ++ * Pseudo action: master-group_running_0 ++ * Resource action: vip-master monitor=10000 on pgsr01 ++ * Resource action: vip-rep monitor=10000 on pgsr01 ++ * Resource action: pgsql notify on pgsr01 ++ * Pseudo action: msPostgresql_confirmed-post_notify_stopped_0 ++ * Pseudo action: pgsql_notified_0 ++ * Resource action: pgsql monitor=9000 on pgsr01 ++ * Pseudo action: all_stopped ++Using the original execution date of: 2015-08-12 02:53:40Z ++ ++Revised cluster status: ++Online: [ bl460g8n3 bl460g8n4 ] ++Containers: [ pgsr01:prmDB1 ] ++ ++ prmDB1 (ocf::heartbeat:VirtualDomain): Started bl460g8n3 ++ prmDB2 (ocf::heartbeat:VirtualDomain): FAILED ++ Resource Group: grpStonith1 ++ prmStonith1-2 (stonith:external/ipmi): Started bl460g8n4 ++ Resource Group: grpStonith2 ++ prmStonith2-2 (stonith:external/ipmi): Started bl460g8n3 ++ Resource Group: master-group ++ vip-master (ocf::heartbeat:Dummy): FAILED[ pgsr02 pgsr01 ] ++ vip-rep (ocf::heartbeat:Dummy): FAILED[ pgsr02 pgsr01 ] ++ Master/Slave Set: msPostgresql [pgsql] ++ Masters: [ pgsr01 ] ++ Stopped: [ bl460g8n3 bl460g8n4 ] ++ +diff --git a/pengine/test10/bug-cl-5247.xml b/pengine/test10/bug-cl-5247.xml +new file mode 100644 +index 0000000..c36ef40 +--- /dev/null ++++ b/pengine/test10/bug-cl-5247.xml +@@ -0,0 +1,295 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ diff --git a/SOURCES/0008-Fix-tools-memory-leak-in-crm_resource.patch b/SOURCES/0008-Fix-tools-memory-leak-in-crm_resource.patch new file mode 100644 index 0000000..c29561f --- /dev/null +++ b/SOURCES/0008-Fix-tools-memory-leak-in-crm_resource.patch @@ -0,0 +1,33 @@ +From: Ken Gaillot +Date: Mon, 17 Aug 2015 10:28:19 -0500 +Subject: [PATCH] Fix: tools: memory leak in crm_resource + +(cherry picked from commit c11bc4b856b07d5ea5b8284a3d566dd782e6bb7c) +--- + tools/crm_resource_runtime.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index f260e19..b9427bc 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -399,9 +399,11 @@ cli_resource_delete_attribute(const char *rsc_id, const char *attr_set, const ch + &local_attr_id); + + if (rc == -ENXIO) { ++ free(lookup_id); + return pcmk_ok; + + } else if (rc != pcmk_ok) { ++ free(lookup_id); + return rc; + } + +@@ -424,6 +426,7 @@ cli_resource_delete_attribute(const char *rsc_id, const char *attr_set, const ch + attr_name ? " name=" : "", attr_name ? attr_name : ""); + } + ++ free(lookup_id); + free_xml(xml_obj); + free(local_attr_id); + return rc; diff --git a/SOURCES/0009-Fix-pengine-The-failed-action-of-the-resource-that-o.patch b/SOURCES/0009-Fix-pengine-The-failed-action-of-the-resource-that-o.patch new file mode 100644 index 0000000..1ddba9f --- /dev/null +++ b/SOURCES/0009-Fix-pengine-The-failed-action-of-the-resource-that-o.patch @@ -0,0 +1,31 @@ +From: Hideo Yamauchi +Date: Fri, 21 Aug 2015 14:12:33 +0900 +Subject: [PATCH] Fix: pengine: The failed action of the resource that occurred + in shutdown is not displayed. + +It is like the problem that entered when you summarized an old judgment +in function (record_failed_op) by the next correction. + +* +https://github.com/ClusterLabs/pacemaker/commit/9cd666ac15a2998f4543e1dac33edea36bbcf930#diff-7dae505817fa61e544018e581ee45933 + +(cherry picked from commit 119df5c0bd8fac02bd36e45a28288dcf4624b89d) +--- + lib/pengine/unpack.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 0f83be4..156a192 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -2546,9 +2546,7 @@ record_failed_op(xmlNode *op, node_t* node, pe_working_set_t * data_set) + xmlNode *xIter = NULL; + const char *op_key = crm_element_value(op, XML_LRM_ATTR_TASK_KEY); + +- if (node->details->shutdown) { +- return; +- } else if(node->details->online == FALSE) { ++ if ((node->details->shutdown) && (node->details->online == FALSE)) { + return; + } + diff --git a/SOURCES/0010-Log-services-Reduce-severity-of-noisy-log-messages.patch b/SOURCES/0010-Log-services-Reduce-severity-of-noisy-log-messages.patch new file mode 100644 index 0000000..40aeb8b --- /dev/null +++ b/SOURCES/0010-Log-services-Reduce-severity-of-noisy-log-messages.patch @@ -0,0 +1,34 @@ +From: "Gao,Yan" +Date: Wed, 26 Aug 2015 18:12:56 +0200 +Subject: [PATCH] Log: services: Reduce severity of noisy log messages + +They occurred for every monitor operation of systemd resources. + +(cherry picked from commit a77c401a3fcdedec165c05d27a75d75abcebf4a1) +--- + lib/services/services.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/lib/services/services.c b/lib/services/services.c +index 3f40078..abf1458 100644 +--- a/lib/services/services.c ++++ b/lib/services/services.c +@@ -366,15 +366,15 @@ services_set_op_pending(svc_action_t *op, DBusPendingCall *pending) + if (pending) { + crm_info("Lost pending %s DBus call (%p)", op->id, op->opaque->pending); + } else { +- crm_info("Done with pending %s DBus call (%p)", op->id, op->opaque->pending); ++ crm_trace("Done with pending %s DBus call (%p)", op->id, op->opaque->pending); + } + dbus_pending_call_unref(op->opaque->pending); + } + op->opaque->pending = pending; + if (pending) { +- crm_info("Updated pending %s DBus call (%p)", op->id, pending); ++ crm_trace("Updated pending %s DBus call (%p)", op->id, pending); + } else { +- crm_info("Cleared pending %s DBus call", op->id); ++ crm_trace("Cleared pending %s DBus call", op->id); + } + } + #endif diff --git a/SOURCES/0011-Fix-xml-Mark-xml-nodes-as-dirty-if-any-children-move.patch b/SOURCES/0011-Fix-xml-Mark-xml-nodes-as-dirty-if-any-children-move.patch new file mode 100644 index 0000000..c67a465 --- /dev/null +++ b/SOURCES/0011-Fix-xml-Mark-xml-nodes-as-dirty-if-any-children-move.patch @@ -0,0 +1,24 @@ +From: "Gao,Yan" +Date: Wed, 26 Aug 2015 16:28:38 +0200 +Subject: [PATCH] Fix: xml: Mark xml nodes as dirty if any children move + +Otherwise if nothing else changed in the new xml, even the versions +weren't bumped, crm_diff would output an empty xml diff. + +(cherry picked from commit 1073786ec24f3bbf26a0f6a5b0614a65edac4301) +--- + lib/common/xml.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/lib/common/xml.c b/lib/common/xml.c +index 299c7bf..353eb4b 100644 +--- a/lib/common/xml.c ++++ b/lib/common/xml.c +@@ -4275,6 +4275,7 @@ __xml_diff_object(xmlNode * old, xmlNode * new) + if(p_old != p_new) { + crm_info("%s.%s moved from %d to %d - %d", + new_child->name, ID(new_child), p_old, p_new); ++ __xml_node_dirty(new); + p->flags |= xpf_moved; + + if(p_old > p_new) { diff --git a/SOURCES/0012-Feature-crmd-Implement-reliable-event-notifications.patch b/SOURCES/0012-Feature-crmd-Implement-reliable-event-notifications.patch new file mode 100644 index 0000000..94e3307 --- /dev/null +++ b/SOURCES/0012-Feature-crmd-Implement-reliable-event-notifications.patch @@ -0,0 +1,565 @@ +From: Andrew Beekhof +Date: Tue, 1 Sep 2015 13:17:45 +1000 +Subject: [PATCH] Feature: crmd: Implement reliable event notifications + +(cherry picked from commit 0cd1b8f02b403976afe106e0ca3a8a8a16864c6c) +--- + crmd/Makefile.am | 2 +- + crmd/callbacks.c | 4 + + crmd/control.c | 67 +++++++++++++--- + crmd/crmd_utils.h | 1 + + crmd/lrm.c | 2 + + crmd/notify.c | 188 ++++++++++++++++++++++++++++++++++++++++++++ + crmd/notify.h | 30 +++++++ + crmd/te_utils.c | 2 + + cts/CIB.py | 2 + + extra/pcmk_notify_sample.sh | 68 ++++++++++++++++ + include/crm_internal.h | 1 + + lib/common/utils.c | 27 +++++++ + 12 files changed, 380 insertions(+), 14 deletions(-) + create mode 100644 crmd/notify.c + create mode 100644 crmd/notify.h + create mode 100755 extra/pcmk_notify_sample.sh + +diff --git a/crmd/Makefile.am b/crmd/Makefile.am +index 8e5e1df..984f5d0 100644 +--- a/crmd/Makefile.am ++++ b/crmd/Makefile.am +@@ -28,7 +28,7 @@ noinst_HEADERS = crmd.h crmd_fsa.h crmd_messages.h fsa_defines.h \ + fsa_matrix.h fsa_proto.h crmd_utils.h crmd_callbacks.h \ + crmd_lrm.h te_callbacks.h tengine.h + +-crmd_SOURCES = main.c crmd.c corosync.c \ ++crmd_SOURCES = main.c crmd.c corosync.c notify.c \ + fsa.c control.c messages.c membership.c callbacks.c \ + election.c join_client.c join_dc.c subsystems.c throttle.c \ + cib.c pengine.c tengine.c lrm.c lrm_state.c remote_lrmd_ra.c \ +diff --git a/crmd/callbacks.c b/crmd/callbacks.c +index f646927..38fb30b 100644 +--- a/crmd/callbacks.c ++++ b/crmd/callbacks.c +@@ -126,6 +126,7 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d + case crm_status_nstate: + crm_info("%s is now %s (was %s)", + node->uname, state_text(node->state), state_text(data)); ++ + if (safe_str_eq(data, node->state)) { + /* State did not change */ + return; +@@ -147,7 +148,10 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d + } + } + } ++ ++ crmd_notify_node_event(node); + break; ++ + case crm_status_processes: + if (data) { + old = *(const uint32_t *)data; +diff --git a/crmd/control.c b/crmd/control.c +index f4add49..d92f46b 100644 +--- a/crmd/control.c ++++ b/crmd/control.c +@@ -873,28 +873,64 @@ do_recover(long long action, + + /* *INDENT-OFF* */ + pe_cluster_option crmd_opts[] = { +- /* name, old-name, validate, default, description */ +- { "dc-version", NULL, "string", NULL, "none", NULL, "Version of Pacemaker on the cluster's DC.", "Includes the hash which identifies the exact Mercurial changeset it was built from. Used for diagnostic purposes." }, +- { "cluster-infrastructure", NULL, "string", NULL, "heartbeat", NULL, "The messaging stack on which Pacemaker is currently running.", "Used for informational and diagnostic purposes." }, +- { XML_CONFIG_ATTR_DC_DEADTIME, "dc_deadtime", "time", NULL, "20s", &check_time, "How long to wait for a response from other nodes during startup.", "The \"correct\" value will depend on the speed/load of your network and the type of switches used." }, ++ /* name, old-name, validate, values, default, short description, long description */ ++ { "dc-version", NULL, "string", NULL, "none", NULL, ++ "Version of Pacemaker on the cluster's DC.", ++ "Includes the hash which identifies the exact changeset it was built from. Used for diagnostic purposes." ++ }, ++ { "cluster-infrastructure", NULL, "string", NULL, "heartbeat", NULL, ++ "The messaging stack on which Pacemaker is currently running.", ++ "Used for informational and diagnostic purposes." }, ++ { XML_CONFIG_ATTR_DC_DEADTIME, "dc_deadtime", "time", NULL, "20s", &check_time, ++ "How long to wait for a response from other nodes during startup.", ++ "The \"correct\" value will depend on the speed/load of your network and the type of switches used." ++ }, + { XML_CONFIG_ATTR_RECHECK, "cluster_recheck_interval", "time", +- "Zero disables polling. Positive values are an interval in seconds (unless other SI units are specified. eg. 5min)", "15min", &check_timer, ++ "Zero disables polling. Positive values are an interval in seconds (unless other SI units are specified. eg. 5min)", ++ "15min", &check_timer, + "Polling interval for time based changes to options, resource parameters and constraints.", + "The Cluster is primarily event driven, however the configuration can have elements that change based on time." +- " To ensure these changes take effect, we can optionally poll the cluster's status for changes." }, ++ " To ensure these changes take effect, we can optionally poll the cluster's status for changes." ++ }, ++ ++ { "notification-script", NULL, "string", NULL, "/dev/null", &check_script, ++ "Notification script to be called after significant cluster events", ++ "Full path to a script that will be invoked when resources start/stop/fail, fencing occurs or nodes join/leave the cluster.\n" ++ "Must exist on all nodes in the cluster." ++ }, ++ { "notification-target", NULL, "string", NULL, "", NULL, ++ "Destination for notifications (Optional)", ++ "Where should the supplied script send notifications to. Useful to avoid hard-coding this in the script." ++ }, ++ + { "load-threshold", NULL, "percentage", NULL, "80%", &check_utilization, + "The maximum amount of system resources that should be used by nodes in the cluster", + "The cluster will slow down its recovery process when the amount of system resources used" +- " (currently CPU) approaches this limit", }, ++ " (currently CPU) approaches this limit", ++ }, + { "node-action-limit", NULL, "integer", NULL, "0", &check_number, + "The maximum number of jobs that can be scheduled per node. Defaults to 2x cores"}, +- { XML_CONFIG_ATTR_ELECTION_FAIL, "election_timeout", "time", NULL, "2min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, +- { XML_CONFIG_ATTR_FORCE_QUIT, "shutdown_escalation", "time", NULL, "20min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, +- { "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, +- { "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer, "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug." }, +- { "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer, "*** Advanced Use Only ***\nEnabling this option will slow down cluster recovery under all conditions", "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\nUseful if your configuration is sensitive to the order in which ping updates arrive." }, ++ { XML_CONFIG_ATTR_ELECTION_FAIL, "election_timeout", "time", NULL, "2min", &check_timer, ++ "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." ++ }, ++ { XML_CONFIG_ATTR_FORCE_QUIT, "shutdown_escalation", "time", NULL, "20min", &check_timer, ++ "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." ++ }, ++ { "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer, ++ "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." ++ }, ++ { "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer, ++ "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug." ++ }, ++ { "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer, ++ "*** Advanced Use Only ***\n" ++ "Enabling this option will slow down cluster recovery under all conditions", ++ "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\n" ++ "Useful if your configuration is sensitive to the order in which ping updates arrive." ++ }, + { "stonith-watchdog-timeout", NULL, "time", NULL, NULL, &check_timer, +- "How long to wait before we can assume nodes are safely down", NULL }, ++ "How long to wait before we can assume nodes are safely down", NULL ++ }, + { "no-quorum-policy", "no_quorum_policy", "enum", "stop, freeze, ignore, suicide", "stop", &check_quorum, NULL, NULL }, + + #if SUPPORT_PLUGIN +@@ -927,6 +963,7 @@ crmd_pref(GHashTable * options, const char *name) + static void + config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) + { ++ const char *script = NULL; + const char *value = NULL; + GHashTable *config_hash = NULL; + crm_time_t *now = crm_time_new(NULL); +@@ -955,6 +992,10 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void + + verify_crmd_options(config_hash); + ++ script = crmd_pref(config_hash, "notification-script"); ++ value = crmd_pref(config_hash, "notification-target"); ++ crmd_enable_notifications(script, value); ++ + value = crmd_pref(config_hash, XML_CONFIG_ATTR_DC_DEADTIME); + election_trigger->period_ms = crm_get_msec(value); + +diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h +index 78214bf..7e8c3e6 100644 +--- a/crmd/crmd_utils.h ++++ b/crmd/crmd_utils.h +@@ -21,6 +21,7 @@ + # include + # include + # include /* For CIB_OP_MODIFY */ ++# include "notify.h" + + # define CLIENT_EXIT_WAIT 30 + # define FAKE_TE_ID "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" +diff --git a/crmd/lrm.c b/crmd/lrm.c +index 418e7cf..48195e8 100644 +--- a/crmd/lrm.c ++++ b/crmd/lrm.c +@@ -2415,6 +2415,8 @@ process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurr + free(prefix); + } + ++ crmd_notify_resource_op(lrm_state->node_name, op); ++ + if (op->rsc_deleted) { + crm_info("Deletion of resource '%s' complete after %s", op->rsc_id, op_key); + delete_rsc_entry(lrm_state, NULL, op->rsc_id, NULL, pcmk_ok, NULL); +diff --git a/crmd/notify.c b/crmd/notify.c +new file mode 100644 +index 0000000..980bfa6 +--- /dev/null ++++ b/crmd/notify.c +@@ -0,0 +1,188 @@ ++/* ++ * Copyright (C) 2015 Andrew Beekhof ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public ++ * License as published by the Free Software Foundation; either ++ * version 2 of the License, or (at your option) any later version. ++ * ++ * This software is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public ++ * License along with this library; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++#include ++#include ++#include "notify.h" ++ ++char *notify_script = NULL; ++char *notify_target = NULL; ++ ++ ++static const char *notify_keys[] = ++{ ++ "CRM_notify_recipient", ++ "CRM_notify_node", ++ "CRM_notify_rsc", ++ "CRM_notify_task", ++ "CRM_notify_interval", ++ "CRM_notify_desc", ++ "CRM_notify_status", ++ "CRM_notify_target_rc", ++ "CRM_notify_rc", ++ "CRM_notify_kind", ++ "CRM_notify_version", ++}; ++ ++ ++void ++crmd_enable_notifications(const char *script, const char *target) ++{ ++ free(notify_script); ++ notify_script = NULL; ++ ++ free(notify_target); ++ notify_target = NULL; ++ ++ if(safe_str_eq(script, "/dev/null")) { ++ crm_notice("Notifications disabled"); ++ return; ++ } ++ ++ notify_script = strdup(script); ++ notify_target = strdup(target); ++ crm_notice("Notifications enabled"); ++} ++ ++static void ++set_notify_key(const char *name, const char *cvalue, char *value) ++{ ++ int lpc; ++ bool found = 0; ++ ++ if(cvalue == NULL) { ++ cvalue = value; ++ } ++ ++ for(lpc = 0; lpc < DIMOF(notify_keys); lpc++) { ++ if(safe_str_eq(name, notify_keys[lpc])) { ++ found = 1; ++ crm_trace("Setting notify key %s = '%s'", name, cvalue); ++ setenv(name, cvalue, 1); ++ break; ++ } ++ } ++ ++ CRM_ASSERT(found != 0); ++ free(value); ++} ++ ++ ++static void ++send_notification(const char *kind) ++{ ++ int lpc; ++ pid_t pid; ++ ++ crm_debug("Sending '%s' notification to '%s' via '%s'", kind, notify_target, notify_script); ++ ++ set_notify_key("CRM_notify_recipient", notify_target, NULL); ++ set_notify_key("CRM_notify_kind", kind, NULL); ++ set_notify_key("CRM_notify_version", VERSION, NULL); ++ ++ pid = fork(); ++ if (pid == -1) { ++ crm_perror(LOG_ERR, "notification failed"); ++ } ++ ++ if (pid == 0) { ++ /* crm_debug("notification: I am the child. Executing the nofitication program."); */ ++ execl(notify_script, notify_script, NULL); ++ exit(EXIT_FAILURE); ++ ++ } else { ++ for(lpc = 0; lpc < DIMOF(notify_keys); lpc++) { ++ unsetenv(notify_keys[lpc]); ++ } ++ } ++} ++ ++void crmd_notify_node_event(crm_node_t *node) ++{ ++ if(notify_script == NULL) { ++ return; ++ } ++ ++ set_notify_key("CRM_notify_node", node->uname, NULL); ++ set_notify_key("CRM_notify_desc", node->state, NULL); ++ ++ send_notification("node"); ++} ++ ++void ++crmd_notify_fencing_op(stonith_event_t * e) ++{ ++ char *desc = NULL; ++ ++ if(notify_script) { ++ return; ++ } ++ ++ desc = crm_strdup_printf("Operation %s requested by %s for peer %s: %s (ref=%s)", ++ e->operation, e->origin, e->target, pcmk_strerror(e->result), ++ e->id); ++ ++ set_notify_key("CRM_notify_node", e->target, NULL); ++ set_notify_key("CRM_notify_task", e->operation, NULL); ++ set_notify_key("CRM_notify_desc", NULL, desc); ++ set_notify_key("CRM_notify_rc", NULL, crm_itoa(e->result)); ++ ++ send_notification("fencing"); ++} ++ ++void ++crmd_notify_resource_op(const char *node, lrmd_event_data_t * op) ++{ ++ int target_rc = 0; ++ ++ if(notify_script == NULL) { ++ return; ++ } ++ ++ target_rc = rsc_op_expected_rc(op); ++ if(op->interval == 0 && target_rc == op->rc && safe_str_eq(op->op_type, RSC_STATUS)) { ++ /* Leave it up to the script if they want to notify for ++ * 'failed' probes, only swallow ones for which the result was ++ * unexpected. ++ * ++ * Even if we find a resource running, it was probably because ++ * someone erased the status section. ++ */ ++ return; ++ } ++ ++ set_notify_key("CRM_notify_node", node, NULL); ++ ++ set_notify_key("CRM_notify_rsc", op->rsc_id, NULL); ++ set_notify_key("CRM_notify_task", op->op_type, NULL); ++ set_notify_key("CRM_notify_interval", NULL, crm_itoa(op->interval)); ++ ++ set_notify_key("CRM_notify_target_rc", NULL, crm_itoa(target_rc)); ++ set_notify_key("CRM_notify_status", NULL, crm_itoa(op->op_status)); ++ set_notify_key("CRM_notify_rc", NULL, crm_itoa(op->rc)); ++ ++ if(op->op_status == PCMK_LRM_OP_DONE) { ++ set_notify_key("CRM_notify_desc", services_ocf_exitcode_str(op->rc), NULL); ++ } else { ++ set_notify_key("CRM_notify_desc", services_lrm_status_str(op->op_status), NULL); ++ } ++ ++ send_notification("resource"); ++} ++ +diff --git a/crmd/notify.h b/crmd/notify.h +new file mode 100644 +index 0000000..4b138ea +--- /dev/null ++++ b/crmd/notify.h +@@ -0,0 +1,30 @@ ++/* ++ * Copyright (C) 2015 Andrew Beekhof ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public ++ * License as published by the Free Software Foundation; either ++ * version 2 of the License, or (at your option) any later version. ++ * ++ * This software is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public ++ * License along with this library; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++#ifndef CRMD_NOTIFY__H ++# define CRMD_NOTIFY__H ++ ++# include ++# include ++# include ++ ++void crmd_enable_notifications(const char *script, const char *target); ++void crmd_notify_node_event(crm_node_t *node); ++void crmd_notify_fencing_op(stonith_event_t * e); ++void crmd_notify_resource_op(const char *node, lrmd_event_data_t * op); ++ ++#endif +diff --git a/crmd/te_utils.c b/crmd/te_utils.c +index a1d29f6..22551ba 100644 +--- a/crmd/te_utils.c ++++ b/crmd/te_utils.c +@@ -124,6 +124,8 @@ tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event) + return; + } + ++ crmd_notify_fencing_op(st_event); ++ + if (st_event->result == pcmk_ok && safe_str_eq("on", st_event->action)) { + crm_notice("%s was successfully unfenced by %s (at the request of %s)", + st_event->target, st_event->executioner ? st_event->executioner : "", st_event->origin); +diff --git a/cts/CIB.py b/cts/CIB.py +index 8fbba6c..cd3a6a1 100644 +--- a/cts/CIB.py ++++ b/cts/CIB.py +@@ -219,6 +219,8 @@ class CIB11(ConfigBase): + o["dc-deadtime"] = "5s" + o["no-quorum-policy"] = no_quorum + o["expected-quorum-votes"] = self.num_nodes ++ o["notification-script"] = "/var/lib/pacemaker/notify.sh" ++ o["notification-target"] = "/var/lib/pacemaker/notify.log" + + if self.CM.Env["DoBSC"] == 1: + o["ident-string"] = "Linux-HA TEST configuration file - REMOVEME!!" +diff --git a/extra/pcmk_notify_sample.sh b/extra/pcmk_notify_sample.sh +new file mode 100755 +index 0000000..83cf8e9 +--- /dev/null ++++ b/extra/pcmk_notify_sample.sh +@@ -0,0 +1,68 @@ ++#!/bin/bash ++# ++# Copyright (C) 2015 Andrew Beekhof ++# ++# This program is free software; you can redistribute it and/or ++# modify it under the terms of the GNU General Public ++# License as published by the Free Software Foundation; either ++# version 2 of the License, or (at your option) any later version. ++# ++# This software is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++# General Public License for more details. ++# ++# You should have received a copy of the GNU General Public ++# License along with this library; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ ++if [ -z $CRM_notify_version ]; then ++ echo "Pacemaker version 1.1.14 is required" >> ${CRM_notify_recipient} ++ exit 0 ++fi ++ ++case $CRM_notify_kind in ++ node) ++ echo "Node '${CRM_notify_node}' is now '${CRM_notify_desc}'" >> ${CRM_notify_recipient} ++ ;; ++ fencing) ++ # Other keys: ++ # ++ # CRM_notify_node ++ # CRM_notify_task ++ # CRM_notify_rc ++ # ++ echo "Fencing ${CRM_notify_desc}" >> ${CRM_notify_recipient} ++ ;; ++ resource) ++ # Other keys: ++ # ++ # CRM_notify_target_rc ++ # CRM_notify_status ++ # CRM_notify_rc ++ # ++ if [ ${CRM_notify_interval} = "0" ]; then ++ CRM_notify_interval="" ++ else ++ CRM_notify_interval=" (${CRM_notify_interval})" ++ fi ++ ++ if [ ${CRM_notify_target_rc} = "0" ]; then ++ CRM_notify_target_rc="" ++ else ++ CRM_notify_target_rc=" (target: ${CRM_notify_target_rc})" ++ fi ++ ++ case ${CRM_notify_desc} in ++ Cancelled) ;; ++ *) ++ echo "Resource operation '${CRM_notify_task}${CRM_notify_interval}' for '${CRM_notify_rsc}' on '${CRM_notify_node}': ${CRM_notify_desc}${CRM_notify_target_rc}" >> ${CRM_notify_recipient} ++ ;; ++ esac ++ ;; ++ *) ++ echo "Unhandled $CRM_notify_kind notification" >> ${CRM_notify_recipient} ++ env | grep CRM_notify >> ${CRM_notify_recipient} ++ ;; ++ ++esac +diff --git a/include/crm_internal.h b/include/crm_internal.h +index c13bc7b..fb03537 100644 +--- a/include/crm_internal.h ++++ b/include/crm_internal.h +@@ -127,6 +127,7 @@ gboolean check_timer(const char *value); + gboolean check_boolean(const char *value); + gboolean check_number(const char *value); + gboolean check_quorum(const char *value); ++gboolean check_script(const char *value); + gboolean check_utilization(const char *value); + + /* Shared PE/crmd functionality */ +diff --git a/lib/common/utils.c b/lib/common/utils.c +index 6a234dc..628cf2f 100644 +--- a/lib/common/utils.c ++++ b/lib/common/utils.c +@@ -180,6 +180,33 @@ check_quorum(const char *value) + } + + gboolean ++check_script(const char *value) ++{ ++ struct stat st; ++ ++ if(safe_str_eq(value, "/dev/null")) { ++ return TRUE; ++ } ++ ++ if(stat(value, &st) != 0) { ++ crm_err("Script %s does not exist", value); ++ return FALSE; ++ } ++ ++ if(S_ISREG(st.st_mode) == 0) { ++ crm_err("Script %s is not a regular file", value); ++ return FALSE; ++ } ++ ++ if( (st.st_mode & (S_IXUSR | S_IXGRP )) == 0) { ++ crm_err("Script %s is not executable", value); ++ return FALSE; ++ } ++ ++ return TRUE; ++} ++ ++gboolean + check_utilization(const char *value) + { + char *end = NULL; diff --git a/SOURCES/0013-Fix-cman-Suppress-implied-node-names.patch b/SOURCES/0013-Fix-cman-Suppress-implied-node-names.patch new file mode 100644 index 0000000..eb14b0d --- /dev/null +++ b/SOURCES/0013-Fix-cman-Suppress-implied-node-names.patch @@ -0,0 +1,47 @@ +From: Andrew Beekhof +Date: Wed, 2 Sep 2015 12:08:52 +1000 +Subject: [PATCH] Fix: cman: Suppress implied node names + +(cherry picked from commit e94fbcd0c49db9d3c69b7c0e478ba89a4d360dde) +--- + tools/crm_node.c | 20 +++++++++++++++++++- + 1 file changed, 19 insertions(+), 1 deletion(-) + +diff --git a/tools/crm_node.c b/tools/crm_node.c +index d0195e3..24cc4d7 100644 +--- a/tools/crm_node.c ++++ b/tools/crm_node.c +@@ -434,6 +434,21 @@ try_heartbeat(int command, enum cluster_type_e stack) + #if SUPPORT_CMAN + # include + # define MAX_NODES 256 ++static bool valid_cman_name(const char *name, uint32_t nodeid) ++{ ++ bool rc = TRUE; ++ ++ /* Yes, %d, because that's what CMAN does */ ++ char *fakename = crm_strdup_printf("Node%d", nodeid); ++ ++ if(crm_str_eq(fakename, name, TRUE)) { ++ rc = FALSE; ++ crm_notice("Ignoring inferred name from cman: %s", fakename); ++ } ++ free(fakename); ++ return rc; ++} ++ + static gboolean + try_cman(int command, enum cluster_type_e stack) + { +@@ -478,7 +493,10 @@ try_cman(int command, enum cluster_type_e stack) + } + + for (lpc = 0; lpc < node_count; lpc++) { +- if (command == 'l') { ++ if(valid_cman_name(cman_nodes[lpc].cn_name, cman_nodes[lpc].cn_nodeid) == FALSE) { ++ /* Do not print */ ++ ++ } if (command == 'l') { + printf("%s ", cman_nodes[lpc].cn_name); + + } else if (cman_nodes[lpc].cn_nodeid != 0 && cman_nodes[lpc].cn_member) { diff --git a/SOURCES/0014-Fix-crmd-Choose-more-appropriate-names-for-notificat.patch b/SOURCES/0014-Fix-crmd-Choose-more-appropriate-names-for-notificat.patch new file mode 100644 index 0000000..2a12849 --- /dev/null +++ b/SOURCES/0014-Fix-crmd-Choose-more-appropriate-names-for-notificat.patch @@ -0,0 +1,58 @@ +From: Andrew Beekhof +Date: Wed, 2 Sep 2015 14:32:40 +1000 +Subject: [PATCH] Fix: crmd: Choose more appropriate names for notification + options + +(cherry picked from commit 8971ef024ffebf3d0240b30e620697a7b58232c4) +--- + crmd/control.c | 12 ++++++------ + cts/CIB.py | 4 ++-- + 2 files changed, 8 insertions(+), 8 deletions(-) + +diff --git a/crmd/control.c b/crmd/control.c +index d92f46b..d1f9acd 100644 +--- a/crmd/control.c ++++ b/crmd/control.c +@@ -893,12 +893,12 @@ pe_cluster_option crmd_opts[] = { + " To ensure these changes take effect, we can optionally poll the cluster's status for changes." + }, + +- { "notification-script", NULL, "string", NULL, "/dev/null", &check_script, +- "Notification script to be called after significant cluster events", +- "Full path to a script that will be invoked when resources start/stop/fail, fencing occurs or nodes join/leave the cluster.\n" ++ { "notification-agent", NULL, "string", NULL, "/dev/null", &check_script, ++ "Notification script or tool to be called after significant cluster events", ++ "Full path to a script or binary that will be invoked when resources start/stop/fail, fencing occurs or nodes join/leave the cluster.\n" + "Must exist on all nodes in the cluster." + }, +- { "notification-target", NULL, "string", NULL, "", NULL, ++ { "notification-recipient", NULL, "string", NULL, "", NULL, + "Destination for notifications (Optional)", + "Where should the supplied script send notifications to. Useful to avoid hard-coding this in the script." + }, +@@ -992,8 +992,8 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void + + verify_crmd_options(config_hash); + +- script = crmd_pref(config_hash, "notification-script"); +- value = crmd_pref(config_hash, "notification-target"); ++ script = crmd_pref(config_hash, "notification-agent"); ++ value = crmd_pref(config_hash, "notification-recipient"); + crmd_enable_notifications(script, value); + + value = crmd_pref(config_hash, XML_CONFIG_ATTR_DC_DEADTIME); +diff --git a/cts/CIB.py b/cts/CIB.py +index cd3a6a1..0933ccd 100644 +--- a/cts/CIB.py ++++ b/cts/CIB.py +@@ -219,8 +219,8 @@ class CIB11(ConfigBase): + o["dc-deadtime"] = "5s" + o["no-quorum-policy"] = no_quorum + o["expected-quorum-votes"] = self.num_nodes +- o["notification-script"] = "/var/lib/pacemaker/notify.sh" +- o["notification-target"] = "/var/lib/pacemaker/notify.log" ++ o["notification-agent"] = "/var/lib/pacemaker/notify.sh" ++ o["notification-recipient"] = "/var/lib/pacemaker/notify.log" + + if self.CM.Env["DoBSC"] == 1: + o["ident-string"] = "Linux-HA TEST configuration file - REMOVEME!!" diff --git a/SOURCES/0015-Fix-crmd-Correctly-enable-disable-notifications.patch b/SOURCES/0015-Fix-crmd-Correctly-enable-disable-notifications.patch new file mode 100644 index 0000000..575f6ea --- /dev/null +++ b/SOURCES/0015-Fix-crmd-Correctly-enable-disable-notifications.patch @@ -0,0 +1,22 @@ +From: Andrew Beekhof +Date: Wed, 2 Sep 2015 14:48:17 +1000 +Subject: [PATCH] Fix: crmd: Correctly enable/disable notifications + +(cherry picked from commit 7368cf120cd5ee848d2bdcd788497a3b89616b05) +--- + crmd/notify.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/crmd/notify.c b/crmd/notify.c +index 980bfa6..ccf5ea8 100644 +--- a/crmd/notify.c ++++ b/crmd/notify.c +@@ -50,7 +50,7 @@ crmd_enable_notifications(const char *script, const char *target) + free(notify_target); + notify_target = NULL; + +- if(safe_str_eq(script, "/dev/null")) { ++ if(script == NULL || safe_str_eq(script, "/dev/null")) { + crm_notice("Notifications disabled"); + return; + } diff --git a/SOURCES/0016-Fix-crmd-Report-the-completion-status-and-output-of-.patch b/SOURCES/0016-Fix-crmd-Report-the-completion-status-and-output-of-.patch new file mode 100644 index 0000000..e7bc0e3 --- /dev/null +++ b/SOURCES/0016-Fix-crmd-Report-the-completion-status-and-output-of-.patch @@ -0,0 +1,109 @@ +From: Andrew Beekhof +Date: Wed, 2 Sep 2015 14:34:04 +1000 +Subject: [PATCH] Fix: crmd: Report the completion status and output of + notifications + +(cherry picked from commit 0c303d8a6f9f9a9dbec9f6d2e9e799fe335f8eaa) +--- + crmd/notify.c | 37 ++++++++++++++++++++++++------------- + lib/services/services.c | 4 ++-- + 2 files changed, 26 insertions(+), 15 deletions(-) + +diff --git a/crmd/notify.c b/crmd/notify.c +index ccf5ea8..ca2be0f 100644 +--- a/crmd/notify.c ++++ b/crmd/notify.c +@@ -29,6 +29,7 @@ static const char *notify_keys[] = + { + "CRM_notify_recipient", + "CRM_notify_node", ++ "CRM_notify_nodeid", + "CRM_notify_rsc", + "CRM_notify_task", + "CRM_notify_interval", +@@ -83,12 +84,21 @@ set_notify_key(const char *name, const char *cvalue, char *value) + free(value); + } + ++static void crmd_notify_complete(svc_action_t *op) ++{ ++ if(op->rc == 0) { ++ crm_info("Notification %d (%s) complete", op->sequence, op->agent); ++ } else { ++ crm_warn("Notification %d (%s) failed: %d", op->sequence, op->agent, op->rc); ++ } ++} + + static void + send_notification(const char *kind) + { + int lpc; +- pid_t pid; ++ svc_action_t *notify = NULL; ++ static int operations = 0; + + crm_debug("Sending '%s' notification to '%s' via '%s'", kind, notify_target, notify_script); + +@@ -96,20 +106,20 @@ send_notification(const char *kind) + set_notify_key("CRM_notify_kind", kind, NULL); + set_notify_key("CRM_notify_version", VERSION, NULL); + +- pid = fork(); +- if (pid == -1) { +- crm_perror(LOG_ERR, "notification failed"); +- } ++ notify = services_action_create_generic(notify_script, NULL); + +- if (pid == 0) { +- /* crm_debug("notification: I am the child. Executing the nofitication program."); */ +- execl(notify_script, notify_script, NULL); +- exit(EXIT_FAILURE); ++ notify->timeout = 300; ++ notify->standard = strdup("event"); ++ notify->id = strdup(notify_script); ++ notify->agent = strdup(notify_script); ++ notify->sequence = ++operations; + +- } else { +- for(lpc = 0; lpc < DIMOF(notify_keys); lpc++) { +- unsetenv(notify_keys[lpc]); +- } ++ if(services_action_async(notify, &crmd_notify_complete) == FALSE) { ++ services_action_free(notify); ++ } ++ ++ for(lpc = 0; lpc < DIMOF(notify_keys); lpc++) { ++ unsetenv(notify_keys[lpc]); + } + } + +@@ -120,6 +130,7 @@ void crmd_notify_node_event(crm_node_t *node) + } + + set_notify_key("CRM_notify_node", node->uname, NULL); ++ set_notify_key("CRM_notify_nodeid", NULL, crm_itoa(node->id)); + set_notify_key("CRM_notify_desc", node->state, NULL); + + send_notification("node"); +diff --git a/lib/services/services.c b/lib/services/services.c +index abf1458..4609a7d 100644 +--- a/lib/services/services.c ++++ b/lib/services/services.c +@@ -598,7 +598,7 @@ action_async_helper(svc_action_t * op) { + } + + /* keep track of ops that are in-flight to avoid collisions in the same namespace */ +- if (res) { ++ if (op->rsc && res) { + inflight_ops = g_list_append(inflight_ops, op); + } + +@@ -622,7 +622,7 @@ services_action_async(svc_action_t * op, void (*action_callback) (svc_action_t * + g_hash_table_replace(recurring_actions, op->id, op); + } + +- if (is_op_blocked(op->rsc)) { ++ if (op->rsc && is_op_blocked(op->rsc)) { + blocked_ops = g_list_append(blocked_ops, op); + return TRUE; + } diff --git a/SOURCES/0017-Fix-cman-Print-the-nodeid-of-nodes-with-fake-names.patch b/SOURCES/0017-Fix-cman-Print-the-nodeid-of-nodes-with-fake-names.patch new file mode 100644 index 0000000..b627349 --- /dev/null +++ b/SOURCES/0017-Fix-cman-Print-the-nodeid-of-nodes-with-fake-names.patch @@ -0,0 +1,23 @@ +From: Andrew Beekhof +Date: Thu, 3 Sep 2015 10:58:59 +1000 +Subject: [PATCH] Fix: cman: Print the nodeid of nodes with fake names + +(cherry picked from commit dd9a379408aa43b89c81d31ce7efa60b2e77f593) +--- + tools/crm_node.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/tools/crm_node.c b/tools/crm_node.c +index 24cc4d7..ed02ee7 100644 +--- a/tools/crm_node.c ++++ b/tools/crm_node.c +@@ -494,7 +494,8 @@ try_cman(int command, enum cluster_type_e stack) + + for (lpc = 0; lpc < node_count; lpc++) { + if(valid_cman_name(cman_nodes[lpc].cn_name, cman_nodes[lpc].cn_nodeid) == FALSE) { +- /* Do not print */ ++ /* The name was invented, but we need to print something, make it the id instead */ ++ printf("%u ", cman_nodes[lpc].cn_nodeid); + + } if (command == 'l') { + printf("%s ", cman_nodes[lpc].cn_name); diff --git a/SOURCES/0018-Refactor-Tools-Isolate-the-paths-which-truely-requir.patch b/SOURCES/0018-Refactor-Tools-Isolate-the-paths-which-truely-requir.patch new file mode 100644 index 0000000..2fbd35e --- /dev/null +++ b/SOURCES/0018-Refactor-Tools-Isolate-the-paths-which-truely-requir.patch @@ -0,0 +1,299 @@ +From: Andrew Beekhof +Date: Thu, 3 Sep 2015 11:36:21 +1000 +Subject: [PATCH] Refactor: Tools: Isolate the paths which truely require + corosync-2.x + +(cherry picked from commit 32c05b99f6a3e953668dcda71ce24e03927d83cb) +--- + tools/crm_node.c | 243 +++++++++++++++++++++++++++++++------------------------ + 1 file changed, 139 insertions(+), 104 deletions(-) + +diff --git a/tools/crm_node.c b/tools/crm_node.c +index ed02ee7..308d4f9 100644 +--- a/tools/crm_node.c ++++ b/tools/crm_node.c +@@ -60,6 +60,9 @@ static struct crm_option long_options[] = { + #if SUPPORT_COROSYNC + {"openais", 0, 0, 'A', "\tOnly try connecting to an OpenAIS-based cluster"}, + #endif ++#ifdef SUPPORT_CS_QUORUM ++ {"corosync", 0, 0, 'C', "\tOnly try connecting to an Corosync-based cluster"}, ++#endif + #ifdef SUPPORT_HEARTBEAT + {"heartbeat", 0, 0, 'H', "Only try connecting to a Heartbeat-based cluster"}, + #endif +@@ -223,6 +226,138 @@ int tools_remove_node_cache(const char *node, const char *target) + return rc > 0 ? 0 : rc; + } + ++static gint ++compare_node_uname(gconstpointer a, gconstpointer b) ++{ ++ const crm_node_t *a_node = a; ++ const crm_node_t *b_node = b; ++ return strcmp(a_node->uname?a_node->uname:"", b_node->uname?b_node->uname:""); ++} ++ ++static int ++node_mcp_dispatch(const char *buffer, ssize_t length, gpointer userdata) ++{ ++ xmlNode *msg = string2xml(buffer); ++ ++ if (msg) { ++ xmlNode *node = NULL; ++ GListPtr nodes = NULL; ++ GListPtr iter = NULL; ++ ++ crm_log_xml_trace(msg, "message"); ++ ++ for (node = __xml_first_child(msg); node != NULL; node = __xml_next(node)) { ++ crm_node_t *peer = calloc(1, sizeof(crm_node_t)); ++ ++ nodes = g_list_insert_sorted(nodes, peer, compare_node_uname); ++ peer->uname = (char*)crm_element_value_copy(node, "uname"); ++ peer->state = (char*)crm_element_value_copy(node, "state"); ++ crm_element_value_int(node, "id", (int*)&peer->id); ++ } ++ ++ for(iter = nodes; iter; iter = iter->next) { ++ crm_node_t *peer = iter->data; ++ if (command == 'l') { ++ fprintf(stdout, "%u %s %s\n", peer->id, peer->uname, peer->state); ++ ++ } else if (command == 'p') { ++ if(safe_str_eq(peer->state, CRM_NODE_MEMBER)) { ++ fprintf(stdout, "%s ", peer->uname); ++ } ++ ++ } else if (command == 'i') { ++ if(safe_str_eq(peer->state, CRM_NODE_MEMBER)) { ++ fprintf(stdout, "%u ", peer->id); ++ } ++ } ++ } ++ ++ g_list_free_full(nodes, free); ++ free_xml(msg); ++ ++ if (command == 'p') { ++ fprintf(stdout, "\n"); ++ } ++ ++ crm_exit(pcmk_ok); ++ } ++ ++ return 0; ++} ++ ++static void ++node_mcp_destroy(gpointer user_data) ++{ ++ crm_exit(ENOTCONN); ++} ++ ++static gboolean ++try_pacemaker(int command, enum cluster_type_e stack) ++{ ++ struct ipc_client_callbacks node_callbacks = { ++ .dispatch = node_mcp_dispatch, ++ .destroy = node_mcp_destroy ++ }; ++ ++ if (stack == pcmk_cluster_heartbeat) { ++ /* Nothing to do for them */ ++ return FALSE; ++ } ++ ++ switch (command) { ++ case 'e': ++ /* Age only applies to heartbeat clusters */ ++ fprintf(stdout, "1\n"); ++ crm_exit(pcmk_ok); ++ ++ case 'q': ++ /* Implement one day? ++ * Wouldn't be much for pacemakerd to track it and include in the poke reply ++ */ ++ return FALSE; ++ ++ case 'R': ++ { ++ int lpc = 0; ++ const char *daemons[] = { ++ CRM_SYSTEM_CRMD, ++ "stonith-ng", ++ T_ATTRD, ++ CRM_SYSTEM_MCP, ++ }; ++ ++ for(lpc = 0; lpc < DIMOF(daemons); lpc++) { ++ if (tools_remove_node_cache(target_uname, daemons[lpc])) { ++ crm_err("Failed to connect to %s to remove node '%s'", daemons[lpc], target_uname); ++ crm_exit(pcmk_err_generic); ++ } ++ } ++ crm_exit(pcmk_ok); ++ } ++ break; ++ ++ case 'i': ++ case 'l': ++ case 'p': ++ /* Go to pacemakerd */ ++ { ++ GMainLoop *amainloop = g_main_new(FALSE); ++ mainloop_io_t *ipc = ++ mainloop_add_ipc_client(CRM_SYSTEM_MCP, G_PRIORITY_DEFAULT, 0, NULL, &node_callbacks); ++ if (ipc != NULL) { ++ /* Sending anything will get us a list of nodes */ ++ xmlNode *poke = create_xml_node(NULL, "poke"); ++ ++ crm_ipc_send(mainloop_get_ipc_client(ipc), poke, 0, 0, NULL); ++ free_xml(poke); ++ g_main_run(amainloop); ++ } ++ } ++ break; ++ } ++ return FALSE; ++} ++ + #if SUPPORT_HEARTBEAT + # include + # include +@@ -626,66 +761,6 @@ ais_membership_dispatch(cpg_handle_t handle, + # include + # include + +-static gint +-compare_node_uname(gconstpointer a, gconstpointer b) +-{ +- const crm_node_t *a_node = a; +- const crm_node_t *b_node = b; +- return strcmp(a_node->uname?a_node->uname:"", b_node->uname?b_node->uname:""); +-} +- +-static int +-node_mcp_dispatch(const char *buffer, ssize_t length, gpointer userdata) +-{ +- xmlNode *msg = string2xml(buffer); +- +- if (msg) { +- xmlNode *node = NULL; +- GListPtr nodes = NULL; +- GListPtr iter = NULL; +- +- crm_log_xml_trace(msg, "message"); +- +- for (node = __xml_first_child(msg); node != NULL; node = __xml_next(node)) { +- crm_node_t *peer = calloc(1, sizeof(crm_node_t)); +- +- nodes = g_list_insert_sorted(nodes, peer, compare_node_uname); +- peer->uname = (char*)crm_element_value_copy(node, "uname"); +- peer->state = (char*)crm_element_value_copy(node, "state"); +- crm_element_value_int(node, "id", (int*)&peer->id); +- } +- +- for(iter = nodes; iter; iter = iter->next) { +- crm_node_t *peer = iter->data; +- if (command == 'l') { +- fprintf(stdout, "%u %s\n", peer->id, peer->uname); +- +- } else if (command == 'p') { +- if(safe_str_eq(peer->state, CRM_NODE_MEMBER)) { +- fprintf(stdout, "%s ", peer->uname); +- } +- } +- } +- +- g_list_free_full(nodes, free); +- free_xml(msg); +- +- if (command == 'p') { +- fprintf(stdout, "\n"); +- } +- +- crm_exit(pcmk_ok); +- } +- +- return 0; +-} +- +-static void +-node_mcp_destroy(gpointer user_data) +-{ +- crm_exit(ENOTCONN); +-} +- + static gboolean + try_corosync(int command, enum cluster_type_e stack) + { +@@ -696,36 +771,7 @@ try_corosync(int command, enum cluster_type_e stack) + cpg_handle_t c_handle = 0; + quorum_handle_t q_handle = 0; + +- mainloop_io_t *ipc = NULL; +- GMainLoop *amainloop = NULL; +- const char *daemons[] = { +- CRM_SYSTEM_CRMD, +- "stonith-ng", +- T_ATTRD, +- CRM_SYSTEM_MCP, +- }; +- +- struct ipc_client_callbacks node_callbacks = { +- .dispatch = node_mcp_dispatch, +- .destroy = node_mcp_destroy +- }; +- + switch (command) { +- case 'R': +- for(rc = 0; rc < DIMOF(daemons); rc++) { +- if (tools_remove_node_cache(target_uname, daemons[rc])) { +- crm_err("Failed to connect to %s to remove node '%s'", daemons[rc], target_uname); +- crm_exit(pcmk_err_generic); +- } +- } +- crm_exit(pcmk_ok); +- break; +- +- case 'e': +- /* Age makes no sense (yet) in an AIS cluster */ +- fprintf(stdout, "1\n"); +- crm_exit(pcmk_ok); +- + case 'q': + /* Go direct to the Quorum API */ + rc = quorum_initialize(&q_handle, NULL, &quorum_type); +@@ -766,21 +812,8 @@ try_corosync(int command, enum cluster_type_e stack) + cpg_finalize(c_handle); + crm_exit(pcmk_ok); + +- case 'l': +- case 'p': +- /* Go to pacemakerd */ +- amainloop = g_main_new(FALSE); +- ipc = +- mainloop_add_ipc_client(CRM_SYSTEM_MCP, G_PRIORITY_DEFAULT, 0, NULL, +- &node_callbacks); +- if (ipc != NULL) { +- /* Sending anything will get us a list of nodes */ +- xmlNode *poke = create_xml_node(NULL, "poke"); +- +- crm_ipc_send(mainloop_get_ipc_client(ipc), poke, 0, 0, NULL); +- free_xml(poke); +- g_main_run(amainloop); +- } ++ default: ++ try_pacemaker(command, stack); + break; + } + return FALSE; +@@ -963,5 +996,7 @@ main(int argc, char **argv) + } + #endif + ++ try_pacemaker(command, try_stack); ++ + return (1); + } diff --git a/SOURCES/0019-Fix-corosync-Display-node-state-and-quorum-data-if-a.patch b/SOURCES/0019-Fix-corosync-Display-node-state-and-quorum-data-if-a.patch new file mode 100644 index 0000000..b7822e3 --- /dev/null +++ b/SOURCES/0019-Fix-corosync-Display-node-state-and-quorum-data-if-a.patch @@ -0,0 +1,94 @@ +From: Andrew Beekhof +Date: Thu, 3 Sep 2015 12:27:59 +1000 +Subject: [PATCH] Fix: corosync: Display node state and quorum data if + available + +(cherry picked from commit 4d4c92e515bbaf74917a311e19d5995b30c29430) +--- + mcp/pacemaker.c | 7 +++++++ + tools/crm_node.c | 17 ++++++++++------- + 2 files changed, 17 insertions(+), 7 deletions(-) + +diff --git a/mcp/pacemaker.c b/mcp/pacemaker.c +index f9fc015..9c3195e 100644 +--- a/mcp/pacemaker.c ++++ b/mcp/pacemaker.c +@@ -35,6 +35,8 @@ + + #include + #include ++ ++gboolean pcmk_quorate = FALSE; + gboolean fatal_error = FALSE; + GMainLoop *mainloop = NULL; + +@@ -560,6 +562,10 @@ update_process_clients(crm_client_t *client) + crm_node_t *node = NULL; + xmlNode *update = create_xml_node(NULL, "nodes"); + ++ if (is_corosync_cluster()) { ++ crm_xml_add_int(update, "quorate", pcmk_quorate); ++ } ++ + g_hash_table_iter_init(&iter, crm_peer_cache); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & node)) { + xmlNode *xml = create_xml_node(update, "node"); +@@ -896,6 +902,7 @@ static gboolean + mcp_quorum_callback(unsigned long long seq, gboolean quorate) + { + /* Nothing to do */ ++ pcmk_quorate = quorate; + return TRUE; + } + +diff --git a/tools/crm_node.c b/tools/crm_node.c +index 308d4f9..9626120 100644 +--- a/tools/crm_node.c ++++ b/tools/crm_node.c +@@ -243,8 +243,16 @@ node_mcp_dispatch(const char *buffer, ssize_t length, gpointer userdata) + xmlNode *node = NULL; + GListPtr nodes = NULL; + GListPtr iter = NULL; ++ const char *quorate = crm_element_value(msg, "quorate"); + + crm_log_xml_trace(msg, "message"); ++ if (command == 'q' && quorate != NULL) { ++ fprintf(stdout, "%s\n", quorate); ++ crm_exit(pcmk_ok); ++ ++ } else if(command == 'q') { ++ crm_exit(1); ++ } + + for (node = __xml_first_child(msg); node != NULL; node = __xml_next(node)) { + crm_node_t *peer = calloc(1, sizeof(crm_node_t)); +@@ -258,7 +266,7 @@ node_mcp_dispatch(const char *buffer, ssize_t length, gpointer userdata) + for(iter = nodes; iter; iter = iter->next) { + crm_node_t *peer = iter->data; + if (command == 'l') { +- fprintf(stdout, "%u %s %s\n", peer->id, peer->uname, peer->state); ++ fprintf(stdout, "%u %s %s\n", peer->id, peer->uname, peer->state?peer->state:""); + + } else if (command == 'p') { + if(safe_str_eq(peer->state, CRM_NODE_MEMBER)) { +@@ -310,12 +318,6 @@ try_pacemaker(int command, enum cluster_type_e stack) + fprintf(stdout, "1\n"); + crm_exit(pcmk_ok); + +- case 'q': +- /* Implement one day? +- * Wouldn't be much for pacemakerd to track it and include in the poke reply +- */ +- return FALSE; +- + case 'R': + { + int lpc = 0; +@@ -338,6 +340,7 @@ try_pacemaker(int command, enum cluster_type_e stack) + + case 'i': + case 'l': ++ case 'q': + case 'p': + /* Go to pacemakerd */ + { diff --git a/SOURCES/0020-Fix-pacemakerd-Do-not-forget-about-nodes-that-leave-.patch b/SOURCES/0020-Fix-pacemakerd-Do-not-forget-about-nodes-that-leave-.patch new file mode 100644 index 0000000..e2da8a5 --- /dev/null +++ b/SOURCES/0020-Fix-pacemakerd-Do-not-forget-about-nodes-that-leave-.patch @@ -0,0 +1,23 @@ +From: Andrew Beekhof +Date: Thu, 3 Sep 2015 13:27:57 +1000 +Subject: [PATCH] Fix: pacemakerd: Do not forget about nodes that leave the + cluster + +(cherry picked from commit 2ac396ae6f54c9437bcf786eeccf94d4e2fdd77a) +--- + mcp/pacemaker.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/mcp/pacemaker.c b/mcp/pacemaker.c +index 9c3195e..88a6a1f 100644 +--- a/mcp/pacemaker.c ++++ b/mcp/pacemaker.c +@@ -1108,6 +1108,8 @@ main(int argc, char **argv) + cluster.cpg.cpg_deliver_fn = mcp_cpg_deliver; + cluster.cpg.cpg_confchg_fn = mcp_cpg_membership; + ++ crm_set_autoreap(FALSE); ++ + if(cluster_connect_cpg(&cluster) == FALSE) { + crm_err("Couldn't connect to Corosync's CPG service"); + rc = -ENOPROTOOPT; diff --git a/SOURCES/0021-Fix-pacemakerd-Track-node-state-in-pacemakerd.patch b/SOURCES/0021-Fix-pacemakerd-Track-node-state-in-pacemakerd.patch new file mode 100644 index 0000000..b2814a8 --- /dev/null +++ b/SOURCES/0021-Fix-pacemakerd-Track-node-state-in-pacemakerd.patch @@ -0,0 +1,58 @@ +From: Andrew Beekhof +Date: Thu, 3 Sep 2015 14:29:27 +1000 +Subject: [PATCH] Fix: pacemakerd: Track node state in pacemakerd + +(cherry picked from commit c186f54241c49bf20b1620767933b006063d613c) +--- + mcp/pacemaker.c | 22 +++++++++++++++++++++- + 1 file changed, 21 insertions(+), 1 deletion(-) + +diff --git a/mcp/pacemaker.c b/mcp/pacemaker.c +index 88a6a1f..9f00a21 100644 +--- a/mcp/pacemaker.c ++++ b/mcp/pacemaker.c +@@ -901,7 +901,6 @@ mcp_cpg_membership(cpg_handle_t handle, + static gboolean + mcp_quorum_callback(unsigned long long seq, gboolean quorate) + { +- /* Nothing to do */ + pcmk_quorate = quorate; + return TRUE; + } +@@ -909,8 +908,23 @@ mcp_quorum_callback(unsigned long long seq, gboolean quorate) + static void + mcp_quorum_destroy(gpointer user_data) + { ++ crm_info("connection lost"); ++} ++ ++#if SUPPORT_CMAN ++static gboolean ++mcp_cman_dispatch(unsigned long long seq, gboolean quorate) ++{ ++ pcmk_quorate = quorate; ++ return TRUE; ++} ++ ++static void ++mcp_cman_destroy(gpointer user_data) ++{ + crm_info("connection closed"); + } ++#endif + + int + main(int argc, char **argv) +@@ -1122,6 +1136,12 @@ main(int argc, char **argv) + } + } + ++#if SUPPORT_CMAN ++ if (rc == pcmk_ok && is_cman_cluster()) { ++ init_cman_connection(mcp_cman_dispatch, mcp_cman_destroy); ++ } ++#endif ++ + if(rc == pcmk_ok) { + local_name = get_local_node_name(); + update_node_processes(local_nodeid, local_name, get_process_list()); diff --git a/SOURCES/0022-Fix-PE-Resolve-memory-leak.patch b/SOURCES/0022-Fix-PE-Resolve-memory-leak.patch new file mode 100644 index 0000000..e7cd5b1 --- /dev/null +++ b/SOURCES/0022-Fix-PE-Resolve-memory-leak.patch @@ -0,0 +1,27 @@ +From: Andrew Beekhof +Date: Tue, 8 Sep 2015 12:02:54 +1000 +Subject: [PATCH] Fix: PE: Resolve memory leak + +(cherry picked from commit 4f48a79fd19be0e614716f0900e31985d4714ace) +--- + lib/pengine/unpack.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 156a192..c4f3134 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -276,9 +276,13 @@ destroy_digest_cache(gpointer ptr) + op_digest_cache_t *data = ptr; + + free_xml(data->params_all); ++ free_xml(data->params_secure); + free_xml(data->params_restart); ++ + free(data->digest_all_calc); + free(data->digest_restart_calc); ++ free(data->digest_secure_calc); ++ + free(data); + } + diff --git a/SOURCES/0023-Fix-cman-Purge-all-node-caches-for-crm_node-R.patch b/SOURCES/0023-Fix-cman-Purge-all-node-caches-for-crm_node-R.patch new file mode 100644 index 0000000..5ff7c08 --- /dev/null +++ b/SOURCES/0023-Fix-cman-Purge-all-node-caches-for-crm_node-R.patch @@ -0,0 +1,24 @@ +From: Andrew Beekhof +Date: Tue, 8 Sep 2015 12:03:56 +1000 +Subject: [PATCH] Fix: cman: Purge all node caches for crm_node -R + +(cherry picked from commit c445e135b6d52b1a5f3cfdacfa54a63b313c00d2) +--- + tools/crm_node.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/tools/crm_node.c b/tools/crm_node.c +index 9626120..48ee7c4 100644 +--- a/tools/crm_node.c ++++ b/tools/crm_node.c +@@ -607,9 +607,7 @@ try_cman(int command, enum cluster_type_e stack) + + switch (command) { + case 'R': +- if (tools_remove_node_cache(target_uname, CRM_SYSTEM_CRMD)) { +- crm_err("Failed to connect to "CRM_SYSTEM_CRMD" to remove node '%s'", target_uname); +- } ++ try_pacemaker(command, stack); + break; + + case 'e': diff --git a/SOURCES/0024-Refactor-membership-Safely-autoreap-nodes-without-co.patch b/SOURCES/0024-Refactor-membership-Safely-autoreap-nodes-without-co.patch new file mode 100644 index 0000000..35617cc --- /dev/null +++ b/SOURCES/0024-Refactor-membership-Safely-autoreap-nodes-without-co.patch @@ -0,0 +1,92 @@ +From: Andrew Beekhof +Date: Tue, 8 Sep 2015 12:05:04 +1000 +Subject: [PATCH] Refactor: membership: Safely autoreap nodes without code + duplication + +(cherry picked from commit acd660a1bdf40ada599041cb14d2128632d2e7a5) +--- + lib/cluster/membership.c | 43 +++++++++++++++++++++---------------------- + 1 file changed, 21 insertions(+), 22 deletions(-) + +diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c +index b7958eb..3081e54 100644 +--- a/lib/cluster/membership.c ++++ b/lib/cluster/membership.c +@@ -795,8 +795,8 @@ crm_update_peer_expected(const char *source, crm_node_t * node, const char *expe + * called within a cache iteration if reaping is possible, + * otherwise reaping could invalidate the iterator. + */ +-crm_node_t * +-crm_update_peer_state(const char *source, crm_node_t * node, const char *state, int membership) ++static crm_node_t * ++crm_update_peer_state_iter(const char *source, crm_node_t * node, const char *state, int membership, GHashTableIter *iter) + { + gboolean is_member; + +@@ -822,13 +822,19 @@ crm_update_peer_state(const char *source, crm_node_t * node, const char *state, + free(last); + + if (!is_member && crm_autoreap) { +- if (status_type == crm_status_rstate) { ++ if(iter) { ++ crm_notice("Purged 1 peer with id=%u and/or uname=%s from the membership cache", node->id, node->uname); ++ g_hash_table_iter_remove(iter); ++ ++ } else if (status_type == crm_status_rstate) { + crm_remote_peer_cache_remove(node->uname); ++ + } else { + reap_crm_member(node->id, node->uname); + } + node = NULL; + } ++ + } else { + crm_trace("%s: Node %s[%u] - state is unchanged (%s)", source, node->uname, node->id, + state); +@@ -836,6 +842,12 @@ crm_update_peer_state(const char *source, crm_node_t * node, const char *state, + return node; + } + ++crm_node_t * ++crm_update_peer_state(const char *source, crm_node_t * node, const char *state, int membership) ++{ ++ return crm_update_peer_state_iter(source, node, state, membership, NULL); ++} ++ + /*! + * \internal + * \brief Reap all nodes from cache whose membership information does not match +@@ -853,26 +865,13 @@ crm_reap_unseen_nodes(uint64_t membership) + while (g_hash_table_iter_next(&iter, NULL, (gpointer *)&node)) { + if (node->last_seen != membership) { + if (node->state) { +- /* crm_update_peer_state() cannot be called here, because that +- * might modify the peer cache, invalidating our iterator ++ /* ++ * Calling crm_update_peer_state_iter() allows us to ++ * remove the node from crm_peer_cache without ++ * invalidating our iterator + */ +- if (safe_str_eq(node->state, CRM_NODE_LOST)) { +- crm_trace("Node %s[%u] - state is unchanged (%s)", +- node->uname, node->id, CRM_NODE_LOST); +- } else { +- char *last = node->state; +- +- node->state = strdup(CRM_NODE_LOST); +- crm_notice("Node %s[%u] - state is now %s (was %s)", +- node->uname, node->id, CRM_NODE_LOST, last); +- if (crm_status_callback) { +- crm_status_callback(crm_status_nstate, node, last); +- } +- if (crm_autoreap) { +- g_hash_table_iter_remove(&iter); +- } +- free(last); +- } ++ crm_update_peer_state_iter(__FUNCTION__, node, CRM_NODE_LOST, membership, &iter); ++ + } else { + crm_info("State of node %s[%u] is still unknown", + node->uname, node->id); diff --git a/SOURCES/0025-Fix-crmd-Prevent-segfault-by-correctly-detecting-whe.patch b/SOURCES/0025-Fix-crmd-Prevent-segfault-by-correctly-detecting-whe.patch new file mode 100644 index 0000000..a1797e9 --- /dev/null +++ b/SOURCES/0025-Fix-crmd-Prevent-segfault-by-correctly-detecting-whe.patch @@ -0,0 +1,23 @@ +From: Andrew Beekhof +Date: Wed, 9 Sep 2015 14:46:49 +1000 +Subject: [PATCH] Fix: crmd: Prevent segfault by correctly detecting when + notifications are not required + +(cherry picked from commit 5eb9f93ef666c75e5f32827a92b0a57ada063803) +--- + crmd/notify.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/crmd/notify.c b/crmd/notify.c +index ca2be0f..179af18 100644 +--- a/crmd/notify.c ++++ b/crmd/notify.c +@@ -141,7 +141,7 @@ crmd_notify_fencing_op(stonith_event_t * e) + { + char *desc = NULL; + +- if(notify_script) { ++ if(notify_script == NULL) { + return; + } + diff --git a/SOURCES/0026-Fix-crmd-don-t-add-node-ID-to-proxied-remote-node-re.patch b/SOURCES/0026-Fix-crmd-don-t-add-node-ID-to-proxied-remote-node-re.patch new file mode 100644 index 0000000..ba29678 --- /dev/null +++ b/SOURCES/0026-Fix-crmd-don-t-add-node-ID-to-proxied-remote-node-re.patch @@ -0,0 +1,29 @@ +From: Ken Gaillot +Date: Thu, 27 Aug 2015 11:00:02 -0500 +Subject: [PATCH] Fix: crmd: don't add node ID to proxied remote node requests + for attrd + +446a1005 incorrectly set F_ATTRD_HOST_ID for proxied remote node requests to +attrd. Since attrd only uses F_ATTRD_HOST_ID to associate a cluster node name +with an ID, it doesn't ever need to be set for remote nodes. + +Additionally, that revision used the proxying cluster node's node ID, which can +lead to node ID conflicts in attrd. + +(cherry picked from commit 6af6da534646dbadf3d8d1d63d0edb2844c72073) +--- + crmd/lrm_state.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/crmd/lrm_state.c b/crmd/lrm_state.c +index c03fa0b..bea1027 100644 +--- a/crmd/lrm_state.c ++++ b/crmd/lrm_state.c +@@ -540,7 +540,6 @@ remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg) + if (safe_str_eq(type, T_ATTRD) + && crm_element_value(request, F_ATTRD_HOST) == NULL) { + crm_xml_add(request, F_ATTRD_HOST, proxy->node_name); +- crm_xml_add_int(request, F_ATTRD_HOST_ID, get_local_nodeid(0)); + } + + rc = crm_ipc_send(proxy->ipc, request, flags, 5000, NULL); diff --git a/SOURCES/0027-Fix-pacemaker_remote-memory-leak-in-ipc_proxy_dispat.patch b/SOURCES/0027-Fix-pacemaker_remote-memory-leak-in-ipc_proxy_dispat.patch new file mode 100644 index 0000000..9dad48e --- /dev/null +++ b/SOURCES/0027-Fix-pacemaker_remote-memory-leak-in-ipc_proxy_dispat.patch @@ -0,0 +1,35 @@ +From: Ken Gaillot +Date: Mon, 14 Sep 2015 15:00:13 -0500 +Subject: [PATCH] Fix: pacemaker_remote: memory leak in ipc_proxy_dispatch() + +Detected via routine valgrind testing + +(cherry picked from commit 3bb439d1554cb5567b886c52107bd3bb6f27b696) +--- + lrmd/ipc_proxy.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/lrmd/ipc_proxy.c b/lrmd/ipc_proxy.c +index 9427393..2a5ad78 100644 +--- a/lrmd/ipc_proxy.c ++++ b/lrmd/ipc_proxy.c +@@ -223,9 +223,9 @@ ipc_proxy_dispatch(qb_ipcs_connection_t * c, void *data, size_t size) + } + + CRM_CHECK(client != NULL, crm_err("Invalid client"); +- return FALSE); ++ free_xml(request); return FALSE); + CRM_CHECK(client->id != NULL, crm_err("Invalid client: %p", client); +- return FALSE); ++ free_xml(request); return FALSE); + + /* this ensures that synced request/responses happen over the event channel + * in the crmd, allowing the crmd to process the messages async */ +@@ -241,6 +241,7 @@ ipc_proxy_dispatch(qb_ipcs_connection_t * c, void *data, size_t size) + crm_xml_add_int(msg, F_LRMD_IPC_MSG_FLAGS, flags); + add_message_xml(msg, F_LRMD_IPC_MSG, request); + lrmd_server_send_notify(ipc_proxy, msg); ++ free_xml(request); + free_xml(msg); + + return 0; diff --git a/SOURCES/0028-Log-The-package-version-is-more-informative.patch b/SOURCES/0028-Log-The-package-version-is-more-informative.patch new file mode 100644 index 0000000..543d9ab --- /dev/null +++ b/SOURCES/0028-Log-The-package-version-is-more-informative.patch @@ -0,0 +1,115 @@ +From: Andrew Beekhof +Date: Wed, 16 Sep 2015 09:14:39 +1000 +Subject: [PATCH] Log: The package version is more informative + +(cherry picked from commit 2b4d195e9e94777fc1953832fcce3637ffa2f449) +--- + crmd/cib.c | 2 +- + crmd/election.c | 2 +- + crmd/main.c | 5 ++--- + lib/ais/plugin.c | 2 +- + lib/common/utils.c | 4 ++-- + mcp/pacemaker.c | 4 ++-- + 6 files changed, 9 insertions(+), 10 deletions(-) + +diff --git a/crmd/cib.c b/crmd/cib.c +index 7ec5eda..41e9efb 100644 +--- a/crmd/cib.c ++++ b/crmd/cib.c +@@ -113,7 +113,7 @@ revision_check_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, vo + cmp = compare_version(revision, CRM_FEATURE_SET); + + if (cmp > 0) { +- crm_err("This build (%s) does not support the current resource configuration", VERSION); ++ crm_err("This build (%s) does not support the current resource configuration", PACEMAKER_VERSION); + crm_err("We can only support up to CRM feature set %s (current=%s)", + CRM_FEATURE_SET, revision); + crm_err("Shutting down the CRM"); +diff --git a/crmd/election.c b/crmd/election.c +index b542a66..adab4e3 100644 +--- a/crmd/election.c ++++ b/crmd/election.c +@@ -215,7 +215,7 @@ do_dc_takeover(long long action, + } + + update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, +- "dc-version", VERSION "-" BUILD_VERSION, FALSE, NULL, NULL); ++ "dc-version", PACEMAKER_VERSION "-" BUILD_VERSION, FALSE, NULL, NULL); + + update_attr_delegate(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, + "cluster-infrastructure", cluster_type, FALSE, NULL, NULL); +diff --git a/crmd/main.c b/crmd/main.c +index e9a69b4..75ed91c 100644 +--- a/crmd/main.c ++++ b/crmd/main.c +@@ -89,13 +89,12 @@ main(int argc, char **argv) + crmd_metadata(); + return 0; + } else if (argc - optind == 1 && safe_str_eq("version", argv[optind])) { +- fprintf(stdout, "CRM Version: "); +- fprintf(stdout, "%s (%s)\n", VERSION, BUILD_VERSION); ++ fprintf(stdout, "CRM Version: %s (%s)\n", PACEMAKER_VERSION, BUILD_VERSION); + return 0; + } + + crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE); +- crm_notice("CRM Git Version: %s\n", BUILD_VERSION); ++ crm_notice("CRM Git Version: %s (%s)\n", PACEMAKER_VERSION, BUILD_VERSION); + + if (optind > argc) { + ++argerr; +diff --git a/lib/ais/plugin.c b/lib/ais/plugin.c +index ab534fa..cf2a131 100644 +--- a/lib/ais/plugin.c ++++ b/lib/ais/plugin.c +@@ -201,7 +201,7 @@ static struct corosync_exec_handler pcmk_exec_service[] = { + */ + /* *INDENT-OFF* */ + struct corosync_service_engine pcmk_service_handler = { +- .name = (char *)"Pacemaker Cluster Manager "PACKAGE_VERSION, ++ .name = (char *)"Pacemaker Cluster Manager "PACEMAKER_VERSION, + .id = PCMK_SERVICE_ID, + .private_data_size = 0, + .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED, +diff --git a/lib/common/utils.c b/lib/common/utils.c +index 628cf2f..2364f5c 100644 +--- a/lib/common/utils.c ++++ b/lib/common/utils.c +@@ -1603,13 +1603,13 @@ crm_help(char cmd, int exit_code) + FILE *stream = (exit_code ? stderr : stdout); + + if (cmd == 'v' || cmd == '$') { +- fprintf(stream, "Pacemaker %s\n", VERSION); ++ fprintf(stream, "Pacemaker %s\n", PACEMAKER_VERSION); + fprintf(stream, "Written by Andrew Beekhof\n"); + goto out; + } + + if (cmd == '!') { +- fprintf(stream, "Pacemaker %s (Build: %s): %s\n", VERSION, BUILD_VERSION, CRM_FEATURES); ++ fprintf(stream, "Pacemaker %s (Build: %s): %s\n", PACEMAKER_VERSION, BUILD_VERSION, CRM_FEATURES); + goto out; + } + +diff --git a/mcp/pacemaker.c b/mcp/pacemaker.c +index 9f00a21..910d154 100644 +--- a/mcp/pacemaker.c ++++ b/mcp/pacemaker.c +@@ -972,7 +972,7 @@ main(int argc, char **argv) + shutdown = TRUE; + break; + case 'F': +- printf("Pacemaker %s (Build: %s)\n Supporting v%s: %s\n", VERSION, BUILD_VERSION, ++ printf("Pacemaker %s (Build: %s)\n Supporting v%s: %s\n", PACEMAKER_VERSION, BUILD_VERSION, + CRM_FEATURE_SET, CRM_FEATURES); + crm_exit(pcmk_ok); + default: +@@ -1039,7 +1039,7 @@ main(int argc, char **argv) + crm_exit(ENODATA); + } + +- crm_notice("Starting Pacemaker %s (Build: %s): %s", VERSION, BUILD_VERSION, CRM_FEATURES); ++ crm_notice("Starting Pacemaker %s (Build: %s): %s", PACEMAKER_VERSION, BUILD_VERSION, CRM_FEATURES); + mainloop = g_main_new(FALSE); + sysrq_init(); + diff --git a/SOURCES/0029-Fix-crm_resource-Allow-the-resource-configuration-to.patch b/SOURCES/0029-Fix-crm_resource-Allow-the-resource-configuration-to.patch new file mode 100644 index 0000000..942b464 --- /dev/null +++ b/SOURCES/0029-Fix-crm_resource-Allow-the-resource-configuration-to.patch @@ -0,0 +1,127 @@ +From: Andrew Beekhof +Date: Thu, 17 Sep 2015 09:46:38 +1000 +Subject: [PATCH] Fix: crm_resource: Allow the resource configuration to be + modified for --force-{check,start,..} calls + +(cherry picked from commit 1206f735a8ddb33c77152c736828e823e7755c34) +--- + tools/crm_resource.c | 36 +++++++++++++++++++++++++++++++----- + tools/crm_resource.h | 2 +- + tools/crm_resource_runtime.c | 14 +++++++++++++- + 3 files changed, 45 insertions(+), 7 deletions(-) + +diff --git a/tools/crm_resource.c b/tools/crm_resource.c +index 156bbea..2a94362 100644 +--- a/tools/crm_resource.c ++++ b/tools/crm_resource.c +@@ -247,6 +247,7 @@ main(int argc, char **argv) + const char *prop_set = NULL; + const char *rsc_long_cmd = NULL; + const char *longname = NULL; ++ GHashTable *override_params = NULL; + + char *xml_file = NULL; + crm_ipc_t *crmd_channel = NULL; +@@ -503,11 +504,35 @@ main(int argc, char **argv) + } + } + +- if (optind < argc && argv[optind] != NULL) { ++ if (optind < argc ++ && argv[optind] != NULL ++ && rsc_cmd == 0 ++ && rsc_long_cmd) { ++ ++ override_params = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); ++ while (optind < argc && argv[optind] != NULL) { ++ char *name = calloc(1, strlen(argv[optind])); ++ char *value = calloc(1, strlen(argv[optind])); ++ int rc = sscanf(argv[optind], "%[^=]=%s", name, value); ++ ++ if(rc == 2) { ++ g_hash_table_replace(override_params, name, value); ++ ++ } else { ++ CMD_ERR("Error parsing '%s' as a name=value pair for --%s", argv[optind], rsc_long_cmd); ++ free(value); ++ free(name); ++ argerr++; ++ } ++ optind++; ++ } ++ ++ } else if (optind < argc && argv[optind] != NULL && rsc_cmd == 0) { + CMD_ERR("non-option ARGV-elements: "); + while (optind < argc && argv[optind] != NULL) { +- CMD_ERR("%s ", argv[optind++]); +- ++argerr; ++ CMD_ERR("[%d of %d] %s ", optind, argc, argv[optind]); ++ optind++; ++ argerr++; + } + } + +@@ -516,7 +541,8 @@ main(int argc, char **argv) + } + + if (argerr) { +- crm_help('?', EX_USAGE); ++ CMD_ERR("Invalid option(s) supplied, use --help for valid usage"); ++ return crm_exit(EX_USAGE); + } + + our_pid = calloc(1, 11); +@@ -631,7 +657,7 @@ main(int argc, char **argv) + rc = wait_till_stable(timeout_ms, cib_conn); + + } else if (rsc_cmd == 0 && rsc_long_cmd) { /* force-(stop|start|check) */ +- rc = cli_resource_execute(rsc_id, rsc_long_cmd, cib_conn, &data_set); ++ rc = cli_resource_execute(rsc_id, rsc_long_cmd, override_params, cib_conn, &data_set); + + } else if (rsc_cmd == 'A' || rsc_cmd == 'a') { + GListPtr lpc = NULL; +diff --git a/tools/crm_resource.h b/tools/crm_resource.h +index 5a206e0..d4c3b05 100644 +--- a/tools/crm_resource.h ++++ b/tools/crm_resource.h +@@ -74,7 +74,7 @@ int cli_resource_search(const char *rsc, pe_working_set_t * data_set); + int cli_resource_delete(cib_t *cib_conn, crm_ipc_t * crmd_channel, const char *host_uname, resource_t * rsc, pe_working_set_t * data_set); + int cli_resource_restart(resource_t * rsc, const char *host, int timeout_ms, cib_t * cib); + int cli_resource_move(const char *rsc_id, const char *host_name, cib_t * cib, pe_working_set_t *data_set); +-int cli_resource_execute(const char *rsc_id, const char *rsc_action, cib_t * cib, pe_working_set_t *data_set); ++int cli_resource_execute(const char *rsc_id, const char *rsc_action, GHashTable *override_hash, cib_t * cib, pe_working_set_t *data_set); + + int cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const char *attr_id, + const char *attr_name, const char *attr_value, bool recursive, +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index b9427bc..ce9db01 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -1297,7 +1297,7 @@ wait_till_stable(int timeout_ms, cib_t * cib) + } + + int +-cli_resource_execute(const char *rsc_id, const char *rsc_action, cib_t * cib, pe_working_set_t *data_set) ++cli_resource_execute(const char *rsc_id, const char *rsc_action, GHashTable *override_hash, cib_t * cib, pe_working_set_t *data_set) + { + int rc = pcmk_ok; + svc_action_t *op = NULL; +@@ -1360,6 +1360,18 @@ cli_resource_execute(const char *rsc_id, const char *rsc_action, cib_t * cib, pe + setenv("OCF_TRACE_RA", "1", 1); + } + ++ if(op && override_hash) { ++ GHashTableIter iter; ++ char *name = NULL; ++ char *value = NULL; ++ ++ g_hash_table_iter_init(&iter, override_hash); ++ while (g_hash_table_iter_next(&iter, (gpointer *) & name, (gpointer *) & value)) { ++ printf("Overriding the cluser configuration for '%s' with '%s' = '%s'\n", rsc->id, name, value); ++ g_hash_table_replace(op->params, strdup(name), strdup(value)); ++ } ++ } ++ + if(op == NULL) { + /* Re-run but with stderr enabled so we can display a sane error message */ + crm_enable_stderr(TRUE); diff --git a/SOURCES/0030-Log-lrmd-Improved-logging-when-no-pacemaker-remote-a.patch b/SOURCES/0030-Log-lrmd-Improved-logging-when-no-pacemaker-remote-a.patch new file mode 100644 index 0000000..6bff962 --- /dev/null +++ b/SOURCES/0030-Log-lrmd-Improved-logging-when-no-pacemaker-remote-a.patch @@ -0,0 +1,34 @@ +From: Andrew Beekhof +Date: Thu, 17 Sep 2015 14:43:15 +1000 +Subject: [PATCH] Log: lrmd: Improved logging when no pacemaker remote authkey + is available + +(cherry picked from commit 20c2178f076ff32fdf9ba9a467c193b8dac2f9e5) +--- + lib/lrmd/lrmd_client.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c +index 42bdf2b..1f1ffde 100644 +--- a/lib/lrmd/lrmd_client.c ++++ b/lib/lrmd/lrmd_client.c +@@ -1061,13 +1061,17 @@ lrmd_tls_set_key(gnutls_datum_t * key) + if (set_key(key, specific_location) == 0) { + crm_debug("Using custom authkey location %s", specific_location); + return 0; ++ ++ } else { ++ crm_err("No lrmd remote key found at %s, trying default locations", specific_location); + } + +- if (set_key(key, DEFAULT_REMOTE_KEY_LOCATION)) { ++ if (set_key(key, DEFAULT_REMOTE_KEY_LOCATION) != 0) { + rc = set_key(key, ALT_REMOTE_KEY_LOCATION); + } ++ + if (rc) { +- crm_err("No lrmd remote key found"); ++ crm_err("No lrmd remote key found at %s", DEFAULT_REMOTE_KEY_LOCATION); + return -1; + } + diff --git a/SOURCES/0031-Fix-liblrmd-don-t-print-error-if-remote-key-environm.patch b/SOURCES/0031-Fix-liblrmd-don-t-print-error-if-remote-key-environm.patch new file mode 100644 index 0000000..0210482 --- /dev/null +++ b/SOURCES/0031-Fix-liblrmd-don-t-print-error-if-remote-key-environm.patch @@ -0,0 +1,38 @@ +From: Ken Gaillot +Date: Wed, 23 Sep 2015 10:45:39 -0500 +Subject: [PATCH] Fix: liblrmd: don't print error if remote key environment + variable unset + +20c2178 added error logging if the remote key was unable to be read, +however it would also log an error in the usual case where the +environment variable was simply unset. + +(cherry picked from commit dec3349f1252e2c2c18ed110b8cc4a2b2212b613) +--- + lib/lrmd/lrmd_client.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c +index 1f1ffde..f365e59 100644 +--- a/lib/lrmd/lrmd_client.c ++++ b/lib/lrmd/lrmd_client.c +@@ -1062,8 +1062,8 @@ lrmd_tls_set_key(gnutls_datum_t * key) + crm_debug("Using custom authkey location %s", specific_location); + return 0; + +- } else { +- crm_err("No lrmd remote key found at %s, trying default locations", specific_location); ++ } else if (specific_location) { ++ crm_err("No valid lrmd remote key found at %s, trying default location", specific_location); + } + + if (set_key(key, DEFAULT_REMOTE_KEY_LOCATION) != 0) { +@@ -1071,7 +1071,7 @@ lrmd_tls_set_key(gnutls_datum_t * key) + } + + if (rc) { +- crm_err("No lrmd remote key found at %s", DEFAULT_REMOTE_KEY_LOCATION); ++ crm_err("No valid lrmd remote key found at %s", DEFAULT_REMOTE_KEY_LOCATION); + return -1; + } + diff --git a/SOURCES/0032-Fix-Tools-Repair-the-logging-of-interesting-command-.patch b/SOURCES/0032-Fix-Tools-Repair-the-logging-of-interesting-command-.patch new file mode 100644 index 0000000..fda67b2 --- /dev/null +++ b/SOURCES/0032-Fix-Tools-Repair-the-logging-of-interesting-command-.patch @@ -0,0 +1,182 @@ +From: Andrew Beekhof +Date: Mon, 28 Sep 2015 14:54:28 +1000 +Subject: [PATCH] Fix: Tools: Repair the logging of 'interesting' command-lines + +(cherry picked from commit b7d6608d8b33b4e9580e04f25446176bac832fb7) +--- + tools/attrd_updater.c | 1 + + tools/cibadmin.c | 8 ++++++-- + tools/crm_attribute.c | 6 +++++- + tools/crm_resource.c | 30 +++++++++++++++++++++++------- + 4 files changed, 35 insertions(+), 10 deletions(-) + +diff --git a/tools/attrd_updater.c b/tools/attrd_updater.c +index 878dab5..11462ee 100644 +--- a/tools/attrd_updater.c ++++ b/tools/attrd_updater.c +@@ -150,6 +150,7 @@ main(int argc, char **argv) + case 'v': + command = flag; + attr_value = optarg; ++ crm_log_args(argc, argv); /* Too much? */ + break; + default: + ++argerr; +diff --git a/tools/cibadmin.c b/tools/cibadmin.c +index 6b90536..c16d3c7 100644 +--- a/tools/cibadmin.c ++++ b/tools/cibadmin.c +@@ -213,7 +213,7 @@ main(int argc, char **argv) + int option_index = 0; + + crm_xml_init(); /* Sets buffer allocation strategy */ +- crm_log_preinit(NULL, argc, argv); ++ crm_log_cli_init("cibadmin"); + crm_set_options(NULL, "command [options] [data]", long_options, + "Provides direct access to the cluster configuration." + "\n\nAllows the configuration, or sections of it, to be queried, modified, replaced and deleted." +@@ -286,6 +286,7 @@ main(int argc, char **argv) + break; + case 'B': + cib_action = CIB_OP_BUMP; ++ crm_log_args(argc, argv); + break; + case 'V': + command_options = command_options | cib_verbose; +@@ -303,13 +304,16 @@ main(int argc, char **argv) + case 'X': + crm_trace("Option %c => %s", flag, optarg); + admin_input_xml = optarg; ++ crm_log_args(argc, argv); + break; + case 'x': + crm_trace("Option %c => %s", flag, optarg); + admin_input_file = optarg; ++ crm_log_args(argc, argv); + break; + case 'p': + admin_input_stdin = TRUE; ++ crm_log_args(argc, argv); + break; + case 'N': + case 'h': +@@ -334,6 +338,7 @@ main(int argc, char **argv) + case 'f': + force_flag = TRUE; + command_options |= cib_quorum_override; ++ crm_log_args(argc, argv); + break; + case 'a': + output = createEmptyCib(1); +@@ -355,7 +360,6 @@ main(int argc, char **argv) + quiet = FALSE; + } + +- crm_log_init(NULL, LOG_CRIT, FALSE, FALSE, argc, argv, quiet); + while (bump_log_num > 0) { + crm_bump_log_level(argc, argv); + bump_log_num--; +diff --git a/tools/crm_attribute.c b/tools/crm_attribute.c +index c37b096..fc2f7c7 100644 +--- a/tools/crm_attribute.c ++++ b/tools/crm_attribute.c +@@ -146,11 +146,15 @@ main(int argc, char **argv) + case '?': + crm_help(flag, EX_OK); + break; +- case 'D': + case 'G': ++ command = flag; ++ attr_value = optarg; ++ break; ++ case 'D': + case 'v': + command = flag; + attr_value = optarg; ++ crm_log_args(argc, argv); + break; + case 'q': + case 'Q': +diff --git a/tools/crm_resource.c b/tools/crm_resource.c +index 2a94362..1b2976b 100644 +--- a/tools/crm_resource.c ++++ b/tools/crm_resource.c +@@ -304,6 +304,7 @@ main(int argc, char **argv) + || safe_str_eq("force-check", longname)) { + rsc_cmd = flag; + rsc_long_cmd = longname; ++ crm_log_args(argc, argv); + + } else if (safe_str_eq("list-ocf-providers", longname) + || safe_str_eq("list-ocf-alternatives", longname) +@@ -433,6 +434,7 @@ main(int argc, char **argv) + break; + case 'f': + do_force = TRUE; ++ crm_log_args(argc, argv); + break; + case 'i': + prop_id = optarg; +@@ -452,41 +454,55 @@ main(int argc, char **argv) + case 'T': + timeout_ms = crm_get_msec(optarg); + break; ++ + case 'C': + case 'R': + case 'P': +- rsc_cmd = 'C'; ++ crm_log_args(argc, argv); + require_resource = FALSE; + require_crmd = TRUE; ++ rsc_cmd = 'C'; + break; ++ + case 'F': +- rsc_cmd = flag; ++ crm_log_args(argc, argv); + require_crmd = TRUE; ++ rsc_cmd = flag; ++ break; ++ ++ case 'U': ++ case 'B': ++ case 'M': ++ case 'D': ++ crm_log_args(argc, argv); ++ rsc_cmd = flag; + break; ++ + case 'L': + case 'c': + case 'l': + case 'q': + case 'w': +- case 'D': + case 'W': +- case 'M': +- case 'U': +- case 'B': + case 'O': + case 'o': + case 'A': + case 'a': + rsc_cmd = flag; + break; ++ + case 'j': + print_pending = TRUE; + break; + case 'p': +- case 'g': + case 'd': + case 'S': ++ crm_log_args(argc, argv); ++ prop_name = optarg; ++ rsc_cmd = flag; ++ break; + case 'G': ++ case 'g': + prop_name = optarg; + rsc_cmd = flag; + break; diff --git a/SOURCES/0033-Feature-Tools-Do-not-send-command-lines-to-syslog.patch b/SOURCES/0033-Feature-Tools-Do-not-send-command-lines-to-syslog.patch new file mode 100644 index 0000000..c01d782 --- /dev/null +++ b/SOURCES/0033-Feature-Tools-Do-not-send-command-lines-to-syslog.patch @@ -0,0 +1,46 @@ +From: Andrew Beekhof +Date: Mon, 28 Sep 2015 15:02:10 +1000 +Subject: [PATCH] Feature: Tools: Do not send command lines to syslog + +(cherry picked from commit 8dae6838312c6a60c2e4b7ffa73a100fd5d0dce3) +--- + lib/common/logging.c | 8 -------- + 1 file changed, 8 deletions(-) + +diff --git a/lib/common/logging.c b/lib/common/logging.c +index b18b841..6879023 100644 +--- a/lib/common/logging.c ++++ b/lib/common/logging.c +@@ -928,24 +928,17 @@ crm_log_args(int argc, char **argv) + { + int lpc = 0; + int len = 0; +- int restore = FALSE; + int existing_len = 0; + int line = __LINE__; + static int logged = 0; + + char *arg_string = NULL; +- struct qb_log_callsite *args_cs = +- qb_log_callsite_get(__func__, __FILE__, ARGS_FMT, LOG_NOTICE, line, 0); + + if (argc == 0 || argv == NULL || logged) { + return; + } + + logged = 1; +- qb_bit_set(args_cs->targets, QB_LOG_SYSLOG); /* Turn on syslog too */ +- +- restore = qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_STATE_GET, 0); +- qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE); + + for (; lpc < argc; lpc++) { + if (argv[lpc] == NULL) { +@@ -958,7 +951,6 @@ crm_log_args(int argc, char **argv) + } + + qb_log_from_external_source(__func__, __FILE__, ARGS_FMT, LOG_NOTICE, line, 0, arg_string); +- qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, restore); + + free(arg_string); + } diff --git a/SOURCES/0034-Log-cibadmin-Default-once-again-to-LOG_CRIT.patch b/SOURCES/0034-Log-cibadmin-Default-once-again-to-LOG_CRIT.patch new file mode 100644 index 0000000..ccc3f1e --- /dev/null +++ b/SOURCES/0034-Log-cibadmin-Default-once-again-to-LOG_CRIT.patch @@ -0,0 +1,21 @@ +From: Andrew Beekhof +Date: Mon, 28 Sep 2015 18:45:32 +1000 +Subject: [PATCH] Log: cibadmin: Default once again to LOG_CRIT + +(cherry picked from commit d0d6118cbee3eccb3467058eadd91e08d3f4a42f) +--- + tools/cibadmin.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/tools/cibadmin.c b/tools/cibadmin.c +index c16d3c7..84531f8 100644 +--- a/tools/cibadmin.c ++++ b/tools/cibadmin.c +@@ -214,6 +214,7 @@ main(int argc, char **argv) + + crm_xml_init(); /* Sets buffer allocation strategy */ + crm_log_cli_init("cibadmin"); ++ set_crm_log_level(LOG_CRIT); + crm_set_options(NULL, "command [options] [data]", long_options, + "Provides direct access to the cluster configuration." + "\n\nAllows the configuration, or sections of it, to be queried, modified, replaced and deleted." diff --git a/SOURCES/0035-Fix-crm_resource-Correctly-update-existing-meta-attr.patch b/SOURCES/0035-Fix-crm_resource-Correctly-update-existing-meta-attr.patch new file mode 100644 index 0000000..33670ac --- /dev/null +++ b/SOURCES/0035-Fix-crm_resource-Correctly-update-existing-meta-attr.patch @@ -0,0 +1,87 @@ +From: Andrew Beekhof +Date: Wed, 30 Sep 2015 17:33:00 +1000 +Subject: [PATCH] Fix: crm_resource: Correctly update existing meta attributes + regardless of their position in the heirarchy + +(cherry picked from commit f367348c832c64e2dc480dc96d2e0c2aa88639ba) + +Conflicts: + tools/crm_resource_runtime.c +--- + tools/crm_resource_runtime.c | 44 ++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 36 insertions(+), 8 deletions(-) + +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index ce9db01..a04adb9 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -213,10 +213,11 @@ cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const ch + } + + if (safe_str_eq(attr_set_type, XML_TAG_ATTR_SETS)) { +- rc = find_resource_attr(cib, XML_ATTR_ID, uber_parent(rsc)->id, XML_TAG_META_SETS, attr_set, attr_id, +- attr_name, &local_attr_id); +- if(rc == pcmk_ok && do_force == FALSE) { +- if (BE_QUIET == FALSE) { ++ if (do_force == FALSE) { ++ rc = find_resource_attr(cib, XML_ATTR_ID, uber_parent(rsc)->id, ++ XML_TAG_META_SETS, attr_set, attr_id, ++ attr_name, &local_attr_id); ++ if (rc == pcmk_ok && BE_QUIET == FALSE) { + printf("WARNING: There is already a meta attribute for '%s' called '%s' (id=%s)\n", + uber_parent(rsc)->id, attr_name, local_attr_id); + printf(" Delete '%s' first or use --force to override\n", local_attr_id); +@@ -224,7 +225,7 @@ cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const ch + return -ENOTUNIQ; + } + +- } else if(rsc->parent) { ++ } else if(rsc->parent && do_force == FALSE) { + + switch(rsc->parent->variant) { + case pe_group: +@@ -234,14 +235,41 @@ cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const ch + break; + case pe_master: + case pe_clone: +- rsc = rsc->parent; +- if (BE_QUIET == FALSE) { +- printf("Updating '%s' for '%s'...\n", rsc->id, rsc_id); ++ ++ rc = find_resource_attr(cib, XML_ATTR_ID, rsc_id, attr_set_type, attr_set, attr_id, attr_name, &local_attr_id); ++ free(local_attr_id); ++ ++ if(rc != pcmk_ok) { ++ rsc = rsc->parent; ++ if (BE_QUIET == FALSE) { ++ printf("Updating '%s' on '%s', the parent of '%s'\n", attr_name, rsc->id, rsc_id); ++ } + } + break; + default: + break; + } ++ ++ } else if (rsc->parent && BE_QUIET == FALSE) { ++ printf("Forcing update of '%s' for '%s' instead of '%s'\n", attr_name, rsc_id, rsc->parent->id); ++ ++ } else if(rsc->parent == NULL && rsc->children) { ++ resource_t *child = rsc->children->data; ++ ++ if(child->variant == pe_native) { ++ lookup_id = clone_strip(child->id); /* Could be a cloned group! */ ++ rc = find_resource_attr(cib, XML_ATTR_ID, lookup_id, attr_set_type, attr_set, attr_id, attr_name, &local_attr_id); ++ ++ if(rc == pcmk_ok) { ++ rsc = child; ++ if (BE_QUIET == FALSE) { ++ printf("A value for '%s' already exists in child '%s', updating that instead of '%s'\n", attr_name, lookup_id, rsc_id); ++ } ++ } ++ ++ free(local_attr_id); ++ free(lookup_id); ++ } + } + + lookup_id = clone_strip(rsc->id); /* Could be a cloned group! */ diff --git a/SOURCES/0036-Log-crm_resource-restart-Improved-user-feedback-on-f.patch b/SOURCES/0036-Log-crm_resource-restart-Improved-user-feedback-on-f.patch new file mode 100644 index 0000000..4dded82 --- /dev/null +++ b/SOURCES/0036-Log-crm_resource-restart-Improved-user-feedback-on-f.patch @@ -0,0 +1,27 @@ +From: Andrew Beekhof +Date: Mon, 5 Oct 2015 12:27:59 +1100 +Subject: [PATCH] Log: crm_resource --restart: Improved user feedback on + failure + +(cherry picked from commit b557a39973a1fb85b2791be67dc03cfd32c22d89) +--- + tools/crm_resource_runtime.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index a04adb9..878fd0b 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -1040,6 +1040,12 @@ cli_resource_restart(resource_t * rsc, const char *host, int timeout_ms, cib_t * + pe_working_set_t data_set; + + if(resource_is_running_on(rsc, host) == FALSE) { ++ const char *id = rsc->clone_name?rsc->clone_name:rsc->id; ++ if(host) { ++ printf("%s is not running on %s and so cannot be restarted\n", id, host); ++ } else { ++ printf("%s is not running anywhere and so cannot be restarted\n", id); ++ } + return -ENXIO; + } + diff --git a/SOURCES/0037-Fix-crm_resource-Correctly-delete-existing-meta-attr.patch b/SOURCES/0037-Fix-crm_resource-Correctly-delete-existing-meta-attr.patch new file mode 100644 index 0000000..5699706 --- /dev/null +++ b/SOURCES/0037-Fix-crm_resource-Correctly-delete-existing-meta-attr.patch @@ -0,0 +1,179 @@ +From: "Gao,Yan" +Date: Wed, 30 Sep 2015 16:59:43 +0200 +Subject: [PATCH] Fix: crm_resource: Correctly delete existing meta attributes + regardless of their position in the heirarchy + +Use the same logics as "--set-parameter" for "--delete-parameter". + +(cherry picked from commit cdee10c7310ab433b006126bc087f6b8dff3843e) + +Conflicts: + tools/crm_resource_runtime.c +--- + tools/crm_resource_runtime.c | 109 ++++++++++++++++++++++--------------------- + 1 file changed, 55 insertions(+), 54 deletions(-) + +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index 878fd0b..2d51e88 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -190,47 +190,20 @@ find_resource_attr(cib_t * the_cib, const char *attr, const char *rsc, const cha + return rc; + } + +-int +-cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const char *attr_id, +- const char *attr_name, const char *attr_value, bool recursive, +- cib_t * cib, pe_working_set_t * data_set) ++static resource_t * ++find_matching_attr_resource(resource_t * rsc, const char * rsc_id, const char * attr_set, const char * attr_id, ++ const char * attr_name, cib_t * cib, const char * cmd) + { + int rc = pcmk_ok; +- static bool need_init = TRUE; +- + char *lookup_id = NULL; + char *local_attr_id = NULL; +- char *local_attr_set = NULL; +- +- xmlNode *xml_top = NULL; +- xmlNode *xml_obj = NULL; +- +- bool use_attributes_tag = FALSE; +- resource_t *rsc = find_rsc_or_clone(rsc_id, data_set); +- +- if (rsc == NULL) { +- return -ENXIO; +- } +- +- if (safe_str_eq(attr_set_type, XML_TAG_ATTR_SETS)) { +- if (do_force == FALSE) { +- rc = find_resource_attr(cib, XML_ATTR_ID, uber_parent(rsc)->id, +- XML_TAG_META_SETS, attr_set, attr_id, +- attr_name, &local_attr_id); +- if (rc == pcmk_ok && BE_QUIET == FALSE) { +- printf("WARNING: There is already a meta attribute for '%s' called '%s' (id=%s)\n", +- uber_parent(rsc)->id, attr_name, local_attr_id); +- printf(" Delete '%s' first or use --force to override\n", local_attr_id); +- } +- return -ENOTUNIQ; +- } + +- } else if(rsc->parent && do_force == FALSE) { ++ if(rsc->parent && do_force == FALSE) { + + switch(rsc->parent->variant) { + case pe_group: + if (BE_QUIET == FALSE) { +- printf("Updating '%s' for '%s' will not apply to its peers in '%s'\n", attr_name, rsc_id, rsc->parent->id); ++ printf("Performing %s of '%s' for '%s' will not apply to its peers in '%s'\n", cmd, attr_name, rsc_id, rsc->parent->id); + } + break; + case pe_master: +@@ -242,7 +215,7 @@ cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const ch + if(rc != pcmk_ok) { + rsc = rsc->parent; + if (BE_QUIET == FALSE) { +- printf("Updating '%s' on '%s', the parent of '%s'\n", attr_name, rsc->id, rsc_id); ++ printf("Performing %s of '%s' on '%s', the parent of '%s'\n", cmd, attr_name, rsc->id, rsc_id); + } + } + break; +@@ -251,7 +224,7 @@ cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const ch + } + + } else if (rsc->parent && BE_QUIET == FALSE) { +- printf("Forcing update of '%s' for '%s' instead of '%s'\n", attr_name, rsc_id, rsc->parent->id); ++ printf("Forcing %s of '%s' for '%s' instead of '%s'\n", cmd, attr_name, rsc_id, rsc->parent->id); + + } else if(rsc->parent == NULL && rsc->children) { + resource_t *child = rsc->children->data; +@@ -263,7 +236,7 @@ cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const ch + if(rc == pcmk_ok) { + rsc = child; + if (BE_QUIET == FALSE) { +- printf("A value for '%s' already exists in child '%s', updating that instead of '%s'\n", attr_name, lookup_id, rsc_id); ++ printf("A value for '%s' already exists in child '%s', performing %s on that instead of '%s'\n", attr_name, lookup_id, cmd, rsc_id); + } + } + +@@ -272,6 +245,51 @@ cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const ch + } + } + ++ return rsc; ++} ++ ++int ++cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const char *attr_id, ++ const char *attr_name, const char *attr_value, bool recursive, ++ cib_t * cib, pe_working_set_t * data_set) ++{ ++ int rc = pcmk_ok; ++ static bool need_init = TRUE; ++ ++ char *lookup_id = NULL; ++ char *local_attr_id = NULL; ++ char *local_attr_set = NULL; ++ ++ xmlNode *xml_top = NULL; ++ xmlNode *xml_obj = NULL; ++ ++ bool use_attributes_tag = FALSE; ++ resource_t *rsc = find_rsc_or_clone(rsc_id, data_set); ++ ++ if (rsc == NULL) { ++ return -ENXIO; ++ } ++ ++ if (safe_str_eq(attr_set_type, XML_TAG_ATTR_SETS)) { ++ if (do_force == FALSE) { ++ rc = find_resource_attr(cib, XML_ATTR_ID, uber_parent(rsc)->id, ++ XML_TAG_META_SETS, attr_set, attr_id, ++ attr_name, &local_attr_id); ++ if (rc == pcmk_ok && BE_QUIET == FALSE) { ++ printf("WARNING: There is already a meta attribute for '%s' called '%s' (id=%s)\n", ++ uber_parent(rsc)->id, attr_name, local_attr_id); ++ printf(" Delete '%s' first or use --force to override\n", local_attr_id); ++ } ++ free(local_attr_id); ++ if (rc == pcmk_ok) { ++ return -ENOTUNIQ; ++ } ++ } ++ ++ } else { ++ rsc = find_matching_attr_resource(rsc, rsc_id, attr_set, attr_id, attr_name, cib, "update"); ++ } ++ + lookup_id = clone_strip(rsc->id); /* Could be a cloned group! */ + rc = find_resource_attr(cib, XML_ATTR_ID, lookup_id, attr_set_type, attr_set, attr_id, attr_name, + &local_attr_id); +@@ -401,25 +419,8 @@ cli_resource_delete_attribute(const char *rsc_id, const char *attr_set, const ch + return -ENXIO; + } + +- if(rsc->parent && safe_str_eq(attr_set_type, XML_TAG_META_SETS)) { +- +- switch(rsc->parent->variant) { +- case pe_group: +- if (BE_QUIET == FALSE) { +- printf("Removing '%s' for '%s' will not apply to its peers in '%s'\n", attr_name, rsc_id, rsc->parent->id); +- } +- break; +- case pe_master: +- case pe_clone: +- rsc = rsc->parent; +- if (BE_QUIET == FALSE) { +- printf("Removing '%s' from '%s' for '%s'...\n", attr_name, rsc->id, rsc_id); +- } +- break; +- default: +- break; +- } +- ++ if(safe_str_eq(attr_set_type, XML_TAG_META_SETS)) { ++ rsc = find_matching_attr_resource(rsc, rsc_id, attr_set, attr_id, attr_name, cib, "delete"); + } + + lookup_id = clone_strip(rsc->id); diff --git a/SOURCES/0038-Fix-crm_resource-Correctly-observe-force-when-deleti.patch b/SOURCES/0038-Fix-crm_resource-Correctly-observe-force-when-deleti.patch new file mode 100644 index 0000000..f5aaaea --- /dev/null +++ b/SOURCES/0038-Fix-crm_resource-Correctly-observe-force-when-deleti.patch @@ -0,0 +1,75 @@ +From: Andrew Beekhof +Date: Thu, 8 Oct 2015 13:38:07 +1100 +Subject: [PATCH] Fix: crm_resource: Correctly observe --force when deleting + and updating attributes + +(cherry picked from commit bd232e36403ea807635cabd336d8bb3101710891) +--- + tools/crm_resource_runtime.c | 25 +++++++++++++++++++++---- + 1 file changed, 21 insertions(+), 4 deletions(-) + +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index 2d51e88..c3f5275 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -123,8 +123,9 @@ find_resource_attr(cib_t * the_cib, const char *attr, const char *rsc, const cha + xmlNode *xml_search = NULL; + char *xpath_string = NULL; + +- CRM_ASSERT(value != NULL); +- *value = NULL; ++ if(value) { ++ *value = NULL; ++ } + + if(the_cib == NULL) { + return -ENOTCONN; +@@ -176,7 +177,7 @@ find_resource_attr(cib_t * the_cib, const char *attr, const char *rsc, const cha + crm_element_value(child, XML_NVPAIR_ATTR_VALUE), ID(child)); + } + +- } else { ++ } else if(value) { + const char *tmp = crm_element_value(xml_search, attr); + + if (tmp) { +@@ -198,8 +199,10 @@ find_matching_attr_resource(resource_t * rsc, const char * rsc_id, const char * + char *lookup_id = NULL; + char *local_attr_id = NULL; + +- if(rsc->parent && do_force == FALSE) { ++ if(do_force == TRUE) { ++ return rsc; + ++ } else if(rsc->parent) { + switch(rsc->parent->variant) { + case pe_group: + if (BE_QUIET == FALSE) { +@@ -270,6 +273,13 @@ cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const ch + return -ENXIO; + } + ++ if(attr_id == NULL ++ && do_force == FALSE ++ && pcmk_ok != find_resource_attr( ++ cib, XML_ATTR_ID, uber_parent(rsc)->id, NULL, NULL, NULL, attr_name, NULL)) { ++ printf("\n"); ++ } ++ + if (safe_str_eq(attr_set_type, XML_TAG_ATTR_SETS)) { + if (do_force == FALSE) { + rc = find_resource_attr(cib, XML_ATTR_ID, uber_parent(rsc)->id, +@@ -419,6 +429,13 @@ cli_resource_delete_attribute(const char *rsc_id, const char *attr_set, const ch + return -ENXIO; + } + ++ if(attr_id == NULL ++ && do_force == FALSE ++ && find_resource_attr( ++ cib, XML_ATTR_ID, uber_parent(rsc)->id, NULL, NULL, NULL, attr_name, NULL) != pcmk_ok) { ++ printf("\n"); ++ } ++ + if(safe_str_eq(attr_set_type, XML_TAG_META_SETS)) { + rsc = find_matching_attr_resource(rsc, rsc_id, attr_set, attr_id, attr_name, cib, "delete"); + } diff --git a/SOURCES/pacemaker-63f8e9a-rollup.patch b/SOURCES/pacemaker-63f8e9a-rollup.patch new file mode 100644 index 0000000..ef14d87 --- /dev/null +++ b/SOURCES/pacemaker-63f8e9a-rollup.patch @@ -0,0 +1,5904 @@ +diff --git a/ChangeLog b/ChangeLog +index d70edbd..e445890 100644 +--- a/ChangeLog ++++ b/ChangeLog +@@ -1,4 +1,218 @@ + ++* Wed Jun 24 2015 Andrew Beekhof Pacemaker-1.1.13-1 ++- Update source tarball to revision: 2a1847e ++- Changesets: 750 ++- Diff: 156 files changed, 11323 insertions(+), 3725 deletions(-) ++ ++- Features added since Pacemaker-1.1.12 ++ + Allow fail-counts to be removed en-mass when the new attrd is in operation ++ + attrd supports private attributes (not written to CIB) ++ + crmd: Ensure a watchdog device is in use if stonith-watchdog-timeout is configured ++ + crmd: If configured, trigger the watchdog immediately if we loose quorum and no-quorum-policy=suicide ++ + crm_diff: Support generating a difference without versions details if --no-version/-u is supplied ++ + crm_resource: Implement an intelligent restart capability ++ + Fencing: Advertise the watchdog device for fencing operations ++ + Fencing: Allow the cluster to recover resources if the watchdog is in use ++ + fencing: cl#5134 - Support random fencing delay to avoid double fencing ++ + mcp: Allow orphan children to initiate node panic via SIGQUIT ++ + mcp: Turn on sbd integration if pacemakerd finds it running ++ + mcp: Two new error codes that result in machine reset or power off ++ + Officially support the resource-discovery attribute for location constraints ++ + PE: Allow natural ordering of colocation sets ++ + PE: Support non-actionable degraded mode for OCF ++ + pengine: cl#5207 - Display "UNCLEAN" for resources running on unclean offline nodes ++ + remote: pcmk remote client tool for use with container wrapper script ++ + Support machine panics for some kinds of errors (via sbd if available) ++ + tools: add crm_resource --wait option ++ + tools: attrd_updater supports --query and --all options ++ + tools: attrd_updater: Allow attributes to be set for other nodes ++ ++- Changes since Pacemaker-1.1.12 ++ + pengine: exclusive discovery implies rsc is only allowed on exclusive subset of nodes ++ + acl: Correctly implement the 'reference' acl directive ++ + acl: Do not delay evaluation of added nodes in some situations ++ + attrd: b22b1fe did uuid test too early ++ + attrd: Clean out the node cache when requested by the admin ++ + attrd: fixes double free in attrd legacy ++ + attrd: properly write attributes for peers once uuid is discovered ++ + attrd: refresh should force an immediate write-out of all attributes ++ + attrd: Simplify how node deletions happen ++ + Bug rhbz#1067544 - Tools: Correctly handle --ban, --move and --locate for master/slave groups ++ + Bug rhbz#1181824 - Ensure the DC can be reliably fenced ++ + cib: Ability to upgrade cib validation schema in legacy mode ++ + cib: Always generate digests for cib diffs in legacy mode ++ + cib: assignment where comparison intended ++ + cib: Avoid nodeid conflicts we don't care about ++ + cib: Correctly add "update-origin", "update-client" and "update-user" attributes for cib ++ + cib: Correctly set up signal handlers ++ + cib: Correctly track node state ++ + cib: Do not update on disk backups if we're just querying them ++ + cib: Enable cib legacy mode for plugin-based clusters ++ + cib: Ensure file-based backends treat '-o section' consistently with the native backend ++ + cib: Ensure upgrade operations from a non-DC get an acknowledgement ++ + cib: No need to enforce cib digests for v2 diffs in legacy mode ++ + cib: Revert d153b86 to instantly get cib synchronized in legacy mode ++ + cib: tls sock cleanup for remote cib connections ++ + cli: Ensure subsequent unknown long options are correctly detected ++ + cluster: Invoke crm_remove_conflicting_peer() only when the new node's uname is being assigned in the node cache ++ + common: Increment current and age for lib common as a result of APIs being added ++ + corosync: Bug cl#5232 - Somewhat gracefully handle nodes with invalid UUIDs ++ + corosync: Avoid unnecessary repeated CMAP API calls ++ + crmd/pengine: handle on-fail=ignore properly ++ + crmd: Add "on_node" attribute for *_last_failure_0 lrm resource operations ++ + crmd: All peers need to track node shutdown requests ++ + crmd: Cached copies of transient attributes cease to be valid once a node leaves the membership ++ + crmd: Correctly add the local option that validates against schema for pengine to calculate ++ + crmd: Disable debug logging that results in significant overhead ++ + crmd: do not remove connection resources during re-probe ++ + crmd: don't update fail count twice for same failure ++ + crmd: Ensure remote connection resources timeout properly during 'migrate_from' action ++ + crmd: Ensure throttle_mode() does something on Linux ++ + crmd: Fixes crash when remote connection migration fails ++ + crmd: gracefully handle remote node disconnects during op execution ++ + crmd: Handle remote connection failures while executing ops on remote connection ++ + crmd: include remote nodes when forcing cluster wide resource reprobe ++ + crmd: never stop recurring monitor ops for pcmk remote during incomplete migration ++ + crmd: Prevent the old version of DC from being fenced when it shuts down for rolling-upgrade ++ + crmd: Prevent use-of-NULL during reprobe ++ + crmd: properly update job limit for baremetal remote-nodes ++ + crmd: Remote-node throttle jobs count towards cluster-node hosting conneciton rsc ++ + crmd: Reset stonith failcount to recover transitioner when the node rejoins ++ + crmd: resolves memory leak in crmd. ++ + crmd: respect start-failure-is-fatal even for artifically injected events ++ + crmd: Wait for all pending operations to complete before poking the policy engine ++ + crmd: When container's host is fenced, cancel in-flight operations ++ + crm_attribute: Correctly update config options when -o crm_config is specified ++ + crm_failcount: Better error reporting when no resource is specified ++ + crm_mon: add exit reason to resource failure output ++ + crm_mon: Fill CRM_notify_node in traps with node's uname rather than node's id if possible ++ + crm_mon: Repair notification delivery when the v2 patch format is in use ++ + crm_node: Correctly remove nodes from the CIB by nodeid ++ + crm_report: More patterns for finding logs on non-DC nodes ++ + crm_resource: Allow resource restart operations to be node specific ++ + crm_resource: avoid deletion of lrm cache on node with resource discovery disabled. ++ + crm_resource: Calculate how long to wait for a restart based on the resource timeouts ++ + crm_resource: Clean up memory in --restart error paths ++ + crm_resource: Display the locations of all anonymous clone children when supplying the children's common ID ++ + crm_resource: Ensure --restart sets/clears meta attributes ++ + crm_resource: Ensure fail-counts are purged when we redetect the state of all resources ++ + crm_resource: Implement --timeout for resource restart operations ++ + crm_resource: Include group members when calculating the next timeout ++ + crm_resource: Memory leak in error paths ++ + crm_resource: Prevent use-after-free ++ + crm_resource: Repair regression test outputs ++ + crm_resource: Use-after-free when restarting a resource ++ + dbus: ref count leaks ++ + dbus: Ensure both the read and write queues get dispatched ++ + dbus: Fail gracefully if malloc fails ++ + dbus: handle dispatch queue when multiple replies need to be processed ++ + dbus: Notice when dbus connections get disabled ++ + dbus: Remove double-free introduced while trying to make coverity shut up ++ + ensure if B is colocated with A, B can never run without A ++ + fence_legacy: Avoid passing 'port' to cluster-glue agents ++ + fencing: Allow nodes to be purged from the member cache ++ + fencing: Correctly make args for fencing agents ++ + fencing: Correctly wait for self-fencing to occur when the watchdog is in use ++ + fencing: Ensure the hostlist parameter is set for watchdog agents ++ + fencing: Force 'stonith-ng' as the system name ++ + fencing: Gracefully handle invalid metadata from agents ++ + fencing: If configured, wait stonith-watchdog-timer seconds for self-fencing to complete ++ + fencing: Reject actions for devices that haven't been explicitly registered yet ++ + ipc: properly allocate server enforced buffer size on client ++ + ipc: use server enforced buffer during ipc client send ++ + lrmd, services: interpret LSB status codes properly ++ + lrmd: add back support for class heartbeat agents ++ + lrmd: cancel pending async connection during disconnect ++ + lrmd: enable ipc proxy for docker-wrapper privileged mode ++ + lrmd: fix rescheduling of systemd monitor op during start ++ + lrmd: Handle systemd reporting 'done' before a resource is actually stopped ++ + lrmd: Hint to child processes that using sd_notify is not required ++ + lrmd: Log with the correct personality ++ + lrmd: Prevent glib assert triggered by timers being removed from mainloop more than once ++ + lrmd: report original timeout when systemd operation completes ++ + lrmd: store failed operation exit reason in cib ++ + mainloop: resolves race condition mainloop poll involving modification of ipc connections ++ + make targetted reprobe for remote node work, crm_resource -C -N ++ + mcp: Allow a configurable delay when debugging shutdown issues ++ + mcp: Avoid requiring 'export' for SYS-V sysconfig options ++ + Membership: Detect and resolve nodes that change their ID ++ + pacemakerd: resolves memory leak of xml structure in pacemakerd ++ + pengine: ability to launch resources in isolated containers ++ + pengine: add #kind=remote for baremetal remote-nodes ++ + pengine: allow baremetal remote-nodes to recover without requiring fencing when cluster-node fails ++ + pengine: allow remote-nodes to be placed in maintenance mode ++ + pengine: Avoid trailing whitespaces when printing resource state ++ + pengine: cl#5130 - Choose nodes capable of running all the colocated utilization resources ++ + pengine: cl#5130 - Only check the capacities of the nodes that are allowed to run the resource ++ + pengine: Correctly compare feature set to determine how to unpack meta attributes ++ + pengine: disable migrations for resources with isolation containers ++ + pengine: disable reloading of resources within isolated container wrappers ++ + pengine: Do not aggregate children in a pending state into the started/stopped/etc lists ++ + pengine: Do not record duplicate copies of the failed actions ++ + pengine: Do not reschedule monitors that are no longer needed while resource definitions have changed ++ + pengine: Fence baremetal remote when recurring monitor op fails ++ + pengine: Fix colocation with unmanaged resources ++ + pengine: Fix the behaviors of multi-state resources with asymmetrical ordering ++ + pengine: fixes pengine crash with orphaned remote node connection resource ++ + pengine: fixes segfault caused by malformed log warning ++ + pengine: handle cloned isolated resources in a sane way ++ + pengine: handle isolated resource scenario, cloned group of isolated resources ++ + pengine: Handle ordering between stateful and migratable resources ++ + pengine: imply stop in container node resources when host node is fenced ++ + pengine: only fence baremetal remote when connection can fails or can not be recovered ++ + pengine: only kill process group on timeout when on-fail does not equal block. ++ + pengine: per-node control over resource discovery ++ + pengine: prefer migration target for remote node connections ++ + pengine: prevent disabling rsc discovery per node in certain situations ++ + pengine: Prevent use-after-free in sort_rsc_process_order() ++ + pengine: properly handle ordering during remote connection partial migration ++ + pengine: properly recover remote-nodes when cluster-node proxy goes offline ++ + pengine: remove unnecessary whitespace from notify environment variables ++ + pengine: require-all feature for ordered clones ++ + pengine: Resolve memory leaks ++ + pengine: resource discovery mode for location constraints ++ + pengine: restart master instances on instance attribute changes ++ + pengine: Turn off legacy unpacking of resource options into the meta hashtable ++ + pengine: Watchdog integration is sufficient for fencing ++ + Perform systemd reloads asynchronously ++ + ping: Correctly advertise multiplier default ++ + Prefer to inherit the watchdog timeout from SBD ++ + properly record stop args after reload ++ + provide fake meta data for ra class heartbeat ++ + remote: report timestamps for remote connection resource operations ++ + remote: Treat recv msg timeout as a disconnect ++ + service: Prevent potential use-of-NULL in metadata lookups ++ + solaris: Allow compilation when dirent.d_type is not available ++ + solaris: Correctly replace the linux swab functions ++ + solaris: Disable throttling since /proc doesn't exist ++ + stonith-ng: Correctly observe the watchdog completion timeout ++ + stonith-ng: Correctly track node state ++ + stonith-ng: Reset mainloop source IDs after removing them ++ + systemd: Correctly handle long running stop actions ++ + systemd: Ensure failed monitor operations always return ++ + systemd: Ensure we don't call dbus_message_unref() with NULL ++ + systemd: fix crash caused when canceling in-flight operation ++ + systemd: Kindly ask dbus NOT to kill the process if the dbus connection fails ++ + systemd: Perform actions asynchronously ++ + systemd: Perform monitor operations without blocking ++ + systemd: Tell systemd not to take DBus down from underneath us ++ + systemd: Trick systemd into not stopping our services before us during shutdown ++ + tools: Improve crm_mon output with certain option combinations ++ + upstart: Monitor actions always return 'ok' or 'not running' ++ + upstart: Perform more parts of monitor operations without blocking ++ + xml: add 'require-all' to xml schema for constraints ++ + xml: cl#5231 - Unset the deleted attributes in the resulting diffs ++ + xml: Clone the latest constraint schema in preparation for changes" ++ + xml: Correctly create v1 patchsets when deleting attributes ++ + xml: Do not change the ordering of properties when applying v1 cib diffs ++ + xml: Do not dump deleted attributes ++ + xml: Do not prune leaves from v1 cib diffs that are being created with digests ++ + xml: Ensure ACLs are reapplied before calculating what a replace operation changed ++ + xml: Fix upgrade-1.3.xsl to correctly transform ACL rules with "attribute" ++ + xml: Prevent assert errors in crm_element_value() on applying a patch without version information ++ + xml: Prevent potential use-of-NULL ++ ++ + * Tue Jul 22 2014 Andrew Beekhof Pacemaker-1.1.12-1 + - Update source tarball to revision: 93a037d + - Changesets: 795 +diff --git a/attrd/commands.c b/attrd/commands.c +index 442c5f8..18c0523 100644 +--- a/attrd/commands.c ++++ b/attrd/commands.c +@@ -289,6 +289,9 @@ attrd_client_update(xmlNode *xml) + + crm_info("Expanded %s=%s to %d", attr, value, int_value); + crm_xml_add_int(xml, F_ATTRD_VALUE, int_value); ++ ++ /* Replacing the value frees the previous memory, so re-query it */ ++ value = crm_element_value(xml, F_ATTRD_VALUE); + } + } + +diff --git a/cib/callbacks.c b/cib/callbacks.c +index 71c487e..1452ded 100644 +--- a/cib/callbacks.c ++++ b/cib/callbacks.c +@@ -40,6 +40,8 @@ + #include + #include "common.h" + ++static unsigned long cib_local_bcast_num = 0; ++ + typedef struct cib_local_notify_s { + xmlNode *notify_src; + char *client_id; +@@ -48,7 +50,13 @@ typedef struct cib_local_notify_s { + } cib_local_notify_t; + + int next_client_id = 0; ++ ++#if SUPPORT_PLUGIN ++gboolean legacy_mode = TRUE; ++#else + gboolean legacy_mode = FALSE; ++#endif ++ + qb_ipcs_service_t *ipcs_ro = NULL; + qb_ipcs_service_t *ipcs_rw = NULL; + qb_ipcs_service_t *ipcs_shm = NULL; +@@ -82,8 +90,12 @@ static gboolean cib_read_legacy_mode(void) + return legacy; + } + +-static gboolean cib_legacy_mode(void) ++gboolean cib_legacy_mode(void) + { ++#if SUPPORT_PLUGIN ++ return TRUE; ++#endif ++ + if(cib_read_legacy_mode()) { + return TRUE; + } +@@ -442,6 +454,54 @@ do_local_notify(xmlNode * notify_src, const char *client_id, + } + + static void ++local_notify_destroy_callback(gpointer data) ++{ ++ cib_local_notify_t *notify = data; ++ ++ free_xml(notify->notify_src); ++ free(notify->client_id); ++ free(notify); ++} ++ ++static void ++check_local_notify(int bcast_id) ++{ ++ cib_local_notify_t *notify = NULL; ++ ++ if (!local_notify_queue) { ++ return; ++ } ++ ++ notify = g_hash_table_lookup(local_notify_queue, GINT_TO_POINTER(bcast_id)); ++ ++ if (notify) { ++ do_local_notify(notify->notify_src, notify->client_id, notify->sync_reply, ++ notify->from_peer); ++ g_hash_table_remove(local_notify_queue, GINT_TO_POINTER(bcast_id)); ++ } ++} ++ ++static void ++queue_local_notify(xmlNode * notify_src, const char *client_id, gboolean sync_reply, ++ gboolean from_peer) ++{ ++ cib_local_notify_t *notify = calloc(1, sizeof(cib_local_notify_t)); ++ ++ notify->notify_src = notify_src; ++ notify->client_id = strdup(client_id); ++ notify->sync_reply = sync_reply; ++ notify->from_peer = from_peer; ++ ++ if (!local_notify_queue) { ++ local_notify_queue = g_hash_table_new_full(g_direct_hash, ++ g_direct_equal, NULL, ++ local_notify_destroy_callback); ++ } ++ ++ g_hash_table_insert(local_notify_queue, GINT_TO_POINTER(cib_local_bcast_num), notify); ++} ++ ++static void + parse_local_options_v1(crm_client_t * cib_client, int call_type, int call_options, const char *host, + const char *op, gboolean * local_notify, gboolean * needs_reply, + gboolean * process, gboolean * needs_forward) +@@ -814,9 +874,12 @@ send_peer_reply(xmlNode * msg, xmlNode * result_diff, const char *originator, gb + int diff_del_admin_epoch = 0; + + const char *digest = NULL; ++ int format = 1; + + CRM_LOG_ASSERT(result_diff != NULL); + digest = crm_element_value(result_diff, XML_ATTR_DIGEST); ++ crm_element_value_int(result_diff, "format", &format); ++ + cib_diff_version_details(result_diff, + &diff_add_admin_epoch, &diff_add_epoch, &diff_add_updates, + &diff_del_admin_epoch, &diff_del_epoch, &diff_del_updates); +@@ -829,7 +892,9 @@ send_peer_reply(xmlNode * msg, xmlNode * result_diff, const char *originator, gb + crm_xml_add(msg, F_CIB_GLOBAL_UPDATE, XML_BOOLEAN_TRUE); + crm_xml_add(msg, F_CIB_OPERATION, CIB_OP_APPLY_DIFF); + +- CRM_ASSERT(digest != NULL); ++ if (format == 1) { ++ CRM_ASSERT(digest != NULL); ++ } + + add_message_xml(msg, F_CIB_UPDATE_DIFF, result_diff); + crm_log_xml_explicit(msg, "copy"); +@@ -1039,6 +1104,27 @@ cib_process_request(xmlNode * request, gboolean force_synchronous, gboolean priv + */ + crm_trace("Completed slave update"); + ++ } else if (cib_legacy_mode() && ++ rc == pcmk_ok && result_diff != NULL && !(call_options & cib_inhibit_bcast)) { ++ gboolean broadcast = FALSE; ++ ++ cib_local_bcast_num++; ++ crm_xml_add_int(request, F_CIB_LOCAL_NOTIFY_ID, cib_local_bcast_num); ++ broadcast = send_peer_reply(request, result_diff, originator, TRUE); ++ ++ if (broadcast && client_id && local_notify && op_reply) { ++ ++ /* If we have been asked to sync the reply, ++ * and a bcast msg has gone out, we queue the local notify ++ * until we know the bcast message has been received */ ++ local_notify = FALSE; ++ crm_trace("Queuing local %ssync notification for %s", ++ (call_options & cib_sync_call) ? "" : "a-", client_id); ++ ++ queue_local_notify(op_reply, client_id, (call_options & cib_sync_call), from_peer); ++ op_reply = NULL; /* the reply is queued, so don't free here */ ++ } ++ + } else if (call_options & cib_discard_reply) { + crm_trace("Caller isn't interested in reply"); + +@@ -1322,6 +1408,11 @@ cib_peer_callback(xmlNode * msg, void *private_data) + + if (cib_legacy_mode() && (originator == NULL || crm_str_eq(originator, cib_our_uname, TRUE))) { + /* message is from ourselves */ ++ int bcast_id = 0; ++ ++ if (!(crm_element_value_int(msg, F_CIB_LOCAL_NOTIFY_ID, &bcast_id))) { ++ check_local_notify(bcast_id); ++ } + return; + + } else if (crm_peer_cache == NULL) { +diff --git a/cib/callbacks.h b/cib/callbacks.h +index 7549a6c..bca9992 100644 +--- a/cib/callbacks.h ++++ b/cib/callbacks.h +@@ -73,6 +73,8 @@ void cib_shutdown(int nsig); + void initiate_exit(void); + void terminate_cib(const char *caller, gboolean fast); + ++extern gboolean cib_legacy_mode(void); ++ + #if SUPPORT_HEARTBEAT + extern void cib_ha_peer_callback(HA_Message * msg, void *private_data); + extern int cib_ccm_dispatch(gpointer user_data); +diff --git a/cib/main.c b/cib/main.c +index 2a48054..e20a2b6 100644 +--- a/cib/main.c ++++ b/cib/main.c +@@ -438,6 +438,13 @@ cib_peer_update_callback(enum crm_status_type type, crm_node_t * node, const voi + + if (cib_shutdown_flag && crm_active_peers() < 2 && crm_hash_table_size(client_connections) == 0) { + crm_info("No more peers"); ++ /* @TODO ++ * terminate_cib() calls crm_cluster_disconnect() which calls ++ * crm_peer_destroy() which destroys the peer caches, which a peer ++ * status callback shouldn't do. For now, there is a workaround in ++ * crm_update_peer_proc(), but CIB should be refactored to avoid ++ * destroying the peer caches here. ++ */ + terminate_cib(__FUNCTION__, FALSE); + } + } +diff --git a/cib/messages.c b/cib/messages.c +index 9c66349..363562c 100644 +--- a/cib/messages.c ++++ b/cib/messages.c +@@ -297,7 +297,14 @@ cib_process_upgrade_server(const char *op, int options, const char *section, xml + crm_xml_add(up, F_CIB_CALLOPTS, crm_element_value(req, F_CIB_CALLOPTS)); + crm_xml_add(up, F_CIB_CALLID, crm_element_value(req, F_CIB_CALLID)); + +- send_cluster_message(NULL, crm_msg_cib, up, FALSE); ++ if (cib_legacy_mode() && cib_is_master) { ++ rc = cib_process_upgrade( ++ op, options, section, up, input, existing_cib, result_cib, answer); ++ ++ } else { ++ send_cluster_message(NULL, crm_msg_cib, up, FALSE); ++ } ++ + free_xml(up); + + } else if(rc == pcmk_ok) { +diff --git a/crmd/lrm.c b/crmd/lrm.c +index 74fede4..062f769 100644 +--- a/crmd/lrm.c ++++ b/crmd/lrm.c +@@ -454,8 +454,6 @@ get_rsc_metadata(const char *type, const char *rclass, const char *provider, boo + + snprintf(key, len, "%s::%s:%s", type, rclass, provider); + if(force == FALSE) { +- snprintf(key, len, "%s::%s:%s", type, rclass, provider); +- + crm_trace("Retreiving cached metadata for %s", key); + metadata = g_hash_table_lookup(metadata_hash, key); + } +@@ -581,7 +579,7 @@ resource_supports_action(xmlNode *metadata, const char *name) + for (action = __xml_first_child(actions); action != NULL; action = __xml_next(action)) { + if (crm_str_eq((const char *)action->name, "action", TRUE)) { + value = crm_element_value(action, "name"); +- if (safe_str_eq("reload", value)) { ++ if (safe_str_eq(name, value)) { + return TRUE; + } + } +@@ -606,16 +604,18 @@ append_restart_list(lrmd_event_data_t *op, xmlNode *metadata, xmlNode * update, + + if(resource_supports_action(metadata, "reload")) { + restart = create_xml_node(NULL, XML_TAG_PARAMS); +- list = build_parameter_list(op, metadata, restart, "unique", FALSE, FALSE); +- } ++ /* Any parameters with unique="1" should be added into the "op-force-restart" list. */ ++ list = build_parameter_list(op, metadata, restart, "unique", TRUE, FALSE); + +- if (list == NULL) { ++ } else { + /* Resource does not support reloads */ + return; + } + + digest = calculate_operation_digest(restart, version); +- crm_xml_add(update, XML_LRM_ATTR_OP_RESTART, list); ++ /* Add "op-force-restart" and "op-restart-digest" to indicate the resource supports reload, ++ * no matter if it actually supports any parameters with unique="1"). */ ++ crm_xml_add(update, XML_LRM_ATTR_OP_RESTART, list? list: ""); + crm_xml_add(update, XML_LRM_ATTR_RESTART_DIGEST, digest); + + crm_trace("%s: %s, %s", op->rsc_id, digest, list); +diff --git a/crmd/throttle.c b/crmd/throttle.c +index 165050c..169594b 100644 +--- a/crmd/throttle.c ++++ b/crmd/throttle.c +@@ -92,41 +92,60 @@ int throttle_num_cores(void) + return cores; + } + ++/* ++ * \internal ++ * \brief Return name of /proc file containing the CIB deamon's load statistics ++ * ++ * \return Newly allocated memory with file name on success, NULL otherwise ++ * ++ * \note It is the caller's responsibility to free the return value. ++ * This will return NULL if the daemon is being run via valgrind. ++ * This should be called only on Linux systems. ++ */ + static char *find_cib_loadfile(void) + { + DIR *dp; + struct dirent *entry; + struct stat statbuf; + char *match = NULL; ++ char procpath[128]; ++ char value[64]; ++ char key[16]; + + dp = opendir("/proc"); + if (!dp) { + /* no proc directory to search through */ + crm_notice("Can not read /proc directory to track existing components"); +- return FALSE; ++ return NULL; + } + ++ /* Iterate through contents of /proc */ + while ((entry = readdir(dp)) != NULL) { +- char procpath[128]; +- char value[64]; +- char key[16]; + FILE *file; + int pid; + +- strcpy(procpath, "/proc/"); +- /* strlen("/proc/") + strlen("/status") + 1 = 14 +- * 128 - 14 = 114 */ +- strncat(procpath, entry->d_name, 114); +- +- if (lstat(procpath, &statbuf)) { ++ /* We're only interested in entries whose name is a PID, ++ * so skip anything non-numeric or that is too long. ++ * ++ * 114 = 128 - strlen("/proc/") - strlen("/status") - 1 ++ */ ++ pid = atoi(entry->d_name); ++ if ((pid <= 0) || (strlen(entry->d_name) > 114)) { + continue; + } +- if (!S_ISDIR(statbuf.st_mode) || !isdigit(entry->d_name[0])) { ++ ++ /* We're only interested in subdirectories */ ++ strcpy(procpath, "/proc/"); ++ strcat(procpath, entry->d_name); ++ if (lstat(procpath, &statbuf) || !S_ISDIR(statbuf.st_mode)) { + continue; + } + ++ /* Read the first entry ("Name:") from the process's status file. ++ * We could handle the valgrind case if we parsed the cmdline file ++ * instead, but that's more of a pain than it's worth. ++ */ + strcat(procpath, "/status"); +- + file = fopen(procpath, "r"); + if (!file) { + continue; +@@ -137,17 +156,11 @@ static char *find_cib_loadfile(void) + } + fclose(file); + +- if (safe_str_neq("cib", value)) { +- continue; +- } +- +- pid = atoi(entry->d_name); +- if (pid <= 0) { +- continue; ++ if (safe_str_eq("cib", value)) { ++ /* We found the CIB! */ ++ match = crm_strdup_printf("/proc/%d/stat", pid); ++ break; + } +- +- match = crm_strdup_printf("/proc/%d/stat", pid); +- break; + } + + closedir(dp); +@@ -214,6 +227,10 @@ static bool throttle_cib_load(float *load) + last_utime = 0; + last_stime = 0; + loadfile = find_cib_loadfile(); ++ if (loadfile == NULL) { ++ crm_warn("Couldn't find CIB load file"); ++ return FALSE; ++ } + ticks_per_s = sysconf(_SC_CLK_TCK); + crm_trace("Found %s", loadfile); + } +diff --git a/cts/CIB.py b/cts/CIB.py +index cdfc7ca..82d02d7 100644 +--- a/cts/CIB.py ++++ b/cts/CIB.py +@@ -312,7 +312,7 @@ Description=Dummy resource that takes a while to start + Type=notify + ExecStart=/usr/bin/python -c 'import time, systemd.daemon; time.sleep(10); systemd.daemon.notify("READY=1"); time.sleep(86400)' + ExecStop=/bin/sleep 10 +-ExecStop=/bin/kill -s KILL $MAINPID ++ExecStop=/bin/kill -s KILL \$MAINPID + """ + + os.system("cat <<-END >/tmp/DummySD.service\n%s\nEND" % (dummy_service_file)) +diff --git a/cts/CTStests.py b/cts/CTStests.py +index 14ab4bf..f817004 100644 +--- a/cts/CTStests.py ++++ b/cts/CTStests.py +@@ -1105,7 +1105,7 @@ class MaintenanceMode(CTSTest): + # fail the resource right after turning Maintenance mode on + # verify it is not recovered until maintenance mode is turned off + if action == "On": +- pats.append("pengine.*: warning: Processing failed op %s for %s on" % (self.action, self.rid)) ++ pats.append("pengine.*: warning:.* Processing failed op %s for %s on" % (self.action, self.rid)) + else: + pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "stop_0")) + pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "start_0")) +@@ -1314,7 +1314,8 @@ class ResourceRecover(CTSTest): + self.debug("Shooting %s aka. %s" % (rsc.clone_id, rsc.id)) + + pats = [] +- pats.append("pengine.*: warning: Processing failed op %s for %s on" % (self.action, self.rid)) ++ pats.append(r"pengine.*: warning:.* Processing failed op %s for (%s|%s) on" % (self.action, ++ rsc.id, rsc.clone_id)) + + if rsc.managed(): + pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "stop_0")) +@@ -2647,32 +2648,31 @@ class RemoteDriver(CTSTest): + self.remote_node_added = 0 + self.remote_rsc_added = 0 + self.remote_rsc = "remote-rsc" ++ self.remote_use_reconnect_interval = self.Env.RandomGen.choice(["true","false"]) + self.cib_cmd = """cibadmin -C -o %s -X '%s' """ + +- def del_rsc(self, node, rsc): +- ++ def get_othernode(self, node): + for othernode in self.Env["nodes"]: + if othernode == node: + # we don't want to try and use the cib that we just shutdown. + # find a cluster node that is not our soon to be remote-node. + continue +- rc = self.rsh(othernode, "crm_resource -D -r %s -t primitive" % (rsc)) +- if rc != 0: +- self.fail_string = ("Removal of resource '%s' failed" % (rsc)) +- self.failed = 1 +- return ++ else: ++ return othernode ++ ++ def del_rsc(self, node, rsc): ++ othernode = self.get_othernode(node) ++ rc = self.rsh(othernode, "crm_resource -D -r %s -t primitive" % (rsc)) ++ if rc != 0: ++ self.fail_string = ("Removal of resource '%s' failed" % (rsc)) ++ self.failed = 1 + + def add_rsc(self, node, rsc_xml): +- for othernode in self.CM.Env["nodes"]: +- if othernode == node: +- # we don't want to try and use the cib that we just shutdown. +- # find a cluster node that is not our soon to be remote-node. +- continue +- rc = self.rsh(othernode, self.cib_cmd % ("resources", rsc_xml)) +- if rc != 0: +- self.fail_string = "resource creation failed" +- self.failed = 1 +- return ++ othernode = self.get_othernode(node) ++ rc = self.rsh(othernode, self.cib_cmd % ("resources", rsc_xml)) ++ if rc != 0: ++ self.fail_string = "resource creation failed" ++ self.failed = 1 + + def add_primitive_rsc(self, node): + rsc_xml = """ +@@ -2687,7 +2687,24 @@ class RemoteDriver(CTSTest): + self.remote_rsc_added = 1 + + def add_connection_rsc(self, node): +- rsc_xml = """ ++ if self.remote_use_reconnect_interval == "true": ++ # use reconnect interval and make sure to set cluster-recheck-interval as well. ++ rsc_xml = """ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++""" % (self.remote_node, node) ++ self.rsh(self.get_othernode(node), self.templates["SetCheckInterval"] % ("45s")) ++ else: ++ # not using reconnect interval ++ rsc_xml = """ + + + +@@ -2698,6 +2715,7 @@ class RemoteDriver(CTSTest): + + + """ % (self.remote_node, node) ++ + self.add_rsc(node, rsc_xml) + if self.failed == 0: + self.remote_node_added = 1 +@@ -2836,7 +2854,7 @@ class RemoteDriver(CTSTest): + self.CM.ns.WaitForNodeToComeUp(node, 120); + + pats = [ ] +- watch = self.create_watch(pats, 120) ++ watch = self.create_watch(pats, 200) + watch.setwatch() + pats.append(self.templates["Pat:RscOpOK"] % (self.remote_node, "start")) + if self.remote_rsc_added == 1: +@@ -2927,12 +2945,19 @@ class RemoteDriver(CTSTest): + pats.append(self.templates["Pat:RscOpOK"] % (self.remote_node, "stop")) + + self.set_timer("remoteMetalCleanup") ++ ++ if self.remote_use_reconnect_interval == "true": ++ self.debug("Cleaning up re-check interval") ++ self.rsh(self.get_othernode(node), self.templates["ClearCheckInterval"]) + if self.remote_rsc_added == 1: ++ self.debug("Cleaning up dummy rsc put on remote node") + self.rsh(node, "crm_resource -U -r %s -N %s" % (self.remote_rsc, self.remote_node)) + self.del_rsc(node, self.remote_rsc) + if self.remote_node_added == 1: ++ self.debug("Cleaning up remote node connection resource") + self.rsh(node, "crm_resource -U -r %s" % (self.remote_node)) + self.del_rsc(node, self.remote_node) ++ + watch.lookforall() + self.log_timer("remoteMetalCleanup") + +diff --git a/cts/environment.py b/cts/environment.py +index 6edf331..a3399c3 100644 +--- a/cts/environment.py ++++ b/cts/environment.py +@@ -160,7 +160,7 @@ class Environment: + self.data["Stack"] = "heartbeat" + + elif name == "openais" or name == "ais" or name == "whitetank": +- self.data["Stack"] = "openais (whitetank)" ++ self.data["Stack"] = "corosync (plugin v0)" + + elif name == "corosync" or name == "cs" or name == "mcp": + self.data["Stack"] = "corosync 2.x" +@@ -351,6 +351,10 @@ class Environment: + self["DoFencing"]=1 + elif args[i+1] == "0" or args[i+1] == "no": + self["DoFencing"]=0 ++ elif args[i+1] == "phd": ++ self["DoStonith"]=1 ++ self["stonith-type"] = "fence_phd_kvm" ++ self["stonith-params"] = "pcmk_arg_map=domain:uname,delay=0" + elif args[i+1] == "rhcs" or args[i+1] == "xvm" or args[i+1] == "virt": + self["DoStonith"]=1 + self["stonith-type"] = "fence_xvm" +diff --git a/cts/patterns.py b/cts/patterns.py +index 8398c7e..1bc05a6 100644 +--- a/cts/patterns.py ++++ b/cts/patterns.py +@@ -32,6 +32,9 @@ class BasePatterns: + + "UUIDQueryCmd" : "crmadmin -N", + ++ "SetCheckInterval" : "cibadmin --modify -c --xml-text ''", ++ "ClearCheckInterval" : "cibadmin --delete --xpath \"//nvpair[@name='cluster-recheck-interval']\"", ++ + "MaintenanceModeOn" : "cibadmin --modify -c --xml-text ''", + "MaintenanceModeOff" : "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"", + +@@ -291,6 +294,9 @@ class crm_cs_v0(BasePatterns): + r"error:.*Connection to cib_shm failed", + r"error:.*Connection to cib_shm.* closed", + r"error:.*STONITH connection failed", ++ r"error: Connection to stonith-ng failed", ++ r"crit: Fencing daemon connection failed", ++ r"error: Connection to stonith-ng.* closed", + ] + + self.components["corosync"] = [ +diff --git a/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt b/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt +index 02525d6..a3c02cb 100644 +--- a/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt ++++ b/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt +@@ -343,7 +343,7 @@ http://www.clusterlabs.org/doc/[Clusters from Scratch] guide for those details. + # cibadmin -C -o resources --xml-file stonith.xml + ---- + +-. Set stonith-enabled to true: ++. Set +stonith-enabled+ to true: + + + ---- + # crm_attribute -t crm_config -n stonith-enabled -v true +@@ -831,3 +831,29 @@ Put together, the configuration looks like this: + + + ---- ++ ++== Remapping Reboots == ++ ++When the cluster needs to reboot a node, whether because +stonith-action+ is +reboot+ or because ++a reboot was manually requested (such as by `stonith_admin --reboot`), it will remap that to ++other commands in two cases: ++ ++. If the chosen fencing device does not support the +reboot+ command, the cluster ++ will ask it to perform +off+ instead. ++ ++. If a fencing topology level with multiple devices must be executed, the cluster ++ will ask all the devices to perform +off+, then ask the devices to perform +on+. ++ ++To understand the second case, consider the example of a node with redundant ++power supplies connected to intelligent power switches. Rebooting one switch ++and then the other would have no effect on the node. Turning both switches off, ++and then on, actually reboots the node. ++ ++In such a case, the fencing operation will be treated as successful as long as ++the +off+ commands succeed, because then it is safe for the cluster to recover ++any resources that were on the node. Timeouts and errors in the +on+ phase will ++be logged but ignored. ++ ++When a reboot operation is remapped, any action-specific timeout for the ++remapped action will be used (for example, +pcmk_off_timeout+ will be used when ++executing the +off+ command, not +pcmk_reboot_timeout+). +diff --git a/doc/asciidoc.reference b/doc/asciidoc.reference +index a9a171b..9323864 100644 +--- a/doc/asciidoc.reference ++++ b/doc/asciidoc.reference +@@ -1,31 +1,49 @@ ++= Single-chapter part of the documentation = ++ ++== Go-to reference chapter for how we use AsciiDoc on this project == ++ ++[NOTE] ++====== ++This is *not* an attempt for fully self-hosted AsciiDoc document, ++consider it a plaintext full of AsciiDoc samples (it's up to the reader ++to recognize the borderline) at documentation writers' disposal ++to somewhat standardize the style{empty}footnote:[ ++ style of both source notation and final visual appearance ++]. ++ + See also: + http://powerman.name/doc/asciidoc ++====== + +-Commands: `some-tool --with option` +-Files: '/tmp/file.name' +-Italic: _some text_ ++Emphasis: _some test_ + Mono: +some text+ +-Bold: *some text* +-Super: ^some text^ +-Sub: ~some text~ ++Strong: *some text* ++Super: ^some text^ ++Sub: ~some text~ + Quotes: + ``double quoted'' + `single quoted' + +-Tool: command ++Command: `some-tool --with option` ++Newly introduced term: ++ 'some text' (another form of emphasis as of this edit) ++ ++File: mono + Literal: mono ++Tool: command ++Option: mono ++Replaceable: emphasis mono + Varname: mono +-Option: italic +-Emphasis: italic bold +-Replaceable: italic mono ++Term encountered on system (e.g., menu choice, hostname): ++ strong + + +-.Title for Eaxmple ++.Title for Example + ===== + Some text + ===== + +-.Title for Eaxmple with XML Listing ++.Title for Example with XML Listing + ===== + [source,XML] + ----- +@@ -49,4 +67,4 @@ Section anchors: + + References to section anchors: + +-<> or <> +\ No newline at end of file ++<> or <> +diff --git a/doc/shared/en-US/pacemaker-intro.txt b/doc/shared/en-US/pacemaker-intro.txt +index bf432fc..6b898c9 100644 +--- a/doc/shared/en-US/pacemaker-intro.txt ++++ b/doc/shared/en-US/pacemaker-intro.txt +@@ -1,41 +1,62 @@ + +-== What Is Pacemaker? == ++== What Is 'Pacemaker'? == + +-Pacemaker is a cluster resource manager. ++Pacemaker is a 'cluster resource manager', that is, a logic responsible ++for a life-cycle of deployed software -- indirectly perhaps even whole ++systems or their interconnections -- under its control within a set of ++computers (a.k.a. 'cluster nodes', 'nodes' for short) and driven by ++prescribed rules. + + It achieves maximum availability for your cluster services +-(aka. resources) by detecting and recovering from node- and ++(a.k.a. 'resources') by detecting and recovering from node- and + resource-level failures by making use of the messaging and membership + capabilities provided by your preferred cluster infrastructure (either + http://www.corosync.org/[Corosync] or +-http://linux-ha.org/wiki/Heartbeat[Heartbeat]). ++http://linux-ha.org/wiki/Heartbeat[Heartbeat]), and possibly by ++utilizing other parts of the overall cluster stack. ++ ++.High Availability Clusters ++[NOTE] ++For *the goal of minimal downtime* a term 'high availability' was coined ++and together with its acronym, 'HA', is well-established in the sector. ++To differentiate this sort of clusters from high performance computing ++('HPC') ones, should a context require it (apparently, not the case in ++this document), using 'HA cluster' is an option. + + Pacemaker's key features include: + + * Detection and recovery of node and service-level failures + * Storage agnostic, no requirement for shared storage + * Resource agnostic, anything that can be scripted can be clustered +- * Supports fencing (aka. STONITH) for ensuring data integrity ++ * Supports 'fencing' (also referred to as the 'STONITH' acronym, ++ <> later on) for ensuring data integrity + * Supports large and small clusters + * Supports both quorate and resource-driven clusters + * Supports practically any redundancy configuration +- * Automatically replicated configuration that can be updated from any node +- * Ability to specify cluster-wide service ordering, colocation and anti-colocation ++ * Automatically replicated configuration that can be updated ++ from any node ++ * Ability to specify cluster-wide service ordering, ++ colocation and anti-colocation + * Support for advanced service types + ** Clones: for services which need to be active on multiple nodes +- ** Multi-state: for services with multiple modes (eg. master/slave, primary/secondary) +- * Unified, scriptable, cluster management tools. ++ ** Multi-state: for services with multiple modes ++ (e.g. master/slave, primary/secondary) ++ * Unified, scriptable cluster management tools + + == Pacemaker Architecture == + + At the highest level, the cluster is made up of three pieces: + +- * Non-cluster-aware components. These pieces ++ * *Non-cluster-aware components*. These pieces + include the resources themselves; scripts that start, stop and + monitor them; and a local daemon that masks the differences + between the different standards these scripts implement. ++ Even though interactions of these resources when run as multiple ++ instances can resemble a distributed system, they still lack ++ the proper HA mechanisms and/or autonomous cluster-wide governance ++ as subsumed in the following item. + +- * Resource management. Pacemaker provides the brain that processes ++ * *Resource management*. Pacemaker provides the brain that processes + and reacts to events regarding the cluster. These events include + nodes joining or leaving the cluster; resource events caused by + failures, maintenance and scheduled activities; and other +@@ -44,21 +65,24 @@ At the highest level, the cluster is made up of three pieces: + events. This may include moving resources, stopping nodes and even + forcing them offline with remote power switches. + +- * Low-level infrastructure. Projects like Corosync, CMAN and +- Heartbeat provide reliable messaging, membership and quorum ++ * *Low-level infrastructure*. Projects like 'Corosync', 'CMAN' and ++ 'Heartbeat' provide reliable messaging, membership and quorum + information about the cluster. + + When combined with Corosync, Pacemaker also supports popular open +-source cluster filesystems. +-footnote:[Even though Pacemaker also supports Heartbeat, the filesystems need +-to use the stack for messaging and membership, and Corosync seems to be +-what they're standardizing on. Technically, it would be possible for them to +-support Heartbeat as well, but there seems little interest in this.] ++source cluster filesystems.{empty}footnote:[ ++ Even though Pacemaker also supports Heartbeat, the filesystems need to ++ use the stack for messaging and membership, and Corosync seems to be ++ what they're standardizing on. Technically, it would be possible for ++ them to support Heartbeat as well, but there seems little interest ++ in this. ++] + + Due to past standardization within the cluster filesystem community, +-cluster filesystems make use of a common distributed lock manager, which makes +-use of Corosync for its messaging and membership capabilities (which nodes +-are up/down) and Pacemaker for fencing services. ++cluster filesystems make use of a common 'distributed lock manager', ++which makes use of Corosync for its messaging and membership ++capabilities (which nodes are up/down) and Pacemaker for fencing ++services. + + .The Pacemaker Stack + image::images/pcmk-stack.png["The Pacemaker stack",width="10cm",height="7.5cm",align="center"] +@@ -67,75 +91,79 @@ image::images/pcmk-stack.png["The Pacemaker stack",width="10cm",height="7.5cm",a + + Pacemaker itself is composed of five key components: + +- * Cluster Information Base (CIB) +- * Cluster Resource Management daemon (CRMd) +- * Local Resource Management daemon (LRMd) +- * Policy Engine (PEngine or PE) +- * Fencing daemon (STONITHd) ++ * 'Cluster Information Base' ('CIB') ++ * 'Cluster Resource Management daemon' ('CRMd') ++ * 'Local Resource Management daemon' ('LRMd') ++ * 'Policy Engine' ('PEngine' or 'PE') ++ * Fencing daemon ('STONITHd') + + .Internal Components + image::images/pcmk-internals.png["Subsystems of a Pacemaker cluster",align="center",scaledwidth="65%"] + + The CIB uses XML to represent both the cluster's configuration and + current state of all resources in the cluster. The contents of the CIB +-are automatically kept in sync across the entire cluster and are used +-by the PEngine to compute the ideal state of the cluster and how it +-should be achieved. ++are automatically kept in sync across the entire cluster and are used by ++the PEngine to compute the ideal state of the cluster and how it should ++be achieved. + +-This list of instructions is then fed to the Designated +-Controller (DC). Pacemaker centralizes all cluster decision making by +-electing one of the CRMd instances to act as a master. Should the +-elected CRMd process (or the node it is on) fail, a new one is +-quickly established. ++This list of instructions is then fed to the 'Designated Controller' ++('DC'). Pacemaker centralizes all cluster decision making by electing ++one of the CRMd instances to act as a master. Should the elected CRMd ++process (or the node it is on) fail, a new one is quickly established. + + The DC carries out the PEngine's instructions in the required order by + passing them to either the Local Resource Management daemon (LRMd) or + CRMd peers on other nodes via the cluster messaging infrastructure + (which in turn passes them on to their LRMd process). + +-The peer nodes all report the results of their operations back to the +-DC and, based on the expected and actual results, will either execute +-any actions that needed to wait for the previous one to complete, or +-abort processing and ask the PEngine to recalculate the ideal cluster +-state based on the unexpected results. ++The peer nodes all report the results of their operations back to the DC ++and, based on the expected and actual results, will either execute any ++actions that needed to wait for the previous one to complete, or abort ++processing and ask the PEngine to recalculate the ideal cluster state ++based on the unexpected results. + + In some cases, it may be necessary to power off nodes in order to + protect shared data or complete resource recovery. For this, Pacemaker + comes with STONITHd. + +-STONITH is an acronym for Shoot-The-Other-Node-In-The-Head and is +-usually implemented with a remote power switch. ++[[s-intro-stonith]] ++.STONITH ++[NOTE] ++*STONITH* is an acronym for 'Shoot-The-Other-Node-In-The-Head', ++a recommended practice that misbehaving node is best to be promptly ++'fenced' (shut off, cut from shared resources or otherwise immobilized), ++and is usually implemented with a remote power switch. + + In Pacemaker, STONITH devices are modeled as resources (and configured + in the CIB) to enable them to be easily monitored for failure, however +-STONITHd takes care of understanding the STONITH topology such that +-its clients simply request a node be fenced, and it does the rest. ++STONITHd takes care of understanding the STONITH topology such that its ++clients simply request a node be fenced, and it does the rest. + + == Types of Pacemaker Clusters == + + Pacemaker makes no assumptions about your environment. This allows it + to support practically any + http://en.wikipedia.org/wiki/High-availability_cluster#Node_configurations[redundancy +-configuration] including Active/Active, Active/Passive, N+1, N+M, +-N-to-1 and N-to-N. ++configuration] including 'Active/Active', 'Active/Passive', 'N+1', ++'N+M', 'N-to-1' and 'N-to-N'. + + .Active/Passive Redundancy + image::images/pcmk-active-passive.png["Active/Passive Redundancy",width="10cm",height="7.5cm",align="center"] + +-Two-node Active/Passive clusters using Pacemaker and DRBD are a +-cost-effective solution for many High Availability situations. ++Two-node Active/Passive clusters using Pacemaker and 'DRBD' are ++a cost-effective solution for many High Availability situations. + + .Shared Failover + image::images/pcmk-shared-failover.png["Shared Failover",width="10cm",height="7.5cm",align="center"] + + By supporting many nodes, Pacemaker can dramatically reduce hardware + costs by allowing several active/passive clusters to be combined and +-share a common backup node ++share a common backup node. + + .N to N Redundancy + image::images/pcmk-active-active.png["N to N Redundancy",width="10cm",height="7.5cm",align="center"] + +-When shared storage is available, every node can potentially be used +-for failover. Pacemaker can even run multiple copies of services to +-spread out the workload. ++When shared storage is available, every node can potentially be used for ++failover. Pacemaker can even run multiple copies of services to spread ++out the workload. + +diff --git a/extra/resources/Dummy b/extra/resources/Dummy +index aec2a0c..8a38ef5 100644 +--- a/extra/resources/Dummy ++++ b/extra/resources/Dummy +@@ -137,7 +137,7 @@ dummy_stop() { + if [ $? = $OCF_SUCCESS ]; then + rm ${OCF_RESKEY_state} + fi +- rm ${VERIFY_SERIALIZED_FILE} ++ rm -f ${VERIFY_SERIALIZED_FILE} + return $OCF_SUCCESS + } + +diff --git a/extra/resources/ping b/extra/resources/ping +index e7b9973..ca9db75 100755 +--- a/extra/resources/ping ++++ b/extra/resources/ping +@@ -43,8 +43,7 @@ meta_data() { + 1.0 + + +-Every time the monitor action is run, this resource agent records (in the CIB) the current number of ping nodes the host can connect to. +-It is essentially the same as pingd except that it uses the system ping tool to obtain the results. ++Every time the monitor action is run, this resource agent records (in the CIB) the current number of nodes the host can connect to using the system fping (preferred) or ping tool. + + node connectivity + +diff --git a/fencing/README.md b/fencing/README.md +new file mode 100644 +index 0000000..a50c69b +--- /dev/null ++++ b/fencing/README.md +@@ -0,0 +1,145 @@ ++# Directory contents ++ ++* `admin.c`, `stonith_admin.8`: `stonith_admin` command-line tool and its man ++ page ++* `commands.c`, `internal.h`, `main.c`, `remote.c`, `stonithd.7`: stonithd and ++ its man page ++* `fence_dummy`, `fence_legacy`, `fence_legacy.8`, `fence_pcmk`, ++ `fence_pcmk.8`: Pacemaker-supplied fence agents and their man pages ++* `regression.py(.in)`: regression tests for `stonithd` ++* `standalone_config.c`, `standalone_config.h`: abandoned project ++* `test.c`: `stonith-test` command-line tool ++ ++# How fencing requests are handled ++ ++## Bird's eye view ++ ++In the broadest terms, stonith works like this: ++ ++1. The initiator (an external program such as `stonith_admin`, or the cluster ++ itself via the `crmd`) asks the local `stonithd`, "Hey, can you fence this ++ node?" ++1. The local `stonithd` asks all the `stonithd's` in the cluster (including ++ itself), "Hey, what fencing devices do you have access to that can fence ++ this node?" ++1. Each `stonithd` in the cluster replies with a list of available devices that ++ it knows about. ++1. Once the original `stonithd` gets all the replies, it asks the most ++ appropriate `stonithd` peer to actually carry out the fencing. It may send ++ out more than one such request if the target node must be fenced with ++ multiple devices. ++1. The chosen `stonithd(s)` call the appropriate fencing resource agent(s) to ++ do the fencing, then replies to the original `stonithd` with the result. ++1. The original `stonithd` broadcasts the result to all `stonithd's`. ++1. Each `stonithd` sends the result to each of its local clients (including, at ++ some point, the initiator). ++ ++## Detailed view ++ ++### Initiating a fencing request ++ ++A fencing request can be initiated by the cluster or externally, using the ++libfencing API. ++ ++* The cluster always initiates fencing via `crmd/te_actions.c:te_fence_node()` ++ (which calls the `fence()` API). This occurs when a graph synapse contains a ++ `CRM_OP_FENCE` XML operation. ++* The main external clients are `stonith_admin` and `stonith-test`. ++ ++Highlights of the fencing API: ++* `stonith_api_new()` creates and returns a new `stonith_t` object, whose ++ `cmds` member has methods for connect, disconnect, fence, etc. ++* the `fence()` method creates and sends a `STONITH_OP_FENCE XML` request with ++ the desired action and target node. Callers do not have to choose or even ++ have any knowledge about particular fencing devices. ++ ++### Fencing queries ++ ++The function calls for a stonith request go something like this as of this writing: ++ ++The local `stonithd` receives the client's request via an IPC or messaging ++layer callback, which calls ++* `stonith_command()`, which (for requests) calls ++ * `handle_request()`, which (for `STONITH_OP_FENCE` from a client) calls ++ * `initiate_remote_stonith_op()`, which creates a `STONITH_OP_QUERY` XML ++ request with the target, desired action, timeout, etc.. then broadcasts ++ the operation to the cluster group (i.e. all `stonithd` instances) and ++ starts a timer. The query is broadcast because (1) location constraints ++ might prevent the local node from accessing the stonith device directly, ++ and (2) even if the local node does have direct access, another node ++ might be preferred to carry out the fencing. ++ ++Each `stonithd` receives the original `stonithd's STONITH_OP_QUERY` broadcast ++request via IPC or messaging layer callback, which calls: ++* `stonith_command()`, which (for requests) calls ++ * `handle_request()`, which (for `STONITH_OP_QUERY` from a peer) calls ++ * `stonith_query()`, which calls ++ * `get_capable_devices()` with `stonith_query_capable_device_db()` to add ++ device information to an XML reply and send it. (A message is ++ considered a reply if it contains `T_STONITH_REPLY`, which is only set ++ by `stonithd` peers, not clients.) ++ ++The original `stonithd` receives all peers' `STONITH_OP_QUERY` replies via IPC ++or messaging layer callback, which calls: ++* `stonith_command()`, which (for replies) calls ++ * `handle_reply()` which (for `STONITH_OP_QUERY`) calls ++ * `process_remote_stonith_query()`, which allocates a new query result ++ structure, parses device information into it, and adds it to operation ++ object. It increments the number of replies received for this operation, ++ and compares it against the expected number of replies (i.e. the number ++ of active peers), and if this is the last expected reply, calls ++ * `call_remote_stonith()`, which calculates the timeout and sends ++ `STONITH_OP_FENCE` request(s) to carry out the fencing. If the target ++ node has a fencing "topology" (which allows specifications such as ++ "this node can be fenced either with device A, or devices B and C in ++ combination"), it will choose the device(s), and send out as many ++ requests as needed. If it chooses a device, it will choose the peer; a ++ peer is preferred if it has "verified" access to the desired device, ++ meaning that it has the device "running" on it and thus has a monitor ++ operation ensuring reachability. ++ ++### Fencing operations ++ ++Each `STONITH_OP_FENCE` request goes something like this as of this writing: ++ ++The chosen peer `stonithd` receives the `STONITH_OP_FENCE` request via IPC or ++messaging layer callback, which calls: ++* `stonith_command()`, which (for requests) calls ++ * `handle_request()`, which (for `STONITH_OP_FENCE` from a peer) calls ++ * `stonith_fence()`, which calls ++ * `schedule_stonith_command()` (using supplied device if ++ `F_STONITH_DEVICE` was set, otherwise the highest-priority capable ++ device obtained via `get_capable_devices()` with ++ `stonith_fence_get_devices_cb()`), which adds the operation to the ++ device's pending operations list and triggers processing. ++ ++The chosen peer `stonithd's` mainloop is triggered and calls ++* `stonith_device_dispatch()`, which calls ++ * `stonith_device_execute()`, which pops off the next item from the device's ++ pending operations list. If acting as the (internally implemented) watchdog ++ agent, it panics the node, otherwise it calls ++ * `stonith_action_create()` and `stonith_action_execute_async()` to call the fencing agent. ++ ++The chosen peer stonithd's mainloop is triggered again once the fencing agent returns, and calls ++* `stonith_action_async_done()` which adds the results to an action object then calls its ++ * done callback (`st_child_done()`), which calls `schedule_stonith_command()` ++ for a new device if there are further required actions to execute or if the ++ original action failed, then builds and sends an XML reply to the original ++ `stonithd` (via `stonith_send_async_reply()`), then checks whether any ++ pending actions are the same as the one just executed and merges them if so. ++ ++### Fencing replies ++ ++The original `stonithd` receives the `STONITH_OP_FENCE` reply via IPC or ++messaging layer callback, which calls: ++* `stonith_command()`, which (for replies) calls ++ * `handle_reply()`, which calls ++ * `process_remote_stonith_exec()`, which calls either ++ `call_remote_stonith()` (to retry a failed operation, or try the next ++ device in a topology is appropriate, which issues a new ++ `STONITH_OP_FENCE` request, proceeding as before) or `remote_op_done()` ++ (if the operation is definitively failed or successful). ++ * remote_op_done() broadcasts the result to all peers. ++ ++Finally, all peers receive the broadcast result and call ++* `remote_op_done()`, which sends the result to all local clients. +diff --git a/fencing/commands.c b/fencing/commands.c +index c9975d3..0d2d614 100644 +--- a/fencing/commands.c ++++ b/fencing/commands.c +@@ -53,15 +53,24 @@ GHashTable *topology = NULL; + GList *cmd_list = NULL; + + struct device_search_s { ++ /* target of fence action */ + char *host; ++ /* requested fence action */ + char *action; ++ /* timeout to use if a device is queried dynamically for possible targets */ + int per_device_timeout; ++ /* number of registered fencing devices at time of request */ + int replies_needed; ++ /* number of device replies received so far */ + int replies_received; ++ /* whether the target is eligible to perform requested action (or off) */ + bool allow_suicide; + ++ /* private data to pass to search callback function */ + void *user_data; ++ /* function to call when all replies have been received */ + void (*callback) (GList * devices, void *user_data); ++ /* devices capable of performing requested action (or off if remapping) */ + GListPtr capable; + }; + +@@ -173,6 +182,17 @@ get_action_timeout(stonith_device_t * device, const char *action, int default_ti + char buffer[64] = { 0, }; + const char *value = NULL; + ++ /* If "reboot" was requested but the device does not support it, ++ * we will remap to "off", so check timeout for "off" instead ++ */ ++ if (safe_str_eq(action, "reboot") ++ && is_not_set(device->flags, st_device_supports_reboot)) { ++ crm_trace("%s doesn't support reboot, using timeout for off instead", ++ device->id); ++ action = "off"; ++ } ++ ++ /* If the device config specified an action-specific timeout, use it */ + snprintf(buffer, sizeof(buffer) - 1, "pcmk_%s_timeout", action); + value = g_hash_table_lookup(device->params, buffer); + if (value) { +@@ -1241,6 +1261,38 @@ search_devices_record_result(struct device_search_s *search, const char *device, + } + } + ++/* ++ * \internal ++ * \brief Check whether the local host is allowed to execute a fencing action ++ * ++ * \param[in] device Fence device to check ++ * \param[in] action Fence action to check ++ * \param[in] target Hostname of fence target ++ * \param[in] allow_suicide Whether self-fencing is allowed for this operation ++ * ++ * \return TRUE if local host is allowed to execute action, FALSE otherwise ++ */ ++static gboolean ++localhost_is_eligible(const stonith_device_t *device, const char *action, ++ const char *target, gboolean allow_suicide) ++{ ++ gboolean localhost_is_target = safe_str_eq(target, stonith_our_uname); ++ ++ if (device && action && device->on_target_actions ++ && strstr(device->on_target_actions, action)) { ++ if (!localhost_is_target) { ++ crm_trace("%s operation with %s can only be executed for localhost not %s", ++ action, device->id, target); ++ return FALSE; ++ } ++ ++ } else if (localhost_is_target && !allow_suicide) { ++ crm_trace("%s operation does not support self-fencing", action); ++ return FALSE; ++ } ++ return TRUE; ++} ++ + static void + can_fence_host_with_device(stonith_device_t * dev, struct device_search_s *search) + { +@@ -1258,19 +1310,20 @@ can_fence_host_with_device(stonith_device_t * dev, struct device_search_s *searc + goto search_report_results; + } + +- if (dev->on_target_actions && +- search->action && +- strstr(dev->on_target_actions, search->action)) { +- /* this device can only execute this action on the target node */ +- +- if(safe_str_neq(host, stonith_our_uname)) { +- crm_trace("%s operation with %s can only be executed for localhost not %s", +- search->action, dev->id, host); ++ /* Short-circuit query if this host is not allowed to perform the action */ ++ if (safe_str_eq(search->action, "reboot")) { ++ /* A "reboot" *might* get remapped to "off" then "on", so short-circuit ++ * only if all three are disallowed. If only one or two are disallowed, ++ * we'll report that with the results. We never allow suicide for ++ * remapped "on" operations because the host is off at that point. ++ */ ++ if (!localhost_is_eligible(dev, "reboot", host, search->allow_suicide) ++ && !localhost_is_eligible(dev, "off", host, search->allow_suicide) ++ && !localhost_is_eligible(dev, "on", host, FALSE)) { + goto search_report_results; + } +- +- } else if(safe_str_eq(host, stonith_our_uname) && search->allow_suicide == FALSE) { +- crm_trace("%s operation does not support self-fencing", search->action); ++ } else if (!localhost_is_eligible(dev, search->action, host, ++ search->allow_suicide)) { + goto search_report_results; + } + +@@ -1423,6 +1476,85 @@ struct st_query_data { + int call_options; + }; + ++/* ++ * \internal ++ * \brief Add action-specific attributes to query reply XML ++ * ++ * \param[in,out] xml XML to add attributes to ++ * \param[in] action Fence action ++ * \param[in] device Fence device ++ */ ++static void ++add_action_specific_attributes(xmlNode *xml, const char *action, ++ stonith_device_t *device) ++{ ++ int action_specific_timeout; ++ int delay_max; ++ ++ CRM_CHECK(xml && action && device, return); ++ ++ if (is_action_required(action, device)) { ++ crm_trace("Action %s is required on %s", action, device->id); ++ crm_xml_add_int(xml, F_STONITH_DEVICE_REQUIRED, 1); ++ } ++ ++ action_specific_timeout = get_action_timeout(device, action, 0); ++ if (action_specific_timeout) { ++ crm_trace("Action %s has timeout %dms on %s", ++ action, action_specific_timeout, device->id); ++ crm_xml_add_int(xml, F_STONITH_ACTION_TIMEOUT, action_specific_timeout); ++ } ++ ++ delay_max = get_action_delay_max(device, action); ++ if (delay_max > 0) { ++ crm_trace("Action %s has maximum random delay %dms on %s", ++ action, delay_max, device->id); ++ crm_xml_add_int(xml, F_STONITH_DELAY_MAX, delay_max / 1000); ++ } ++} ++ ++/* ++ * \internal ++ * \brief Add "disallowed" attribute to query reply XML if appropriate ++ * ++ * \param[in,out] xml XML to add attribute to ++ * \param[in] action Fence action ++ * \param[in] device Fence device ++ * \param[in] target Fence target ++ * \param[in] allow_suicide Whether self-fencing is allowed ++ */ ++static void ++add_disallowed(xmlNode *xml, const char *action, stonith_device_t *device, ++ const char *target, gboolean allow_suicide) ++{ ++ if (!localhost_is_eligible(device, action, target, allow_suicide)) { ++ crm_trace("Action %s on %s is disallowed for local host", ++ action, device->id); ++ crm_xml_add(xml, F_STONITH_ACTION_DISALLOWED, XML_BOOLEAN_TRUE); ++ } ++} ++ ++/* ++ * \internal ++ * \brief Add child element with action-specific values to query reply XML ++ * ++ * \param[in,out] xml XML to add attribute to ++ * \param[in] action Fence action ++ * \param[in] device Fence device ++ * \param[in] target Fence target ++ * \param[in] allow_suicide Whether self-fencing is allowed ++ */ ++static void ++add_action_reply(xmlNode *xml, const char *action, stonith_device_t *device, ++ const char *target, gboolean allow_suicide) ++{ ++ xmlNode *child = create_xml_node(xml, F_STONITH_ACTION); ++ ++ crm_xml_add(child, XML_ATTR_ID, action); ++ add_action_specific_attributes(child, action, device); ++ add_disallowed(child, action, device, target, allow_suicide); ++} ++ + static void + stonith_query_capable_device_cb(GList * devices, void *user_data) + { +@@ -1432,13 +1564,12 @@ stonith_query_capable_device_cb(GList * devices, void *user_data) + xmlNode *list = NULL; + GListPtr lpc = NULL; + +- /* Pack the results into data */ ++ /* Pack the results into XML */ + list = create_xml_node(NULL, __FUNCTION__); + crm_xml_add(list, F_STONITH_TARGET, query->target); + for (lpc = devices; lpc != NULL; lpc = lpc->next) { + stonith_device_t *device = g_hash_table_lookup(device_list, lpc->data); +- int action_specific_timeout; +- int delay_max; ++ const char *action = query->action; + + if (!device) { + /* It is possible the device got unregistered while +@@ -1448,24 +1579,44 @@ stonith_query_capable_device_cb(GList * devices, void *user_data) + + available_devices++; + +- action_specific_timeout = get_action_timeout(device, query->action, 0); + dev = create_xml_node(list, F_STONITH_DEVICE); + crm_xml_add(dev, XML_ATTR_ID, device->id); + crm_xml_add(dev, "namespace", device->namespace); + crm_xml_add(dev, "agent", device->agent); + crm_xml_add_int(dev, F_STONITH_DEVICE_VERIFIED, device->verified); +- if (is_action_required(query->action, device)) { +- crm_xml_add_int(dev, F_STONITH_DEVICE_REQUIRED, 1); +- } +- if (action_specific_timeout) { +- crm_xml_add_int(dev, F_STONITH_ACTION_TIMEOUT, action_specific_timeout); ++ ++ /* If the originating stonithd wants to reboot the node, and we have a ++ * capable device that doesn't support "reboot", remap to "off" instead. ++ */ ++ if (is_not_set(device->flags, st_device_supports_reboot) ++ && safe_str_eq(query->action, "reboot")) { ++ crm_trace("%s doesn't support reboot, using values for off instead", ++ device->id); ++ action = "off"; + } + +- delay_max = get_action_delay_max(device, query->action); +- if (delay_max > 0) { +- crm_xml_add_int(dev, F_STONITH_DELAY_MAX, delay_max / 1000); ++ /* Add action-specific values if available */ ++ add_action_specific_attributes(dev, action, device); ++ if (safe_str_eq(query->action, "reboot")) { ++ /* A "reboot" *might* get remapped to "off" then "on", so after ++ * sending the "reboot"-specific values in the main element, we add ++ * sub-elements for "off" and "on" values. ++ * ++ * We short-circuited earlier if "reboot", "off" and "on" are all ++ * disallowed for the local host. However if only one or two are ++ * disallowed, we send back the results and mark which ones are ++ * disallowed. If "reboot" is disallowed, this might cause problems ++ * with older stonithd versions, which won't check for it. Older ++ * versions will ignore "off" and "on", so they are not a problem. ++ */ ++ add_disallowed(dev, action, device, query->target, ++ is_set(query->call_options, st_opt_allow_suicide)); ++ add_action_reply(dev, "off", device, query->target, ++ is_set(query->call_options, st_opt_allow_suicide)); ++ add_action_reply(dev, "on", device, query->target, FALSE); + } + ++ /* A query without a target wants device parameters */ + if (query->target == NULL) { + xmlNode *attrs = create_xml_node(dev, XML_TAG_ATTRS); + +@@ -1481,7 +1632,7 @@ stonith_query_capable_device_cb(GList * devices, void *user_data) + } + + if (list != NULL) { +- crm_trace("Attaching query list output"); ++ crm_log_xml_trace(list, "Add query results"); + add_message_xml(query->reply, F_STONITH_CALLDATA, list); + } + stonith_send_reply(query->reply, query->call_options, query->remote_peer, query->client_id); +@@ -1766,6 +1917,14 @@ st_child_done(GPid pid, int rc, const char *output, gpointer user_data) + continue; + } + ++ /* Duplicate merging will do the right thing for either type of remapped ++ * reboot. If the executing stonithd remapped an unsupported reboot to ++ * off, then cmd->action will be reboot and will be merged with any ++ * other reboot requests. If the originating stonithd remapped a ++ * topology reboot to off then on, we will get here once with ++ * cmd->action "off" and once with "on", and they will be merged ++ * separately with similar requests. ++ */ + crm_notice + ("Merging stonith action %s for node %s originating from client %s with identical stonith request from client %s", + cmd_other->action, cmd_other->victim, cmd_other->client_name, cmd->client_name); +diff --git a/fencing/internal.h b/fencing/internal.h +index 46bd3bf..5fb8f9c 100644 +--- a/fencing/internal.h ++++ b/fencing/internal.h +@@ -51,6 +51,17 @@ typedef struct stonith_device_s { + gboolean api_registered; + } stonith_device_t; + ++/* These values are used to index certain arrays by "phase". Usually an ++ * operation has only one "phase", so phase is always zero. However, some ++ * reboots are remapped to "off" then "on", in which case "reboot" will be ++ * phase 0, "off" will be phase 1 and "on" will be phase 2. ++ */ ++enum st_remap_phase { ++ st_phase_requested = 0, ++ st_phase_off = 1, ++ st_phase_on = 2 ++}; ++ + typedef struct remote_fencing_op_s { + /* The unique id associated with this operation */ + char *id; +@@ -97,7 +108,7 @@ typedef struct remote_fencing_op_s { + long long call_options; + + /*! The current state of the remote operation. This indicates +- * what phase the op is in, query, exec, done, duplicate, failed. */ ++ * what stage the op is in, query, exec, done, duplicate, failed. */ + enum op_state state; + /*! The node that owns the remote operation */ + char *originator; +@@ -114,10 +125,17 @@ typedef struct remote_fencing_op_s { + + /*! The current topology level being executed */ + guint level; +- +- /*! List of required devices the topology must execute regardless of what +- * topology level they exist at. */ +- GListPtr required_list; ++ /*! The current operation phase being executed */ ++ enum st_remap_phase phase; ++ ++ /* For phase 0 or 1 (requested action or a remapped "off"), required devices ++ * will be executed regardless of what topology level is being executed ++ * currently. For phase 1 (remapped "on"), required devices will not be ++ * attempted, because the cluster will execute them automatically when the ++ * node next joins the cluster. ++ */ ++ /*! Lists of devices marked as required for each phase */ ++ GListPtr required_list[3]; + /*! The device list of all the devices at the current executing topology level. */ + GListPtr devices_list; + /*! Current entry in the topology device list */ +@@ -129,6 +147,20 @@ typedef struct remote_fencing_op_s { + + } remote_fencing_op_t; + ++/* ++ * Complex fencing requirements are specified via fencing topologies. ++ * A topology consists of levels; each level is a list of fencing devices. ++ * Topologies are stored in a hash table by node name. When a node needs to be ++ * fenced, if it has an entry in the topology table, the levels are tried ++ * sequentially, and the devices in each level are tried sequentially. ++ * Fencing is considered successful as soon as any level succeeds; ++ * a level is considered successful if all its devices succeed. ++ * Essentially, all devices at a given level are "and-ed" and the ++ * levels are "or-ed". ++ * ++ * This structure is used for the topology table entries. ++ * Topology levels start from 1, so levels[0] is unused and always NULL. ++ */ + typedef struct stonith_topology_s { + char *node; + GListPtr levels[ST_LEVEL_MAX]; +diff --git a/fencing/main.c b/fencing/main.c +index a499175..46d7352 100644 +--- a/fencing/main.c ++++ b/fencing/main.c +@@ -1234,7 +1234,7 @@ struct qb_ipcs_service_handlers ipc_callbacks = { + static void + st_peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) + { +- if (type == crm_status_uname) { ++ if (type != crm_status_processes) { + /* + * This is a hack until we can send to a nodeid and/or we fix node name lookups + * These messages are ignored in stonith_peer_callback() +diff --git a/fencing/regression.py.in b/fencing/regression.py.in +index fe6d418..b4e6f08 100644 +--- a/fencing/regression.py.in ++++ b/fencing/regression.py.in +@@ -23,861 +23,937 @@ import shlex + import time + + def output_from_command(command): +- test = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE) +- test.wait() ++ test = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE) ++ test.wait() + +- return test.communicate()[0].split("\n") ++ return test.communicate()[0].split("\n") + + class Test: +- def __init__(self, name, description, verbose = 0, with_cpg = 0): +- self.name = name +- self.description = description +- self.cmds = [] +- self.verbose = verbose ++ def __init__(self, name, description, verbose = 0, with_cpg = 0): ++ self.name = name ++ self.description = description ++ self.cmds = [] ++ self.verbose = verbose + +- self.result_txt = "" +- self.cmd_tool_output = "" +- self.result_exitcode = 0; ++ self.result_txt = "" ++ self.cmd_tool_output = "" ++ self.result_exitcode = 0; + +- self.stonith_options = "-s" +- self.enable_corosync = 0 ++ self.stonith_options = "-s" ++ self.enable_corosync = 0 + +- if with_cpg: +- self.stonith_options = "-c" +- self.enable_corosync = 1 ++ if with_cpg: ++ self.stonith_options = "-c" ++ self.enable_corosync = 1 + +- self.stonith_process = None +- self.stonith_output = "" +- self.stonith_patterns = [] +- self.negative_stonith_patterns = [] ++ self.stonith_process = None ++ self.stonith_output = "" ++ self.stonith_patterns = [] ++ self.negative_stonith_patterns = [] + +- self.executed = 0 ++ self.executed = 0 + +- rsc_classes = output_from_command("crm_resource --list-standards") ++ rsc_classes = output_from_command("crm_resource --list-standards") + +- def __new_cmd(self, cmd, args, exitcode, stdout_match = "", no_wait = 0, stdout_negative_match = "", kill=None): +- self.cmds.append( +- { +- "cmd" : cmd, +- "kill" : kill, +- "args" : args, +- "expected_exitcode" : exitcode, +- "stdout_match" : stdout_match, +- "stdout_negative_match" : stdout_negative_match, +- "no_wait" : no_wait, +- } +- ) ++ def __new_cmd(self, cmd, args, exitcode, stdout_match = "", no_wait = 0, stdout_negative_match = "", kill=None): ++ self.cmds.append( ++ { ++ "cmd" : cmd, ++ "kill" : kill, ++ "args" : args, ++ "expected_exitcode" : exitcode, ++ "stdout_match" : stdout_match, ++ "stdout_negative_match" : stdout_negative_match, ++ "no_wait" : no_wait, ++ } ++ ) + +- def stop_pacemaker(self): +- cmd = shlex.split("killall -9 -q pacemakerd") +- test = subprocess.Popen(cmd, stdout=subprocess.PIPE) +- test.wait() ++ def stop_pacemaker(self): ++ cmd = shlex.split("killall -9 -q pacemakerd") ++ test = subprocess.Popen(cmd, stdout=subprocess.PIPE) ++ test.wait() + +- def start_environment(self): +- ### make sure we are in full control here ### +- self.stop_pacemaker() ++ def start_environment(self): ++ ### make sure we are in full control here ### ++ self.stop_pacemaker() + +- cmd = shlex.split("killall -9 -q stonithd") +- test = subprocess.Popen(cmd, stdout=subprocess.PIPE) +- test.wait() ++ cmd = shlex.split("killall -9 -q stonithd") ++ test = subprocess.Popen(cmd, stdout=subprocess.PIPE) ++ test.wait() + +- if self.verbose: +- self.stonith_options = self.stonith_options + " -V" +- print "Starting stonithd with %s" % self.stonith_options ++ if self.verbose: ++ self.stonith_options = self.stonith_options + " -V" ++ print "Starting stonithd with %s" % self.stonith_options + +- if os.path.exists("/tmp/stonith-regression.log"): +- os.remove('/tmp/stonith-regression.log') ++ if os.path.exists("/tmp/stonith-regression.log"): ++ os.remove('/tmp/stonith-regression.log') + +- self.stonith_process = subprocess.Popen( +- shlex.split("@CRM_DAEMON_DIR@/stonithd %s -l /tmp/stonith-regression.log" % self.stonith_options)) ++ self.stonith_process = subprocess.Popen( ++ shlex.split("@CRM_DAEMON_DIR@/stonithd %s -l /tmp/stonith-regression.log" % self.stonith_options)) + +- time.sleep(1) +- +- def clean_environment(self): +- if self.stonith_process: +- self.stonith_process.terminate() +- self.stonith_process.wait() +- +- self.stonith_output = "" +- self.stonith_process = None +- +- f = open('/tmp/stonith-regression.log', 'r') +- for line in f.readlines(): +- self.stonith_output = self.stonith_output + line +- +- if self.verbose: +- print "Daemon Output Start" +- print self.stonith_output +- print "Daemon Output End" +- os.remove('/tmp/stonith-regression.log') +- +- def add_stonith_log_pattern(self, pattern): +- self.stonith_patterns.append(pattern) +- +- def add_stonith_negative_log_pattern(self, pattern): +- self.negative_stonith_patterns.append(pattern) +- +- def add_cmd(self, cmd, args): +- self.__new_cmd(cmd, args, 0, "") +- +- def add_cmd_no_wait(self, cmd, args): +- self.__new_cmd(cmd, args, 0, "", 1) +- +- def add_cmd_check_stdout(self, cmd, args, match, no_match = ""): +- self.__new_cmd(cmd, args, 0, match, 0, no_match) +- +- def add_expected_fail_cmd(self, cmd, args, exitcode = 255): +- self.__new_cmd(cmd, args, exitcode, "") +- +- def get_exitcode(self): +- return self.result_exitcode +- +- def print_result(self, filler): +- print "%s%s" % (filler, self.result_txt) +- +- def run_cmd(self, args): +- cmd = shlex.split(args['args']) +- cmd.insert(0, args['cmd']) +- +- if self.verbose: +- print "\n\nRunning: "+" ".join(cmd) +- test = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) +- +- if args['kill']: +- if self.verbose: +- print "Also running: "+args['kill'] +- subprocess.Popen(shlex.split(args['kill'])) +- +- if args['no_wait'] == 0: +- test.wait() +- else: +- return 0 +- +- output_res = test.communicate() +- output = output_res[0] + output_res[1] +- +- if self.verbose: +- print output +- +- if args['stdout_match'] != "" and output.count(args['stdout_match']) == 0: +- test.returncode = -2 +- print "STDOUT string '%s' was not found in cmd output: %s" % (args['stdout_match'], output) +- +- if args['stdout_negative_match'] != "" and output.count(args['stdout_negative_match']) != 0: +- test.returncode = -2 +- print "STDOUT string '%s' was found in cmd output: %s" % (args['stdout_negative_match'], output) +- +- return test.returncode; +- +- +- def count_negative_matches(self, outline): +- count = 0 +- for line in self.negative_stonith_patterns: +- if outline.count(line): +- count = 1 +- if self.verbose: +- print "This pattern should not have matched = '%s" % (line) +- return count +- +- def match_stonith_patterns(self): +- negative_matches = 0 +- cur = 0 +- pats = self.stonith_patterns +- total_patterns = len(self.stonith_patterns) +- +- if len(self.stonith_patterns) == 0: +- return +- +- for line in self.stonith_output.split("\n"): +- negative_matches = negative_matches + self.count_negative_matches(line) +- if len(pats) == 0: +- continue +- cur = -1 +- for p in pats: +- cur = cur + 1 +- if line.count(pats[cur]): +- del pats[cur] +- break +- +- if len(pats) > 0 or negative_matches: +- if self.verbose: +- for p in pats: +- print "Pattern Not Matched = '%s'" % p +- +- self.result_txt = "FAILURE - '%s' failed. %d patterns out of %d not matched. %d negative matches." % (self.name, len(pats), total_patterns, negative_matches) +- self.result_exitcode = -1 +- +- def run(self): +- res = 0 +- i = 1 +- self.start_environment() +- +- if self.verbose: +- print "\n--- START TEST - %s" % self.name +- +- self.result_txt = "SUCCESS - '%s'" % (self.name) +- self.result_exitcode = 0 +- for cmd in self.cmds: +- res = self.run_cmd(cmd) +- if res != cmd['expected_exitcode']: +- print "Step %d FAILED - command returned %d, expected %d" % (i, res, cmd['expected_exitcode']) +- self.result_txt = "FAILURE - '%s' failed at step %d. Command: %s %s" % (self.name, i, cmd['cmd'], cmd['args']) +- self.result_exitcode = -1 +- break +- else: +- if self.verbose: +- print "Step %d SUCCESS" % (i) +- i = i + 1 +- self.clean_environment() +- +- if self.result_exitcode == 0: +- self.match_stonith_patterns() +- +- print self.result_txt +- if self.verbose: +- print "--- END TEST - %s\n" % self.name +- +- self.executed = 1 +- return res ++ time.sleep(1) ++ ++ def clean_environment(self): ++ if self.stonith_process: ++ self.stonith_process.terminate() ++ self.stonith_process.wait() ++ ++ self.stonith_output = "" ++ self.stonith_process = None ++ ++ f = open('/tmp/stonith-regression.log', 'r') ++ for line in f.readlines(): ++ self.stonith_output = self.stonith_output + line ++ ++ if self.verbose: ++ print "Daemon Output Start" ++ print self.stonith_output ++ print "Daemon Output End" ++ os.remove('/tmp/stonith-regression.log') ++ ++ def add_stonith_log_pattern(self, pattern): ++ self.stonith_patterns.append(pattern) ++ ++ def add_stonith_negative_log_pattern(self, pattern): ++ self.negative_stonith_patterns.append(pattern) ++ ++ def add_cmd(self, cmd, args): ++ self.__new_cmd(cmd, args, 0, "") ++ ++ def add_cmd_no_wait(self, cmd, args): ++ self.__new_cmd(cmd, args, 0, "", 1) ++ ++ def add_cmd_check_stdout(self, cmd, args, match, no_match = ""): ++ self.__new_cmd(cmd, args, 0, match, 0, no_match) ++ ++ def add_expected_fail_cmd(self, cmd, args, exitcode = 255): ++ self.__new_cmd(cmd, args, exitcode, "") ++ ++ def get_exitcode(self): ++ return self.result_exitcode ++ ++ def print_result(self, filler): ++ print "%s%s" % (filler, self.result_txt) ++ ++ def run_cmd(self, args): ++ cmd = shlex.split(args['args']) ++ cmd.insert(0, args['cmd']) ++ ++ if self.verbose: ++ print "\n\nRunning: "+" ".join(cmd) ++ test = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) ++ ++ if args['kill']: ++ if self.verbose: ++ print "Also running: "+args['kill'] ++ subprocess.Popen(shlex.split(args['kill'])) ++ ++ if args['no_wait'] == 0: ++ test.wait() ++ else: ++ return 0 ++ ++ output_res = test.communicate() ++ output = output_res[0] + output_res[1] ++ ++ if self.verbose: ++ print output ++ ++ if args['stdout_match'] != "" and output.count(args['stdout_match']) == 0: ++ test.returncode = -2 ++ print "STDOUT string '%s' was not found in cmd output: %s" % (args['stdout_match'], output) ++ ++ if args['stdout_negative_match'] != "" and output.count(args['stdout_negative_match']) != 0: ++ test.returncode = -2 ++ print "STDOUT string '%s' was found in cmd output: %s" % (args['stdout_negative_match'], output) ++ ++ return test.returncode; ++ ++ ++ def count_negative_matches(self, outline): ++ count = 0 ++ for line in self.negative_stonith_patterns: ++ if outline.count(line): ++ count = 1 ++ if self.verbose: ++ print "This pattern should not have matched = '%s" % (line) ++ return count ++ ++ def match_stonith_patterns(self): ++ negative_matches = 0 ++ cur = 0 ++ pats = self.stonith_patterns ++ total_patterns = len(self.stonith_patterns) ++ ++ if len(self.stonith_patterns) == 0: ++ return ++ ++ for line in self.stonith_output.split("\n"): ++ negative_matches = negative_matches + self.count_negative_matches(line) ++ if len(pats) == 0: ++ continue ++ cur = -1 ++ for p in pats: ++ cur = cur + 1 ++ if line.count(pats[cur]): ++ del pats[cur] ++ break ++ ++ if len(pats) > 0 or negative_matches: ++ if self.verbose: ++ for p in pats: ++ print "Pattern Not Matched = '%s'" % p ++ ++ self.result_txt = "FAILURE - '%s' failed. %d patterns out of %d not matched. %d negative matches." % (self.name, len(pats), total_patterns, negative_matches) ++ self.result_exitcode = -1 ++ ++ def run(self): ++ res = 0 ++ i = 1 ++ self.start_environment() ++ ++ if self.verbose: ++ print "\n--- START TEST - %s" % self.name ++ ++ self.result_txt = "SUCCESS - '%s'" % (self.name) ++ self.result_exitcode = 0 ++ for cmd in self.cmds: ++ res = self.run_cmd(cmd) ++ if res != cmd['expected_exitcode']: ++ print "Step %d FAILED - command returned %d, expected %d" % (i, res, cmd['expected_exitcode']) ++ self.result_txt = "FAILURE - '%s' failed at step %d. Command: %s %s" % (self.name, i, cmd['cmd'], cmd['args']) ++ self.result_exitcode = -1 ++ break ++ else: ++ if self.verbose: ++ print "Step %d SUCCESS" % (i) ++ i = i + 1 ++ self.clean_environment() ++ ++ if self.result_exitcode == 0: ++ self.match_stonith_patterns() ++ ++ print self.result_txt ++ if self.verbose: ++ print "--- END TEST - %s\n" % self.name ++ ++ self.executed = 1 ++ return res + + class Tests: +- def __init__(self, verbose = 0): +- self.tests = [] +- self.verbose = verbose +- self.autogen_corosync_cfg = 0 +- if not os.path.exists("/etc/corosync/corosync.conf"): +- self.autogen_corosync_cfg = 1 +- +- def new_test(self, name, description, with_cpg = 0): +- test = Test(name, description, self.verbose, with_cpg) +- self.tests.append(test) +- return test +- +- def print_list(self): +- print "\n==== %d TESTS FOUND ====" % (len(self.tests)) +- print "%35s - %s" % ("TEST NAME", "TEST DESCRIPTION") +- print "%35s - %s" % ("--------------------", "--------------------") +- for test in self.tests: +- print "%35s - %s" % (test.name, test.description) +- print "==== END OF LIST ====\n" +- +- +- def start_corosync(self): +- if self.verbose: +- print "Starting corosync" +- +- test = subprocess.Popen("corosync", stdout=subprocess.PIPE) +- test.wait() +- time.sleep(10) +- +- def stop_corosync(self): +- cmd = shlex.split("killall -9 -q corosync") +- test = subprocess.Popen(cmd, stdout=subprocess.PIPE) +- test.wait() +- +- def run_single(self, name): +- for test in self.tests: +- if test.name == name: +- test.run() +- break; +- +- def run_tests_matching(self, pattern): +- for test in self.tests: +- if test.name.count(pattern) != 0: +- test.run() +- +- def run_cpg_only(self): +- for test in self.tests: +- if test.enable_corosync: +- test.run() +- +- def run_no_cpg(self): +- for test in self.tests: +- if not test.enable_corosync: +- test.run() +- +- def run_tests(self): +- for test in self.tests: +- test.run() +- +- def exit(self): +- for test in self.tests: +- if test.executed == 0: +- continue +- +- if test.get_exitcode() != 0: +- sys.exit(-1) +- +- sys.exit(0) +- +- def print_results(self): +- failures = 0; +- success = 0; +- print "\n\n======= FINAL RESULTS ==========" +- print "\n--- FAILURE RESULTS:" +- for test in self.tests: +- if test.executed == 0: +- continue +- +- if test.get_exitcode() != 0: +- failures = failures + 1 +- test.print_result(" ") +- else: +- success = success + 1 +- +- if failures == 0: +- print " None" +- +- print "\n--- TOTALS\n Pass:%d\n Fail:%d\n" % (success, failures) +- def build_api_sanity_tests(self): +- verbose_arg = "" +- if self.verbose: +- verbose_arg = "-V" +- +- test = self.new_test("standalone_low_level_api_test", "Sanity test client api in standalone mode.") +- test.add_cmd("@CRM_DAEMON_DIR@/stonith-test", "-t %s" % (verbose_arg)) +- +- test = self.new_test("cpg_low_level_api_test", "Sanity test client api using mainloop and cpg.", 1) +- test.add_cmd("@CRM_DAEMON_DIR@/stonith-test", "-m %s" % (verbose_arg)) +- +- def build_custom_timeout_tests(self): +- # custom timeout without topology +- test = self.new_test("cpg_custom_timeout_1", +- "Verify per device timeouts work as expected without using topology.", 1) +- test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=1\"") +- test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=4\"") +- test.add_cmd("stonith_admin", "-F node3 -t 2") +- # timeout is 2+1+4 = 7 +- test.add_stonith_log_pattern("remote op timeout set to 7") +- +- # custom timeout _WITH_ topology +- test = self.new_test("cpg_custom_timeout_2", +- "Verify per device timeouts work as expected _WITH_ topology.", 1) +- test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=1\"") +- test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=4000\"") +- test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") +- test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1") +- test.add_cmd("stonith_admin", "-r node3 -i 3 -v false2") +- test.add_cmd("stonith_admin", "-F node3 -t 2") +- # timeout is 2+1+4000 = 4003 +- test.add_stonith_log_pattern("remote op timeout set to 4003") +- +- def build_fence_merge_tests(self): +- +- ### Simple test that overlapping fencing operations get merged +- test = self.new_test("cpg_custom_merge_single", +- "Verify overlapping identical fencing operations are merged, no fencing levels used.", 1) +- test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ") +- test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") +- test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") +- test.add_cmd("stonith_admin", "-F node3 -t 10") +- ### one merger will happen +- test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") +- ### the pattern below signifies that both the original and duplicate operation completed +- test.add_stonith_log_pattern("Operation off of node3 by") +- test.add_stonith_log_pattern("Operation off of node3 by") +- +- ### Test that multiple mergers occur +- test = self.new_test("cpg_custom_merge_multiple", +- "Verify multiple overlapping identical fencing operations are merged", 1) +- test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ") +- test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") +- test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") +- test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") +- test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") +- test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") +- test.add_cmd("stonith_admin", "-F node3 -t 10") +- ### 4 mergers should occur +- test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") +- test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") +- test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") +- test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") +- ### the pattern below signifies that both the original and duplicate operation completed +- test.add_stonith_log_pattern("Operation off of node3 by") +- test.add_stonith_log_pattern("Operation off of node3 by") +- test.add_stonith_log_pattern("Operation off of node3 by") +- test.add_stonith_log_pattern("Operation off of node3 by") +- test.add_stonith_log_pattern("Operation off of node3 by") +- +- ### Test that multiple mergers occur with topologies used +- test = self.new_test("cpg_custom_merge_with_topology", +- "Verify multiple overlapping identical fencing operations are merged with fencing levels.", 1) +- test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ") +- test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") +- test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") +- test.add_cmd("stonith_admin", "-r node3 -i 1 -v false2") +- test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1") +- test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") +- test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") +- test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") +- test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") +- test.add_cmd("stonith_admin", "-F node3 -t 10") +- ### 4 mergers should occur +- test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") +- test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") +- test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") +- test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") +- ### the pattern below signifies that both the original and duplicate operation completed +- test.add_stonith_log_pattern("Operation off of node3 by") +- test.add_stonith_log_pattern("Operation off of node3 by") +- test.add_stonith_log_pattern("Operation off of node3 by") +- test.add_stonith_log_pattern("Operation off of node3 by") +- test.add_stonith_log_pattern("Operation off of node3 by") +- +- +- test = self.new_test("cpg_custom_no_merge", +- "Verify differing fencing operations are not merged", 1) +- test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3 node2\"") +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3 node2\" ") +- test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3 node2\"") +- test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") +- test.add_cmd("stonith_admin", "-r node3 -i 1 -v false2") +- test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1") +- test.add_cmd_no_wait("stonith_admin", "-F node2 -t 10") +- test.add_cmd("stonith_admin", "-F node3 -t 10") +- test.add_stonith_negative_log_pattern("Merging stonith action off for node node3 originating from client") +- +- def build_standalone_tests(self): +- test_types = [ +- { +- "prefix" : "standalone" , +- "use_cpg" : 0, +- }, +- { +- "prefix" : "cpg" , +- "use_cpg" : 1, +- }, +- ] +- +- # test what happens when all devices timeout +- for test_type in test_types: +- test = self.new_test("%s_fence_multi_device_failure" % test_type["prefix"], +- "Verify that all devices timeout, a fencing failure is returned.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R false3 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") +- if test_type["use_cpg"] == 1: +- test.add_expected_fail_cmd("stonith_admin", "-F node3 -t 2", 194) +- test.add_stonith_log_pattern("remote op timeout set to 6") +- else: +- test.add_expected_fail_cmd("stonith_admin", "-F node3 -t 2", 55) +- +- test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: ") +- test.add_stonith_log_pattern("for host 'node3' with device 'false2' returned: ") +- test.add_stonith_log_pattern("for host 'node3' with device 'false3' returned: ") +- +- # test what happens when multiple devices can fence a node, but the first device fails. +- for test_type in test_types: +- test = self.new_test("%s_fence_device_failure_rollover" % test_type["prefix"], +- "Verify that when one fence device fails for a node, the others are tried.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-F node3 -t 2") +- +- if test_type["use_cpg"] == 1: +- test.add_stonith_log_pattern("remote op timeout set to 6") +- +- # simple topology test for one device +- for test_type in test_types: +- if test_type["use_cpg"] == 0: +- continue +- +- test = self.new_test("%s_topology_simple" % test_type["prefix"], +- "Verify all fencing devices at a level are used.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- +- test.add_cmd("stonith_admin", "-r node3 -i 1 -v true") +- test.add_cmd("stonith_admin", "-F node3 -t 2") +- +- test.add_stonith_log_pattern("remote op timeout set to 2") +- test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0") +- +- +- # add topology, delete topology, verify fencing still works +- for test_type in test_types: +- if test_type["use_cpg"] == 0: +- continue +- +- test = self.new_test("%s_topology_add_remove" % test_type["prefix"], +- "Verify fencing occurrs after all topology levels are removed", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- +- test.add_cmd("stonith_admin", "-r node3 -i 1 -v true") +- test.add_cmd("stonith_admin", "-d node3 -i 1") +- test.add_cmd("stonith_admin", "-F node3 -t 2") +- +- test.add_stonith_log_pattern("remote op timeout set to 2") +- test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0") +- +- # test what happens when the first fencing level has multiple devices. +- for test_type in test_types: +- if test_type["use_cpg"] == 0: +- continue +- +- test = self.new_test("%s_topology_device_fails" % test_type["prefix"], +- "Verify if one device in a level fails, the other is tried.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R false -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- +- test.add_cmd("stonith_admin", "-r node3 -i 1 -v false") +- test.add_cmd("stonith_admin", "-r node3 -i 2 -v true") +- test.add_cmd("stonith_admin", "-F node3 -t 20") +- +- test.add_stonith_log_pattern("remote op timeout set to 40") +- test.add_stonith_log_pattern("for host 'node3' with device 'false' returned: -201") +- test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0") +- +- # test what happens when the first fencing level fails. +- for test_type in test_types: +- if test_type["use_cpg"] == 0: +- continue +- +- test = self.new_test("%s_topology_multi_level_fails" % test_type["prefix"], +- "Verify if one level fails, the next leve is tried.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") +- +- test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") +- test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1") +- test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2") +- test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2") +- test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3") +- test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4") +- +- test.add_cmd("stonith_admin", "-F node3 -t 2") +- +- test.add_stonith_log_pattern("remote op timeout set to 12") +- test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: -201") +- test.add_stonith_log_pattern("for host 'node3' with device 'false2' returned: -201") +- test.add_stonith_log_pattern("for host 'node3' with device 'true3' returned: 0") +- test.add_stonith_log_pattern("for host 'node3' with device 'true4' returned: 0") +- +- +- # test what happens when the first fencing level had devices that no one has registered +- for test_type in test_types: +- if test_type["use_cpg"] == 0: +- continue +- +- test = self.new_test("%s_topology_missing_devices" % test_type["prefix"], +- "Verify topology can continue with missing devices.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") +- +- test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") +- test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1") +- test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2") +- test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2") +- test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3") +- test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4") +- +- test.add_cmd("stonith_admin", "-F node3 -t 2") +- +- # Test what happens if multiple fencing levels are defined, and then the first one is removed. +- for test_type in test_types: +- if test_type["use_cpg"] == 0: +- continue +- +- test = self.new_test("%s_topology_level_removal" % test_type["prefix"], +- "Verify level removal works.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") +- +- test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") +- test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1") +- +- test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2") +- test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2") +- +- test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3") +- test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4") +- +- # Now remove level 2, verify none of the devices in level two are hit. +- test.add_cmd("stonith_admin", "-d node3 -i 2") +- +- test.add_cmd("stonith_admin", "-F node3 -t 20") +- +- test.add_stonith_log_pattern("remote op timeout set to 8") +- test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: -201") +- test.add_stonith_negative_log_pattern("for host 'node3' with device 'false2' returned: ") +- test.add_stonith_log_pattern("for host 'node3' with device 'true3' returned: 0") +- test.add_stonith_log_pattern("for host 'node3' with device 'true4' returned: 0") +- +- # test the stonith builds the correct list of devices that can fence a node. +- for test_type in test_types: +- test = self.new_test("%s_list_devices" % test_type["prefix"], +- "Verify list of devices that can fence a node is correct", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") +- test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- +- test.add_cmd_check_stdout("stonith_admin", "-l node1 -V", "true2", "true1") +- test.add_cmd_check_stdout("stonith_admin", "-l node1 -V", "true3", "true1") +- +- # simple test of device monitor +- for test_type in test_types: +- test = self.new_test("%s_monitor" % test_type["prefix"], +- "Verify device is reachable", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") +- test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") +- +- test.add_cmd("stonith_admin", "-Q true1") +- test.add_cmd("stonith_admin", "-Q false1") +- test.add_expected_fail_cmd("stonith_admin", "-Q true2", 237) +- +- # Verify monitor occurs for duration of timeout period on failure +- for test_type in test_types: +- test = self.new_test("%s_monitor_timeout" % test_type["prefix"], +- "Verify monitor uses duration of timeout period given.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_monitor_fail -o \"pcmk_host_list=node3\"") +- test.add_expected_fail_cmd("stonith_admin", "-Q true1 -t 5", 195) +- test.add_stonith_log_pattern("Attempt 2 to execute") +- +- # Verify monitor occurs for duration of timeout period on failure, but stops at max retries +- for test_type in test_types: +- test = self.new_test("%s_monitor_timeout_max_retries" % test_type["prefix"], +- "Verify monitor retries until max retry value or timeout is hit.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_monitor_fail -o \"pcmk_host_list=node3\"") +- test.add_expected_fail_cmd("stonith_admin", "-Q true1 -t 15",195) +- test.add_stonith_log_pattern("Attempted to execute agent fence_dummy_monitor_fail (list) the maximum number of times") +- +- # simple register test +- for test_type in test_types: +- test = self.new_test("%s_register" % test_type["prefix"], +- "Verify devices can be registered and un-registered", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") +- +- test.add_cmd("stonith_admin", "-Q true1") +- +- test.add_cmd("stonith_admin", "-D true1") +- +- test.add_expected_fail_cmd("stonith_admin", "-Q true1", 237) +- +- +- # simple reboot test +- for test_type in test_types: +- test = self.new_test("%s_reboot" % test_type["prefix"], +- "Verify devices can be rebooted", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") +- +- test.add_cmd("stonith_admin", "-B node3 -t 2") +- +- test.add_cmd("stonith_admin", "-D true1") +- +- test.add_expected_fail_cmd("stonith_admin", "-Q true1", 237) +- +- # test fencing history. +- for test_type in test_types: +- if test_type["use_cpg"] == 0: +- continue +- test = self.new_test("%s_fence_history" % test_type["prefix"], +- "Verify last fencing operation is returned.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") +- +- test.add_cmd("stonith_admin", "-F node3 -t 2 -V") +- +- test.add_cmd_check_stdout("stonith_admin", "-H node3", "was able to turn off node node3", "") +- +- # simple test of dynamic list query +- for test_type in test_types: +- test = self.new_test("%s_dynamic_list_query" % test_type["prefix"], +- "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_list") +- test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_list") +- test.add_cmd("stonith_admin", "-R true3 -a fence_dummy_list") +- +- test.add_cmd_check_stdout("stonith_admin", "-l fake_port_1", "3 devices found") +- +- +- # fence using dynamic list query +- for test_type in test_types: +- test = self.new_test("%s_fence_dynamic_list_query" % test_type["prefix"], +- "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_list") +- test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_list") +- test.add_cmd("stonith_admin", "-R true3 -a fence_dummy_list") +- +- test.add_cmd("stonith_admin", "-F fake_port_1 -t 5 -V"); +- +- # simple test of query using status action +- for test_type in test_types: +- test = self.new_test("%s_status_query" % test_type["prefix"], +- "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"") +- test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"") +- test.add_cmd("stonith_admin", "-R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"") +- +- test.add_cmd_check_stdout("stonith_admin", "-l fake_port_1", "3 devices found") +- +- # test what happens when no reboot action is advertised +- for test_type in test_types: +- test = self.new_test("%s_no_reboot_support" % test_type["prefix"], +- "Verify reboot action defaults to off when no reboot action is advertised by agent.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_no_reboot -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-B node1 -t 5 -V"); +- test.add_stonith_log_pattern("does not advertise support for 'reboot', performing 'off'") +- test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)"); +- +- # make sure reboot is used when reboot action is advertised +- for test_type in test_types: +- test = self.new_test("%s_with_reboot_support" % test_type["prefix"], +- "Verify reboot action can be used when metadata advertises it.", test_type["use_cpg"]) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") +- test.add_cmd("stonith_admin", "-B node1 -t 5 -V"); +- test.add_stonith_negative_log_pattern("does not advertise support for 'reboot', performing 'off'") +- test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)"); +- +- def build_nodeid_tests(self): +- our_uname = output_from_command("uname -n") +- if our_uname: +- our_uname = our_uname[0] +- +- ### verify nodeid is supplied when nodeid is in the metadata parameters +- test = self.new_test("cpg_supply_nodeid", +- "Verify nodeid is given when fence agent has nodeid as parameter", 1) +- +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-F %s -t 3" % (our_uname)) +- test.add_stonith_log_pattern("For stonith action (off) for victim %s, adding nodeid" % (our_uname)) +- +- ### verify nodeid is _NOT_ supplied when nodeid is not in the metadata parameters +- test = self.new_test("cpg_do_not_supply_nodeid", +- "Verify nodeid is _NOT_ given when fence agent does not have nodeid as parameter", 1) +- +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-F %s -t 3" % (our_uname)) +- test.add_stonith_negative_log_pattern("For stonith action (off) for victim %s, adding nodeid" % (our_uname)) +- +- ### verify nodeid use doesn't explode standalone mode +- test = self.new_test("standalone_do_not_supply_nodeid", +- "Verify nodeid in metadata parameter list doesn't kill standalone mode", 0) +- +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-F %s -t 3" % (our_uname)) +- test.add_stonith_negative_log_pattern("For stonith action (off) for victim %s, adding nodeid" % (our_uname)) +- +- +- def build_unfence_tests(self): +- our_uname = output_from_command("uname -n") +- if our_uname: +- our_uname = our_uname[0] +- +- ### verify unfencing using automatic unfencing +- test = self.new_test("cpg_unfence_required_1", +- "Verify require unfencing on all devices when automatic=true in agent's metadata", 1) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname)) +- # both devices should be executed +- test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)"); +- test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)"); +- +- +- ### verify unfencing using automatic unfencing fails if any of the required agents fail +- test = self.new_test("cpg_unfence_required_2", +- "Verify require unfencing on all devices when automatic=true in agent's metadata", 1) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=fail\" -o \"pcmk_host_list=%s\"" % (our_uname)) +- test.add_expected_fail_cmd("stonith_admin", "-U %s -t 6" % (our_uname), 143) +- +- ### verify unfencing using automatic devices with topology +- test = self.new_test("cpg_unfence_required_3", +- "Verify require unfencing on all devices even when required devices are at different topology levels", 1) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-r %s -i 1 -v true1" % (our_uname)) +- test.add_cmd("stonith_admin", "-r %s -i 2 -v true2" % (our_uname)) +- test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname)) +- test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)"); +- test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)"); +- +- +- ### verify unfencing using automatic devices with topology +- test = self.new_test("cpg_unfence_required_4", +- "Verify all required devices are executed even with topology levels fail.", 1) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-R true3 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-R true4 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-R false3 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-R false4 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-r %s -i 1 -v true1" % (our_uname)) +- test.add_cmd("stonith_admin", "-r %s -i 1 -v false1" % (our_uname)) +- test.add_cmd("stonith_admin", "-r %s -i 2 -v false2" % (our_uname)) +- test.add_cmd("stonith_admin", "-r %s -i 2 -v true2" % (our_uname)) +- test.add_cmd("stonith_admin", "-r %s -i 2 -v false3" % (our_uname)) +- test.add_cmd("stonith_admin", "-r %s -i 2 -v true3" % (our_uname)) +- test.add_cmd("stonith_admin", "-r %s -i 3 -v false4" % (our_uname)) +- test.add_cmd("stonith_admin", "-r %s -i 4 -v true4" % (our_uname)) +- test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname)) +- test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)"); +- test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)"); +- test.add_stonith_log_pattern("with device 'true3' returned: 0 (OK)"); +- test.add_stonith_log_pattern("with device 'true4' returned: 0 (OK)"); +- +- ### verify unfencing using on_target device +- test = self.new_test("cpg_unfence_on_target_1", +- "Verify unfencing with on_target = true", 1) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname)) +- test.add_stonith_log_pattern("(on) to be executed on the target node") +- +- +- ### verify failure of unfencing using on_target device +- test = self.new_test("cpg_unfence_on_target_2", +- "Verify failure unfencing with on_target = true", 1) +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake_1234\"" % (our_uname)) +- test.add_expected_fail_cmd("stonith_admin", "-U node_fake_1234 -t 3", 237) +- test.add_stonith_log_pattern("(on) to be executed on the target node") +- +- +- ### verify unfencing using on_target device with topology +- test = self.new_test("cpg_unfence_on_target_3", +- "Verify unfencing with on_target = true using topology", 1) +- +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) +- +- test.add_cmd("stonith_admin", "-r %s -i 1 -v true1" % (our_uname)) +- test.add_cmd("stonith_admin", "-r %s -i 2 -v true2" % (our_uname)) +- +- test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname)) +- test.add_stonith_log_pattern("(on) to be executed on the target node") +- +- ### verify unfencing using on_target device with topology fails when victim node doesn't exist +- test = self.new_test("cpg_unfence_on_target_4", +- "Verify unfencing failure with on_target = true using topology", 1) +- +- test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake\"" % (our_uname)) +- test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake\"" % (our_uname)) +- +- test.add_cmd("stonith_admin", "-r node_fake -i 1 -v true1") +- test.add_cmd("stonith_admin", "-r node_fake -i 2 -v true2") +- +- test.add_expected_fail_cmd("stonith_admin", "-U node_fake -t 3", 237) +- test.add_stonith_log_pattern("(on) to be executed on the target node") +- +- +- def setup_environment(self, use_corosync): +- if self.autogen_corosync_cfg and use_corosync: +- corosync_conf = (""" ++ def __init__(self, verbose = 0): ++ self.tests = [] ++ self.verbose = verbose ++ self.autogen_corosync_cfg = 0 ++ if not os.path.exists("/etc/corosync/corosync.conf"): ++ self.autogen_corosync_cfg = 1 ++ ++ def new_test(self, name, description, with_cpg = 0): ++ test = Test(name, description, self.verbose, with_cpg) ++ self.tests.append(test) ++ return test ++ ++ def print_list(self): ++ print "\n==== %d TESTS FOUND ====" % (len(self.tests)) ++ print "%35s - %s" % ("TEST NAME", "TEST DESCRIPTION") ++ print "%35s - %s" % ("--------------------", "--------------------") ++ for test in self.tests: ++ print "%35s - %s" % (test.name, test.description) ++ print "==== END OF LIST ====\n" ++ ++ ++ def start_corosync(self): ++ if self.verbose: ++ print "Starting corosync" ++ ++ test = subprocess.Popen("corosync", stdout=subprocess.PIPE) ++ test.wait() ++ time.sleep(10) ++ ++ def stop_corosync(self): ++ cmd = shlex.split("killall -9 -q corosync") ++ test = subprocess.Popen(cmd, stdout=subprocess.PIPE) ++ test.wait() ++ ++ def run_single(self, name): ++ for test in self.tests: ++ if test.name == name: ++ test.run() ++ break; ++ ++ def run_tests_matching(self, pattern): ++ for test in self.tests: ++ if test.name.count(pattern) != 0: ++ test.run() ++ ++ def run_cpg_only(self): ++ for test in self.tests: ++ if test.enable_corosync: ++ test.run() ++ ++ def run_no_cpg(self): ++ for test in self.tests: ++ if not test.enable_corosync: ++ test.run() ++ ++ def run_tests(self): ++ for test in self.tests: ++ test.run() ++ ++ def exit(self): ++ for test in self.tests: ++ if test.executed == 0: ++ continue ++ ++ if test.get_exitcode() != 0: ++ sys.exit(-1) ++ ++ sys.exit(0) ++ ++ def print_results(self): ++ failures = 0; ++ success = 0; ++ print "\n\n======= FINAL RESULTS ==========" ++ print "\n--- FAILURE RESULTS:" ++ for test in self.tests: ++ if test.executed == 0: ++ continue ++ ++ if test.get_exitcode() != 0: ++ failures = failures + 1 ++ test.print_result(" ") ++ else: ++ success = success + 1 ++ ++ if failures == 0: ++ print " None" ++ ++ print "\n--- TOTALS\n Pass:%d\n Fail:%d\n" % (success, failures) ++ def build_api_sanity_tests(self): ++ verbose_arg = "" ++ if self.verbose: ++ verbose_arg = "-V" ++ ++ test = self.new_test("standalone_low_level_api_test", "Sanity test client api in standalone mode.") ++ test.add_cmd("@CRM_DAEMON_DIR@/stonith-test", "-t %s" % (verbose_arg)) ++ ++ test = self.new_test("cpg_low_level_api_test", "Sanity test client api using mainloop and cpg.", 1) ++ test.add_cmd("@CRM_DAEMON_DIR@/stonith-test", "-m %s" % (verbose_arg)) ++ ++ def build_custom_timeout_tests(self): ++ # custom timeout without topology ++ test = self.new_test("cpg_custom_timeout_1", ++ "Verify per device timeouts work as expected without using topology.", 1) ++ test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=1\"") ++ test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=4\"") ++ test.add_cmd("stonith_admin", "-F node3 -t 2") ++ # timeout is 2+1+4 = 7 ++ test.add_stonith_log_pattern("remote op timeout set to 7") ++ ++ # custom timeout _WITH_ topology ++ test = self.new_test("cpg_custom_timeout_2", ++ "Verify per device timeouts work as expected _WITH_ topology.", 1) ++ test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=1\"") ++ test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=4000\"") ++ test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") ++ test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1") ++ test.add_cmd("stonith_admin", "-r node3 -i 3 -v false2") ++ test.add_cmd("stonith_admin", "-F node3 -t 2") ++ # timeout is 2+1+4000 = 4003 ++ test.add_stonith_log_pattern("remote op timeout set to 4003") ++ ++ def build_fence_merge_tests(self): ++ ++ ### Simple test that overlapping fencing operations get merged ++ test = self.new_test("cpg_custom_merge_single", ++ "Verify overlapping identical fencing operations are merged, no fencing levels used.", 1) ++ test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ") ++ test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") ++ test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") ++ test.add_cmd("stonith_admin", "-F node3 -t 10") ++ ### one merger will happen ++ test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") ++ ### the pattern below signifies that both the original and duplicate operation completed ++ test.add_stonith_log_pattern("Operation off of node3 by") ++ test.add_stonith_log_pattern("Operation off of node3 by") ++ ++ ### Test that multiple mergers occur ++ test = self.new_test("cpg_custom_merge_multiple", ++ "Verify multiple overlapping identical fencing operations are merged", 1) ++ test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"delay=2\" -o \"pcmk_host_list=node3\" ") ++ test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") ++ test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") ++ test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") ++ test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") ++ test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") ++ test.add_cmd("stonith_admin", "-F node3 -t 10") ++ ### 4 mergers should occur ++ test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") ++ test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") ++ test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") ++ test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") ++ ### the pattern below signifies that both the original and duplicate operation completed ++ test.add_stonith_log_pattern("Operation off of node3 by") ++ test.add_stonith_log_pattern("Operation off of node3 by") ++ test.add_stonith_log_pattern("Operation off of node3 by") ++ test.add_stonith_log_pattern("Operation off of node3 by") ++ test.add_stonith_log_pattern("Operation off of node3 by") ++ ++ ### Test that multiple mergers occur with topologies used ++ test = self.new_test("cpg_custom_merge_with_topology", ++ "Verify multiple overlapping identical fencing operations are merged with fencing levels.", 1) ++ test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ") ++ test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") ++ test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") ++ test.add_cmd("stonith_admin", "-r node3 -i 1 -v false2") ++ test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1") ++ test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") ++ test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") ++ test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") ++ test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") ++ test.add_cmd("stonith_admin", "-F node3 -t 10") ++ ### 4 mergers should occur ++ test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") ++ test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") ++ test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") ++ test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") ++ ### the pattern below signifies that both the original and duplicate operation completed ++ test.add_stonith_log_pattern("Operation off of node3 by") ++ test.add_stonith_log_pattern("Operation off of node3 by") ++ test.add_stonith_log_pattern("Operation off of node3 by") ++ test.add_stonith_log_pattern("Operation off of node3 by") ++ test.add_stonith_log_pattern("Operation off of node3 by") ++ ++ ++ test = self.new_test("cpg_custom_no_merge", ++ "Verify differing fencing operations are not merged", 1) ++ test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3 node2\"") ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3 node2\" ") ++ test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3 node2\"") ++ test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") ++ test.add_cmd("stonith_admin", "-r node3 -i 1 -v false2") ++ test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1") ++ test.add_cmd_no_wait("stonith_admin", "-F node2 -t 10") ++ test.add_cmd("stonith_admin", "-F node3 -t 10") ++ test.add_stonith_negative_log_pattern("Merging stonith action off for node node3 originating from client") ++ ++ def build_standalone_tests(self): ++ test_types = [ ++ { ++ "prefix" : "standalone" , ++ "use_cpg" : 0, ++ }, ++ { ++ "prefix" : "cpg" , ++ "use_cpg" : 1, ++ }, ++ ] ++ ++ # test what happens when all devices timeout ++ for test_type in test_types: ++ test = self.new_test("%s_fence_multi_device_failure" % test_type["prefix"], ++ "Verify that all devices timeout, a fencing failure is returned.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R false3 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") ++ if test_type["use_cpg"] == 1: ++ test.add_expected_fail_cmd("stonith_admin", "-F node3 -t 2", 194) ++ test.add_stonith_log_pattern("remote op timeout set to 6") ++ else: ++ test.add_expected_fail_cmd("stonith_admin", "-F node3 -t 2", 55) ++ ++ test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: ") ++ test.add_stonith_log_pattern("for host 'node3' with device 'false2' returned: ") ++ test.add_stonith_log_pattern("for host 'node3' with device 'false3' returned: ") ++ ++ # test what happens when multiple devices can fence a node, but the first device fails. ++ for test_type in test_types: ++ test = self.new_test("%s_fence_device_failure_rollover" % test_type["prefix"], ++ "Verify that when one fence device fails for a node, the others are tried.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-F node3 -t 2") ++ ++ if test_type["use_cpg"] == 1: ++ test.add_stonith_log_pattern("remote op timeout set to 6") ++ ++ # simple topology test for one device ++ for test_type in test_types: ++ if test_type["use_cpg"] == 0: ++ continue ++ ++ test = self.new_test("%s_topology_simple" % test_type["prefix"], ++ "Verify all fencing devices at a level are used.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ ++ test.add_cmd("stonith_admin", "-r node3 -i 1 -v true") ++ test.add_cmd("stonith_admin", "-F node3 -t 2") ++ ++ test.add_stonith_log_pattern("remote op timeout set to 2") ++ test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0") ++ ++ ++ # add topology, delete topology, verify fencing still works ++ for test_type in test_types: ++ if test_type["use_cpg"] == 0: ++ continue ++ ++ test = self.new_test("%s_topology_add_remove" % test_type["prefix"], ++ "Verify fencing occurrs after all topology levels are removed", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ ++ test.add_cmd("stonith_admin", "-r node3 -i 1 -v true") ++ test.add_cmd("stonith_admin", "-d node3 -i 1") ++ test.add_cmd("stonith_admin", "-F node3 -t 2") ++ ++ test.add_stonith_log_pattern("remote op timeout set to 2") ++ test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0") ++ ++ # test what happens when the first fencing level has multiple devices. ++ for test_type in test_types: ++ if test_type["use_cpg"] == 0: ++ continue ++ ++ test = self.new_test("%s_topology_device_fails" % test_type["prefix"], ++ "Verify if one device in a level fails, the other is tried.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R false -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R true -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ ++ test.add_cmd("stonith_admin", "-r node3 -i 1 -v false") ++ test.add_cmd("stonith_admin", "-r node3 -i 2 -v true") ++ test.add_cmd("stonith_admin", "-F node3 -t 20") ++ ++ test.add_stonith_log_pattern("remote op timeout set to 40") ++ test.add_stonith_log_pattern("for host 'node3' with device 'false' returned: -201") ++ test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0") ++ ++ # test what happens when the first fencing level fails. ++ for test_type in test_types: ++ if test_type["use_cpg"] == 0: ++ continue ++ ++ test = self.new_test("%s_topology_multi_level_fails" % test_type["prefix"], ++ "Verify if one level fails, the next leve is tried.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") ++ ++ test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") ++ test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1") ++ test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2") ++ test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2") ++ test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3") ++ test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4") ++ ++ test.add_cmd("stonith_admin", "-F node3 -t 3") ++ ++ test.add_stonith_log_pattern("remote op timeout set to 18") ++ test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: -201") ++ test.add_stonith_log_pattern("for host 'node3' with device 'false2' returned: -201") ++ test.add_stonith_log_pattern("for host 'node3' with device 'true3' returned: 0") ++ test.add_stonith_log_pattern("for host 'node3' with device 'true4' returned: 0") ++ ++ ++ # test what happens when the first fencing level had devices that no one has registered ++ for test_type in test_types: ++ if test_type["use_cpg"] == 0: ++ continue ++ ++ test = self.new_test("%s_topology_missing_devices" % test_type["prefix"], ++ "Verify topology can continue with missing devices.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") ++ ++ test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") ++ test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1") ++ test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2") ++ test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2") ++ test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3") ++ test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4") ++ ++ test.add_cmd("stonith_admin", "-F node3 -t 2") ++ ++ # Test what happens if multiple fencing levels are defined, and then the first one is removed. ++ for test_type in test_types: ++ if test_type["use_cpg"] == 0: ++ continue ++ ++ test = self.new_test("%s_topology_level_removal" % test_type["prefix"], ++ "Verify level removal works.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R true4 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"") ++ ++ test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") ++ test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1") ++ ++ test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2") ++ test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2") ++ ++ test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3") ++ test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4") ++ ++ # Now remove level 2, verify none of the devices in level two are hit. ++ test.add_cmd("stonith_admin", "-d node3 -i 2") ++ ++ test.add_cmd("stonith_admin", "-F node3 -t 20") ++ ++ test.add_stonith_log_pattern("remote op timeout set to 8") ++ test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: -201") ++ test.add_stonith_negative_log_pattern("for host 'node3' with device 'false2' returned: ") ++ test.add_stonith_log_pattern("for host 'node3' with device 'true3' returned: 0") ++ test.add_stonith_log_pattern("for host 'node3' with device 'true4' returned: 0") ++ ++ # test the stonith builds the correct list of devices that can fence a node. ++ for test_type in test_types: ++ test = self.new_test("%s_list_devices" % test_type["prefix"], ++ "Verify list of devices that can fence a node is correct", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") ++ test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ ++ test.add_cmd_check_stdout("stonith_admin", "-l node1 -V", "true2", "true1") ++ test.add_cmd_check_stdout("stonith_admin", "-l node1 -V", "true3", "true1") ++ ++ # simple test of device monitor ++ for test_type in test_types: ++ test = self.new_test("%s_monitor" % test_type["prefix"], ++ "Verify device is reachable", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") ++ test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"") ++ ++ test.add_cmd("stonith_admin", "-Q true1") ++ test.add_cmd("stonith_admin", "-Q false1") ++ test.add_expected_fail_cmd("stonith_admin", "-Q true2", 237) ++ ++ # Verify monitor occurs for duration of timeout period on failure ++ for test_type in test_types: ++ test = self.new_test("%s_monitor_timeout" % test_type["prefix"], ++ "Verify monitor uses duration of timeout period given.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_monitor_fail -o \"pcmk_host_list=node3\"") ++ test.add_expected_fail_cmd("stonith_admin", "-Q true1 -t 5", 195) ++ test.add_stonith_log_pattern("Attempt 2 to execute") ++ ++ # Verify monitor occurs for duration of timeout period on failure, but stops at max retries ++ for test_type in test_types: ++ test = self.new_test("%s_monitor_timeout_max_retries" % test_type["prefix"], ++ "Verify monitor retries until max retry value or timeout is hit.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_monitor_fail -o \"pcmk_host_list=node3\"") ++ test.add_expected_fail_cmd("stonith_admin", "-Q true1 -t 15",195) ++ test.add_stonith_log_pattern("Attempted to execute agent fence_dummy_monitor_fail (list) the maximum number of times") ++ ++ # simple register test ++ for test_type in test_types: ++ test = self.new_test("%s_register" % test_type["prefix"], ++ "Verify devices can be registered and un-registered", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") ++ ++ test.add_cmd("stonith_admin", "-Q true1") ++ ++ test.add_cmd("stonith_admin", "-D true1") ++ ++ test.add_expected_fail_cmd("stonith_admin", "-Q true1", 237) ++ ++ ++ # simple reboot test ++ for test_type in test_types: ++ test = self.new_test("%s_reboot" % test_type["prefix"], ++ "Verify devices can be rebooted", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") ++ ++ test.add_cmd("stonith_admin", "-B node3 -t 2") ++ ++ test.add_cmd("stonith_admin", "-D true1") ++ ++ test.add_expected_fail_cmd("stonith_admin", "-Q true1", 237) ++ ++ # test fencing history. ++ for test_type in test_types: ++ if test_type["use_cpg"] == 0: ++ continue ++ test = self.new_test("%s_fence_history" % test_type["prefix"], ++ "Verify last fencing operation is returned.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"") ++ ++ test.add_cmd("stonith_admin", "-F node3 -t 2 -V") ++ ++ test.add_cmd_check_stdout("stonith_admin", "-H node3", "was able to turn off node node3", "") ++ ++ # simple test of dynamic list query ++ for test_type in test_types: ++ test = self.new_test("%s_dynamic_list_query" % test_type["prefix"], ++ "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_list") ++ test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_list") ++ test.add_cmd("stonith_admin", "-R true3 -a fence_dummy_list") ++ ++ test.add_cmd_check_stdout("stonith_admin", "-l fake_port_1", "3 devices found") ++ ++ ++ # fence using dynamic list query ++ for test_type in test_types: ++ test = self.new_test("%s_fence_dynamic_list_query" % test_type["prefix"], ++ "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_list") ++ test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_list") ++ test.add_cmd("stonith_admin", "-R true3 -a fence_dummy_list") ++ ++ test.add_cmd("stonith_admin", "-F fake_port_1 -t 5 -V"); ++ ++ # simple test of query using status action ++ for test_type in test_types: ++ test = self.new_test("%s_status_query" % test_type["prefix"], ++ "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"") ++ test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"") ++ test.add_cmd("stonith_admin", "-R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"") ++ ++ test.add_cmd_check_stdout("stonith_admin", "-l fake_port_1", "3 devices found") ++ ++ # test what happens when no reboot action is advertised ++ for test_type in test_types: ++ test = self.new_test("%s_no_reboot_support" % test_type["prefix"], ++ "Verify reboot action defaults to off when no reboot action is advertised by agent.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_no_reboot -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-B node1 -t 5 -V"); ++ test.add_stonith_log_pattern("does not advertise support for 'reboot', performing 'off'") ++ test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)"); ++ ++ # make sure reboot is used when reboot action is advertised ++ for test_type in test_types: ++ test = self.new_test("%s_with_reboot_support" % test_type["prefix"], ++ "Verify reboot action can be used when metadata advertises it.", test_type["use_cpg"]) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") ++ test.add_cmd("stonith_admin", "-B node1 -t 5 -V"); ++ test.add_stonith_negative_log_pattern("does not advertise support for 'reboot', performing 'off'") ++ test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)"); ++ ++ def build_nodeid_tests(self): ++ our_uname = output_from_command("uname -n") ++ if our_uname: ++ our_uname = our_uname[0] ++ ++ ### verify nodeid is supplied when nodeid is in the metadata parameters ++ test = self.new_test("cpg_supply_nodeid", ++ "Verify nodeid is given when fence agent has nodeid as parameter", 1) ++ ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-F %s -t 3" % (our_uname)) ++ test.add_stonith_log_pattern("For stonith action (off) for victim %s, adding nodeid" % (our_uname)) ++ ++ ### verify nodeid is _NOT_ supplied when nodeid is not in the metadata parameters ++ test = self.new_test("cpg_do_not_supply_nodeid", ++ "Verify nodeid is _NOT_ given when fence agent does not have nodeid as parameter", 1) ++ ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-F %s -t 3" % (our_uname)) ++ test.add_stonith_negative_log_pattern("For stonith action (off) for victim %s, adding nodeid" % (our_uname)) ++ ++ ### verify nodeid use doesn't explode standalone mode ++ test = self.new_test("standalone_do_not_supply_nodeid", ++ "Verify nodeid in metadata parameter list doesn't kill standalone mode", 0) ++ ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-F %s -t 3" % (our_uname)) ++ test.add_stonith_negative_log_pattern("For stonith action (off) for victim %s, adding nodeid" % (our_uname)) ++ ++ ++ def build_unfence_tests(self): ++ our_uname = output_from_command("uname -n") ++ if our_uname: ++ our_uname = our_uname[0] ++ ++ ### verify unfencing using automatic unfencing ++ test = self.new_test("cpg_unfence_required_1", ++ "Verify require unfencing on all devices when automatic=true in agent's metadata", 1) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname)) ++ # both devices should be executed ++ test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)"); ++ test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)"); ++ ++ ++ ### verify unfencing using automatic unfencing fails if any of the required agents fail ++ test = self.new_test("cpg_unfence_required_2", ++ "Verify require unfencing on all devices when automatic=true in agent's metadata", 1) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=fail\" -o \"pcmk_host_list=%s\"" % (our_uname)) ++ test.add_expected_fail_cmd("stonith_admin", "-U %s -t 6" % (our_uname), 143) ++ ++ ### verify unfencing using automatic devices with topology ++ test = self.new_test("cpg_unfence_required_3", ++ "Verify require unfencing on all devices even when required devices are at different topology levels", 1) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-r %s -i 1 -v true1" % (our_uname)) ++ test.add_cmd("stonith_admin", "-r %s -i 2 -v true2" % (our_uname)) ++ test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname)) ++ test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)"); ++ test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)"); ++ ++ ++ ### verify unfencing using automatic devices with topology ++ test = self.new_test("cpg_unfence_required_4", ++ "Verify all required devices are executed even with topology levels fail.", 1) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-R true3 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-R true4 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-R false3 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-R false4 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-r %s -i 1 -v true1" % (our_uname)) ++ test.add_cmd("stonith_admin", "-r %s -i 1 -v false1" % (our_uname)) ++ test.add_cmd("stonith_admin", "-r %s -i 2 -v false2" % (our_uname)) ++ test.add_cmd("stonith_admin", "-r %s -i 2 -v true2" % (our_uname)) ++ test.add_cmd("stonith_admin", "-r %s -i 2 -v false3" % (our_uname)) ++ test.add_cmd("stonith_admin", "-r %s -i 2 -v true3" % (our_uname)) ++ test.add_cmd("stonith_admin", "-r %s -i 3 -v false4" % (our_uname)) ++ test.add_cmd("stonith_admin", "-r %s -i 4 -v true4" % (our_uname)) ++ test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname)) ++ test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)"); ++ test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)"); ++ test.add_stonith_log_pattern("with device 'true3' returned: 0 (OK)"); ++ test.add_stonith_log_pattern("with device 'true4' returned: 0 (OK)"); ++ ++ ### verify unfencing using on_target device ++ test = self.new_test("cpg_unfence_on_target_1", ++ "Verify unfencing with on_target = true", 1) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname)) ++ test.add_stonith_log_pattern("(on) to be executed on the target node") ++ ++ ++ ### verify failure of unfencing using on_target device ++ test = self.new_test("cpg_unfence_on_target_2", ++ "Verify failure unfencing with on_target = true", 1) ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake_1234\"" % (our_uname)) ++ test.add_expected_fail_cmd("stonith_admin", "-U node_fake_1234 -t 3", 237) ++ test.add_stonith_log_pattern("(on) to be executed on the target node") ++ ++ ++ ### verify unfencing using on_target device with topology ++ test = self.new_test("cpg_unfence_on_target_3", ++ "Verify unfencing with on_target = true using topology", 1) ++ ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname)) ++ ++ test.add_cmd("stonith_admin", "-r %s -i 1 -v true1" % (our_uname)) ++ test.add_cmd("stonith_admin", "-r %s -i 2 -v true2" % (our_uname)) ++ ++ test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname)) ++ test.add_stonith_log_pattern("(on) to be executed on the target node") ++ ++ ### verify unfencing using on_target device with topology fails when victim node doesn't exist ++ test = self.new_test("cpg_unfence_on_target_4", ++ "Verify unfencing failure with on_target = true using topology", 1) ++ ++ test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake\"" % (our_uname)) ++ test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake\"" % (our_uname)) ++ ++ test.add_cmd("stonith_admin", "-r node_fake -i 1 -v true1") ++ test.add_cmd("stonith_admin", "-r node_fake -i 2 -v true2") ++ ++ test.add_expected_fail_cmd("stonith_admin", "-U node_fake -t 3", 237) ++ test.add_stonith_log_pattern("(on) to be executed on the target node") ++ ++ def build_remap_tests(self): ++ test = self.new_test("cpg_remap_simple", ++ "Verify sequential topology reboot is remapped to all-off-then-all-on", 1) ++ test.add_cmd("stonith_admin", ++ """-R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """ ++ """-o "pcmk_off_timeout=1" -o "pcmk_reboot_timeout=10" """) ++ test.add_cmd("stonith_admin", ++ """-R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """ ++ """-o "pcmk_off_timeout=2" -o "pcmk_reboot_timeout=20" """) ++ test.add_cmd("stonith_admin", "-r node_fake -i 1 -v true1 -v true2") ++ test.add_cmd("stonith_admin", "-B node_fake -t 5") ++ test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake") ++ # timeout should be sum of off timeouts (1+2=3), not reboot timeouts (10+20=30) ++ test.add_stonith_log_pattern("remote op timeout set to 3 for fencing of node node_fake") ++ test.add_stonith_log_pattern("perform op off node_fake with true1") ++ test.add_stonith_log_pattern("perform op off node_fake with true2") ++ test.add_stonith_log_pattern("Remapped off of node_fake complete, remapping to on") ++ # fence_dummy sets "on" as an on_target action ++ test.add_stonith_log_pattern("Ignoring true1 'on' failure (no capable peers) for node_fake") ++ test.add_stonith_log_pattern("Ignoring true2 'on' failure (no capable peers) for node_fake") ++ test.add_stonith_log_pattern("Undoing remap of reboot of node_fake") ++ ++ test = self.new_test("cpg_remap_automatic", ++ "Verify remapped topology reboot skips automatic 'on'", 1) ++ test.add_cmd("stonith_admin", ++ """-R true1 -a fence_dummy_automatic_unfence """ ++ """-o "mode=pass" -o "pcmk_host_list=node_fake" """) ++ test.add_cmd("stonith_admin", ++ """-R true2 -a fence_dummy_automatic_unfence """ ++ """-o "mode=pass" -o "pcmk_host_list=node_fake" """) ++ test.add_cmd("stonith_admin", "-r node_fake -i 1 -v true1 -v true2") ++ test.add_cmd("stonith_admin", "-B node_fake -t 5") ++ test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake") ++ test.add_stonith_log_pattern("perform op off node_fake with true1") ++ test.add_stonith_log_pattern("perform op off node_fake with true2") ++ test.add_stonith_log_pattern("Remapped off of node_fake complete, remapping to on") ++ test.add_stonith_log_pattern("Undoing remap of reboot of node_fake") ++ test.add_stonith_negative_log_pattern("perform op on node_fake with") ++ test.add_stonith_negative_log_pattern("'on' failure") ++ ++ test = self.new_test("cpg_remap_complex_1", ++ "Verify remapped topology reboot in second level works if non-remapped first level fails", 1) ++ test.add_cmd("stonith_admin", """-R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """) ++ test.add_cmd("stonith_admin", """-R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """) ++ test.add_cmd("stonith_admin", """-R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """) ++ test.add_cmd("stonith_admin", "-r node_fake -i 1 -v false1") ++ test.add_cmd("stonith_admin", "-r node_fake -i 2 -v true1 -v true2") ++ test.add_cmd("stonith_admin", "-B node_fake -t 5") ++ test.add_stonith_log_pattern("perform op reboot node_fake with false1") ++ test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake") ++ test.add_stonith_log_pattern("perform op off node_fake with true1") ++ test.add_stonith_log_pattern("perform op off node_fake with true2") ++ test.add_stonith_log_pattern("Remapped off of node_fake complete, remapping to on") ++ test.add_stonith_log_pattern("Ignoring true1 'on' failure (no capable peers) for node_fake") ++ test.add_stonith_log_pattern("Ignoring true2 'on' failure (no capable peers) for node_fake") ++ test.add_stonith_log_pattern("Undoing remap of reboot of node_fake") ++ ++ test = self.new_test("cpg_remap_complex_2", ++ "Verify remapped topology reboot failure in second level proceeds to third level", 1) ++ test.add_cmd("stonith_admin", """-R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """) ++ test.add_cmd("stonith_admin", """-R false2 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """) ++ test.add_cmd("stonith_admin", """-R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """) ++ test.add_cmd("stonith_admin", """-R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """) ++ test.add_cmd("stonith_admin", """-R true3 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """) ++ test.add_cmd("stonith_admin", "-r node_fake -i 1 -v false1") ++ test.add_cmd("stonith_admin", "-r node_fake -i 2 -v true1 -v false2 -v true3") ++ test.add_cmd("stonith_admin", "-r node_fake -i 3 -v true2") ++ test.add_cmd("stonith_admin", "-B node_fake -t 5") ++ test.add_stonith_log_pattern("perform op reboot node_fake with false1") ++ test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake") ++ test.add_stonith_log_pattern("perform op off node_fake with true1") ++ test.add_stonith_log_pattern("perform op off node_fake with false2") ++ test.add_stonith_log_pattern("Attempted to execute agent fence_dummy (off) the maximum number of times") ++ test.add_stonith_log_pattern("Undoing remap of reboot of node_fake") ++ test.add_stonith_log_pattern("perform op reboot node_fake with true2") ++ test.add_stonith_negative_log_pattern("node_fake with true3") ++ ++ def setup_environment(self, use_corosync): ++ if self.autogen_corosync_cfg and use_corosync: ++ corosync_conf = (""" + totem { + version: 2 + crypto_cipher: none +@@ -908,15 +984,15 @@ logging { + } + """) + +- os.system("cat <<-END >>/etc/corosync/corosync.conf\n%s\nEND" % (corosync_conf)) ++ os.system("cat <<-END >>/etc/corosync/corosync.conf\n%s\nEND" % (corosync_conf)) + + +- if use_corosync: +- ### make sure we are in control ### +- self.stop_corosync() +- self.start_corosync() ++ if use_corosync: ++ ### make sure we are in control ### ++ self.stop_corosync() ++ self.start_corosync() + +- monitor_fail_agent = ("""#!/usr/bin/python ++ monitor_fail_agent = ("""#!/usr/bin/python + import sys + def main(): + for line in sys.stdin.readlines(): +@@ -927,7 +1003,7 @@ if __name__ == "__main__": + main() + """) + +- dynamic_list_agent = ("""#!/usr/bin/python ++ dynamic_list_agent = ("""#!/usr/bin/python + import sys + def main(): + for line in sys.stdin.readlines(): +@@ -942,140 +1018,141 @@ if __name__ == "__main__": + """) + + +- os.system("cat <<-END >>/usr/sbin/fence_dummy_list\n%s\nEND" % (dynamic_list_agent)) +- os.system("chmod 711 /usr/sbin/fence_dummy_list") ++ os.system("cat <<-END >>/usr/sbin/fence_dummy_list\n%s\nEND" % (dynamic_list_agent)) ++ os.system("chmod 711 /usr/sbin/fence_dummy_list") + +- os.system("cat <<-END >>/usr/sbin/fence_dummy_monitor_fail\n%s\nEND" % (monitor_fail_agent)) +- os.system("chmod 711 /usr/sbin/fence_dummy_monitor_fail") ++ os.system("cat <<-END >>/usr/sbin/fence_dummy_monitor_fail\n%s\nEND" % (monitor_fail_agent)) ++ os.system("chmod 711 /usr/sbin/fence_dummy_monitor_fail") + +- os.system("cp /usr/share/pacemaker/tests/cts/fence_dummy /usr/sbin/fence_dummy") ++ os.system("cp /usr/share/pacemaker/tests/cts/fence_dummy /usr/sbin/fence_dummy") + +- # modifies dummy agent to do require unfencing +- os.system("cat /usr/share/pacemaker/tests/cts/fence_dummy | sed 's/on_target=/automatic=/g' > /usr/sbin/fence_dummy_automatic_unfence"); +- os.system("chmod 711 /usr/sbin/fence_dummy_automatic_unfence") ++ # modifies dummy agent to do require unfencing ++ os.system("cat /usr/share/pacemaker/tests/cts/fence_dummy | sed 's/on_target=/automatic=/g' > /usr/sbin/fence_dummy_automatic_unfence"); ++ os.system("chmod 711 /usr/sbin/fence_dummy_automatic_unfence") + +- # modifies dummy agent to not advertise reboot +- os.system("cat /usr/share/pacemaker/tests/cts/fence_dummy | sed 's/^.*.*//g' > /usr/sbin/fence_dummy_no_reboot"); +- os.system("chmod 711 /usr/sbin/fence_dummy_no_reboot") ++ # modifies dummy agent to not advertise reboot ++ os.system("cat /usr/share/pacemaker/tests/cts/fence_dummy | sed 's/^.*.*//g' > /usr/sbin/fence_dummy_no_reboot"); ++ os.system("chmod 711 /usr/sbin/fence_dummy_no_reboot") + +- def cleanup_environment(self, use_corosync): +- if use_corosync: +- self.stop_corosync() ++ def cleanup_environment(self, use_corosync): ++ if use_corosync: ++ self.stop_corosync() + +- if self.verbose and os.path.exists('/var/log/corosync.log'): +- print "Corosync output" +- f = open('/var/log/corosync.log', 'r') +- for line in f.readlines(): +- print line.strip() +- os.remove('/var/log/corosync.log') ++ if self.verbose and os.path.exists('/var/log/corosync.log'): ++ print "Corosync output" ++ f = open('/var/log/corosync.log', 'r') ++ for line in f.readlines(): ++ print line.strip() ++ os.remove('/var/log/corosync.log') + +- if self.autogen_corosync_cfg: +- os.system("rm -f /etc/corosync/corosync.conf") ++ if self.autogen_corosync_cfg: ++ os.system("rm -f /etc/corosync/corosync.conf") + +- os.system("rm -f /usr/sbin/fence_dummy_monitor_fail") +- os.system("rm -f /usr/sbin/fence_dummy_list") +- os.system("rm -f /usr/sbin/fence_dummy") +- os.system("rm -f /usr/sbin/fence_dummy_automatic_unfence") +- os.system("rm -f /usr/sbin/fence_dummy_no_reboot") ++ os.system("rm -f /usr/sbin/fence_dummy_monitor_fail") ++ os.system("rm -f /usr/sbin/fence_dummy_list") ++ os.system("rm -f /usr/sbin/fence_dummy") ++ os.system("rm -f /usr/sbin/fence_dummy_automatic_unfence") ++ os.system("rm -f /usr/sbin/fence_dummy_no_reboot") + + class TestOptions: +- def __init__(self): +- self.options = {} +- self.options['list-tests'] = 0 +- self.options['run-all'] = 1 +- self.options['run-only'] = "" +- self.options['run-only-pattern'] = "" +- self.options['verbose'] = 0 +- self.options['invalid-arg'] = "" +- self.options['cpg-only'] = 0 +- self.options['no-cpg'] = 0 +- self.options['show-usage'] = 0 +- +- def build_options(self, argv): +- args = argv[1:] +- skip = 0 +- for i in range(0, len(args)): +- if skip: +- skip = 0 +- continue +- elif args[i] == "-h" or args[i] == "--help": +- self.options['show-usage'] = 1 +- elif args[i] == "-l" or args[i] == "--list-tests": +- self.options['list-tests'] = 1 +- elif args[i] == "-V" or args[i] == "--verbose": +- self.options['verbose'] = 1 +- elif args[i] == "-n" or args[i] == "--no-cpg": +- self.options['no-cpg'] = 1 +- elif args[i] == "-c" or args[i] == "--cpg-only": +- self.options['cpg-only'] = 1 +- elif args[i] == "-r" or args[i] == "--run-only": +- self.options['run-only'] = args[i+1] +- skip = 1 +- elif args[i] == "-p" or args[i] == "--run-only-pattern": +- self.options['run-only-pattern'] = args[i+1] +- skip = 1 +- +- def show_usage(self): +- print "usage: " + sys.argv[0] + " [options]" +- print "If no options are provided, all tests will run" +- print "Options:" +- print "\t [--help | -h] Show usage" +- print "\t [--list-tests | -l] Print out all registered tests." +- print "\t [--cpg-only | -c] Only run tests that require corosync." +- print "\t [--no-cpg | -n] Only run tests that do not require corosync" +- print "\t [--run-only | -r 'testname'] Run a specific test" +- print "\t [--verbose | -V] Verbose output" +- print "\t [--run-only-pattern | -p 'string'] Run only tests containing the string value" +- print "\n\tExample: Run only the test 'start_top'" +- print "\t\t python ./regression.py --run-only start_stop" +- print "\n\tExample: Run only the tests with the string 'systemd' present in them" +- print "\t\t python ./regression.py --run-only-pattern systemd" ++ def __init__(self): ++ self.options = {} ++ self.options['list-tests'] = 0 ++ self.options['run-all'] = 1 ++ self.options['run-only'] = "" ++ self.options['run-only-pattern'] = "" ++ self.options['verbose'] = 0 ++ self.options['invalid-arg'] = "" ++ self.options['cpg-only'] = 0 ++ self.options['no-cpg'] = 0 ++ self.options['show-usage'] = 0 ++ ++ def build_options(self, argv): ++ args = argv[1:] ++ skip = 0 ++ for i in range(0, len(args)): ++ if skip: ++ skip = 0 ++ continue ++ elif args[i] == "-h" or args[i] == "--help": ++ self.options['show-usage'] = 1 ++ elif args[i] == "-l" or args[i] == "--list-tests": ++ self.options['list-tests'] = 1 ++ elif args[i] == "-V" or args[i] == "--verbose": ++ self.options['verbose'] = 1 ++ elif args[i] == "-n" or args[i] == "--no-cpg": ++ self.options['no-cpg'] = 1 ++ elif args[i] == "-c" or args[i] == "--cpg-only": ++ self.options['cpg-only'] = 1 ++ elif args[i] == "-r" or args[i] == "--run-only": ++ self.options['run-only'] = args[i+1] ++ skip = 1 ++ elif args[i] == "-p" or args[i] == "--run-only-pattern": ++ self.options['run-only-pattern'] = args[i+1] ++ skip = 1 ++ ++ def show_usage(self): ++ print "usage: " + sys.argv[0] + " [options]" ++ print "If no options are provided, all tests will run" ++ print "Options:" ++ print "\t [--help | -h] Show usage" ++ print "\t [--list-tests | -l] Print out all registered tests." ++ print "\t [--cpg-only | -c] Only run tests that require corosync." ++ print "\t [--no-cpg | -n] Only run tests that do not require corosync" ++ print "\t [--run-only | -r 'testname'] Run a specific test" ++ print "\t [--verbose | -V] Verbose output" ++ print "\t [--run-only-pattern | -p 'string'] Run only tests containing the string value" ++ print "\n\tExample: Run only the test 'start_top'" ++ print "\t\t python ./regression.py --run-only start_stop" ++ print "\n\tExample: Run only the tests with the string 'systemd' present in them" ++ print "\t\t python ./regression.py --run-only-pattern systemd" + + def main(argv): +- o = TestOptions() +- o.build_options(argv) +- +- use_corosync = 1 +- +- tests = Tests(o.options['verbose']) +- tests.build_standalone_tests() +- tests.build_custom_timeout_tests() +- tests.build_api_sanity_tests() +- tests.build_fence_merge_tests() +- tests.build_unfence_tests() +- tests.build_nodeid_tests() +- +- if o.options['list-tests']: +- tests.print_list() +- sys.exit(0) +- elif o.options['show-usage']: +- o.show_usage() +- sys.exit(0) +- +- print "Starting ..." +- +- if o.options['no-cpg']: +- use_corosync = 0 +- +- tests.setup_environment(use_corosync) +- +- if o.options['run-only-pattern'] != "": +- tests.run_tests_matching(o.options['run-only-pattern']) +- tests.print_results() +- elif o.options['run-only'] != "": +- tests.run_single(o.options['run-only']) +- tests.print_results() +- elif o.options['no-cpg']: +- tests.run_no_cpg() +- tests.print_results() +- elif o.options['cpg-only']: +- tests.run_cpg_only() +- tests.print_results() +- else: +- tests.run_tests() +- tests.print_results() +- +- tests.cleanup_environment(use_corosync) +- tests.exit() ++ o = TestOptions() ++ o.build_options(argv) ++ ++ use_corosync = 1 ++ ++ tests = Tests(o.options['verbose']) ++ tests.build_standalone_tests() ++ tests.build_custom_timeout_tests() ++ tests.build_api_sanity_tests() ++ tests.build_fence_merge_tests() ++ tests.build_unfence_tests() ++ tests.build_nodeid_tests() ++ tests.build_remap_tests() ++ ++ if o.options['list-tests']: ++ tests.print_list() ++ sys.exit(0) ++ elif o.options['show-usage']: ++ o.show_usage() ++ sys.exit(0) ++ ++ print "Starting ..." ++ ++ if o.options['no-cpg']: ++ use_corosync = 0 ++ ++ tests.setup_environment(use_corosync) ++ ++ if o.options['run-only-pattern'] != "": ++ tests.run_tests_matching(o.options['run-only-pattern']) ++ tests.print_results() ++ elif o.options['run-only'] != "": ++ tests.run_single(o.options['run-only']) ++ tests.print_results() ++ elif o.options['no-cpg']: ++ tests.run_no_cpg() ++ tests.print_results() ++ elif o.options['cpg-only']: ++ tests.run_cpg_only() ++ tests.print_results() ++ else: ++ tests.run_tests() ++ tests.print_results() ++ ++ tests.cleanup_environment(use_corosync) ++ tests.exit() + if __name__=="__main__": +- main(sys.argv) ++ main(sys.argv) +diff --git a/fencing/remote.c b/fencing/remote.c +index a568035..2c00b5f 100644 +--- a/fencing/remote.c ++++ b/fencing/remote.c +@@ -47,17 +47,37 @@ + + #define TIMEOUT_MULTIPLY_FACTOR 1.2 + ++/* When one stonithd queries its peers for devices able to handle a fencing ++ * request, each peer will reply with a list of such devices available to it. ++ * Each reply will be parsed into a st_query_result_t, with each device's ++ * information kept in a device_properties_t. ++ */ ++ ++typedef struct device_properties_s { ++ /* Whether access to this device has been verified */ ++ gboolean verified; ++ ++ /* The remaining members are indexed by the operation's "phase" */ ++ ++ /* Whether this device has been executed in each phase */ ++ gboolean executed[3]; ++ /* Whether this device is disallowed from executing in each phase */ ++ gboolean disallowed[3]; ++ /* Action-specific timeout for each phase */ ++ int custom_action_timeout[3]; ++ /* Action-specific maximum random delay for each phase */ ++ int delay_max[3]; ++} device_properties_t; ++ + typedef struct st_query_result_s { ++ /* Name of peer that sent this result */ + char *host; +- int devices; +- /* only try peers for non-topology based operations once */ ++ /* Only try peers for non-topology based operations once */ + gboolean tried; +- GListPtr device_list; +- GHashTable *custom_action_timeouts; +- GHashTable *delay_maxes; +- /* Subset of devices that peer has verified connectivity on */ +- GHashTable *verified_devices; +- ++ /* Number of entries in the devices table */ ++ int ndevices; ++ /* Devices available to this host that are capable of fencing the target */ ++ GHashTable *devices; + } st_query_result_t; + + GHashTable *remote_op_list = NULL; +@@ -67,8 +87,8 @@ extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op + int call_options); + + static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); +-static int get_op_total_timeout(remote_fencing_op_t * op, st_query_result_t * chosen_peer, +- int default_timeout); ++static int get_op_total_timeout(const remote_fencing_op_t *op, ++ const st_query_result_t *chosen_peer); + + static gint + sort_strings(gconstpointer a, gconstpointer b) +@@ -83,15 +103,126 @@ free_remote_query(gpointer data) + st_query_result_t *query = data; + + crm_trace("Free'ing query result from %s", query->host); ++ g_hash_table_destroy(query->devices); + free(query->host); +- g_list_free_full(query->device_list, free); +- g_hash_table_destroy(query->custom_action_timeouts); +- g_hash_table_destroy(query->delay_maxes); +- g_hash_table_destroy(query->verified_devices); + free(query); + } + } + ++struct peer_count_data { ++ const remote_fencing_op_t *op; ++ gboolean verified_only; ++ int count; ++}; ++ ++/*! ++ * \internal ++ * \brief Increment a counter if a device has not been executed yet ++ * ++ * \param[in] key Device ID (ignored) ++ * \param[in] value Device properties ++ * \param[in] user_data Peer count data ++ */ ++static void ++count_peer_device(gpointer key, gpointer value, gpointer user_data) ++{ ++ device_properties_t *props = (device_properties_t*)value; ++ struct peer_count_data *data = user_data; ++ ++ if (!props->executed[data->op->phase] ++ && (!data->verified_only || props->verified)) { ++ ++(data->count); ++ } ++} ++ ++/*! ++ * \internal ++ * \brief Check the number of available devices in a peer's query results ++ * ++ * \param[in] op Operation that results are for ++ * \param[in] peer Peer to count ++ * \param[in] verified_only Whether to count only verified devices ++ * ++ * \return Number of devices available to peer that were not already executed ++ */ ++static int ++count_peer_devices(const remote_fencing_op_t *op, const st_query_result_t *peer, ++ gboolean verified_only) ++{ ++ struct peer_count_data data; ++ ++ data.op = op; ++ data.verified_only = verified_only; ++ data.count = 0; ++ if (peer) { ++ g_hash_table_foreach(peer->devices, count_peer_device, &data); ++ } ++ return data.count; ++} ++ ++/*! ++ * \internal ++ * \brief Search for a device in a query result ++ * ++ * \param[in] op Operation that result is for ++ * \param[in] peer Query result for a peer ++ * \param[in] device Device ID to search for ++ * ++ * \return Device properties if found, NULL otherwise ++ */ ++static device_properties_t * ++find_peer_device(const remote_fencing_op_t *op, const st_query_result_t *peer, ++ const char *device) ++{ ++ device_properties_t *props = g_hash_table_lookup(peer->devices, device); ++ ++ return (props && !props->executed[op->phase] ++ && !props->disallowed[op->phase])? props : NULL; ++} ++ ++/*! ++ * \internal ++ * \brief Find a device in a peer's device list and mark it as executed ++ * ++ * \param[in] op Operation that peer result is for ++ * \param[in,out] peer Peer with results to search ++ * \param[in] device ID of device to mark as done ++ * \param[in] verified_devices_only Only consider verified devices ++ * ++ * \return TRUE if device was found and marked, FALSE otherwise ++ */ ++static gboolean ++grab_peer_device(const remote_fencing_op_t *op, st_query_result_t *peer, ++ const char *device, gboolean verified_devices_only) ++{ ++ device_properties_t *props = find_peer_device(op, peer, device); ++ ++ if ((props == NULL) || (verified_devices_only && !props->verified)) { ++ return FALSE; ++ } ++ ++ crm_trace("Removing %s from %s (%d remaining)", ++ device, peer->host, count_peer_devices(op, peer, FALSE)); ++ props->executed[op->phase] = TRUE; ++ return TRUE; ++} ++ ++/* ++ * \internal ++ * \brief Free the list of required devices for a particular phase ++ * ++ * \param[in,out] op Operation to modify ++ * \param[in] phase Phase to modify ++ */ ++static void ++free_required_list(remote_fencing_op_t *op, enum st_remap_phase phase) ++{ ++ if (op->required_list[phase]) { ++ g_list_free_full(op->required_list[phase], free); ++ op->required_list[phase] = NULL; ++ } ++} ++ + static void + clear_remote_op_timers(remote_fencing_op_t * op) + { +@@ -137,13 +268,100 @@ free_remote_op(gpointer data) + g_list_free_full(op->devices_list, free); + op->devices_list = NULL; + } +- if (op->required_list) { +- g_list_free_full(op->required_list, free); +- op->required_list = NULL; +- } ++ free_required_list(op, st_phase_requested); ++ free_required_list(op, st_phase_off); ++ free_required_list(op, st_phase_on); + free(op); + } + ++/* ++ * \internal ++ * \brief Return an operation's originally requested action (before any remap) ++ * ++ * \param[in] op Operation to check ++ * ++ * \return Operation's original action ++ */ ++static const char * ++op_requested_action(const remote_fencing_op_t *op) ++{ ++ return ((op->phase > st_phase_requested)? "reboot" : op->action); ++} ++ ++/* ++ * \internal ++ * \brief Remap a "reboot" operation to the "off" phase ++ * ++ * \param[in,out] op Operation to remap ++ */ ++static void ++op_phase_off(remote_fencing_op_t *op) ++{ ++ crm_info("Remapping multiple-device reboot of %s (%s) to off", ++ op->target, op->id); ++ op->phase = st_phase_off; ++ ++ /* Happily, "off" and "on" are shorter than "reboot", so we can reuse the ++ * memory allocation at each phase. ++ */ ++ strcpy(op->action, "off"); ++} ++ ++/*! ++ * \internal ++ * \brief Advance a remapped reboot operation to the "on" phase ++ * ++ * \param[in,out] op Operation to remap ++ */ ++static void ++op_phase_on(remote_fencing_op_t *op) ++{ ++ GListPtr iter = NULL; ++ ++ crm_info("Remapped off of %s complete, remapping to on for %s.%.8s", ++ op->target, op->client_name, op->id); ++ op->phase = st_phase_on; ++ strcpy(op->action, "on"); ++ ++ /* Any devices that are required for "on" will be automatically executed by ++ * the cluster when the node next joins, so we skip them here. ++ */ ++ for (iter = op->required_list[op->phase]; iter != NULL; iter = iter->next) { ++ GListPtr match = g_list_find_custom(op->devices_list, iter->data, ++ sort_strings); ++ ++ if (match) { ++ op->devices_list = g_list_remove(op->devices_list, match->data); ++ } ++ } ++ ++ /* We know this level will succeed, because phase 1 completed successfully ++ * and we ignore any errors from phase 2. So we can free the required list, ++ * which will keep them from being executed after the device list is done. ++ */ ++ free_required_list(op, op->phase); ++ ++ /* Rewind device list pointer */ ++ op->devices = op->devices_list; ++} ++ ++/*! ++ * \internal ++ * \brief Reset a remapped reboot operation ++ * ++ * \param[in,out] op Operation to reset ++ */ ++static void ++undo_op_remap(remote_fencing_op_t *op) ++{ ++ if (op->phase > 0) { ++ crm_info("Undoing remap of reboot of %s for %s.%.8s", ++ op->target, op->client_name, op->id); ++ op->phase = st_phase_requested; ++ strcpy(op->action, "reboot"); ++ } ++} ++ + static xmlNode * + create_op_done_notify(remote_fencing_op_t * op, int rc) + { +@@ -271,6 +489,7 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) + + op->completed = time(NULL); + clear_remote_op_timers(op); ++ undo_op_remap(op); + + if (op->notify_sent == TRUE) { + crm_err("Already sent notifications for '%s of %s by %s' (for=%s@%s.%.8s, state=%d): %s", +@@ -279,10 +498,12 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) + goto remote_op_done_cleanup; + } + +- if (!op->delegate && data) { ++ if (!op->delegate && data && rc != -ENODEV && rc != -EHOSTUNREACH) { + xmlNode *ndata = get_xpath_object("//@" F_STONITH_DELEGATE, data, LOG_TRACE); + if(ndata) { + op->delegate = crm_element_value_copy(ndata, F_STONITH_DELEGATE); ++ } else { ++ op->delegate = crm_element_value_copy(data, F_ORIG); + } + } + +@@ -377,6 +598,16 @@ remote_op_timeout(gpointer userdata) + + crm_debug("Action %s (%s) for %s (%s) timed out", + op->action, op->id, op->target, op->client_name); ++ ++ if (op->phase == st_phase_on) { ++ /* A remapped reboot operation timed out in the "on" phase, but the ++ * "off" phase completed successfully, so quit trying any further ++ * devices, and return success. ++ */ ++ remote_op_done(op, NULL, pcmk_ok, FALSE); ++ return FALSE; ++ } ++ + op->state = st_failed; + + remote_op_done(op, NULL, -ETIME, FALSE); +@@ -426,22 +657,43 @@ topology_is_empty(stonith_topology_t *tp) + return TRUE; + } + ++/* ++ * \internal ++ * \brief Add a device to the required list for a particular phase ++ * ++ * \param[in,out] op Operation to modify ++ * \param[in] phase Phase to modify ++ * \param[in] device Device ID to add ++ */ + static void +-add_required_device(remote_fencing_op_t * op, const char *device) ++add_required_device(remote_fencing_op_t *op, enum st_remap_phase phase, ++ const char *device) + { +- GListPtr match = g_list_find_custom(op->required_list, device, sort_strings); +- if (match) { +- /* device already marked required */ +- return; ++ GListPtr match = g_list_find_custom(op->required_list[phase], device, ++ sort_strings); ++ ++ if (!match) { ++ op->required_list[phase] = g_list_prepend(op->required_list[phase], ++ strdup(device)); + } +- op->required_list = g_list_prepend(op->required_list, strdup(device)); ++} + +- /* make sure the required devices is in the current list of devices to be executed */ +- if (op->devices_list) { +- GListPtr match = g_list_find_custom(op->devices_list, device, sort_strings); +- if (match == NULL) { +- op->devices_list = g_list_append(op->devices_list, strdup(device)); +- } ++/* ++ * \internal ++ * \brief Remove a device from the required list for the current phase ++ * ++ * \param[in,out] op Operation to modify ++ * \param[in] device Device ID to remove ++ */ ++static void ++remove_required_device(remote_fencing_op_t *op, const char *device) ++{ ++ GListPtr match = g_list_find_custom(op->required_list[op->phase], device, ++ sort_strings); ++ ++ if (match) { ++ op->required_list[op->phase] = g_list_remove(op->required_list[op->phase], ++ match->data); + } + } + +@@ -458,18 +710,6 @@ set_op_device_list(remote_fencing_op_t * op, GListPtr devices) + for (lpc = devices; lpc != NULL; lpc = lpc->next) { + op->devices_list = g_list_append(op->devices_list, strdup(lpc->data)); + } +- +- /* tack on whatever required devices have not been executed +- * to the end of the current devices list. This ensures that +- * the required devices will get executed regardless of what topology +- * level they exist at. */ +- for (lpc = op->required_list; lpc != NULL; lpc = lpc->next) { +- GListPtr match = g_list_find_custom(op->devices_list, lpc->data, sort_strings); +- if (match == NULL) { +- op->devices_list = g_list_append(op->devices_list, strdup(lpc->data)); +- } +- } +- + op->devices = op->devices_list; + } + +@@ -491,6 +731,7 @@ find_topology_for_host(const char *host) + crm_info("Bad regex '%s' for fencing level", tp->node); + } else { + status = regexec(&r_patt, host, 0, NULL, 0); ++ regfree(&r_patt); + } + + if (status == 0) { +@@ -529,6 +770,9 @@ stonith_topology_next(remote_fencing_op_t * op) + + set_bit(op->call_options, st_opt_topology); + ++ /* This is a new level, so undo any remapping left over from previous */ ++ undo_op_remap(op); ++ + do { + op->level++; + +@@ -539,6 +783,15 @@ stonith_topology_next(remote_fencing_op_t * op) + op->level, op->target, g_list_length(tp->levels[op->level]), + op->client_name, op->originator, op->id); + set_op_device_list(op, tp->levels[op->level]); ++ ++ if (g_list_next(op->devices_list) && safe_str_eq(op->action, "reboot")) { ++ /* A reboot has been requested for a topology level with multiple ++ * devices. Instead of rebooting the devices sequentially, we will ++ * turn them all off, then turn them all on again. (Think about ++ * switched power outlets for redundant power supplies.) ++ */ ++ op_phase_off(op); ++ } + return pcmk_ok; + } + +@@ -563,6 +816,7 @@ merge_duplicates(remote_fencing_op_t * op) + g_hash_table_iter_init(&iter, remote_op_list); + while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) { + crm_node_t *peer = NULL; ++ const char *other_action = op_requested_action(other); + + if (other->state > st_exec) { + /* Must be in-progress */ +@@ -570,8 +824,9 @@ merge_duplicates(remote_fencing_op_t * op) + } else if (safe_str_neq(op->target, other->target)) { + /* Must be for the same node */ + continue; +- } else if (safe_str_neq(op->action, other->action)) { +- crm_trace("Must be for the same action: %s vs. ", op->action, other->action); ++ } else if (safe_str_neq(op->action, other_action)) { ++ crm_trace("Must be for the same action: %s vs. %s", ++ op->action, other_action); + continue; + } else if (safe_str_eq(op->client_name, other->client_name)) { + crm_trace("Must be for different clients: %s", op->client_name); +@@ -602,7 +857,7 @@ merge_duplicates(remote_fencing_op_t * op) + if (other->total_timeout == 0) { + crm_trace("Making a best-guess as to the timeout used"); + other->total_timeout = op->total_timeout = +- TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL, op->base_timeout); ++ TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL); + } + crm_notice + ("Merging stonith action %s for node %s originating from client %s.%.8s with identical request from %s@%s.%.8s (%ds)", +@@ -792,16 +1047,16 @@ initiate_remote_stonith_op(crm_client_t * client, xmlNode * request, gboolean ma + op->id, op->state); + } + +- query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY, NULL, 0); ++ query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY, ++ NULL, op->call_options); + + crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id); + crm_xml_add(query, F_STONITH_TARGET, op->target); +- crm_xml_add(query, F_STONITH_ACTION, op->action); ++ crm_xml_add(query, F_STONITH_ACTION, op_requested_action(op)); + crm_xml_add(query, F_STONITH_ORIGIN, op->originator); + crm_xml_add(query, F_STONITH_CLIENTID, op->client_id); + crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name); + crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout); +- crm_xml_add_int(query, F_STONITH_CALLOPTS, op->call_options); + + send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE); + free_xml(query); +@@ -835,7 +1090,7 @@ find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer + st_query_result_t *peer = iter->data; + + crm_trace("Testing result from %s for %s with %d devices: %d %x", +- peer->host, op->target, peer->devices, peer->tried, options); ++ peer->host, op->target, peer->ndevices, peer->tried, options); + if ((options & FIND_PEER_SKIP_TARGET) && safe_str_eq(peer->host, op->target)) { + continue; + } +@@ -844,25 +1099,13 @@ find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer + } + + if (is_set(op->call_options, st_opt_topology)) { +- /* Do they have the next device of the current fencing level? */ +- GListPtr match = NULL; +- +- if (verified_devices_only && !g_hash_table_lookup(peer->verified_devices, device)) { +- continue; +- } + +- match = g_list_find_custom(peer->device_list, device, sort_strings); +- if (match) { +- crm_trace("Removing %s from %s (%d remaining)", (char *)match->data, peer->host, +- g_list_length(peer->device_list)); +- peer->device_list = g_list_remove(peer->device_list, match->data); ++ if (grab_peer_device(op, peer, device, verified_devices_only)) { + return peer; + } + +- } else if (peer->devices > 0 && peer->tried == FALSE) { +- if (verified_devices_only && !g_hash_table_size(peer->verified_devices)) { +- continue; +- } ++ } else if ((peer->tried == FALSE) ++ && count_peer_devices(op, peer, verified_devices_only)) { + + /* No topology: Use the current best peer */ + crm_trace("Simple fencing"); +@@ -883,11 +1126,14 @@ stonith_choose_peer(remote_fencing_op_t * op) + do { + if (op->devices) { + device = op->devices->data; +- crm_trace("Checking for someone to fence %s with %s", op->target, device); ++ crm_trace("Checking for someone to fence (%s) %s with %s", ++ op->action, op->target, device); + } else { +- crm_trace("Checking for someone to fence %s", op->target); ++ crm_trace("Checking for someone to fence (%s) %s", ++ op->action, op->target); + } + ++ /* Best choice is a peer other than the target with verified access */ + peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY); + if (peer) { + crm_trace("Found verified peer %s for %s", peer->host, device?device:""); +@@ -899,62 +1145,101 @@ stonith_choose_peer(remote_fencing_op_t * op) + return NULL; + } + ++ /* If no other peer has verified access, next best is unverified access */ + peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET); + if (peer) { + crm_trace("Found best unverified peer %s", peer->host); + return peer; + } + +- peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY); +- if(peer) { +- crm_trace("%s will fence itself", peer->host); +- return peer; ++ /* If no other peer can do it, last option is self-fencing ++ * (which is never allowed for the "on" phase of a remapped reboot) ++ */ ++ if (op->phase != st_phase_on) { ++ peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY); ++ if (peer) { ++ crm_trace("%s will fence itself", peer->host); ++ return peer; ++ } + } + +- /* Try the next fencing level if there is one */ +- } while (is_set(op->call_options, st_opt_topology) ++ /* Try the next fencing level if there is one (unless we're in the "on" ++ * phase of a remapped "reboot", because we ignore errors in that case) ++ */ ++ } while ((op->phase != st_phase_on) ++ && is_set(op->call_options, st_opt_topology) + && stonith_topology_next(op) == pcmk_ok); + +- crm_notice("Couldn't find anyone to fence %s with %s", op->target, device?device:""); ++ crm_notice("Couldn't find anyone to fence (%s) %s with %s", ++ op->action, op->target, (device? device : "any device")); + return NULL; + } + + static int +-get_device_timeout(st_query_result_t * peer, const char *device, int default_timeout) ++get_device_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer, ++ const char *device) + { +- gpointer res; +- int delay_max = 0; ++ device_properties_t *props; + + if (!peer || !device) { +- return default_timeout; ++ return op->base_timeout; + } + +- res = g_hash_table_lookup(peer->delay_maxes, device); +- if (res && GPOINTER_TO_INT(res) > 0) { +- delay_max = GPOINTER_TO_INT(res); ++ props = g_hash_table_lookup(peer->devices, device); ++ if (!props) { ++ return op->base_timeout; + } + +- res = g_hash_table_lookup(peer->custom_action_timeouts, device); ++ return (props->custom_action_timeout[op->phase]? ++ props->custom_action_timeout[op->phase] : op->base_timeout) ++ + props->delay_max[op->phase]; ++} + +- return res ? GPOINTER_TO_INT(res) + delay_max : default_timeout + delay_max; ++struct timeout_data { ++ const remote_fencing_op_t *op; ++ const st_query_result_t *peer; ++ int total_timeout; ++}; ++ ++/*! ++ * \internal ++ * \brief Add timeout to a total if device has not been executed yet ++ * ++ * \param[in] key GHashTable key (device ID) ++ * \param[in] value GHashTable value (device properties) ++ * \param[in] user_data Timeout data ++ */ ++static void ++add_device_timeout(gpointer key, gpointer value, gpointer user_data) ++{ ++ const char *device_id = key; ++ device_properties_t *props = value; ++ struct timeout_data *timeout = user_data; ++ ++ if (!props->executed[timeout->op->phase] ++ && !props->disallowed[timeout->op->phase]) { ++ timeout->total_timeout += get_device_timeout(timeout->op, ++ timeout->peer, device_id); ++ } + } + + static int +-get_peer_timeout(st_query_result_t * peer, int default_timeout) ++get_peer_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer) + { +- int total_timeout = 0; ++ struct timeout_data timeout; + +- GListPtr cur = NULL; ++ timeout.op = op; ++ timeout.peer = peer; ++ timeout.total_timeout = 0; + +- for (cur = peer->device_list; cur; cur = cur->next) { +- total_timeout += get_device_timeout(peer, cur->data, default_timeout); +- } ++ g_hash_table_foreach(peer->devices, add_device_timeout, &timeout); + +- return total_timeout ? total_timeout : default_timeout; ++ return (timeout.total_timeout? timeout.total_timeout : op->base_timeout); + } + + static int +-get_op_total_timeout(remote_fencing_op_t * op, st_query_result_t * chosen_peer, int default_timeout) ++get_op_total_timeout(const remote_fencing_op_t *op, ++ const st_query_result_t *chosen_peer) + { + int total_timeout = 0; + stonith_topology_t *tp = find_topology_for_host(op->target); +@@ -977,11 +1262,11 @@ get_op_total_timeout(remote_fencing_op_t * op, st_query_result_t * chosen_peer, + } + for (device_list = tp->levels[i]; device_list; device_list = device_list->next) { + for (iter = op->query_results; iter != NULL; iter = iter->next) { +- st_query_result_t *peer = iter->data; ++ const st_query_result_t *peer = iter->data; + +- if (g_list_find_custom(peer->device_list, device_list->data, sort_strings)) { +- total_timeout += +- get_device_timeout(peer, device_list->data, default_timeout); ++ if (find_peer_device(op, peer, device_list->data)) { ++ total_timeout += get_device_timeout(op, peer, ++ device_list->data); + break; + } + } /* End Loop3: match device with peer that owns device, find device's timeout period */ +@@ -989,12 +1274,12 @@ get_op_total_timeout(remote_fencing_op_t * op, st_query_result_t * chosen_peer, + } /*End Loop1: iterate through fencing levels */ + + } else if (chosen_peer) { +- total_timeout = get_peer_timeout(chosen_peer, default_timeout); ++ total_timeout = get_peer_timeout(op, chosen_peer); + } else { +- total_timeout = default_timeout; ++ total_timeout = op->base_timeout; + } + +- return total_timeout ? total_timeout : default_timeout; ++ return total_timeout ? total_timeout : op->base_timeout; + } + + static void +@@ -1049,6 +1334,55 @@ report_timeout_period(remote_fencing_op_t * op, int op_timeout) + } + } + ++/* ++ * \internal ++ * \brief Advance an operation to the next device in its topology ++ * ++ * \param[in,out] op Operation to advance ++ * \param[in] device ID of device just completed ++ * \param[in] msg XML reply that contained device result (if available) ++ * \param[in] rc Return code of device's execution ++ */ ++static void ++advance_op_topology(remote_fencing_op_t *op, const char *device, xmlNode *msg, ++ int rc) ++{ ++ /* Advance to the next device at this topology level, if any */ ++ if (op->devices) { ++ op->devices = op->devices->next; ++ } ++ ++ /* If this device was required, it's not anymore */ ++ remove_required_device(op, device); ++ ++ /* If there are no more devices at this topology level, ++ * run through any required devices not already executed ++ */ ++ if (op->devices == NULL) { ++ op->devices = op->required_list[op->phase]; ++ } ++ ++ if ((op->devices == NULL) && (op->phase == st_phase_off)) { ++ /* We're done with this level and with required devices, but we had ++ * remapped "reboot" to "off", so start over with "on". If any devices ++ * need to be turned back on, op->devices will be non-NULL after this. ++ */ ++ op_phase_on(op); ++ } ++ ++ if (op->devices) { ++ /* Necessary devices remain, so execute the next one */ ++ crm_trace("Next for %s on behalf of %s@%s (rc was %d)", ++ op->target, op->originator, op->client_name, rc); ++ call_remote_stonith(op, NULL); ++ } else { ++ /* We're done with all devices and phases, so finalize operation */ ++ crm_trace("Marking complex fencing op for %s as complete", op->target); ++ op->state = st_done; ++ remote_op_done(op, msg, rc, FALSE); ++ } ++} ++ + void + call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer) + { +@@ -1061,7 +1395,7 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer) + } + + if (!op->op_timer_total) { +- int total_timeout = get_op_total_timeout(op, peer, op->base_timeout); ++ int total_timeout = get_op_total_timeout(op, peer); + + op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * total_timeout; + op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op); +@@ -1071,13 +1405,13 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer) + } + + if (is_set(op->call_options, st_opt_topology) && op->devices) { +- /* Ignore any preference, they might not have the device we need */ +- /* When using topology, the stonith_choose_peer function pops off +- * the peer from the op's query results. Make sure to calculate +- * the op_timeout before calling this function when topology is in use */ ++ /* Ignore any peer preference, they might not have the device we need */ ++ /* When using topology, stonith_choose_peer() removes the device from ++ * further consideration, so be sure to calculate timeout beforehand */ + peer = stonith_choose_peer(op); ++ + device = op->devices->data; +- timeout = get_device_timeout(peer, device, op->base_timeout); ++ timeout = get_device_timeout(op, peer, device); + } + + if (peer) { +@@ -1094,15 +1428,15 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer) + crm_xml_add_int(remote_op, F_STONITH_CALLOPTS, op->call_options); + + if (device) { +- timeout_one = +- TIMEOUT_MULTIPLY_FACTOR * get_device_timeout(peer, device, op->base_timeout); ++ timeout_one = TIMEOUT_MULTIPLY_FACTOR * ++ get_device_timeout(op, peer, device); + crm_info("Requesting that %s perform op %s %s with %s for %s (%ds)", peer->host, + op->action, op->target, device, op->client_name, timeout_one); + crm_xml_add(remote_op, F_STONITH_DEVICE, device); + crm_xml_add(remote_op, F_STONITH_MODE, "slave"); + + } else { +- timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(peer, op->base_timeout); ++ timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer); + crm_info("Requesting that %s perform op %s %s for %s (%ds, %ds)", + peer->host, op->action, op->target, op->client_name, timeout_one, stonith_watchdog_timeout_ms); + crm_xml_add(remote_op, F_STONITH_MODE, "smart"); +@@ -1115,16 +1449,18 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer) + } + + if(stonith_watchdog_timeout_ms > 0 && device && safe_str_eq(device, "watchdog")) { +- crm_notice("Waiting %ds for %s to self-terminate for %s.%.8s (%p)", +- stonith_watchdog_timeout_ms/1000, op->target, op->client_name, op->id, device); ++ crm_notice("Waiting %ds for %s to self-fence (%s) for %s.%.8s (%p)", ++ stonith_watchdog_timeout_ms/1000, op->target, ++ op->action, op->client_name, op->id, device); + op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op); + +- /* TODO: We should probably look into peer->device_list to verify watchdog is going to be in use */ ++ /* TODO check devices to verify watchdog will be in use */ + } else if(stonith_watchdog_timeout_ms > 0 + && safe_str_eq(peer->host, op->target) + && safe_str_neq(op->action, "on")) { +- crm_notice("Waiting %ds for %s to self-terminate for %s.%.8s (%p)", +- stonith_watchdog_timeout_ms/1000, op->target, op->client_name, op->id, device); ++ crm_notice("Waiting %ds for %s to self-fence (%s) for %s.%.8s (%p)", ++ stonith_watchdog_timeout_ms/1000, op->target, ++ op->action, op->client_name, op->id, device); + op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op); + + } else { +@@ -1137,13 +1473,23 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer) + free_xml(remote_op); + return; + ++ } else if (op->phase == st_phase_on) { ++ /* A remapped "on" cannot be executed, but the node was already ++ * turned off successfully, so ignore the error and continue. ++ */ ++ crm_warn("Ignoring %s 'on' failure (no capable peers) for %s after successful 'off'", ++ device, op->target); ++ advance_op_topology(op, device, NULL, pcmk_ok); ++ return; ++ + } else if (op->owner == FALSE) { +- crm_err("The termination of %s for %s is not ours to control", op->target, op->client_name); ++ crm_err("Fencing (%s) of %s for %s is not ours to control", ++ op->action, op->target, op->client_name); + + } else if (op->query_timer == 0) { + /* We've exhausted all available peers */ +- crm_info("No remaining peers capable of terminating %s for %s (%d)", op->target, +- op->client_name, op->state); ++ crm_info("No remaining peers capable of fencing (%s) %s for %s (%d)", ++ op->target, op->action, op->client_name, op->state); + CRM_LOG_ASSERT(op->state < st_done); + remote_op_timeout(op); + +@@ -1153,33 +1499,37 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer) + /* if the operation never left the query state, + * but we have all the expected replies, then no devices + * are available to execute the fencing operation. */ ++ + if(stonith_watchdog_timeout_ms && (device == NULL || safe_str_eq(device, "watchdog"))) { +- crm_notice("Waiting %ds for %s to self-terminate for %s.%.8s (%p)", +- stonith_watchdog_timeout_ms/1000, op->target, op->client_name, op->id, device); ++ crm_notice("Waiting %ds for %s to self-fence (%s) for %s.%.8s (%p)", ++ stonith_watchdog_timeout_ms/1000, op->target, ++ op->action, op->client_name, op->id, device); + + op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op); + return; + } + + if (op->state == st_query) { +- crm_info("None of the %d peers have devices capable of terminating %s for %s (%d)", +- op->replies, op->target, op->client_name, op->state); ++ crm_info("None of the %d peers have devices capable of fencing (%s) %s for %s (%d)", ++ op->replies, op->action, op->target, op->client_name, ++ op->state); + + rc = -ENODEV; + } else { +- crm_info("None of the %d peers are capable of terminating %s for %s (%d)", +- op->replies, op->target, op->client_name, op->state); ++ crm_info("None of the %d peers are capable of fencing (%s) %s for %s (%d)", ++ op->replies, op->action, op->target, op->client_name, ++ op->state); + } + + op->state = st_failed; + remote_op_done(op, NULL, rc, FALSE); + + } else if (device) { +- crm_info("Waiting for additional peers capable of terminating %s with %s for %s.%.8s", +- op->target, device, op->client_name, op->id); ++ crm_info("Waiting for additional peers capable of fencing (%s) %s with %s for %s.%.8s", ++ op->action, op->target, device, op->client_name, op->id); + } else { +- crm_info("Waiting for additional peers capable of terminating %s for %s%.8s", +- op->target, op->client_name, op->id); ++ crm_info("Waiting for additional peers capable of fencing (%s) %s for %s%.8s", ++ op->action, op->target, op->client_name, op->id); + } + } + +@@ -1200,7 +1550,7 @@ sort_peers(gconstpointer a, gconstpointer b) + const st_query_result_t *peer_a = a; + const st_query_result_t *peer_b = b; + +- return (peer_b->devices - peer_a->devices); ++ return (peer_b->ndevices - peer_a->ndevices); + } + + /*! +@@ -1212,7 +1562,7 @@ all_topology_devices_found(remote_fencing_op_t * op) + { + GListPtr device = NULL; + GListPtr iter = NULL; +- GListPtr match = NULL; ++ device_properties_t *match = NULL; + stonith_topology_t *tp = NULL; + gboolean skip_target = FALSE; + int i; +@@ -1236,7 +1586,7 @@ all_topology_devices_found(remote_fencing_op_t * op) + if (skip_target && safe_str_eq(peer->host, op->target)) { + continue; + } +- match = g_list_find_custom(peer->device_list, device->data, sort_strings); ++ match = find_peer_device(op, peer, device->data); + } + if (!match) { + return FALSE; +@@ -1247,10 +1597,169 @@ all_topology_devices_found(remote_fencing_op_t * op) + return TRUE; + } + ++/* ++ * \internal ++ * \brief Parse action-specific device properties from XML ++ * ++ * \param[in] msg XML element containing the properties ++ * \param[in] peer Name of peer that sent XML (for logs) ++ * \param[in] device Device ID (for logs) ++ * \param[in] action Action the properties relate to (for logs) ++ * \param[in] phase Phase the properties relate to ++ * \param[in,out] props Device properties to update ++ */ ++static void ++parse_action_specific(xmlNode *xml, const char *peer, const char *device, ++ const char *action, remote_fencing_op_t *op, ++ enum st_remap_phase phase, device_properties_t *props) ++{ ++ int required; ++ ++ props->custom_action_timeout[phase] = 0; ++ crm_element_value_int(xml, F_STONITH_ACTION_TIMEOUT, ++ &props->custom_action_timeout[phase]); ++ if (props->custom_action_timeout[phase]) { ++ crm_trace("Peer %s with device %s returned %s action timeout %d", ++ peer, device, action, props->custom_action_timeout[phase]); ++ } ++ ++ props->delay_max[phase] = 0; ++ crm_element_value_int(xml, F_STONITH_DELAY_MAX, &props->delay_max[phase]); ++ if (props->delay_max[phase]) { ++ crm_trace("Peer %s with device %s returned maximum of random delay %d for %s", ++ peer, device, props->delay_max[phase], action); ++ } ++ ++ required = 0; ++ crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required); ++ if (required) { ++ /* If the action is marked as required, add the device to the ++ * operation's list of required devices for this phase. We use this ++ * for unfencing when executing a topology. In phase 0 (requested ++ * action) or phase 1 (remapped "off"), required devices get executed ++ * regardless of their topology level; in phase 2 (remapped "on"), ++ * required devices are not attempted, because the cluster will ++ * execute them automatically later. ++ */ ++ crm_trace("Peer %s requires device %s to execute for action %s", ++ peer, device, action); ++ add_required_device(op, phase, device); ++ } ++ ++ /* If a reboot is remapped to off+on, it's possible that a node is allowed ++ * to perform one action but not another. ++ */ ++ if (crm_is_true(crm_element_value(xml, F_STONITH_ACTION_DISALLOWED))) { ++ props->disallowed[phase] = TRUE; ++ crm_trace("Peer %s is disallowed from executing %s for device %s", ++ peer, action, device); ++ } ++} ++ ++/* ++ * \internal ++ * \brief Parse one device's properties from peer's XML query reply ++ * ++ * \param[in] xml XML node containing device properties ++ * \param[in,out] op Operation that query and reply relate to ++ * \param[in,out] result Peer's results ++ * \param[in] device ID of device being parsed ++ */ ++static void ++add_device_properties(xmlNode *xml, remote_fencing_op_t *op, ++ st_query_result_t *result, const char *device) ++{ ++ xmlNode *child; ++ int verified = 0; ++ device_properties_t *props = calloc(1, sizeof(device_properties_t)); ++ ++ /* Add a new entry to this result's devices list */ ++ CRM_ASSERT(props != NULL); ++ g_hash_table_insert(result->devices, strdup(device), props); ++ ++ /* Peers with verified (monitored) access will be preferred */ ++ crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified); ++ if (verified) { ++ crm_trace("Peer %s has confirmed a verified device %s", ++ result->host, device); ++ props->verified = TRUE; ++ } ++ ++ /* Parse action-specific device properties */ ++ parse_action_specific(xml, result->host, device, op_requested_action(op), ++ op, st_phase_requested, props); ++ for (child = __xml_first_child(xml); child != NULL; child = __xml_next(child)) { ++ /* Replies for "reboot" operations will include the action-specific ++ * values for "off" and "on" in child elements, just in case the reboot ++ * winds up getting remapped. ++ */ ++ if (safe_str_eq(ID(child), "off")) { ++ parse_action_specific(child, result->host, device, "off", ++ op, st_phase_off, props); ++ } else if (safe_str_eq(ID(child), "on")) { ++ parse_action_specific(child, result->host, device, "on", ++ op, st_phase_on, props); ++ } ++ } ++} ++ ++/* ++ * \internal ++ * \brief Parse a peer's XML query reply and add it to operation's results ++ * ++ * \param[in,out] op Operation that query and reply relate to ++ * \param[in] host Name of peer that sent this reply ++ * \param[in] ndevices Number of devices expected in reply ++ * \param[in] xml XML node containing device list ++ * ++ * \return Newly allocated result structure with parsed reply ++ */ ++static st_query_result_t * ++add_result(remote_fencing_op_t *op, const char *host, int ndevices, xmlNode *xml) ++{ ++ st_query_result_t *result = calloc(1, sizeof(st_query_result_t)); ++ xmlNode *child; ++ ++ CRM_CHECK(result != NULL, return NULL); ++ result->host = strdup(host); ++ result->devices = g_hash_table_new_full(crm_str_hash, g_str_equal, free, free); ++ ++ /* Each child element describes one capable device available to the peer */ ++ for (child = __xml_first_child(xml); child != NULL; child = __xml_next(child)) { ++ const char *device = ID(child); ++ ++ if (device) { ++ add_device_properties(child, op, result, device); ++ } ++ } ++ ++ result->ndevices = g_hash_table_size(result->devices); ++ CRM_CHECK(ndevices == result->ndevices, ++ crm_err("Query claimed to have %d devices but %d found", ++ ndevices, result->ndevices)); ++ ++ op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers); ++ return result; ++} ++ ++/* ++ * \internal ++ * \brief Handle a peer's reply to our fencing query ++ * ++ * Parse a query result from XML and store it in the remote operation ++ * table, and when enough replies have been received, issue a fencing request. ++ * ++ * \param[in] msg XML reply received ++ * ++ * \return pcmk_ok on success, -errno on error ++ * ++ * \note See initiate_remote_stonith_op() for how the XML query was initially ++ * formed, and stonith_query() for how the peer formed its XML reply. ++ */ + int + process_remote_stonith_query(xmlNode * msg) + { +- int devices = 0; ++ int ndevices = 0; + gboolean host_is_target = FALSE; + gboolean have_all_replies = FALSE; + const char *id = NULL; +@@ -1259,7 +1768,6 @@ process_remote_stonith_query(xmlNode * msg) + st_query_result_t *result = NULL; + uint32_t replies_expected; + xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); +- xmlNode *child = NULL; + + CRM_CHECK(dev != NULL, return -EPROTO); + +@@ -1268,7 +1776,7 @@ process_remote_stonith_query(xmlNode * msg) + + dev = get_xpath_object("//@" F_STONITH_AVAILABLE_DEVICES, msg, LOG_ERR); + CRM_CHECK(dev != NULL, return -EPROTO); +- crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &devices); ++ crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &ndevices); + + op = g_hash_table_lookup(remote_op_list, id); + if (op == NULL) { +@@ -1283,75 +1791,13 @@ process_remote_stonith_query(xmlNode * msg) + host = crm_element_value(msg, F_ORIG); + host_is_target = safe_str_eq(host, op->target); + +- if (devices <= 0) { +- /* If we're doing 'known' then we might need to fire anyway */ +- crm_trace("Query result %d of %d from %s for %s/%s (%d devices) %s", +- op->replies, replies_expected, host, +- op->target, op->action, devices, id); +- if (have_all_replies) { +- crm_info("All query replies have arrived, continuing (%d expected/%d received for id %s)", +- replies_expected, op->replies, id); +- call_remote_stonith(op, NULL); +- } +- return pcmk_ok; +- } +- + crm_info("Query result %d of %d from %s for %s/%s (%d devices) %s", + op->replies, replies_expected, host, +- op->target, op->action, devices, id); +- result = calloc(1, sizeof(st_query_result_t)); +- result->host = strdup(host); +- result->devices = devices; +- result->custom_action_timeouts = g_hash_table_new_full(crm_str_hash, g_str_equal, free, NULL); +- result->delay_maxes = g_hash_table_new_full(crm_str_hash, g_str_equal, free, NULL); +- result->verified_devices = g_hash_table_new_full(crm_str_hash, g_str_equal, free, NULL); +- +- for (child = __xml_first_child(dev); child != NULL; child = __xml_next(child)) { +- const char *device = ID(child); +- int action_timeout = 0; +- int delay_max = 0; +- int verified = 0; +- int required = 0; +- +- if (device) { +- result->device_list = g_list_prepend(result->device_list, strdup(device)); +- crm_element_value_int(child, F_STONITH_ACTION_TIMEOUT, &action_timeout); +- crm_element_value_int(child, F_STONITH_DELAY_MAX, &delay_max); +- crm_element_value_int(child, F_STONITH_DEVICE_VERIFIED, &verified); +- crm_element_value_int(child, F_STONITH_DEVICE_REQUIRED, &required); +- if (action_timeout) { +- crm_trace("Peer %s with device %s returned action timeout %d", +- result->host, device, action_timeout); +- g_hash_table_insert(result->custom_action_timeouts, +- strdup(device), GINT_TO_POINTER(action_timeout)); +- } +- if (delay_max > 0) { +- crm_trace("Peer %s with device %s returned maximum of random delay %d", +- result->host, device, delay_max); +- g_hash_table_insert(result->delay_maxes, +- strdup(device), GINT_TO_POINTER(delay_max)); +- } +- if (verified) { +- crm_trace("Peer %s has confirmed a verified device %s", result->host, device); +- g_hash_table_insert(result->verified_devices, +- strdup(device), GINT_TO_POINTER(verified)); +- } +- if (required) { +- crm_trace("Peer %s requires device %s to execute for action %s", +- result->host, device, op->action); +- /* This matters when executing a topology. Required devices will get +- * executed regardless of their topology level. We use this for unfencing. */ +- add_required_device(op, device); +- } +- } ++ op->target, op->action, ndevices, id); ++ if (ndevices > 0) { ++ result = add_result(op, host, ndevices, dev); + } + +- CRM_CHECK(devices == g_list_length(result->device_list), +- crm_err("Mis-match: Query claimed to have %d devices but %d found", devices, +- g_list_length(result->device_list))); +- +- op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers); +- + if (is_set(op->call_options, st_opt_topology)) { + /* If we start the fencing before all the topology results are in, + * it is possible fencing levels will be skipped because of the missing +@@ -1368,11 +1814,13 @@ process_remote_stonith_query(xmlNode * msg) + } + + } else if (op->state == st_query) { ++ int nverified = count_peer_devices(op, result, TRUE); ++ + /* We have a result for a non-topology fencing op that looks promising, + * go ahead and start fencing before query timeout */ +- if (host_is_target == FALSE && g_hash_table_size(result->verified_devices)) { ++ if (result && (host_is_target == FALSE) && nverified) { + /* we have a verified device living on a peer that is not the target */ +- crm_trace("Found %d verified devices", g_hash_table_size(result->verified_devices)); ++ crm_trace("Found %d verified devices", nverified); + call_remote_stonith(op, result); + + } else if (have_all_replies) { +@@ -1384,14 +1832,25 @@ process_remote_stonith_query(xmlNode * msg) + crm_trace("Waiting for more peer results before launching fencing operation"); + } + +- } else if (op->state == st_done) { ++ } else if (result && (op->state == st_done)) { + crm_info("Discarding query result from %s (%d devices): Operation is in state %d", +- result->host, result->devices, op->state); ++ result->host, result->ndevices, op->state); + } + + return pcmk_ok; + } + ++/* ++ * \internal ++ * \brief Handle a peer's reply to a fencing request ++ * ++ * Parse a fencing reply from XML, and either finalize the operation ++ * or attempt another device as appropriate. ++ * ++ * \param[in] msg XML reply received ++ * ++ * \return pcmk_ok on success, -errno on error ++ */ + int + process_remote_stonith_exec(xmlNode * msg) + { +@@ -1472,26 +1931,20 @@ process_remote_stonith_exec(xmlNode * msg) + return rc; + } + +- /* An operation completed succesfully but has not yet been marked as done. +- * Continue the topology if more devices exist at the current level, otherwise +- * mark as done. */ ++ if ((op->phase == 2) && (rc != pcmk_ok)) { ++ /* A remapped "on" failed, but the node was already turned off ++ * successfully, so ignore the error and continue. ++ */ ++ crm_warn("Ignoring %s 'on' failure (exit code %d) for %s after successful 'off'", ++ device, rc, op->target); ++ rc = pcmk_ok; ++ } ++ + if (rc == pcmk_ok) { +- GListPtr required_match = g_list_find_custom(op->required_list, device, sort_strings); +- if (op->devices) { +- /* Success, are there any more? */ +- op->devices = op->devices->next; +- } +- if (required_match) { +- op->required_list = g_list_remove(op->required_list, required_match->data); +- } +- /* if no more devices at this fencing level, we are done, +- * else we need to contine with executing the next device in the list */ +- if (op->devices == NULL) { +- crm_trace("Marking complex fencing op for %s as complete", op->target); +- op->state = st_done; +- remote_op_done(op, msg, rc, FALSE); +- return rc; +- } ++ /* An operation completed successfully. Try another device if ++ * necessary, otherwise mark the operation as done. */ ++ advance_op_topology(op, device, msg, rc); ++ return rc; + } else { + /* This device failed, time to try another topology level. If no other + * levels are available, mark this operation as failed and report results. */ +@@ -1516,7 +1969,7 @@ process_remote_stonith_exec(xmlNode * msg) + /* fall-through and attempt other fencing action using another peer */ + } + +- /* Retry on failure or execute the rest of the topology */ ++ /* Retry on failure */ + crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator, + op->client_name, rc); + call_remote_stonith(op, NULL); +@@ -1595,6 +2048,9 @@ stonith_check_fence_tolerance(int tolerance, const char *target, const char *act + continue; + } else if (rop->state != st_done) { + continue; ++ /* We don't have to worry about remapped reboots here ++ * because if state is done, any remapping has been undone ++ */ + } else if (strcmp(rop->action, action) != 0) { + continue; + } else if ((rop->completed + tolerance) < now) { +diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h +index a6f58b1..a59151b 100644 +--- a/include/crm/fencing/internal.h ++++ b/include/crm/fencing/internal.h +@@ -63,6 +63,8 @@ xmlNode *create_device_registration_xml(const char *id, const char *namespace, c + # define F_STONITH_TOLERANCE "st_tolerance" + /*! Action specific timeout period returned in query of fencing devices. */ + # define F_STONITH_ACTION_TIMEOUT "st_action_timeout" ++/*! Host in query result is not allowed to run this action */ ++# define F_STONITH_ACTION_DISALLOWED "st_action_disallowed" + /*! Maximum of random fencing delay for a device */ + # define F_STONITH_DELAY_MAX "st_delay_max" + /*! Has this device been verified using a monitor type +diff --git a/include/crm/lrmd.h b/include/crm/lrmd.h +index e3a0d63..730cad3 100644 +--- a/include/crm/lrmd.h ++++ b/include/crm/lrmd.h +@@ -200,8 +200,6 @@ typedef struct lrmd_event_data_s { + enum ocf_exitcode rc; + /*! The lrmd status returned for exec_complete events */ + int op_status; +- /*! exit failure reason string from resource agent operation */ +- const char *exit_reason; + /*! stdout from resource agent operation */ + const char *output; + /*! Timestamp of when op ran */ +@@ -226,6 +224,9 @@ typedef struct lrmd_event_data_s { + * to the proper client. */ + const char *remote_nodename; + ++ /*! exit failure reason string from resource agent operation */ ++ const char *exit_reason; ++ + } lrmd_event_data_t; + + lrmd_event_data_t *lrmd_copy_event(lrmd_event_data_t * event); +diff --git a/include/crm/pengine/status.h b/include/crm/pengine/status.h +index 4bfa3fe..4214959 100644 +--- a/include/crm/pengine/status.h ++++ b/include/crm/pengine/status.h +@@ -137,10 +137,6 @@ struct node_shared_s { + gboolean shutdown; + gboolean expected_up; + gboolean is_dc; +- gboolean rsc_discovery_enabled; +- +- gboolean remote_requires_reset; +- gboolean remote_was_fenced; + + int num_resources; + GListPtr running_rsc; /* resource_t* */ +@@ -157,14 +153,17 @@ struct node_shared_s { + GHashTable *digest_cache; + + gboolean maintenance; ++ gboolean rsc_discovery_enabled; ++ gboolean remote_requires_reset; ++ gboolean remote_was_fenced; + }; + + struct node_s { + int weight; + gboolean fixed; +- int rsc_discover_mode; + int count; + struct node_shared_s *details; ++ int rsc_discover_mode; + }; + + # include +@@ -262,7 +261,6 @@ struct resource_s { + int migration_threshold; + + gboolean is_remote_node; +- gboolean exclusive_discover; + + unsigned long long flags; + +@@ -296,6 +294,7 @@ struct resource_s { + char *pending_task; + + const char *isolation_wrapper; ++ gboolean exclusive_discover; + }; + + struct pe_action_s { +diff --git a/lib/cib/cib_ops.c b/lib/cib/cib_ops.c +index 5f73559..8966ae2 100644 +--- a/lib/cib/cib_ops.c ++++ b/lib/cib/cib_ops.c +@@ -373,7 +373,10 @@ cib_process_modify(const char *op, int options, const char *section, xmlNode * r + + for (lpc = 0; lpc < max; lpc++) { + xmlNode *match = getXpathResult(xpathObj, lpc); +- crm_debug("Destroying %s", (char *)xmlGetNodePath(match)); ++ xmlChar *match_path = xmlGetNodePath(match); ++ ++ crm_debug("Destroying %s", match_path); ++ free(match_path); + free_xml(match); + } + +diff --git a/lib/cib/cib_utils.c b/lib/cib/cib_utils.c +index 28b8e81..d321517 100644 +--- a/lib/cib/cib_utils.c ++++ b/lib/cib/cib_utils.c +@@ -533,7 +533,7 @@ cib_perform_op(const char *op, int call_options, cib_op_t * fn, gboolean is_quer + int current_schema = get_schema_version(schema); + + if (minimum_schema == 0) { +- minimum_schema = get_schema_version("pacemaker-1.1"); ++ minimum_schema = get_schema_version("pacemaker-1.2"); + } + + /* Does the CIB support the "update-*" attributes... */ +diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c +index 28f41cb..b7958eb 100644 +--- a/lib/cluster/membership.c ++++ b/lib/cluster/membership.c +@@ -734,6 +734,14 @@ crm_update_peer_proc(const char *source, crm_node_t * node, uint32_t flag, const + if (crm_status_callback) { + crm_status_callback(crm_status_processes, node, &last); + } ++ ++ /* The client callback shouldn't touch the peer caches, ++ * but as a safety net, bail if the peer cache was destroyed. ++ */ ++ if (crm_peer_cache == NULL) { ++ return NULL; ++ } ++ + if (crm_autoreap) { + node = crm_update_peer_state(__FUNCTION__, node, + is_set(node->processes, crm_get_cluster_proc())? +diff --git a/lib/common/Makefile.am b/lib/common/Makefile.am +index f5c0766..a593f40 100644 +--- a/lib/common/Makefile.am ++++ b/lib/common/Makefile.am +@@ -37,7 +37,7 @@ if BUILD_CIBSECRETS + libcrmcommon_la_SOURCES += cib_secrets.c + endif + +-libcrmcommon_la_LDFLAGS = -version-info 8:0:5 ++libcrmcommon_la_LDFLAGS = -version-info 7:0:4 + libcrmcommon_la_LIBADD = @LIBADD_DL@ $(GNUTLSLIBS) + libcrmcommon_la_SOURCES += $(top_builddir)/lib/gnu/md5.c + +diff --git a/lib/common/xml.c b/lib/common/xml.c +index e272049..8eed245 100644 +--- a/lib/common/xml.c ++++ b/lib/common/xml.c +@@ -3430,12 +3430,18 @@ dump_xml_attr(xmlAttrPtr attr, int options, char **buffer, int *offset, int *max + { + char *p_value = NULL; + const char *p_name = NULL; ++ xml_private_t *p = NULL; + + CRM_ASSERT(buffer != NULL); + if (attr == NULL || attr->children == NULL) { + return; + } + ++ p = attr->_private; ++ if (p && is_set(p->flags, xpf_deleted)) { ++ return; ++ } ++ + p_name = (const char *)attr->name; + p_value = crm_xml_escape((const char *)attr->children->content); + buffer_print(*buffer, *max, *offset, " %s=\"%s\"", p_name, p_value); +@@ -3812,6 +3818,10 @@ dump_xml_comment(xmlNode * data, int options, char **buffer, int *offset, int *m + void + crm_xml_dump(xmlNode * data, int options, char **buffer, int *offset, int *max, int depth) + { ++ if(data == NULL) { ++ *offset = 0; ++ *max = 0; ++ } + #if 0 + if (is_not_set(options, xml_log_option_filtered)) { + /* Turning this code on also changes the PE tests for some reason +@@ -4564,6 +4574,8 @@ subtract_xml_object(xmlNode * parent, xmlNode * left, xmlNode * right, + /* changes to name/value pairs */ + for (xIter = crm_first_attr(left); xIter != NULL; xIter = xIter->next) { + const char *prop_name = (const char *)xIter->name; ++ xmlAttrPtr right_attr = NULL; ++ xml_private_t *p = NULL; + + if (strcmp(prop_name, XML_ATTR_ID) == 0) { + continue; +@@ -4582,8 +4594,13 @@ subtract_xml_object(xmlNode * parent, xmlNode * left, xmlNode * right, + continue; + } + ++ right_attr = xmlHasProp(right, (const xmlChar *)prop_name); ++ if (right_attr) { ++ p = right_attr->_private; ++ } ++ + right_val = crm_element_value(right, prop_name); +- if (right_val == NULL) { ++ if (right_val == NULL || (p && is_set(p->flags, xpf_deleted))) { + /* new */ + *changed = TRUE; + if (full) { +diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c +index 80f0064..67114c2 100644 +--- a/lib/fencing/st_client.c ++++ b/lib/fencing/st_client.c +@@ -1100,57 +1100,62 @@ stonith_api_device_metadata(stonith_t * stonith, int call_options, const char *a + if (safe_str_eq(provider, "redhat")) { + stonith_action_t *action = stonith_action_create(agent, "metadata", NULL, 0, 5, NULL, NULL); + int exec_rc = stonith_action_execute(action, &rc, &buffer); ++ xmlNode *xml = NULL; ++ xmlNode *actions = NULL; ++ xmlXPathObject *xpathObj = NULL; + + if (exec_rc < 0 || rc != 0 || buffer == NULL) { ++ crm_warn("Could not obtain metadata for %s", agent); + crm_debug("Query failed: %d %d: %s", exec_rc, rc, crm_str(buffer)); + free(buffer); /* Just in case */ + return -EINVAL; ++ } + +- } else { +- +- xmlNode *xml = string2xml(buffer); +- xmlNode *actions = NULL; +- xmlXPathObject *xpathObj = NULL; ++ xml = string2xml(buffer); ++ if(xml == NULL) { ++ crm_warn("Metadata for %s is invalid", agent); ++ free(buffer); ++ return -EINVAL; ++ } + +- xpathObj = xpath_search(xml, "//actions"); +- if (numXpathResults(xpathObj) > 0) { +- actions = getXpathResult(xpathObj, 0); +- } ++ xpathObj = xpath_search(xml, "//actions"); ++ if (numXpathResults(xpathObj) > 0) { ++ actions = getXpathResult(xpathObj, 0); ++ } + +- freeXpathObject(xpathObj); ++ freeXpathObject(xpathObj); + +- /* Now fudge the metadata so that the start/stop actions appear */ +- xpathObj = xpath_search(xml, "//action[@name='stop']"); +- if (numXpathResults(xpathObj) <= 0) { +- xmlNode *tmp = NULL; ++ /* Now fudge the metadata so that the start/stop actions appear */ ++ xpathObj = xpath_search(xml, "//action[@name='stop']"); ++ if (numXpathResults(xpathObj) <= 0) { ++ xmlNode *tmp = NULL; + +- tmp = create_xml_node(actions, "action"); +- crm_xml_add(tmp, "name", "stop"); +- crm_xml_add(tmp, "timeout", "20s"); ++ tmp = create_xml_node(actions, "action"); ++ crm_xml_add(tmp, "name", "stop"); ++ crm_xml_add(tmp, "timeout", "20s"); + +- tmp = create_xml_node(actions, "action"); +- crm_xml_add(tmp, "name", "start"); +- crm_xml_add(tmp, "timeout", "20s"); +- } ++ tmp = create_xml_node(actions, "action"); ++ crm_xml_add(tmp, "name", "start"); ++ crm_xml_add(tmp, "timeout", "20s"); ++ } + +- freeXpathObject(xpathObj); ++ freeXpathObject(xpathObj); + +- /* Now fudge the metadata so that the port isn't required in the configuration */ +- xpathObj = xpath_search(xml, "//parameter[@name='port']"); +- if (numXpathResults(xpathObj) > 0) { +- /* We'll fill this in */ +- xmlNode *tmp = getXpathResult(xpathObj, 0); ++ /* Now fudge the metadata so that the port isn't required in the configuration */ ++ xpathObj = xpath_search(xml, "//parameter[@name='port']"); ++ if (numXpathResults(xpathObj) > 0) { ++ /* We'll fill this in */ ++ xmlNode *tmp = getXpathResult(xpathObj, 0); + +- crm_xml_add(tmp, "required", "0"); +- } ++ crm_xml_add(tmp, "required", "0"); ++ } + +- freeXpathObject(xpathObj); +- free(buffer); +- buffer = dump_xml_formatted(xml); +- free_xml(xml); +- if (!buffer) { +- return -EINVAL; +- } ++ freeXpathObject(xpathObj); ++ free(buffer); ++ buffer = dump_xml_formatted(xml); ++ free_xml(xml); ++ if (!buffer) { ++ return -EINVAL; + } + + } else { +@@ -1280,7 +1285,10 @@ stonith_api_query(stonith_t * stonith, int call_options, const char *target, + + CRM_LOG_ASSERT(match != NULL); + if(match != NULL) { +- crm_info("%s[%d] = %s", "//@agent", lpc, xmlGetNodePath(match)); ++ xmlChar *match_path = xmlGetNodePath(match); ++ ++ crm_info("%s[%d] = %s", "//@agent", lpc, match_path); ++ free(match_path); + *devices = stonith_key_value_add(*devices, NULL, crm_element_value(match, XML_ATTR_ID)); + } + } +diff --git a/lib/lrmd/Makefile.am b/lib/lrmd/Makefile.am +index e98d1e5..f961ae1 100644 +--- a/lib/lrmd/Makefile.am ++++ b/lib/lrmd/Makefile.am +@@ -25,7 +25,7 @@ AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include \ + lib_LTLIBRARIES = liblrmd.la + + liblrmd_la_SOURCES = lrmd_client.c proxy_common.c +-liblrmd_la_LDFLAGS = -version-info 3:0:0 ++liblrmd_la_LDFLAGS = -version-info 3:0:2 + liblrmd_la_LIBADD = $(top_builddir)/lib/common/libcrmcommon.la \ + $(top_builddir)/lib/services/libcrmservice.la \ + $(top_builddir)/lib/fencing/libstonithd.la +diff --git a/lib/pengine/Makefile.am b/lib/pengine/Makefile.am +index 29b7206..78da075 100644 +--- a/lib/pengine/Makefile.am ++++ b/lib/pengine/Makefile.am +@@ -30,7 +30,7 @@ libpe_rules_la_LDFLAGS = -version-info 2:4:0 + libpe_rules_la_SOURCES = rules.c common.c + libpe_rules_la_LIBADD = $(top_builddir)/lib/common/libcrmcommon.la + +-libpe_status_la_LDFLAGS = -version-info 8:0:0 ++libpe_status_la_LDFLAGS = -version-info 8:0:4 + libpe_status_la_SOURCES = status.c unpack.c utils.c complex.c native.c group.c clone.c rules.c common.c + libpe_status_la_LIBADD = @CURSESLIBS@ $(top_builddir)/lib/common/libcrmcommon.la + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 73c44a8..106c674 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -2834,8 +2834,9 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod + + node_t *remote_node = pe_find_node(data_set->nodes, rsc->id); + if (remote_node && remote_node->details->remote_was_fenced == 0) { +- +- crm_info("Waiting to clear monitor failure for remote node %s until fencing has occured", rsc->id); ++ if (strstr(ID(xml_op), "last_failure")) { ++ crm_info("Waiting to clear monitor failure for remote node %s until fencing has occured", rsc->id); ++ } + /* disabling failure timeout for this operation because we believe + * fencing of the remote node should occur first. */ + failure_timeout = 0; +@@ -2866,6 +2867,9 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod + } else { + expired = FALSE; + } ++ } else if (rsc->remote_reconnect_interval && strstr(ID(xml_op), "last_failure")) { ++ /* always clear last failure when reconnect interval is set */ ++ clear_failcount = 1; + } + } + +diff --git a/lib/services/pcmk-dbus.h b/lib/services/pcmk-dbus.h +index afb8a2a..b9a713b 100644 +--- a/lib/services/pcmk-dbus.h ++++ b/lib/services/pcmk-dbus.h +@@ -1,3 +1,7 @@ ++#ifndef DBUS_TIMEOUT_USE_DEFAULT ++# define DBUS_TIMEOUT_USE_DEFAULT -1 ++#endif ++ + DBusConnection *pcmk_dbus_connect(void); + void pcmk_dbus_connection_setup_with_select(DBusConnection *c); + void pcmk_dbus_disconnect(DBusConnection *connection); +diff --git a/lrmd/lrmd.c b/lrmd/lrmd.c +index bd4d33e..0cf98cc 100644 +--- a/lrmd/lrmd.c ++++ b/lrmd/lrmd.c +@@ -219,6 +219,7 @@ free_lrmd_cmd(lrmd_cmd_t * cmd) + } + free(cmd->origin); + free(cmd->action); ++ free(cmd->real_action); + free(cmd->userdata_str); + free(cmd->rsc_id); + free(cmd->output); +diff --git a/pacemaker.spec.in b/pacemaker.spec.in +index 0e3200f..2dfb4a6 100644 +--- a/pacemaker.spec.in ++++ b/pacemaker.spec.in +@@ -54,7 +54,7 @@ + + Name: pacemaker + Summary: Scalable High-Availability cluster resource manager +-Version: 1.1.11 ++Version: 1.1.13 + Release: %{pcmk_release}%{?dist} + License: GPLv2+ and LGPLv2+ + Url: http://www.clusterlabs.org +diff --git a/pengine/Makefile.am b/pengine/Makefile.am +index d14d911..31532cf 100644 +--- a/pengine/Makefile.am ++++ b/pengine/Makefile.am +@@ -61,7 +61,7 @@ endif + noinst_HEADERS = allocate.h utils.h pengine.h + #utils.h pengine.h + +-libpengine_la_LDFLAGS = -version-info 8:0:0 ++libpengine_la_LDFLAGS = -version-info 8:0:4 + # -L$(top_builddir)/lib/pils -lpils -export-dynamic -module -avoid-version + libpengine_la_SOURCES = pengine.c allocate.c utils.c constraints.c + libpengine_la_SOURCES += native.c group.c clone.c master.c graph.c utilization.c +diff --git a/pengine/allocate.c b/pengine/allocate.c +index 4b6fca1..68cafd4 100644 +--- a/pengine/allocate.c ++++ b/pengine/allocate.c +@@ -1681,10 +1681,38 @@ apply_remote_node_ordering(pe_working_set_t *data_set) + resource_t *remote_rsc = NULL; + resource_t *container = NULL; + ++ if (action->rsc == NULL) { ++ continue; ++ } ++ ++ /* Special case. */ ++ if (action->rsc && ++ action->rsc->is_remote_node && ++ safe_str_eq(action->task, CRM_OP_CLEAR_FAILCOUNT)) { ++ ++ /* if we are clearing the failcount of an actual remote node connect ++ * resource, then make sure this happens before allowing the connection ++ * to start if we are planning on starting the connection during this ++ * transition */ ++ custom_action_order(action->rsc, ++ NULL, ++ action, ++ action->rsc, ++ generate_op_key(action->rsc->id, RSC_START, 0), ++ NULL, ++ pe_order_optional, ++ data_set); ++ ++ continue; ++ } ++ ++ /* detect if the action occurs on a remote node. if so create ++ * ordering constraints that guarantee the action occurs while ++ * the remote node is active (after start, before stop...) things ++ * like that */ + if (action->node == NULL || + is_remote_node(action->node) == FALSE || + action->node->details->remote_rsc == NULL || +- action->rsc == NULL || + is_set(action->flags, pe_action_pseudo)) { + continue; + } +diff --git a/pengine/regression.sh b/pengine/regression.sh +index d57da17..d184798 100755 +--- a/pengine/regression.sh ++++ b/pengine/regression.sh +@@ -566,6 +566,8 @@ do_test colocated-utilization-primitive-2 "Colocated Utilization - Choose the mo + do_test colocated-utilization-group "Colocated Utilization - Group" + do_test colocated-utilization-clone "Colocated Utilization - Clone" + ++do_test utilization-check-allowed-nodes "Only check the capacities of the nodes that can run the resource" ++ + echo "" + do_test reprobe-target_rc "Ensure correct target_rc for reprobe of inactive resources" + do_test node-maintenance-1 "cl#5128 - Node maintenance" +diff --git a/pengine/test10/utilization-check-allowed-nodes.dot b/pengine/test10/utilization-check-allowed-nodes.dot +new file mode 100644 +index 0000000..d09efbc +--- /dev/null ++++ b/pengine/test10/utilization-check-allowed-nodes.dot +@@ -0,0 +1,19 @@ ++digraph "g" { ++"load_stopped_node1 node1" [ style=bold color="green" fontcolor="orange"] ++"load_stopped_node2 node2" [ style=bold color="green" fontcolor="orange"] ++"probe_complete node1" -> "probe_complete" [ style = bold] ++"probe_complete node1" [ style=bold color="green" fontcolor="black"] ++"probe_complete node2" -> "probe_complete" [ style = bold] ++"probe_complete node2" [ style=bold color="green" fontcolor="black"] ++"probe_complete" -> "rsc1_start_0 node2" [ style = bold] ++"probe_complete" [ style=bold color="green" fontcolor="orange"] ++"rsc1_monitor_0 node1" -> "probe_complete node1" [ style = bold] ++"rsc1_monitor_0 node1" [ style=bold color="green" fontcolor="black"] ++"rsc1_monitor_0 node2" -> "probe_complete node2" [ style = bold] ++"rsc1_monitor_0 node2" [ style=bold color="green" fontcolor="black"] ++"rsc1_start_0 node2" [ style=bold color="green" fontcolor="black"] ++"rsc2_monitor_0 node1" -> "probe_complete node1" [ style = bold] ++"rsc2_monitor_0 node1" [ style=bold color="green" fontcolor="black"] ++"rsc2_monitor_0 node2" -> "probe_complete node2" [ style = bold] ++"rsc2_monitor_0 node2" [ style=bold color="green" fontcolor="black"] ++} +diff --git a/pengine/test10/utilization-check-allowed-nodes.exp b/pengine/test10/utilization-check-allowed-nodes.exp +new file mode 100644 +index 0000000..134ccb3 +--- /dev/null ++++ b/pengine/test10/utilization-check-allowed-nodes.exp +@@ -0,0 +1,112 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/pengine/test10/utilization-check-allowed-nodes.scores b/pengine/test10/utilization-check-allowed-nodes.scores +new file mode 100644 +index 0000000..26887e2 +--- /dev/null ++++ b/pengine/test10/utilization-check-allowed-nodes.scores +@@ -0,0 +1,5 @@ ++Allocation scores: ++native_color: rsc1 allocation score on node1: -INFINITY ++native_color: rsc1 allocation score on node2: 0 ++native_color: rsc2 allocation score on node1: -INFINITY ++native_color: rsc2 allocation score on node2: 0 +diff --git a/pengine/test10/utilization-check-allowed-nodes.summary b/pengine/test10/utilization-check-allowed-nodes.summary +new file mode 100644 +index 0000000..12bf19a +--- /dev/null ++++ b/pengine/test10/utilization-check-allowed-nodes.summary +@@ -0,0 +1,26 @@ ++ ++Current cluster status: ++Online: [ node1 node2 ] ++ ++ rsc1 (ocf::pacemaker:Dummy): Stopped ++ rsc2 (ocf::pacemaker:Dummy): Stopped ++ ++Transition Summary: ++ * Start rsc1 (node2) ++ ++Executing cluster transition: ++ * Resource action: rsc1 monitor on node2 ++ * Resource action: rsc1 monitor on node1 ++ * Resource action: rsc2 monitor on node2 ++ * Resource action: rsc2 monitor on node1 ++ * Pseudo action: probe_complete ++ * Pseudo action: load_stopped_node1 ++ * Pseudo action: load_stopped_node2 ++ * Resource action: rsc1 start on node2 ++ ++Revised cluster status: ++Online: [ node1 node2 ] ++ ++ rsc1 (ocf::pacemaker:Dummy): Started node2 ++ rsc2 (ocf::pacemaker:Dummy): Stopped ++ +diff --git a/pengine/test10/utilization-check-allowed-nodes.xml b/pengine/test10/utilization-check-allowed-nodes.xml +new file mode 100644 +index 0000000..39cf51f +--- /dev/null ++++ b/pengine/test10/utilization-check-allowed-nodes.xml +@@ -0,0 +1,39 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/pengine/utilization.c b/pengine/utilization.c +index 982fcc9..db41b21 100644 +--- a/pengine/utilization.c ++++ b/pengine/utilization.c +@@ -344,9 +344,10 @@ process_utilization(resource_t * rsc, node_t ** prefer, pe_working_set_t * data_ + int alloc_details = scores_log_level + 1; + + if (safe_str_neq(data_set->placement_strategy, "default")) { +- GListPtr gIter = NULL; ++ GHashTableIter iter; + GListPtr colocated_rscs = NULL; + gboolean any_capable = FALSE; ++ node_t *node = NULL; + + colocated_rscs = find_colocated_rscs(colocated_rscs, rsc, rsc); + if (colocated_rscs) { +@@ -356,8 +357,11 @@ process_utilization(resource_t * rsc, node_t ** prefer, pe_working_set_t * data_ + + unallocated_utilization = sum_unallocated_utilization(rsc, colocated_rscs); + +- for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { +- node_t *node = (node_t *) gIter->data; ++ g_hash_table_iter_init(&iter, rsc->allowed_nodes); ++ while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) { ++ if (can_run_resources(node) == FALSE || node->weight < 0) { ++ continue; ++ } + + if (have_enough_capacity(node, rscs_id, unallocated_utilization)) { + any_capable = TRUE; +@@ -371,8 +375,11 @@ process_utilization(resource_t * rsc, node_t ** prefer, pe_working_set_t * data_ + } + + if (any_capable) { +- for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { +- node_t *node = (node_t *) gIter->data; ++ g_hash_table_iter_init(&iter, rsc->allowed_nodes); ++ while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) { ++ if (can_run_resources(node) == FALSE || node->weight < 0) { ++ continue; ++ } + + if (have_enough_capacity(node, rscs_id, unallocated_utilization) == FALSE) { + pe_rsc_debug(rsc, "Resource %s and its colocated resources cannot be allocated to node %s: no enough capacity", +@@ -394,8 +401,11 @@ process_utilization(resource_t * rsc, node_t ** prefer, pe_working_set_t * data_ + } + + if (any_capable == FALSE) { +- for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { +- node_t *node = (node_t *) gIter->data; ++ g_hash_table_iter_init(&iter, rsc->allowed_nodes); ++ while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) { ++ if (can_run_resources(node) == FALSE || node->weight < 0) { ++ continue; ++ } + + if (have_enough_capacity(node, rsc->id, rsc->utilization) == FALSE) { + pe_rsc_debug(rsc, "Resource %s cannot be allocated to node %s: no enough capacity", +diff --git a/tools/fake_transition.c b/tools/fake_transition.c +index e8c37f7..fe5de95 100644 +--- a/tools/fake_transition.c ++++ b/tools/fake_transition.c +@@ -65,11 +65,14 @@ inject_transient_attr(xmlNode * cib_node, const char *name, const char *value) + xmlNode *attrs = NULL; + xmlNode *container = NULL; + xmlNode *nvp = NULL; ++ xmlChar *node_path; + const char *node_uuid = ID(cib_node); + char *nvp_id = crm_concat(name, node_uuid, '-'); + +- quiet_log("Injecting attribute %s=%s into %s '%s'", name, value, xmlGetNodePath(cib_node), ++ node_path = xmlGetNodePath(cib_node); ++ quiet_log("Injecting attribute %s=%s into %s '%s'", name, value, node_path, + ID(cib_node)); ++ free(node_path); + + attrs = first_named_child(cib_node, XML_TAG_TRANSIENT_NODEATTRS); + if (attrs == NULL) { +diff --git a/valgrind-pcmk.suppressions b/valgrind-pcmk.suppressions +index e7caa55..2e382df 100644 +--- a/valgrind-pcmk.suppressions ++++ b/valgrind-pcmk.suppressions +@@ -20,6 +20,15 @@ + } + + { ++ Another bash leak ++ Memcheck:Leak ++ fun:malloc ++ fun:xmalloc ++ fun:set_default_locale ++ fun:main ++} ++ ++{ + Ignore option parsing + Memcheck:Leak + fun:realloc +@@ -294,4 +303,4 @@ + obj:*/libgobject-* + fun:call_init.part.0 + fun:_dl_init +-} +\ No newline at end of file ++} +diff --git a/version.m4 b/version.m4 +index 22faf65..3d5e96b 100644 +--- a/version.m4 ++++ b/version.m4 +@@ -1 +1 @@ +-m4_define([VERSION_NUMBER], [1.1.12]) ++m4_define([VERSION_NUMBER], [1.1.13]) diff --git a/SOURCES/pacemaker-rollup-3a7715d.patch b/SOURCES/pacemaker-rollup-3a7715d.patch new file mode 100644 index 0000000..6b1935c --- /dev/null +++ b/SOURCES/pacemaker-rollup-3a7715d.patch @@ -0,0 +1,4919 @@ +diff --git a/attrd/commands.c b/attrd/commands.c +index 18c0523..c6586c7 100644 +--- a/attrd/commands.c ++++ b/attrd/commands.c +@@ -832,7 +832,6 @@ attrd_cib_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *u + } + } + done: +- free(name); + if(a && a->changed && election_state(writer) == election_won) { + write_attribute(a); + } +@@ -1019,8 +1018,10 @@ write_attribute(attribute_t *a) + crm_info("Sent update %d with %d changes for %s, id=%s, set=%s", + a->update, cib_updates, a->id, (a->uuid? a->uuid : ""), a->set); + +- the_cib->cmds->register_callback( +- the_cib, a->update, 120, FALSE, strdup(a->id), "attrd_cib_callback", attrd_cib_callback); ++ the_cib->cmds->register_callback_full(the_cib, a->update, 120, FALSE, ++ strdup(a->id), ++ "attrd_cib_callback", ++ attrd_cib_callback, free); + } + free_xml(xml_top); + } +diff --git a/attrd/legacy.c b/attrd/legacy.c +index 4aae4c4..8a18c38 100644 +--- a/attrd/legacy.c ++++ b/attrd/legacy.c +@@ -635,6 +635,20 @@ struct attrd_callback_s { + char *value; + }; + ++/* ++ * \internal ++ * \brief Free an attrd callback structure ++ */ ++static void ++free_attrd_callback(void *user_data) ++{ ++ struct attrd_callback_s *data = user_data; ++ ++ free(data->attr); ++ free(data->value); ++ free(data); ++} ++ + static void + attrd_cib_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) + { +@@ -646,7 +660,7 @@ attrd_cib_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *u + + } else if (call_id < 0) { + crm_warn("Update %s=%s failed: %s", data->attr, data->value, pcmk_strerror(call_id)); +- goto cleanup; ++ return; + } + + switch (rc) { +@@ -674,10 +688,6 @@ attrd_cib_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *u + crm_err("Update %d for %s=%s failed: %s", + call_id, data->attr, data->value, pcmk_strerror(rc)); + } +- cleanup: +- free(data->value); +- free(data->attr); +- free(data); + } + + void +@@ -749,8 +759,10 @@ attrd_perform_update(attr_hash_entry_t * hash_entry) + if (hash_entry->value != NULL) { + data->value = strdup(hash_entry->value); + } +- cib_conn->cmds->register_callback(cib_conn, rc, 120, FALSE, data, "attrd_cib_callback", +- attrd_cib_callback); ++ cib_conn->cmds->register_callback_full(cib_conn, rc, 120, FALSE, data, ++ "attrd_cib_callback", ++ attrd_cib_callback, ++ free_attrd_callback); + return; + } + +diff --git a/bumplibs.sh b/bumplibs.sh +index 68f2f58..2044efa 100755 +--- a/bumplibs.sh ++++ b/bumplibs.sh +@@ -3,6 +3,7 @@ + declare -A headers + headers[crmcommon]="include/crm/common include/crm/crm.h" + headers[crmcluster]="include/crm/cluster.h" ++headers[crmservice]="include/crm/services.h" + headers[transitioner]="include/crm/transition.h" + headers[cib]="include/crm/cib.h include/crm/cib/util.h" + headers[pe_rules]="include/crm/pengine/rules.h" +@@ -11,8 +12,17 @@ headers[pengine]="include/crm/pengine/common.h include/crm/pengine/complex.h i + headers[stonithd]="include/crm/stonith-ng.h" + headers[lrmd]="include/crm/lrmd.h" + +-LAST_RELEASE=`test -e /Volumes || git tag -l | grep Pacemaker | grep -v rc | sort -Vr | head -n 1` +-for lib in crmcommon crmcluster transitioner cib pe_rules pe_status stonithd pengine lrmd; do ++if [ ! -z $1 ]; then ++ LAST_RELEASE=$1 ++else ++ LAST_RELEASE=`test -e /Volumes || git tag -l | grep Pacemaker | grep -v rc | sort -Vr | head -n 1` ++fi ++libs=$(find . -name "*.am" -exec grep "lib.*_la_LDFLAGS.*version-info" \{\} \; | sed -e s/_la_LDFLAGS.*// -e s/^lib//) ++for lib in $libs; do ++ if [ -z "${headers[$lib]}" ]; then ++ echo "Unknown headers for lib$lib" ++ exit 0 ++ fi + git diff -w $LAST_RELEASE..HEAD ${headers[$lib]} + echo "" + +@@ -27,6 +37,7 @@ for lib in crmcommon crmcluster transitioner cib pe_rules pe_status stonithd pen + fi + + sources=`grep "lib${lib}_la_SOURCES" $am | sed s/.*=// | sed 's:$(top_builddir)/::' | sed 's:$(top_srcdir)/::' | sed 's:\\\::' | sed 's:$(libpe_rules_la_SOURCES):rules.c\ common.c:'` ++ + full_sources="" + for f in $sources; do + if +@@ -48,6 +59,11 @@ for lib in crmcommon crmcluster transitioner cib pe_rules pe_status stonithd pen + echo "" + echo "New arguments to functions or changes to the middle of structs are incompatible additions" + echo "" ++ echo "Where possible:" ++ echo "- move new fields to the end of structs" ++ echo "- use bitfields instead of booleans" ++ echo "- when adding arguments, create new functions that the old version can call" ++ echo "" + read -p "Are the changes to lib$lib: [a]dditions, [i]ncompatible additions, [r]emovals or [f]ixes? [None]: " CHANGE + + git show $LAST_RELEASE:$am | grep version-info +diff --git a/cib/callbacks.c b/cib/callbacks.c +index 1452ded..28844b8 100644 +--- a/cib/callbacks.c ++++ b/cib/callbacks.c +@@ -1570,7 +1570,7 @@ static gboolean + cib_force_exit(gpointer data) + { + crm_notice("Forcing exit!"); +- terminate_cib(__FUNCTION__, TRUE); ++ terminate_cib(__FUNCTION__, -1); + return FALSE; + } + +@@ -1656,7 +1656,7 @@ initiate_exit(void) + + active = crm_active_peers(); + if (active < 2) { +- terminate_cib(__FUNCTION__, FALSE); ++ terminate_cib(__FUNCTION__, 0); + return; + } + +@@ -1675,9 +1675,19 @@ initiate_exit(void) + extern int remote_fd; + extern int remote_tls_fd; + ++/* ++ * \internal ++ * \brief Close remote sockets, free the global CIB and quit ++ * ++ * \param[in] caller Name of calling function (for log message) ++ * \param[in] fast If 1, skip disconnect; if -1, also exit error ++ */ + void +-terminate_cib(const char *caller, gboolean fast) ++terminate_cib(const char *caller, int fast) + { ++ crm_info("%s: Exiting%s...", caller, ++ (fast < 0)? " fast" : mainloop ? " from mainloop" : ""); ++ + if (remote_fd > 0) { + close(remote_fd); + remote_fd = 0; +@@ -1687,27 +1697,29 @@ terminate_cib(const char *caller, gboolean fast) + remote_tls_fd = 0; + } + +- if (!fast) { +- crm_info("%s: Disconnecting from cluster infrastructure", caller); +- crm_cluster_disconnect(&crm_cluster); +- } +- + uninitializeCib(); + +- crm_info("%s: Exiting%s...", caller, fast ? " fast" : mainloop ? " from mainloop" : ""); ++ if (fast < 0) { ++ /* Quit fast on error */ ++ cib_ipc_servers_destroy(ipcs_ro, ipcs_rw, ipcs_shm); ++ crm_exit(EINVAL); + +- if (fast == FALSE && mainloop != NULL && g_main_is_running(mainloop)) { ++ } else if ((mainloop != NULL) && g_main_is_running(mainloop)) { ++ /* Quit via returning from the main loop. If fast == 1, we skip the ++ * disconnect here, and it will be done when the main loop returns ++ * (this allows the peer status callback to avoid messing with the ++ * peer caches). ++ */ ++ if (fast == 0) { ++ crm_cluster_disconnect(&crm_cluster); ++ } + g_main_quit(mainloop); + + } else { +- qb_ipcs_destroy(ipcs_ro); +- qb_ipcs_destroy(ipcs_rw); +- qb_ipcs_destroy(ipcs_shm); +- +- if (fast) { +- crm_exit(EINVAL); +- } else { +- crm_exit(pcmk_ok); +- } ++ /* Quit via clean exit. Even the peer status callback can disconnect ++ * here, because we're not returning control to the caller. */ ++ crm_cluster_disconnect(&crm_cluster); ++ cib_ipc_servers_destroy(ipcs_ro, ipcs_rw, ipcs_shm); ++ crm_exit(pcmk_ok); + } + } +diff --git a/cib/callbacks.h b/cib/callbacks.h +index bca9992..a49428e 100644 +--- a/cib/callbacks.h ++++ b/cib/callbacks.h +@@ -71,7 +71,7 @@ extern void cib_common_callback_worker(uint32_t id, uint32_t flags, xmlNode * op + + void cib_shutdown(int nsig); + void initiate_exit(void); +-void terminate_cib(const char *caller, gboolean fast); ++void terminate_cib(const char *caller, int fast); + + extern gboolean cib_legacy_mode(void); + +diff --git a/cib/main.c b/cib/main.c +index e20a2b6..cbaf7b5 100644 +--- a/cib/main.c ++++ b/cib/main.c +@@ -71,8 +71,6 @@ gboolean cib_register_ha(ll_cluster_t * hb_cluster, const char *client_name); + void *hb_conn = NULL; + #endif + +-extern void terminate_cib(const char *caller, gboolean fast); +- + GMainLoop *mainloop = NULL; + const char *cib_root = NULL; + char *cib_our_uname = NULL; +@@ -414,7 +412,7 @@ cib_cs_destroy(gpointer user_data) + crm_info("Corosync disconnection complete"); + } else { + crm_err("Corosync connection lost! Exiting."); +- terminate_cib(__FUNCTION__, TRUE); ++ terminate_cib(__FUNCTION__, -1); + } + } + #endif +@@ -422,30 +420,29 @@ cib_cs_destroy(gpointer user_data) + static void + cib_peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) + { +- if ((type == crm_status_processes) && legacy_mode +- && is_not_set(node->processes, crm_get_cluster_proc())) { +- uint32_t old = 0; +- +- if (data) { +- old = *(const uint32_t *)data; +- } ++ switch (type) { ++ case crm_status_processes: ++ if (legacy_mode && is_not_set(node->processes, crm_get_cluster_proc())) { ++ uint32_t old = data? *(const uint32_t *)data : 0; ++ ++ if ((node->processes ^ old) & crm_proc_cpg) { ++ crm_info("Attempting to disable legacy mode after %s left the cluster", ++ node->uname); ++ legacy_mode = FALSE; ++ } ++ } ++ break; + +- if ((node->processes ^ old) & crm_proc_cpg) { +- crm_info("Attempting to disable legacy mode after %s left the cluster", node->uname); +- legacy_mode = FALSE; +- } +- } ++ case crm_status_uname: ++ case crm_status_rstate: ++ case crm_status_nstate: ++ if (cib_shutdown_flag && (crm_active_peers() < 2) ++ && crm_hash_table_size(client_connections) == 0) { + +- if (cib_shutdown_flag && crm_active_peers() < 2 && crm_hash_table_size(client_connections) == 0) { +- crm_info("No more peers"); +- /* @TODO +- * terminate_cib() calls crm_cluster_disconnect() which calls +- * crm_peer_destroy() which destroys the peer caches, which a peer +- * status callback shouldn't do. For now, there is a workaround in +- * crm_update_peer_proc(), but CIB should be refactored to avoid +- * destroying the peer caches here. +- */ +- terminate_cib(__FUNCTION__, FALSE); ++ crm_info("No more peers"); ++ terminate_cib(__FUNCTION__, 1); ++ } ++ break; + } + } + +@@ -455,10 +452,10 @@ cib_ha_connection_destroy(gpointer user_data) + { + if (cib_shutdown_flag) { + crm_info("Heartbeat disconnection complete... exiting"); +- terminate_cib(__FUNCTION__, FALSE); ++ terminate_cib(__FUNCTION__, 0); + } else { + crm_err("Heartbeat connection lost! Exiting."); +- terminate_cib(__FUNCTION__, TRUE); ++ terminate_cib(__FUNCTION__, -1); + } + } + #endif +@@ -541,8 +538,12 @@ cib_init(void) + /* Create the mainloop and run it... */ + mainloop = g_main_new(FALSE); + crm_info("Starting %s mainloop", crm_system_name); +- + g_main_run(mainloop); ++ ++ /* If main loop returned, clean up and exit. We disconnect in case ++ * terminate_cib() was called with fast=1. ++ */ ++ crm_cluster_disconnect(&crm_cluster); + cib_ipc_servers_destroy(ipcs_ro, ipcs_rw, ipcs_shm); + + return crm_exit(pcmk_ok); +diff --git a/cib/messages.c b/cib/messages.c +index 363562c..eca63b9 100644 +--- a/cib/messages.c ++++ b/cib/messages.c +@@ -87,7 +87,7 @@ cib_process_shutdown_req(const char *op, int options, const char *section, xmlNo + + } else if (cib_shutdown_flag) { + crm_info("Shutdown ACK from %s", host); +- terminate_cib(__FUNCTION__, FALSE); ++ terminate_cib(__FUNCTION__, 0); + return pcmk_ok; + + } else { +diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h +index 78ccad2..78214bf 100644 +--- a/crmd/crmd_utils.h ++++ b/crmd/crmd_utils.h +@@ -102,11 +102,14 @@ gboolean too_many_st_failures(void); + void st_fail_count_reset(const char * target); + void crmd_peer_down(crm_node_t *peer, bool full); + ++/* Convenience macro for registering a CIB callback ++ * (assumes that data can be freed with free()) ++ */ + # define fsa_register_cib_callback(id, flag, data, fn) do { \ + CRM_ASSERT(fsa_cib_conn); \ +- fsa_cib_conn->cmds->register_callback( \ ++ fsa_cib_conn->cmds->register_callback_full( \ + fsa_cib_conn, id, 10 * (1 + crm_active_peers()), \ +- flag, data, #fn, fn); \ ++ flag, data, #fn, fn, free); \ + } while(0) + + # define start_transition(state) do { \ +diff --git a/crmd/join_client.c b/crmd/join_client.c +index 286cd92..65e3bed 100644 +--- a/crmd/join_client.c ++++ b/crmd/join_client.c +@@ -116,8 +116,8 @@ do_cl_join_offer_respond(long long action, + + /* we only ever want the last one */ + if (query_call_id > 0) { +- /* Calling remove_cib_op_callback() would result in a memory leak of the data field */ + crm_trace("Cancelling previous join query: %d", query_call_id); ++ remove_cib_op_callback(query_call_id, FALSE); + query_call_id = 0; + } + +@@ -173,7 +173,6 @@ join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void * + + done: + free_xml(generation); +- free(join_id); + } + + /* A_CL_JOIN_RESULT */ +diff --git a/crmd/join_dc.c b/crmd/join_dc.c +index f777296..5280b6e 100644 +--- a/crmd/join_dc.c ++++ b/crmd/join_dc.c +@@ -452,8 +452,6 @@ finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, voi + crm_debug("No longer the DC in S_FINALIZE_JOIN: %s/%s", + AM_I_DC ? "DC" : "CRMd", fsa_state2string(fsa_state)); + } +- +- free(user_data); + } + + static void +diff --git a/crmd/lrm_state.c b/crmd/lrm_state.c +index 162ad03..c03fa0b 100644 +--- a/crmd/lrm_state.c ++++ b/crmd/lrm_state.c +@@ -490,7 +490,7 @@ remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg) + if (remote_proxy_new(lrm_state->node_name, session, channel) == NULL) { + remote_proxy_notify_destroy(lrmd, session); + } +- crm_info("new remote proxy client established to %s, session id %s", channel, session); ++ crm_trace("new remote proxy client established to %s, session id %s", channel, session); + } else if (safe_str_eq(op, "destroy")) { + remote_proxy_end_session(session); + +@@ -534,7 +534,16 @@ remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg) + } + + } else if(is_set(flags, crm_ipc_proxied)) { +- int rc = crm_ipc_send(proxy->ipc, request, flags, 5000, NULL); ++ const char *type = crm_element_value(request, F_TYPE); ++ int rc = 0; ++ ++ if (safe_str_eq(type, T_ATTRD) ++ && crm_element_value(request, F_ATTRD_HOST) == NULL) { ++ crm_xml_add(request, F_ATTRD_HOST, proxy->node_name); ++ crm_xml_add_int(request, F_ATTRD_HOST_ID, get_local_nodeid(0)); ++ } ++ ++ rc = crm_ipc_send(proxy->ipc, request, flags, 5000, NULL); + + if(rc < 0) { + xmlNode *op_reply = create_xml_node(NULL, "nack"); +diff --git a/crmd/membership.c b/crmd/membership.c +index 447e6a8..27ae710 100644 +--- a/crmd/membership.c ++++ b/crmd/membership.c +@@ -200,7 +200,6 @@ remove_conflicting_node_callback(xmlNode * msg, int call_id, int rc, + do_crm_log_unlikely(rc == 0 ? LOG_DEBUG : LOG_NOTICE, + "Deletion of the unknown conflicting node \"%s\": %s (rc=%d)", + node_uuid, pcmk_strerror(rc), rc); +- free(node_uuid); + } + + static void +@@ -215,11 +214,9 @@ search_conflicting_node_callback(xmlNode * msg, int call_id, int rc, + crm_notice("Searching conflicting nodes for %s failed: %s (%d)", + new_node_uuid, pcmk_strerror(rc), rc); + } +- free(new_node_uuid); + return; + + } else if (output == NULL) { +- free(new_node_uuid); + return; + } + +@@ -283,8 +280,6 @@ search_conflicting_node_callback(xmlNode * msg, int call_id, int rc, + free_xml(node_state_xml); + } + } +- +- free(new_node_uuid); + } + + static void +diff --git a/crmd/pengine.c b/crmd/pengine.c +index c9544a9..46df648 100644 +--- a/crmd/pengine.c ++++ b/crmd/pengine.c +@@ -77,8 +77,6 @@ save_cib_contents(xmlNode * msg, int call_id, int rc, xmlNode * output, void *us + + free(filename); + } +- +- free(id); + } + + static void +@@ -320,9 +318,10 @@ do_pe_invoke_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void + crm_debug("Discarding PE request in state: %s", fsa_state2string(fsa_state)); + return; + +- } else if (num_cib_op_callbacks() != 0) { +- crm_debug("Re-asking for the CIB: %d peer updates still pending", num_cib_op_callbacks()); +- ++ /* this callback counts as 1 */ ++ } else if (num_cib_op_callbacks() > 1) { ++ crm_debug("Re-asking for the CIB: %d other peer updates still pending", ++ (num_cib_op_callbacks() - 1)); + sleep(1); + register_fsa_action(A_PE_INVOKE); + return; +diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c +index 68742c2..c22b273 100644 +--- a/crmd/te_callbacks.c ++++ b/crmd/te_callbacks.c +@@ -294,6 +294,49 @@ static char *get_node_from_xpath(const char *xpath) + return nodeid; + } + ++static char *extract_node_uuid(const char *xpath) ++{ ++ char *mutable_path = strdup(xpath); ++ char *node_uuid = NULL; ++ char *search = NULL; ++ char *match = NULL; ++ ++ match = strstr(mutable_path, "node_state[@id=\'") + strlen("node_state[@id=\'"); ++ search = strchr(match, '\''); ++ search[0] = 0; ++ ++ node_uuid = strdup(match); ++ free(mutable_path); ++ return node_uuid; ++} ++ ++static void abort_unless_down(const char *xpath, const char *op, xmlNode *change, const char *reason) ++{ ++ char *node_uuid = NULL; ++ crm_action_t *down = NULL; ++ ++ if(safe_str_neq(op, "delete")) { ++ abort_transition(INFINITY, tg_restart, reason, change); ++ return; ++ } ++ ++ node_uuid = extract_node_uuid(xpath); ++ if(node_uuid == NULL) { ++ crm_err("Could not extract node ID from %s", xpath); ++ abort_transition(INFINITY, tg_restart, reason, change); ++ return; ++ } ++ ++ down = match_down_event(0, node_uuid, NULL, FALSE); ++ if(down == NULL || down->executed == false) { ++ crm_trace("Not expecting %s to be down (%s)", node_uuid, xpath); ++ abort_transition(INFINITY, tg_restart, reason, change); ++ } else { ++ crm_trace("Expecting changes to %s (%s)", node_uuid, xpath); ++ } ++ free(node_uuid); ++} ++ + void + te_update_diff(const char *event, xmlNode * msg) + { +@@ -388,27 +431,22 @@ te_update_diff(const char *event, xmlNode * msg) + break; /* Wont be packaged with any resource operations we may be waiting for */ + + } else if(strstr(xpath, "/"XML_TAG_TRANSIENT_NODEATTRS"[") || safe_str_eq(name, XML_TAG_TRANSIENT_NODEATTRS)) { +- abort_transition(INFINITY, tg_restart, "Transient attribute change", change); ++ abort_unless_down(xpath, op, change, "Transient attribute change"); + break; /* Wont be packaged with any resource operations we may be waiting for */ + + } else if(strstr(xpath, "/"XML_LRM_TAG_RSC_OP"[") && safe_str_eq(op, "delete")) { + crm_action_t *cancel = NULL; + char *mutable_key = strdup(xpath); +- char *mutable_node = strdup(xpath); + char *search = NULL; + + const char *key = NULL; +- const char *node_uuid = NULL; ++ char *node_uuid = extract_node_uuid(xpath); + + search = strrchr(mutable_key, '\''); + search[0] = 0; + + key = strrchr(mutable_key, '\'') + 1; + +- node_uuid = strstr(mutable_node, "node_state[@id=\'") + strlen("node_state[@id=\'"); +- search = strchr(node_uuid, '\''); +- search[0] = 0; +- + cancel = get_cancel_action(key, node_uuid); + if (cancel == NULL) { + abort_transition(INFINITY, tg_restart, "Resource operation removal", change); +@@ -422,14 +460,14 @@ te_update_diff(const char *event, xmlNode * msg) + trigger_graph(); + + } +- free(mutable_node); + free(mutable_key); ++ free(node_uuid); + + } else if(strstr(xpath, "/"XML_CIB_TAG_LRM"[") && safe_str_eq(op, "delete")) { +- abort_transition(INFINITY, tg_restart, "Resource state removal", change); ++ abort_unless_down(xpath, op, change, "Resource state removal"); + + } else if(strstr(xpath, "/"XML_CIB_TAG_STATE"[") && safe_str_eq(op, "delete")) { +- abort_transition(INFINITY, tg_restart, "Node state removal", change); ++ abort_unless_down(xpath, op, change, "Node state removal"); + + } else if(name == NULL) { + crm_debug("No result for %s operation to %s", op, xpath); +@@ -717,7 +755,6 @@ cib_fencing_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void * + } else { + crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data); + } +- free(user_data); + } + + void +diff --git a/crmd/utils.c b/crmd/utils.c +index 5ca4b9d..4fe3a49 100644 +--- a/crmd/utils.c ++++ b/crmd/utils.c +@@ -999,7 +999,6 @@ erase_xpath_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void + + do_crm_log_unlikely(rc == 0 ? LOG_DEBUG : LOG_NOTICE, + "Deletion of \"%s\": %s (rc=%d)", xpath, pcmk_strerror(rc), rc); +- free(xpath); + } + + void +diff --git a/cts/CIB.py b/cts/CIB.py +index 82d02d7..8fbba6c 100644 +--- a/cts/CIB.py ++++ b/cts/CIB.py +@@ -105,7 +105,7 @@ class CIB11(ConfigBase): + if not name: + name = "r%s%d" % (self.CM.Env["IPagent"], self.counter) + self.counter = self.counter + 1 +- r = Resource(self.Factory, name, self.CM.Env["IPagent"], standard) ++ r = Resource(self.Factory, name, self.CM.Env["IPagent"], standard) + + r.add_op("monitor", "5s") + return r +@@ -387,7 +387,7 @@ class ConfigFactory: + """register a constructor""" + _args = [constructor] + _args.extend(args) +- setattr(self, methodName, apply(ConfigFactoryItem,_args, kargs)) ++ setattr(self, methodName, ConfigFactoryItem(*_args, **kargs)) + + def unregister(self, methodName): + """unregister a constructor""" +@@ -415,7 +415,6 @@ class ConfigFactory: + + class ConfigFactoryItem: + def __init__(self, function, *args, **kargs): +- assert callable(function), "function should be a callable obj" + self._function = function + self._args = args + self._kargs = kargs +@@ -426,7 +425,7 @@ class ConfigFactoryItem: + _args.extend(args) + _kargs = self._kargs.copy() + _kargs.update(kargs) +- return apply(self._function,_args,_kargs) ++ return self._function(*_args,**_kargs) + + # Basic Sanity Testing + if __name__ == '__main__': +@@ -449,4 +448,4 @@ if __name__ == '__main__': + + CibFactory = ConfigFactory(manager) + cib = CibFactory.createConfig("pacemaker-1.1") +- print cib.contents() ++ print(cib.contents()) +diff --git a/cts/CM_ais.py b/cts/CM_ais.py +index a34f9b1..d2e2c1f 100644 +--- a/cts/CM_ais.py ++++ b/cts/CM_ais.py +@@ -80,7 +80,7 @@ class crm_ais(crm_lha): + # Processes running under valgrind can't be shot with "killall -9 processname", + # so don't include them in the returned list + vgrind = self.Env["valgrind-procs"].split() +- for key in self.fullcomplist.keys(): ++ for key in list(self.fullcomplist.keys()): + if self.Env["valgrind-tests"]: + if key in vgrind: + self.log("Filtering %s from the component list as it is being profiled by valgrind" % key) +diff --git a/cts/CM_lha.py b/cts/CM_lha.py +index b192272..28742d9 100755 +--- a/cts/CM_lha.py ++++ b/cts/CM_lha.py +@@ -92,7 +92,7 @@ class crm_lha(ClusterManager): + self.log("Node %s is not up." % node) + return None + +- if not self.CIBsync.has_key(node) and self.Env["ClobberCIB"] == 1: ++ if not node in self.CIBsync and self.Env["ClobberCIB"] == 1: + self.CIBsync[node] = 1 + self.rsh(node, "rm -f "+CTSvars.CRM_CONFIG_DIR+"/cib*") + +diff --git a/cts/CTS.py b/cts/CTS.py +index 9f9a291..634348a 100644 +--- a/cts/CTS.py ++++ b/cts/CTS.py +@@ -69,7 +69,7 @@ function status() { + function start() { + # Is it already running? + if +- status ++ status + then + return + fi +@@ -94,20 +94,20 @@ case $action in + nohup $0 $f start >/dev/null 2>&1 > $f +- echo " $*" >> $f ++ uptime | sed s/up.*:/,/ | tr '\\n' ',' >> $f ++ echo " $*" >> $f + start +- ;; ++ ;; + *) +- echo "Unknown action: $action." +- ;; ++ echo "Unknown action: $action." ++ ;; + esac + """ + +@@ -157,7 +157,7 @@ class CtsLab: + self.Env.dump() + + def has_key(self, key): +- return self.Env.has_key(key) ++ return key in self.Env.keys() + + def __getitem__(self, key): + return self.Env[key] +@@ -275,7 +275,7 @@ class ClusterManager(UserDict): + None + + def _finalConditions(self): +- for key in self.keys(): ++ for key in list(self.keys()): + if self[key] == None: + raise ValueError("Improper derivation: self[" + key + "] must be overridden by subclass.") + +@@ -299,14 +299,14 @@ class ClusterManager(UserDict): + if key == "Name": + return self.name + +- print "FIXME: Getting %s from %s" % (key, repr(self)) +- if self.data.has_key(key): ++ print("FIXME: Getting %s from %s" % (key, repr(self))) ++ if key in self.data: + return self.data[key] + + return self.templates.get_patterns(self.Env["Name"], key) + + def __setitem__(self, key, value): +- print "FIXME: Setting %s=%s on %s" % (key, value, repr(self)) ++ print("FIXME: Setting %s=%s on %s" % (key, value, repr(self))) + self.data[key] = value + + def key_for_node(self, node): +@@ -333,7 +333,7 @@ class ClusterManager(UserDict): + def prepare(self): + '''Finish the Initialization process. Prepare to test...''' + +- print repr(self)+"prepare" ++ print(repr(self)+"prepare") + for node in self.Env["nodes"]: + if self.StataCM(node): + self.ShouldBeStatus[node] = "up" +@@ -387,11 +387,11 @@ class ClusterManager(UserDict): + return None + + if not self.templates["Pat:Fencing_start"]: +- print "No start pattern" ++ print("No start pattern") + return None + + if not self.templates["Pat:Fencing_ok"]: +- print "No ok pattern" ++ print("No ok pattern") + return None + + stonith = None +@@ -500,7 +500,7 @@ class ClusterManager(UserDict): + else: self.debug("Starting %s on node %s" % (self.templates["Name"], node)) + ret = 1 + +- if not self.ShouldBeStatus.has_key(node): ++ if not node in self.ShouldBeStatus: + self.ShouldBeStatus[node] = "down" + + if self.ShouldBeStatus[node] != "down": +@@ -871,13 +871,13 @@ class ClusterManager(UserDict): + + for host in self.Env["nodes"]: + log_stats_file = "%s/cts-stats.csv" % CTSvars.CRM_DAEMON_DIR +- if has_log_stats.has_key(host): ++ if host in has_log_stats: + self.rsh(host, '''bash %s %s stop''' % (log_stats_bin, log_stats_file)) + (rc, lines) = self.rsh(host, '''cat %s''' % log_stats_file, stdout=2) + self.rsh(host, '''bash %s %s delete''' % (log_stats_bin, log_stats_file)) + + fname = "cts-stats-%d-nodes-%s.csv" % (len(self.Env["nodes"]), host) +- print "Extracted stats: %s" % fname ++ print("Extracted stats: %s" % fname) + fd = open(fname, "a") + fd.writelines(lines) + fd.close() +@@ -891,7 +891,7 @@ class ClusterManager(UserDict): + + for host in self.Env["nodes"]: + log_stats_file = "%s/cts-stats.csv" % CTSvars.CRM_DAEMON_DIR +- if not has_log_stats.has_key(host): ++ if not host in has_log_stats: + + global log_stats + global log_stats_bin +@@ -986,7 +986,7 @@ class Process(Component): + self.CM = cm + self.badnews_ignore = badnews_ignore + self.badnews_ignore.extend(common_ignore) +- self.triggersreboot = triggersreboot ++ self.triggersreboot = triggersreboot + + if process: + self.proc = str(process) +diff --git a/cts/CTSaudits.py b/cts/CTSaudits.py +index 8d52062..e8663f2 100755 +--- a/cts/CTSaudits.py ++++ b/cts/CTSaudits.py +@@ -108,7 +108,7 @@ class LogAudit(ClusterAudit): + self.CM.log ("ERROR: Cannot execute remote command [%s] on %s" % (cmd, node)) + + for k in self.kinds: +- if watch.has_key(k): ++ if k in watch: + w = watch[k] + if watch_pref == "any": self.CM.log("Testing for %s logs" % (k)) + w.lookforall(silent=True) +@@ -118,7 +118,7 @@ class LogAudit(ClusterAudit): + self.CM.Env["LogWatcher"] = w.kind + return 1 + +- for k in watch.keys(): ++ for k in list(watch.keys()): + w = watch[k] + if w.unmatched: + for regex in w.unmatched: +@@ -226,7 +226,7 @@ class FileAudit(ClusterAudit): + self.known.append(line) + self.CM.log("Warning: Corosync core file on %s: %s" % (node, line)) + +- if self.CM.ShouldBeStatus.has_key(node) and self.CM.ShouldBeStatus[node] == "down": ++ if node in self.CM.ShouldBeStatus and self.CM.ShouldBeStatus[node] == "down": + clean = 0 + (rc, lsout) = self.CM.rsh(node, "ls -al /dev/shm | grep qb-", None) + for line in lsout: +@@ -532,7 +532,7 @@ class CrmdStateAudit(ClusterAudit): + , "auditfail":0} + + def has_key(self, key): +- return self.Stats.has_key(key) ++ return key in self.Stats + + def __setitem__(self, key, value): + self.Stats[key] = value +@@ -542,7 +542,7 @@ class CrmdStateAudit(ClusterAudit): + + def incr(self, name): + '''Increment (or initialize) the value associated with the given name''' +- if not self.Stats.has_key(name): ++ if not name in self.Stats: + self.Stats[name] = 0 + self.Stats[name] = self.Stats[name]+1 + +@@ -601,7 +601,7 @@ class CIBAudit(ClusterAudit): + , "auditfail":0} + + def has_key(self, key): +- return self.Stats.has_key(key) ++ return key in self.Stats + + def __setitem__(self, key, value): + self.Stats[key] = value +@@ -611,7 +611,7 @@ class CIBAudit(ClusterAudit): + + def incr(self, name): + '''Increment (or initialize) the value associated with the given name''' +- if not self.Stats.has_key(name): ++ if not name in self.Stats: + self.Stats[name] = 0 + self.Stats[name] = self.Stats[name]+1 + +@@ -726,7 +726,7 @@ class PartitionAudit(ClusterAudit): + + def incr(self, name): + '''Increment (or initialize) the value associated with the given name''' +- if not self.Stats.has_key(name): ++ if not name in self.Stats: + self.Stats[name] = 0 + self.Stats[name] = self.Stats[name]+1 + +diff --git a/cts/CTSscenarios.py b/cts/CTSscenarios.py +index 2f3a69b..cc6e67e 100644 +--- a/cts/CTSscenarios.py ++++ b/cts/CTSscenarios.py +@@ -124,7 +124,7 @@ A partially set up scenario is torn down if it fails during setup. + + def incr(self, name): + '''Increment (or initialize) the value associated with the given name''' +- if not self.Stats.has_key(name): ++ if not name in self.Stats: + self.Stats[name] = 0 + self.Stats[name] = self.Stats[name]+1 + +@@ -176,7 +176,7 @@ A partially set up scenario is torn down if it fails during setup. + + elapsed_time = stoptime - starttime + test_time = stoptime - test.get_timer() +- if not test.has_key("min_time"): ++ if not test["min_time"]: + test["elapsed_time"] = elapsed_time + test["min_time"] = test_time + test["max_time"] = test_time +@@ -211,7 +211,7 @@ A partially set up scenario is torn down if it fails during setup. + } + self.ClusterManager.log("Test Summary") + for test in self.Tests: +- for key in stat_filter.keys(): ++ for key in list(stat_filter.keys()): + stat_filter[key] = test.Stats[key] + self.ClusterManager.log(("Test %s: "%test.name).ljust(25) + " %s"%repr(stat_filter)) + +@@ -387,7 +387,7 @@ According to the manual page for ping: + '''Start the PingFest!''' + + self.PingSize = 1024 +- if CM.Env.has_key("PingSize"): ++ if "PingSize" in CM.Env.keys(): + self.PingSize = CM.Env["PingSize"] + + CM.log("Starting %d byte flood pings" % self.PingSize) +@@ -550,7 +550,7 @@ Test a rolling upgrade between two versions of the stack + return self.install(node, self.CM.Env["previous-version"]) + + def SetUp(self, CM): +- print repr(self)+"prepare" ++ print(repr(self)+"prepare") + CM.prepare() + + # Clear out the cobwebs +diff --git a/cts/CTStests.py b/cts/CTStests.py +index f817004..00fcd13 100644 +--- a/cts/CTStests.py ++++ b/cts/CTStests.py +@@ -97,13 +97,18 @@ class CTSTest: + self.logger.debug(args) + + def has_key(self, key): +- return self.Stats.has_key(key) ++ return key in self.Stats + + def __setitem__(self, key, value): + self.Stats[key] = value + + def __getitem__(self, key): +- return self.Stats[key] ++ if str(key) == "0": ++ raise ValueError("Bad call to 'foo in X', should reference 'foo in X.Stats' instead") ++ ++ if key in self.Stats: ++ return self.Stats[key] ++ return None + + def log_mark(self, msg): + self.debug("MARK: test %s %s %d" % (self.name,msg,time.time())) +@@ -128,7 +133,7 @@ class CTSTest: + + def incr(self, name): + '''Increment (or initialize) the value associated with the given name''' +- if not self.Stats.has_key(name): ++ if not name in self.Stats: + self.Stats[name] = 0 + self.Stats[name] = self.Stats[name]+1 + +@@ -534,7 +539,7 @@ class StonithdTest(CTSTest): + if not self.is_applicable_common(): + return 0 + +- if self.Env.has_key("DoFencing"): ++ if "DoFencing" in self.Env.keys(): + return self.Env["DoFencing"] + + return 1 +@@ -1048,7 +1053,7 @@ class BandwidthTest(CTSTest): + T1 = linesplit[0] + timesplit = string.split(T1,":") + time2split = string.split(timesplit[2],".") +- time1 = (long(timesplit[0])*60+long(timesplit[1]))*60+long(time2split[0])+long(time2split[1])*0.000001 ++ time1 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001 + break + + while count < 100: +@@ -1070,7 +1075,7 @@ class BandwidthTest(CTSTest): + T2 = linessplit[0] + timesplit = string.split(T2,":") + time2split = string.split(timesplit[2],".") +- time2 = (long(timesplit[0])*60+long(timesplit[1]))*60+long(time2split[0])+long(time2split[1])*0.000001 ++ time2 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001 + time = time2-time1 + if (time <= 0): + return 0 +@@ -1105,7 +1110,7 @@ class MaintenanceMode(CTSTest): + # fail the resource right after turning Maintenance mode on + # verify it is not recovered until maintenance mode is turned off + if action == "On": +- pats.append("pengine.*: warning:.* Processing failed op %s for %s on" % (self.action, self.rid)) ++ pats.append(r"pengine.*:\s+warning:.*Processing failed op %s for %s on" % (self.action, self.rid)) + else: + pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "stop_0")) + pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "start_0")) +@@ -1314,7 +1319,7 @@ class ResourceRecover(CTSTest): + self.debug("Shooting %s aka. %s" % (rsc.clone_id, rsc.id)) + + pats = [] +- pats.append(r"pengine.*: warning:.* Processing failed op %s for (%s|%s) on" % (self.action, ++ pats.append(r"pengine.*:\s+warning:.*Processing failed op %s for (%s|%s) on" % (self.action, + rsc.id, rsc.clone_id)) + + if rsc.managed(): +@@ -1574,7 +1579,7 @@ class SplitBrainTest(CTSTest): + p_max = len(self.Env["nodes"]) + for node in self.Env["nodes"]: + p = self.Env.RandomGen.randint(1, p_max) +- if not partitions.has_key(p): ++ if not p in partitions: + partitions[p] = [] + partitions[p].append(node) + p_max = len(partitions.keys()) +@@ -1583,13 +1588,13 @@ class SplitBrainTest(CTSTest): + # else, try again + + self.debug("Created %d partitions" % p_max) +- for key in partitions.keys(): ++ for key in list(partitions.keys()): + self.debug("Partition["+str(key)+"]:\t"+repr(partitions[key])) + + # Disabling STONITH to reduce test complexity for now + self.rsh(node, "crm_attribute -V -n stonith-enabled -v false") + +- for key in partitions.keys(): ++ for key in list(partitions.keys()): + self.isolate_partition(partitions[key]) + + count = 30 +@@ -1612,7 +1617,7 @@ class SplitBrainTest(CTSTest): + self.CM.partitions_expected = 1 + + # And heal them again +- for key in partitions.keys(): ++ for key in list(partitions.keys()): + self.heal_partition(partitions[key]) + + # Wait for a single partition to form +@@ -2247,11 +2252,11 @@ class RollingUpgradeTest(CTSTest): + if not self.is_applicable_common(): + return None + +- if not self.Env.has_key("rpm-dir"): ++ if not "rpm-dir" in self.Env.keys(): + return None +- if not self.Env.has_key("current-version"): ++ if not "current-version" in self.Env.keys(): + return None +- if not self.Env.has_key("previous-version"): ++ if not "previous-version" in self.Env.keys(): + return None + + return 1 +@@ -2305,7 +2310,7 @@ class BSC_AddResource(CTSTest): + if ":" in ip: + fields = ip.rpartition(":") + fields[2] = str(hex(int(fields[2], 16)+1)) +- print str(hex(int(f[2], 16)+1)) ++ print(str(hex(int(f[2], 16)+1))) + else: + fields = ip.rpartition('.') + fields[2] = str(int(fields[2])+1) +@@ -3109,7 +3114,7 @@ class RemoteStonithd(CTSTest): + if not self.driver.is_applicable(): + return False + +- if self.Env.has_key("DoFencing"): ++ if "DoFencing" in self.Env.keys(): + return self.Env["DoFencing"] + + return True +diff --git a/cts/OCFIPraTest.py b/cts/OCFIPraTest.py +index 9900a62..03d964b 100755 +--- a/cts/OCFIPraTest.py ++++ b/cts/OCFIPraTest.py +@@ -28,13 +28,13 @@ from cts.CTSvars import * + + + def usage(): +- print "usage: " + sys.argv[0] \ ++ print("usage: " + sys.argv[0] \ + + " [-2]"\ + + " [--ipbase|-i first-test-ip]"\ + + " [--ipnum|-n test-ip-num]"\ + + " [--help|-h]"\ + + " [--perform|-p op]"\ +- + " [number-of-iterations]" ++ + " [number-of-iterations]") + sys.exit(1) + + +@@ -71,7 +71,7 @@ def log(towrite): + t = time.strftime("%Y/%m/%d_%H:%M:%S\t", time.localtime(time.time())) + logstr = t + " "+str(towrite) + syslog.syslog(logstr) +- print logstr ++ print(logstr) + + if __name__ == '__main__': + ra = "IPaddr" +diff --git a/cts/cib_xml.py b/cts/cib_xml.py +index 0bd963b..3d8f8d4 100644 +--- a/cts/cib_xml.py ++++ b/cts/cib_xml.py +@@ -19,7 +19,7 @@ class XmlBase(CibBase): + text = '''<%s''' % self.tag + if self.name: + text += ''' id="%s"''' % (self.name) +- for k in self.kwargs.keys(): ++ for k in list(self.kwargs.keys()): + text += ''' %s="%s"''' % (k, self.kwargs[k]) + + if not self.children: +@@ -149,22 +149,22 @@ class Resource(XmlBase): + def constraints(self): + text = "" + +- for k in self.scores.keys(): ++ for k in list(self.scores.keys()): + text += '''''' % (k, self.name) + text += self.scores[k].show() + text += '''''' + +- for k in self.needs.keys(): ++ for k in list(self.needs.keys()): + text += '''''' + +- for k in self.coloc.keys(): ++ for k in list(self.coloc.keys()): + text += '''''' + +@@ -179,13 +179,13 @@ class Resource(XmlBase): + + if len(self.meta) > 0: + text += '''''' % self.name +- for p in self.meta.keys(): ++ for p in list(self.meta.keys()): + text += '''''' % (self.name, p, p, self.meta[p]) + text += '''''' + + if len(self.param) > 0: + text += '''''' % self.name +- for p in self.param.keys(): ++ for p in list(self.param.keys()): + text += '''''' % (self.name, p, p, self.param[p]) + text += '''''' + +@@ -219,7 +219,7 @@ class Group(Resource): + + if len(self.meta) > 0: + text += '''''' % self.name +- for p in self.meta.keys(): ++ for p in list(self.meta.keys()): + text += '''''' % (self.name, p, p, self.meta[p]) + text += '''''' + +diff --git a/cts/environment.py b/cts/environment.py +index 61d4211..4ed5ced 100644 +--- a/cts/environment.py ++++ b/cts/environment.py +@@ -92,7 +92,7 @@ class Environment: + + def dump(self): + keys = [] +- for key in self.data.keys(): ++ for key in list(self.data.keys()): + keys.append(key) + + keys.sort() +@@ -106,16 +106,19 @@ class Environment: + if key == "nodes": + return True + +- return self.data.has_key(key) ++ return key in self.data + + def __getitem__(self, key): ++ if str(key) == "0": ++ raise ValueError("Bad call to 'foo in X', should reference 'foo in X.keys()' instead") ++ + if key == "nodes": + return self.Nodes + + elif key == "Name": + return self.get_stack_short() + +- elif self.data.has_key(key): ++ elif key in self.data: + return self.data[key] + + else: +@@ -175,12 +178,12 @@ class Environment: + self.data["Stack"] = "corosync (plugin v0)" + + else: +- print "Unknown stack: "+name ++ raise ValueError("Unknown stack: "+name) + sys.exit(1) + + def get_stack_short(self): + # Create the Cluster Manager object +- if not self.data.has_key("Stack"): ++ if not "Stack" in self.data: + return "unknown" + + elif self.data["Stack"] == "heartbeat": +@@ -202,12 +205,12 @@ class Environment: + return "crm-plugin-v0" + + else: +- LogFactory().log("Unknown stack: "+self.data["stack"]) +- sys.exit(1) ++ LogFactory().log("Unknown stack: "+self["stack"]) ++ raise ValueError("Unknown stack: "+self["stack"]) + + def detect_syslog(self): + # Detect syslog variant +- if not self.has_key("syslogd"): ++ if not "syslogd" in self.data: + if self["have_systemd"]: + # Systemd + self["syslogd"] = self.rsh(self.target, "systemctl list-units | grep syslog.*\.service.*active.*running | sed 's:.service.*::'", stdout=1).strip() +@@ -215,13 +218,13 @@ class Environment: + # SYS-V + self["syslogd"] = self.rsh(self.target, "chkconfig --list | grep syslog.*on | awk '{print $1}' | head -n 1", stdout=1).strip() + +- if not self.has_key("syslogd") or not self["syslogd"]: ++ if not "syslogd" in self.data or not self["syslogd"]: + # default + self["syslogd"] = "rsyslog" + + def detect_at_boot(self): + # Detect if the cluster starts at boot +- if not self.has_key("at-boot"): ++ if not "at-boot" in self.data: + atboot = 0 + + if self["have_systemd"]: +@@ -237,7 +240,7 @@ class Environment: + + def detect_ip_offset(self): + # Try to determin an offset for IPaddr resources +- if self["CIBResource"] and not self.has_key("IPBase"): ++ if self["CIBResource"] and not "IPBase" in self.data: + network=self.rsh(self.target, "ip addr | grep inet | grep -v -e link -e inet6 -e '/32' -e ' lo' | awk '{print $2}'", stdout=1).strip() + self["IPBase"] = self.rsh(self.target, "nmap -sn -n %s | grep 'scan report' | awk '{print $NF}' | sed 's:(::' | sed 's:)::' | sort -V | tail -n 1" % network, stdout=1).strip() + if not self["IPBase"]: +@@ -261,7 +264,7 @@ class Environment: + + def validate(self): + if len(self["nodes"]) < 1: +- print "No nodes specified!" ++ print("No nodes specified!") + sys.exit(1) + + def discover(self): +@@ -276,7 +279,7 @@ class Environment: + break; + self["cts-master"] = master + +- if not self.has_key("have_systemd"): ++ if not "have_systemd" in self.data: + self["have_systemd"] = not self.rsh(self.target, "systemctl list-units") + + self.detect_syslog() +@@ -390,7 +393,7 @@ class Environment: + self["DoStonith"]=1 + self["stonith-type"] = "fence_openstack" + +- print "Obtaining OpenStack credentials from the current environment" ++ print("Obtaining OpenStack credentials from the current environment") + self["stonith-params"] = "region=%s,tenant=%s,auth=%s,user=%s,password=%s" % ( + os.environ['OS_REGION_NAME'], + os.environ['OS_TENANT_NAME'], +@@ -403,7 +406,7 @@ class Environment: + self["DoStonith"]=1 + self["stonith-type"] = "fence_rhevm" + +- print "Obtaining RHEV-M credentials from the current environment" ++ print("Obtaining RHEV-M credentials from the current environment") + self["stonith-params"] = "login=%s,passwd=%s,ipaddr=%s,ipport=%s,ssl=1,shell_timeout=10" % ( + os.environ['RHEVM_USERNAME'], + os.environ['RHEVM_PASSWORD'], +@@ -442,7 +445,7 @@ class Environment: + try: + float(args[i+1]) + except ValueError: +- print ("--xmit-loss parameter should be float") ++ print("--xmit-loss parameter should be float") + self.usage(args[i+1]) + skipthis=1 + self["XmitLoss"] = args[i+1] +@@ -451,7 +454,7 @@ class Environment: + try: + float(args[i+1]) + except ValueError: +- print ("--recv-loss parameter should be float") ++ print("--recv-loss parameter should be float") + self.usage(args[i+1]) + skipthis=1 + self["RecvLoss"] = args[i+1] +@@ -503,7 +506,7 @@ class Environment: + self["DoStonith"]=1 + self["stonith-type"] = "fence_rhevm" + +- print "Obtaining RHEV-M credentials from the current environment" ++ print("Obtaining RHEV-M credentials from the current environment") + self["stonith-params"] = "login=%s,passwd=%s,ipaddr=%s,ipport=%s,ssl=1,shell_timeout=10" % ( + os.environ['RHEVM_USERNAME'], + os.environ['RHEVM_PASSWORD'], +@@ -605,7 +608,7 @@ class Environment: + skipthis=1 + (name, value) = args[i+1].split('=') + self[name] = value +- print "Setting %s = %s" % (name, value) ++ print("Setting %s = %s" % (name, value)) + + elif args[i] == "--help": + self.usage(args[i], 0) +@@ -622,52 +625,52 @@ class Environment: + + def usage(self, arg, status=1): + if status: +- print "Illegal argument %s" % arg +- print "usage: " + sys.argv[0] +" [options] number-of-iterations" +- print "\nCommon options: " +- print "\t [--nodes 'node list'] list of cluster nodes separated by whitespace" +- print "\t [--group | -g 'name'] use the nodes listed in the named DSH group (~/.dsh/groups/$name)" +- print "\t [--limit-nodes max] only use the first 'max' cluster nodes supplied with --nodes" +- print "\t [--stack (v0|v1|cman|corosync|heartbeat|openais)] which cluster stack is installed" +- print "\t [--list-tests] list the valid tests" +- print "\t [--benchmark] add the timing information" +- print "\t " +- print "Options that CTS will usually auto-detect correctly: " +- print "\t [--logfile path] where should the test software look for logs from cluster nodes" +- print "\t [--syslog-facility name] which syslog facility should the test software log to" +- print "\t [--at-boot (1|0)] does the cluster software start at boot time" +- print "\t [--test-ip-base ip] offset for generated IP address resources" +- print "\t " +- print "Options for release testing: " +- print "\t [--populate-resources | -r] generate a sample configuration" +- print "\t [--choose name] run only the named test" +- print "\t [--stonith (1 | 0 | yes | no | rhcs | ssh)]" +- print "\t [--once] run all valid tests once" +- print "\t " +- print "Additional (less common) options: " +- print "\t [--clobber-cib | -c ] erase any existing configuration" +- print "\t [--outputfile path] optional location for the test software to write logs to" +- print "\t [--trunc] truncate logfile before starting" +- print "\t [--xmit-loss lost-rate(0.0-1.0)]" +- print "\t [--recv-loss lost-rate(0.0-1.0)]" +- print "\t [--standby (1 | 0 | yes | no)]" +- print "\t [--fencing (1 | 0 | yes | no | rhcs | lha | openstack )]" +- print "\t [--stonith-type type]" +- print "\t [--stonith-args name=value]" +- print "\t [--bsc]" +- print "\t [--no-loop-tests] dont run looping/time-based tests" +- print "\t [--no-unsafe-tests] dont run tests that are unsafe for use with ocfs2/drbd" +- print "\t [--valgrind-tests] include tests using valgrind" +- print "\t [--experimental-tests] include experimental tests" +- print "\t [--container-tests] include pacemaker_remote tests that run in lxc container resources" +- print "\t [--oprofile 'node list'] list of cluster nodes to run oprofile on]" +- print "\t [--qarsh] use the QARSH backdoor to access nodes instead of SSH" +- print "\t [--docker] Indicates nodes are docker nodes." +- print "\t [--seed random_seed]" +- print "\t [--set option=value]" +- print "\t " +- print "\t Example: " +- print "\t python sys.argv[0] -g virt1 --stack cs -r --stonith ssh --schema pacemaker-1.0 500" ++ print("Illegal argument %s" % arg) ++ print("usage: " + sys.argv[0] +" [options] number-of-iterations") ++ print("\nCommon options: ") ++ print("\t [--nodes 'node list'] list of cluster nodes separated by whitespace") ++ print("\t [--group | -g 'name'] use the nodes listed in the named DSH group (~/.dsh/groups/$name)") ++ print("\t [--limit-nodes max] only use the first 'max' cluster nodes supplied with --nodes") ++ print("\t [--stack (v0|v1|cman|corosync|heartbeat|openais)] which cluster stack is installed") ++ print("\t [--list-tests] list the valid tests") ++ print("\t [--benchmark] add the timing information") ++ print("\t ") ++ print("Options that CTS will usually auto-detect correctly: ") ++ print("\t [--logfile path] where should the test software look for logs from cluster nodes") ++ print("\t [--syslog-facility name] which syslog facility should the test software log to") ++ print("\t [--at-boot (1|0)] does the cluster software start at boot time") ++ print("\t [--test-ip-base ip] offset for generated IP address resources") ++ print("\t ") ++ print("Options for release testing: ") ++ print("\t [--populate-resources | -r] generate a sample configuration") ++ print("\t [--choose name] run only the named test") ++ print("\t [--stonith (1 | 0 | yes | no | rhcs | ssh)]") ++ print("\t [--once] run all valid tests once") ++ print("\t ") ++ print("Additional (less common) options: ") ++ print("\t [--clobber-cib | -c ] erase any existing configuration") ++ print("\t [--outputfile path] optional location for the test software to write logs to") ++ print("\t [--trunc] truncate logfile before starting") ++ print("\t [--xmit-loss lost-rate(0.0-1.0)]") ++ print("\t [--recv-loss lost-rate(0.0-1.0)]") ++ print("\t [--standby (1 | 0 | yes | no)]") ++ print("\t [--fencing (1 | 0 | yes | no | rhcs | lha | openstack )]") ++ print("\t [--stonith-type type]") ++ print("\t [--stonith-args name=value]") ++ print("\t [--bsc]") ++ print("\t [--no-loop-tests] dont run looping/time-based tests") ++ print("\t [--no-unsafe-tests] dont run tests that are unsafe for use with ocfs2/drbd") ++ print("\t [--valgrind-tests] include tests using valgrind") ++ print("\t [--experimental-tests] include experimental tests") ++ print("\t [--container-tests] include pacemaker_remote tests that run in lxc container resources") ++ print("\t [--oprofile 'node list'] list of cluster nodes to run oprofile on]") ++ print("\t [--qarsh] use the QARSH backdoor to access nodes instead of SSH") ++ print("\t [--docker] Indicates nodes are docker nodes.") ++ print("\t [--seed random_seed]") ++ print("\t [--set option=value]") ++ print("\t ") ++ print("\t Example: ") ++ print("\t python sys.argv[0] -g virt1 --stack cs -r --stonith ssh --schema pacemaker-1.0 500") + + sys.exit(status) + +diff --git a/cts/logging.py b/cts/logging.py +index 8afa611..08da44a 100644 +--- a/cts/logging.py ++++ b/cts/logging.py +@@ -22,7 +22,7 @@ Licensed under the GNU GPL. + # along with this program; if not, write to the Free Software + # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + +-import types, string, sys, time, os ++import string, sys, time, os + + class Logger: + TimeFormat = "%b %d %H:%M:%S\t" +@@ -47,7 +47,7 @@ class StdErrLog(Logger): + + def __call__(self, lines): + t = time.strftime(Logger.TimeFormat, time.localtime(time.time())) +- if isinstance(lines, types.StringType): ++ if isinstance(lines, basestring): + sys.__stderr__.writelines([t, lines, "\n"]) + else: + for line in lines: +@@ -71,7 +71,7 @@ class FileLog(Logger): + fd = open(self.logfile, "a") + t = time.strftime(Logger.TimeFormat, time.localtime(time.time())) + +- if isinstance(lines, types.StringType): ++ if isinstance(lines, basestring): + fd.writelines([t, self.hostname, self.source, lines, "\n"]) + else: + for line in lines: +diff --git a/cts/patterns.py b/cts/patterns.py +index 493b690..3cdce2f 100644 +--- a/cts/patterns.py ++++ b/cts/patterns.py +@@ -67,9 +67,9 @@ class BasePatterns: + } + + def get_component(self, key): +- if self.components.has_key(key): ++ if key in self.components: + return self.components[key] +- print "Unknown component '%s' for %s" % (key, self.name) ++ print("Unknown component '%s' for %s" % (key, self.name)) + return [] + + def get_patterns(self, key): +@@ -87,12 +87,12 @@ class BasePatterns: + def __getitem__(self, key): + if key == "Name": + return self.name +- elif self.commands.has_key(key): ++ elif key in self.commands: + return self.commands[key] +- elif self.search.has_key(key): ++ elif key in self.search: + return self.search[key] + else: +- print "Unknown template '%s' for %s" % (key, self.name) ++ print("Unknown template '%s' for %s" % (key, self.name)) + return None + + class crm_lha(BasePatterns): +@@ -489,9 +489,9 @@ class PatternSelector: + crm_mcp_docker(name) + + def get_variant(self, variant): +- if patternvariants.has_key(variant): ++ if variant in patternvariants: + return patternvariants[variant] +- print "defaulting to crm-base for %s" % variant ++ print("defaulting to crm-base for %s" % variant) + return self.base + + def get_patterns(self, variant, kind): +@@ -532,7 +532,7 @@ if __name__ == '__main__': + template = args[i+1] + + else: +- print "Illegal argument " + args[i] ++ print("Illegal argument " + args[i]) + + +- print PatternSelector(kind)[template] ++ print(PatternSelector(kind)[template]) +diff --git a/cts/remote.py b/cts/remote.py +index b32b028..040b48a 100644 +--- a/cts/remote.py ++++ b/cts/remote.py +@@ -147,7 +147,7 @@ class RemoteExec: + sysname = args[0] + command = args[1] + +- #print "sysname: %s, us: %s" % (sysname, self.OurNode) ++ #print("sysname: %s, us: %s" % (sysname, self.OurNode)) + if sysname == None or string.lower(sysname) == self.OurNode or sysname == "localhost": + ret = command + else: +@@ -164,7 +164,7 @@ class RemoteExec: + self.logger.debug(args) + + def call_async(self, node, command, completionDelegate=None): +- #if completionDelegate: print "Waiting for %d on %s: %s" % (proc.pid, node, command) ++ #if completionDelegate: print("Waiting for %d on %s: %s" % (proc.pid, node, command)) + aproc = AsyncRemoteCmd(node, self._cmd([node, command]), completionDelegate=completionDelegate) + aproc.start() + return aproc +@@ -186,7 +186,7 @@ class RemoteExec: + proc = Popen(self._cmd([node, command]), + stdout = PIPE, stderr = PIPE, close_fds = True, shell = True) + +- #if completionDelegate: print "Waiting for %d on %s: %s" % (proc.pid, node, command) ++ #if completionDelegate: print("Waiting for %d on %s: %s" % (proc.pid, node, command)) + if not synchronous and proc.pid > 0 and not self.silent: + aproc = AsyncWaitProc(proc, node, command, completionDelegate=completionDelegate) + aproc.start() +@@ -257,14 +257,14 @@ class RemoteFactory: + return RemoteExec(RemoteFactory.rsh, silent) + + def enable_docker(self): +- print "Using DOCKER backend for connections to cluster nodes" ++ print("Using DOCKER backend for connections to cluster nodes") + + RemoteFactory.rsh.Command = "/usr/libexec/phd/docker/phd_docker_remote_cmd " + RemoteFactory.rsh.CpCommand = "/usr/libexec/phd/docker/phd_docker_cp" + + def enable_qarsh(self): + # http://nstraz.wordpress.com/2008/12/03/introducing-qarsh/ +- print "Using QARSH for connections to cluster nodes" ++ print("Using QARSH for connections to cluster nodes") + + RemoteFactory.rsh.Command = "qarsh -t 300 -l root" + RemoteFactory.rsh.CpCommand = "qacp -q" +diff --git a/cts/watcher.py b/cts/watcher.py +index 1182c8b..de032f7 100644 +--- a/cts/watcher.py ++++ b/cts/watcher.py +@@ -73,7 +73,7 @@ for i in range(0, len(args)): + skipthis=1 + + if not os.access(filename, os.R_OK): +- print prefix + 'Last read: %d, limit=%d, count=%d - unreadable' % (0, limit, 0) ++ print(prefix + 'Last read: %d, limit=%d, count=%d - unreadable' % (0, limit, 0)) + sys.exit(1) + + logfile=open(filename, 'r') +@@ -85,7 +85,7 @@ if offset != 'EOF': + if newsize >= offset: + logfile.seek(offset) + else: +- print prefix + ('File truncated from %d to %d' % (offset, newsize)) ++ print(prefix + ('File truncated from %d to %d' % (offset, newsize))) + if (newsize*1.05) < offset: + logfile.seek(0) + # else: we probably just lost a few logs after a fencing op +@@ -103,10 +103,10 @@ while True: + line = logfile.readline() + if not line: break + +- print line.strip() ++ print(line.strip()) + count += 1 + +-print prefix + 'Last read: %d, limit=%d, count=%d' % (logfile.tell(), limit, count) ++print(prefix + 'Last read: %d, limit=%d, count=%d' % (logfile.tell(), limit, count)) + logfile.close() + """ + +@@ -158,7 +158,7 @@ class FileObj(SearchObj): + SearchObj.__init__(self, filename, host, name) + + if host is not None: +- if not has_log_watcher.has_key(host): ++ if not host in has_log_watcher: + + global log_watcher + global log_watcher_bin +@@ -381,7 +381,7 @@ class LogWatcher(RemoteExec): + else: + self.file_list.append(FileObj(self.filename)) + +- # print "%s now has %d files" % (self.name, len(self.file_list)) ++ # print("%s now has %d files" % (self.name, len(self.file_list))) + + def __del__(self): + if self.debug_level > 1: self.debug("Destroy") +@@ -406,7 +406,7 @@ class LogWatcher(RemoteExec): + raise ValueError("No sources to read from") + + pending = [] +- #print "%s waiting for %d operations" % (self.name, self.pending) ++ #print("%s waiting for %d operations" % (self.name, self.pending)) + for f in self.file_list: + t = f.harvest_async(self) + if t: +@@ -418,7 +418,7 @@ class LogWatcher(RemoteExec): + self.logger.log("%s: Aborting after 20s waiting for %s logging commands" % (self.name, repr(t))) + return + +- #print "Got %d lines" % len(self.line_cache) ++ #print("Got %d lines" % len(self.line_cache)) + + def end(self): + for f in self.file_list: +diff --git a/doc/Pacemaker_Explained/en-US/Ch-Resources.txt b/doc/Pacemaker_Explained/en-US/Ch-Resources.txt +index 5d5fa33..b0115fb 100644 +--- a/doc/Pacemaker_Explained/en-US/Ch-Resources.txt ++++ b/doc/Pacemaker_Explained/en-US/Ch-Resources.txt +@@ -643,6 +643,16 @@ indexterm:[Action,Property,on-fail] + indexterm:[enabled,Action Property] + indexterm:[Action,Property,enabled] + ++|role ++| ++|This option only makes sense for recurring operations. It restricts ++ the operation to a specific role. The truely paranoid can even ++ specify +role=Stopped+ which allows the cluster to detect an admin ++ that manually started cluster services. ++ Allowed values: +Stopped+, +Started+, +Slave+, +Master+. ++ indexterm:[role,Action Property] ++ indexterm:[Action,Property,role] ++ + |========================================================= + + [[s-operation-defaults]] +diff --git a/fencing/commands.c b/fencing/commands.c +index 0d2d614..bd3b27d 100644 +--- a/fencing/commands.c ++++ b/fencing/commands.c +@@ -124,17 +124,7 @@ static xmlNode *stonith_construct_async_reply(async_command_t * cmd, const char + static gboolean + is_action_required(const char *action, stonith_device_t *device) + { +- if(device == NULL) { +- return FALSE; +- +- } else if (device->required_actions == NULL) { +- return FALSE; +- +- } else if (strstr(device->required_actions, action)) { +- return TRUE; +- } +- +- return FALSE; ++ return device && device->automatic_unfencing && safe_str_eq(action, "on"); + } + + static int +@@ -449,7 +439,6 @@ free_device(gpointer data) + free_xml(device->agent_metadata); + free(device->namespace); + free(device->on_target_actions); +- free(device->required_actions); + free(device->agent); + free(device->id); + free(device); +@@ -713,8 +702,6 @@ read_action_metadata(stonith_device_t *device) + for (lpc = 0; lpc < max; lpc++) { + const char *on_target = NULL; + const char *action = NULL; +- const char *automatic = NULL; +- const char *required = NULL; + xmlNode *match = getXpathResult(xpath, lpc); + + CRM_LOG_ASSERT(match != NULL); +@@ -722,8 +709,6 @@ read_action_metadata(stonith_device_t *device) + + on_target = crm_element_value(match, "on_target"); + action = crm_element_value(match, "name"); +- automatic = crm_element_value(match, "automatic"); +- required = crm_element_value(match, "required"); + + if(safe_str_eq(action, "list")) { + set_bit(device->flags, st_device_supports_list); +@@ -731,17 +716,21 @@ read_action_metadata(stonith_device_t *device) + set_bit(device->flags, st_device_supports_status); + } else if(safe_str_eq(action, "reboot")) { + set_bit(device->flags, st_device_supports_reboot); +- } else if(safe_str_eq(action, "on") && (crm_is_true(automatic))) { +- /* this setting implies required=true for unfencing */ +- required = "true"; ++ } else if (safe_str_eq(action, "on")) { ++ /* "automatic" means the cluster will unfence node when it joins */ ++ const char *automatic = crm_element_value(match, "automatic"); ++ ++ /* "required" is a deprecated synonym for "automatic" */ ++ const char *required = crm_element_value(match, "required"); ++ ++ if (crm_is_true(automatic) || crm_is_true(required)) { ++ device->automatic_unfencing = TRUE; ++ } + } + + if (action && crm_is_true(on_target)) { + device->on_target_actions = add_action(device->on_target_actions, action); + } +- if (action && crm_is_true(required)) { +- device->required_actions = add_action(device->required_actions, action); +- } + } + + freeXpathObject(xpath); +@@ -778,8 +767,7 @@ build_device_from_xml(xmlNode * msg) + + value = crm_element_value(dev, "rsc_provides"); + if (safe_str_eq(value, "unfencing")) { +- /* if this agent requires unfencing, 'on' is considered a required action */ +- device->required_actions = add_action(device->required_actions, "on"); ++ device->automatic_unfencing = TRUE; + } + + if (is_action_required("on", device)) { +@@ -1224,7 +1212,6 @@ stonith_device_action(xmlNode * msg, char **output) + } else if (device) { + cmd = create_async_command(msg); + if (cmd == NULL) { +- free_device(device); + return -EPROTO; + } + +diff --git a/fencing/internal.h b/fencing/internal.h +index 5fb8f9c..0f418ec 100644 +--- a/fencing/internal.h ++++ b/fencing/internal.h +@@ -26,12 +26,13 @@ typedef struct stonith_device_s { + + /*! list of actions that must execute on the target node. Used for unfencing */ + char *on_target_actions; +- char *required_actions; + GListPtr targets; + time_t targets_age; + gboolean has_attr_map; + /* should nodeid parameter for victim be included in agent arguments */ + gboolean include_nodeid; ++ /* whether the cluster should automatically unfence nodes with the device */ ++ gboolean automatic_unfencing; + guint priority; + guint active_pid; + +@@ -59,7 +60,8 @@ typedef struct stonith_device_s { + enum st_remap_phase { + st_phase_requested = 0, + st_phase_off = 1, +- st_phase_on = 2 ++ st_phase_on = 2, ++ st_phase_max = 3 + }; + + typedef struct remote_fencing_op_s { +@@ -128,15 +130,9 @@ typedef struct remote_fencing_op_s { + /*! The current operation phase being executed */ + enum st_remap_phase phase; + +- /* For phase 0 or 1 (requested action or a remapped "off"), required devices +- * will be executed regardless of what topology level is being executed +- * currently. For phase 1 (remapped "on"), required devices will not be +- * attempted, because the cluster will execute them automatically when the +- * node next joins the cluster. +- */ +- /*! Lists of devices marked as required for each phase */ +- GListPtr required_list[3]; +- /*! The device list of all the devices at the current executing topology level. */ ++ /*! Devices with automatic unfencing (always run if "on" requested, never if remapped) */ ++ GListPtr automatic_list; ++ /*! List of all devices at the currently executing topology level */ + GListPtr devices_list; + /*! Current entry in the topology device list */ + GListPtr devices; +diff --git a/fencing/main.c b/fencing/main.c +index 46d7352..c48e12d 100644 +--- a/fencing/main.c ++++ b/fencing/main.c +@@ -553,7 +553,7 @@ remove_fencing_topology(xmlXPathObjectPtr xpathObj) + } + + static void +-register_fencing_topology(xmlXPathObjectPtr xpathObj, gboolean force) ++register_fencing_topology(xmlXPathObjectPtr xpathObj) + { + int max = numXpathResults(xpathObj), lpc = 0; + +@@ -584,7 +584,7 @@ register_fencing_topology(xmlXPathObjectPtr xpathObj, gboolean force) + */ + + static void +-fencing_topology_init(xmlNode * msg) ++fencing_topology_init() + { + xmlXPathObjectPtr xpathObj = NULL; + const char *xpath = "//" XML_TAG_FENCING_LEVEL; +@@ -598,7 +598,7 @@ fencing_topology_init(xmlNode * msg) + + /* Grab everything */ + xpathObj = xpath_search(local_cib, xpath); +- register_fencing_topology(xpathObj, TRUE); ++ register_fencing_topology(xpathObj); + + freeXpathObject(xpathObj); + } +@@ -931,7 +931,7 @@ update_fencing_topology(const char *event, xmlNode * msg) + xpath = "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_TAG_FENCING_LEVEL; + xpathObj = xpath_search(msg, xpath); + +- register_fencing_topology(xpathObj, FALSE); ++ register_fencing_topology(xpathObj); + freeXpathObject(xpathObj); + + } else if(format == 2) { +@@ -969,7 +969,7 @@ update_fencing_topology(const char *event, xmlNode * msg) + /* Nuclear option, all we have is the path and an id... not enough to remove a specific entry */ + crm_info("Re-initializing fencing topology after %s operation %d.%d.%d for %s", + op, add[0], add[1], add[2], xpath); +- fencing_topology_init(NULL); ++ fencing_topology_init(); + return; + } + +@@ -977,7 +977,7 @@ update_fencing_topology(const char *event, xmlNode * msg) + /* Change to the topology in general */ + crm_info("Re-initializing fencing topology after top-level %s operation %d.%d.%d for %s", + op, add[0], add[1], add[2], xpath); +- fencing_topology_init(NULL); ++ fencing_topology_init(); + return; + + } else if (strstr(xpath, "/" XML_CIB_TAG_CONFIGURATION)) { +@@ -989,7 +989,7 @@ update_fencing_topology(const char *event, xmlNode * msg) + } else if(strcmp(op, "delete") == 0 || strcmp(op, "create") == 0) { + crm_info("Re-initializing fencing topology after top-level %s operation %d.%d.%d for %s.", + op, add[0], add[1], add[2], xpath); +- fencing_topology_init(NULL); ++ fencing_topology_init(); + return; + } + +@@ -1098,7 +1098,7 @@ update_cib_cache_cb(const char *event, xmlNode * msg) + } else if (stonith_enabled_saved == FALSE) { + crm_info("Updating stonith device and topology lists now that stonith is enabled"); + stonith_enabled_saved = TRUE; +- fencing_topology_init(NULL); ++ fencing_topology_init(); + cib_devices_update(); + + } else { +@@ -1114,7 +1114,7 @@ init_cib_cache_cb(xmlNode * msg, int call_id, int rc, xmlNode * output, void *us + have_cib_devices = TRUE; + local_cib = copy_xml(output); + +- fencing_topology_init(msg); ++ fencing_topology_init(); + cib_devices_update(); + } + +@@ -1239,7 +1239,7 @@ st_peer_update_callback(enum crm_status_type type, crm_node_t * node, const void + * This is a hack until we can send to a nodeid and/or we fix node name lookups + * These messages are ignored in stonith_peer_callback() + */ +- xmlNode *query = query = create_xml_node(NULL, "stonith_command"); ++ xmlNode *query = create_xml_node(NULL, "stonith_command"); + + crm_xml_add(query, F_XML_TAGNAME, "stonith_command"); + crm_xml_add(query, F_TYPE, T_STONITH_NG); +diff --git a/fencing/remote.c b/fencing/remote.c +index 2c00b5f..d741672 100644 +--- a/fencing/remote.c ++++ b/fencing/remote.c +@@ -60,13 +60,13 @@ typedef struct device_properties_s { + /* The remaining members are indexed by the operation's "phase" */ + + /* Whether this device has been executed in each phase */ +- gboolean executed[3]; ++ gboolean executed[st_phase_max]; + /* Whether this device is disallowed from executing in each phase */ +- gboolean disallowed[3]; ++ gboolean disallowed[st_phase_max]; + /* Action-specific timeout for each phase */ +- int custom_action_timeout[3]; ++ int custom_action_timeout[st_phase_max]; + /* Action-specific maximum random delay for each phase */ +- int delay_max[3]; ++ int delay_max[st_phase_max]; + } device_properties_t; + + typedef struct st_query_result_s { +@@ -207,22 +207,6 @@ grab_peer_device(const remote_fencing_op_t *op, st_query_result_t *peer, + return TRUE; + } + +-/* +- * \internal +- * \brief Free the list of required devices for a particular phase +- * +- * \param[in,out] op Operation to modify +- * \param[in] phase Phase to modify +- */ +-static void +-free_required_list(remote_fencing_op_t *op, enum st_remap_phase phase) +-{ +- if (op->required_list[phase]) { +- g_list_free_full(op->required_list[phase], free); +- op->required_list[phase] = NULL; +- } +-} +- + static void + clear_remote_op_timers(remote_fencing_op_t * op) + { +@@ -268,9 +252,7 @@ free_remote_op(gpointer data) + g_list_free_full(op->devices_list, free); + op->devices_list = NULL; + } +- free_required_list(op, st_phase_requested); +- free_required_list(op, st_phase_off); +- free_required_list(op, st_phase_on); ++ g_list_free_full(op->automatic_list, free); + free(op); + } + +@@ -323,10 +305,10 @@ op_phase_on(remote_fencing_op_t *op) + op->phase = st_phase_on; + strcpy(op->action, "on"); + +- /* Any devices that are required for "on" will be automatically executed by +- * the cluster when the node next joins, so we skip them here. ++ /* Skip devices with automatic unfencing, because the cluster will handle it ++ * when the node rejoins. + */ +- for (iter = op->required_list[op->phase]; iter != NULL; iter = iter->next) { ++ for (iter = op->automatic_list; iter != NULL; iter = iter->next) { + GListPtr match = g_list_find_custom(op->devices_list, iter->data, + sort_strings); + +@@ -334,12 +316,8 @@ op_phase_on(remote_fencing_op_t *op) + op->devices_list = g_list_remove(op->devices_list, match->data); + } + } +- +- /* We know this level will succeed, because phase 1 completed successfully +- * and we ignore any errors from phase 2. So we can free the required list, +- * which will keep them from being executed after the device list is done. +- */ +- free_required_list(op, op->phase); ++ g_list_free_full(op->automatic_list, free); ++ op->automatic_list = NULL; + + /* Rewind device list pointer */ + op->devices = op->devices_list; +@@ -659,28 +637,25 @@ topology_is_empty(stonith_topology_t *tp) + + /* + * \internal +- * \brief Add a device to the required list for a particular phase ++ * \brief Add a device to an operation's automatic unfencing list + * + * \param[in,out] op Operation to modify +- * \param[in] phase Phase to modify + * \param[in] device Device ID to add + */ + static void +-add_required_device(remote_fencing_op_t *op, enum st_remap_phase phase, +- const char *device) ++add_required_device(remote_fencing_op_t *op, const char *device) + { +- GListPtr match = g_list_find_custom(op->required_list[phase], device, ++ GListPtr match = g_list_find_custom(op->automatic_list, device, + sort_strings); + + if (!match) { +- op->required_list[phase] = g_list_prepend(op->required_list[phase], +- strdup(device)); ++ op->automatic_list = g_list_prepend(op->automatic_list, strdup(device)); + } + } + + /* + * \internal +- * \brief Remove a device from the required list for the current phase ++ * \brief Remove a device from the automatic unfencing list + * + * \param[in,out] op Operation to modify + * \param[in] device Device ID to remove +@@ -688,12 +663,11 @@ add_required_device(remote_fencing_op_t *op, enum st_remap_phase phase, + static void + remove_required_device(remote_fencing_op_t *op, const char *device) + { +- GListPtr match = g_list_find_custom(op->required_list[op->phase], device, ++ GListPtr match = g_list_find_custom(op->automatic_list, device, + sort_strings); + + if (match) { +- op->required_list[op->phase] = g_list_remove(op->required_list[op->phase], +- match->data); ++ op->automatic_list = g_list_remove(op->automatic_list, match->data); + } + } + +@@ -938,7 +912,7 @@ create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer) + + op = calloc(1, sizeof(remote_fencing_op_t)); + +- crm_element_value_int(request, F_STONITH_TIMEOUT, (int *)&(op->base_timeout)); ++ crm_element_value_int(request, F_STONITH_TIMEOUT, &(op->base_timeout)); + + if (peer && dev) { + op->id = crm_element_value_copy(dev, F_STONITH_REMOTE_OP_ID); +@@ -974,7 +948,7 @@ create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer) + crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); + op->call_options = call_options; + +- crm_element_value_int(request, F_STONITH_CALLID, (int *)&(op->client_callid)); ++ crm_element_value_int(request, F_STONITH_CALLID, &(op->client_callid)); + + crm_trace("%s new stonith op: %s - %s of %s for %s", + (peer +@@ -1352,14 +1326,17 @@ advance_op_topology(remote_fencing_op_t *op, const char *device, xmlNode *msg, + op->devices = op->devices->next; + } + +- /* If this device was required, it's not anymore */ +- remove_required_device(op, device); ++ /* Handle automatic unfencing if an "on" action was requested */ ++ if ((op->phase == st_phase_requested) && safe_str_eq(op->action, "on")) { ++ /* If the device we just executed was required, it's not anymore */ ++ remove_required_device(op, device); + +- /* If there are no more devices at this topology level, +- * run through any required devices not already executed +- */ +- if (op->devices == NULL) { +- op->devices = op->required_list[op->phase]; ++ /* If there are no more devices at this topology level, run through any ++ * remaining devices with automatic unfencing ++ */ ++ if (op->devices == NULL) { ++ op->devices = op->automatic_list; ++ } + } + + if ((op->devices == NULL) && (op->phase == st_phase_off)) { +@@ -1613,8 +1590,6 @@ parse_action_specific(xmlNode *xml, const char *peer, const char *device, + const char *action, remote_fencing_op_t *op, + enum st_remap_phase phase, device_properties_t *props) + { +- int required; +- + props->custom_action_timeout[phase] = 0; + crm_element_value_int(xml, F_STONITH_ACTION_TIMEOUT, + &props->custom_action_timeout[phase]); +@@ -1630,20 +1605,16 @@ parse_action_specific(xmlNode *xml, const char *peer, const char *device, + peer, device, props->delay_max[phase], action); + } + +- required = 0; +- crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required); +- if (required) { +- /* If the action is marked as required, add the device to the +- * operation's list of required devices for this phase. We use this +- * for unfencing when executing a topology. In phase 0 (requested +- * action) or phase 1 (remapped "off"), required devices get executed +- * regardless of their topology level; in phase 2 (remapped "on"), +- * required devices are not attempted, because the cluster will +- * execute them automatically later. +- */ +- crm_trace("Peer %s requires device %s to execute for action %s", +- peer, device, action); +- add_required_device(op, phase, device); ++ /* Handle devices with automatic unfencing */ ++ if (safe_str_eq(action, "on")) { ++ int required = 0; ++ ++ crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required); ++ if (required) { ++ crm_trace("Peer %s requires device %s to execute for action %s", ++ peer, device, action); ++ add_required_device(op, device); ++ } + } + + /* If a reboot is remapped to off+on, it's possible that a node is allowed +diff --git a/include/crm/cib.h b/include/crm/cib.h +index cb465bf..306706e 100644 +--- a/include/crm/cib.h ++++ b/include/crm/cib.h +@@ -136,6 +136,13 @@ typedef struct cib_api_operations_s { + void *user_data, const char *callback_name, + void (*callback) (xmlNode *, int, int, xmlNode *, void *)); + ++ gboolean (*register_callback_full)(cib_t *cib, int call_id, int timeout, ++ gboolean only_success, void *user_data, ++ const char *callback_name, ++ void (*callback)(xmlNode *, int, int, ++ xmlNode *, void *), ++ void (*free_func)(void *)); ++ + } cib_api_operations_t; + + struct cib_s { +diff --git a/include/crm/cib/internal.h b/include/crm/cib/internal.h +index 431a2bd..adc2faf 100644 +--- a/include/crm/cib/internal.h ++++ b/include/crm/cib/internal.h +@@ -106,7 +106,7 @@ typedef struct cib_callback_client_s { + void *user_data; + gboolean only_success; + struct timer_rec_s *timer; +- ++ void (*free_func)(void *); + } cib_callback_client_t; + + struct timer_rec_s { +@@ -137,6 +137,13 @@ int cib_native_register_notification(cib_t * cib, const char *callback, int enab + gboolean cib_client_register_callback(cib_t * cib, int call_id, int timeout, gboolean only_success, + void *user_data, const char *callback_name, + void (*callback) (xmlNode *, int, int, xmlNode *, void *)); ++gboolean cib_client_register_callback_full(cib_t *cib, int call_id, ++ int timeout, gboolean only_success, ++ void *user_data, ++ const char *callback_name, ++ void (*callback)(xmlNode *, int, int, ++ xmlNode *, void *), ++ void (*free_func)(void *)); + + int cib_process_query(const char *op, int options, const char *section, xmlNode * req, + xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib, +diff --git a/include/crm/common/ipc.h b/include/crm/common/ipc.h +index db83b09..d6ceda2 100644 +--- a/include/crm/common/ipc.h ++++ b/include/crm/common/ipc.h +@@ -75,7 +75,7 @@ long crm_ipc_read(crm_ipc_t * client); + const char *crm_ipc_buffer(crm_ipc_t * client); + uint32_t crm_ipc_buffer_flags(crm_ipc_t * client); + const char *crm_ipc_name(crm_ipc_t * client); +-int crm_ipc_default_buffer_size(void); ++unsigned int crm_ipc_default_buffer_size(void); + + /* Utils */ + xmlNode *create_hello_message(const char *uuid, const char *client_name, +diff --git a/include/crm/common/ipcs.h b/include/crm/common/ipcs.h +index b43fc53..d825912 100644 +--- a/include/crm/common/ipcs.h ++++ b/include/crm/common/ipcs.h +@@ -110,7 +110,7 @@ void crm_ipcs_send_ack(crm_client_t * c, uint32_t request, uint32_t flags, + const char *tag, const char *function, int line); + + /* when max_send_size is 0, default ipc buffer size is used */ +-ssize_t crm_ipc_prepare(uint32_t request, xmlNode * message, struct iovec **result, int32_t max_send_size); ++ssize_t crm_ipc_prepare(uint32_t request, xmlNode * message, struct iovec ** result, uint32_t max_send_size); + ssize_t crm_ipcs_send(crm_client_t * c, uint32_t request, xmlNode * message, enum crm_ipc_flags flags); + ssize_t crm_ipcs_sendv(crm_client_t * c, struct iovec *iov, enum crm_ipc_flags flags); + xmlNode *crm_ipcs_recv(crm_client_t * c, void *data, size_t size, uint32_t * id, uint32_t * flags); +diff --git a/lib/cib/cib_client.c b/lib/cib/cib_client.c +index b13323e..f7a19b8 100644 +--- a/lib/cib/cib_client.c ++++ b/lib/cib/cib_client.c +@@ -198,6 +198,11 @@ cib_destroy_op_callback(gpointer data) + g_source_remove(blob->timer->ref); + } + free(blob->timer); ++ ++ if (blob->user_data && blob->free_func) { ++ blob->free_func(blob->user_data); ++ } ++ + free(blob); + } + +@@ -327,10 +332,15 @@ cib_new(void) + return cib_native_new(); + } + +-/* this is backwards... +- cib_*_new should call this not the other way around ++/* ++ * \internal ++ * \brief Create a generic CIB connection instance ++ * ++ * \return Newly allocated and initialized cib_t instance ++ * ++ * \note This is called by each variant's cib_*_new() function before setting ++ * variant-specific values. + */ +- + cib_t * + cib_new_variant(void) + { +@@ -364,6 +374,7 @@ cib_new_variant(void) + new_cib->cmds->add_notify_callback = cib_client_add_notify_callback; + new_cib->cmds->del_notify_callback = cib_client_del_notify_callback; + new_cib->cmds->register_callback = cib_client_register_callback; ++ new_cib->cmds->register_callback_full = cib_client_register_callback_full; + + new_cib->cmds->noop = cib_client_noop; + new_cib->cmds->ping = cib_client_ping; +@@ -545,6 +556,19 @@ cib_client_register_callback(cib_t * cib, int call_id, int timeout, gboolean onl + void *user_data, const char *callback_name, + void (*callback) (xmlNode *, int, int, xmlNode *, void *)) + { ++ return cib_client_register_callback_full(cib, call_id, timeout, ++ only_success, user_data, ++ callback_name, callback, NULL); ++} ++ ++gboolean ++cib_client_register_callback_full(cib_t *cib, int call_id, int timeout, ++ gboolean only_success, void *user_data, ++ const char *callback_name, ++ void (*callback)(xmlNode *, int, int, ++ xmlNode *, void *), ++ void (*free_func)(void *)) ++{ + cib_callback_client_t *blob = NULL; + + if (call_id < 0) { +@@ -553,6 +577,9 @@ cib_client_register_callback(cib_t * cib, int call_id, int timeout, gboolean onl + } else { + crm_warn("CIB call failed: %s", pcmk_strerror(call_id)); + } ++ if (user_data && free_func) { ++ free_func(user_data); ++ } + return FALSE; + } + +@@ -561,6 +588,7 @@ cib_client_register_callback(cib_t * cib, int call_id, int timeout, gboolean onl + blob->only_success = only_success; + blob->user_data = user_data; + blob->callback = callback; ++ blob->free_func = free_func; + + if (timeout > 0) { + struct timer_rec_s *async_timer = NULL; +diff --git a/lib/cib/cib_utils.c b/lib/cib/cib_utils.c +index d321517..4dc65aa 100644 +--- a/lib/cib/cib_utils.c ++++ b/lib/cib/cib_utils.c +@@ -624,12 +624,6 @@ cib_native_callback(cib_t * cib, xmlNode * msg, int call_id, int rc) + { + xmlNode *output = NULL; + cib_callback_client_t *blob = NULL; +- cib_callback_client_t local_blob; +- +- local_blob.id = NULL; +- local_blob.callback = NULL; +- local_blob.user_data = NULL; +- local_blob.only_success = FALSE; + + if (msg != NULL) { + crm_element_value_int(msg, F_CIB_RC, &rc); +@@ -638,16 +632,8 @@ cib_native_callback(cib_t * cib, xmlNode * msg, int call_id, int rc) + } + + blob = g_hash_table_lookup(cib_op_callback_table, GINT_TO_POINTER(call_id)); +- +- if (blob != NULL) { +- local_blob = *blob; +- blob = NULL; +- +- remove_cib_op_callback(call_id, FALSE); +- +- } else { ++ if (blob == NULL) { + crm_trace("No callback found for call %d", call_id); +- local_blob.callback = NULL; + } + + if (cib == NULL) { +@@ -659,15 +645,20 @@ cib_native_callback(cib_t * cib, xmlNode * msg, int call_id, int rc) + rc = pcmk_ok; + } + +- if (local_blob.callback != NULL && (rc == pcmk_ok || local_blob.only_success == FALSE)) { +- crm_trace("Invoking callback %s for call %d", crm_str(local_blob.id), call_id); +- local_blob.callback(msg, call_id, rc, output, local_blob.user_data); ++ if (blob && blob->callback && (rc == pcmk_ok || blob->only_success == FALSE)) { ++ crm_trace("Invoking callback %s for call %d", crm_str(blob->id), call_id); ++ blob->callback(msg, call_id, rc, output, blob->user_data); + + } else if (cib && cib->op_callback == NULL && rc != pcmk_ok) { + crm_warn("CIB command failed: %s", pcmk_strerror(rc)); + crm_log_xml_debug(msg, "Failed CIB Update"); + } + ++ /* This may free user_data, so do it after the callback */ ++ if (blob) { ++ remove_cib_op_callback(call_id, FALSE); ++ } ++ + if (cib && cib->op_callback != NULL) { + crm_trace("Invoking global callback for call %d", call_id); + cib->op_callback(msg, call_id, rc, output); +diff --git a/lib/cluster/legacy.c b/lib/cluster/legacy.c +index d93613d..e9905f6 100644 +--- a/lib/cluster/legacy.c ++++ b/lib/cluster/legacy.c +@@ -52,6 +52,21 @@ void *ais_ipc_ctx = NULL; + + hdb_handle_t ais_ipc_handle = 0; + ++static bool valid_cman_name(const char *name, uint32_t nodeid) ++{ ++ bool rc = TRUE; ++ ++ /* Yes, %d, because that's what CMAN does */ ++ char *fakename = crm_strdup_printf("Node%d", nodeid); ++ ++ if(crm_str_eq(fakename, name, TRUE)) { ++ rc = FALSE; ++ crm_notice("Ignoring inferred name from cman: %s", fakename); ++ } ++ free(fakename); ++ return rc; ++} ++ + static gboolean + plugin_get_details(uint32_t * id, char **uname) + { +@@ -361,6 +376,7 @@ cman_event_callback(cman_handle_t handle, void *privdata, int reason, int arg) + arg ? "retained" : "still lost"); + } + ++ memset(cman_nodes, 0, MAX_NODES * sizeof(cman_node_t)); + rc = cman_get_nodes(pcmk_cman_handle, MAX_NODES, &node_count, cman_nodes); + if (rc < 0) { + crm_err("Couldn't query cman node list: %d %d", rc, errno); +@@ -369,6 +385,7 @@ cman_event_callback(cman_handle_t handle, void *privdata, int reason, int arg) + + for (lpc = 0; lpc < node_count; lpc++) { + crm_node_t *peer = NULL; ++ const char *name = NULL; + + if (cman_nodes[lpc].cn_nodeid == 0) { + /* Never allow node ID 0 to be considered a member #315711 */ +@@ -376,7 +393,11 @@ cman_event_callback(cman_handle_t handle, void *privdata, int reason, int arg) + continue; + } + +- peer = crm_get_peer(cman_nodes[lpc].cn_nodeid, cman_nodes[lpc].cn_name); ++ if(valid_cman_name(cman_nodes[lpc].cn_name, cman_nodes[lpc].cn_nodeid)) { ++ name = cman_nodes[lpc].cn_name; ++ } ++ ++ peer = crm_get_peer(cman_nodes[lpc].cn_nodeid, name); + if(cman_nodes[lpc].cn_member) { + crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_MEMBER, crm_peer_seq); + +@@ -631,15 +652,17 @@ cman_node_name(uint32_t nodeid) + + cman = cman_init(NULL); + if (cman != NULL && cman_is_active(cman)) { +- us.cn_name[0] = 0; ++ ++ memset(&us, 0, sizeof(cman_node_t)); + cman_get_node(cman, nodeid, &us); +- name = strdup(us.cn_name); +- crm_info("Using CMAN node name %s for %u", name, nodeid); +- } ++ if(valid_cman_name(us.cn_name, nodeid)) { ++ name = strdup(us.cn_name); ++ crm_info("Using CMAN node name %s for %u", name, nodeid); ++ } ++ } + + cman_finish(cman); + # endif +- + if (name == NULL) { + crm_debug("Unable to get node name for nodeid %u", nodeid); + } +@@ -667,7 +690,6 @@ init_cs_connection_once(crm_cluster_t * cluster) + if (cluster_connect_cpg(cluster) == FALSE) { + return FALSE; + } +- cluster->uname = cman_node_name(0 /* CMAN_NODEID_US */ ); + break; + case pcmk_cluster_heartbeat: + crm_info("Could not find an active corosync based cluster"); +diff --git a/lib/common/ipc.c b/lib/common/ipc.c +index d71c54a..f4188ed 100644 +--- a/lib/common/ipc.c ++++ b/lib/common/ipc.c +@@ -46,8 +46,8 @@ struct crm_ipc_response_header { + }; + + static int hdr_offset = 0; +-static int ipc_buffer_max = 0; +-static unsigned int pick_ipc_buffer(int max); ++static unsigned int ipc_buffer_max = 0; ++static unsigned int pick_ipc_buffer(unsigned int max); + + static inline void + crm_ipc_init(void) +@@ -60,7 +60,7 @@ crm_ipc_init(void) + } + } + +-int ++unsigned int + crm_ipc_default_buffer_size(void) + { + return pick_ipc_buffer(0); +@@ -91,7 +91,7 @@ generateReference(const char *custom1, const char *custom2) + since_epoch = calloc(1, reference_len); + + if (since_epoch != NULL) { +- sprintf(since_epoch, "%s-%s-%ld-%u", ++ sprintf(since_epoch, "%s-%s-%lu-%u", + local_cust1, local_cust2, (unsigned long)time(NULL), ref_counter++); + } + +@@ -431,7 +431,7 @@ crm_ipcs_recv(crm_client_t * c, void *data, size_t size, uint32_t * id, uint32_t + unsigned int size_u = 1 + header->size_uncompressed; + uncompressed = calloc(1, size_u); + +- crm_trace("Decompressing message data %d bytes into %d bytes", ++ crm_trace("Decompressing message data %u bytes into %u bytes", + header->size_compressed, size_u); + + rc = BZ2_bzBuffToBuffDecompress(uncompressed, &size_u, text, header->size_compressed, 1, 0); +@@ -531,9 +531,9 @@ crm_ipcs_flush_events(crm_client_t * c) + } + + ssize_t +-crm_ipc_prepare(uint32_t request, xmlNode * message, struct iovec ** result, int32_t max_send_size) ++crm_ipc_prepare(uint32_t request, xmlNode * message, struct iovec ** result, uint32_t max_send_size) + { +- static int biggest = 0; ++ static unsigned int biggest = 0; + struct iovec *iov; + unsigned int total = 0; + char *compressed = NULL; +@@ -579,20 +579,18 @@ crm_ipc_prepare(uint32_t request, xmlNode * message, struct iovec ** result, int + + free(buffer); + +- if (header->size_compressed > biggest) { +- biggest = 2 * QB_MAX(header->size_compressed, biggest); +- } ++ biggest = QB_MAX(header->size_compressed, biggest); + + } else { + ssize_t rc = -EMSGSIZE; + + crm_log_xml_trace(message, "EMSGSIZE"); +- biggest = 2 * QB_MAX(header->size_uncompressed, biggest); ++ biggest = QB_MAX(header->size_uncompressed, biggest); + + crm_err +- ("Could not compress the message into less than the configured ipc limit (%d bytes)." +- "Set PCMK_ipc_buffer to a higher value (%d bytes suggested)", max_send_size, +- biggest); ++ ("Could not compress the message (%u bytes) into less than the configured ipc limit (%u bytes). " ++ "Set PCMK_ipc_buffer to a higher value (%u bytes suggested)", ++ header->size_uncompressed, max_send_size, 4 * biggest); + + free(compressed); + free(buffer); +@@ -656,7 +654,7 @@ crm_ipcs_sendv(crm_client_t * c, struct iovec * iov, enum crm_ipc_flags flags) + + rc = qb_ipcs_response_sendv(c->ipcs, iov, 2); + if (rc < header->qb.size) { +- crm_notice("Response %d to %p[%d] (%d bytes) failed: %s (%d)", ++ crm_notice("Response %d to %p[%d] (%u bytes) failed: %s (%d)", + header->qb.id, c->ipcs, c->pid, header->qb.size, pcmk_strerror(rc), rc); + + } else { +@@ -747,9 +745,9 @@ struct crm_ipc_s { + }; + + static unsigned int +-pick_ipc_buffer(int max) ++pick_ipc_buffer(unsigned int max) + { +- static int global_max = 0; ++ static unsigned int global_max = 0; + + if(global_max == 0) { + const char *env = getenv("PCMK_ipc_buffer"); +@@ -925,7 +923,7 @@ crm_ipc_decompress(crm_ipc_t * client) + unsigned int new_buf_size = QB_MAX((hdr_offset + size_u), client->max_buf_size); + char *uncompressed = calloc(1, new_buf_size); + +- crm_trace("Decompressing message data %d bytes into %d bytes", ++ crm_trace("Decompressing message data %u bytes into %u bytes", + header->size_compressed, size_u); + + rc = BZ2_bzBuffToBuffDecompress(uncompressed + hdr_offset, &size_u, +@@ -986,7 +984,7 @@ crm_ipc_read(crm_ipc_t * client) + return -EBADMSG; + } + +- crm_trace("Received %s event %d, size=%d, rc=%d, text: %.100s", ++ crm_trace("Received %s event %d, size=%u, rc=%d, text: %.100s", + client->name, header->qb.id, header->qb.size, client->msg_size, + client->buffer + hdr_offset); + +@@ -1166,9 +1164,9 @@ crm_ipc_send(crm_ipc_t * client, xmlNode * message, enum crm_ipc_flags flags, in + + if(header->size_compressed) { + if(factor < 10 && (client->max_buf_size / 10) < (rc / factor)) { +- crm_notice("Compressed message exceeds %d0%% of the configured ipc limit (%d bytes), " +- "consider setting PCMK_ipc_buffer to %d or higher", +- factor, client->max_buf_size, 2*client->max_buf_size); ++ crm_notice("Compressed message exceeds %d0%% of the configured ipc limit (%u bytes), " ++ "consider setting PCMK_ipc_buffer to %u or higher", ++ factor, client->max_buf_size, 2 * client->max_buf_size); + factor++; + } + } +@@ -1211,7 +1209,7 @@ crm_ipc_send(crm_ipc_t * client, xmlNode * message, enum crm_ipc_flags flags, in + if (rc > 0) { + struct crm_ipc_response_header *hdr = (struct crm_ipc_response_header *)(void*)client->buffer; + +- crm_trace("Received response %d, size=%d, rc=%ld, text: %.200s", hdr->qb.id, hdr->qb.size, ++ crm_trace("Received response %d, size=%u, rc=%ld, text: %.200s", hdr->qb.id, hdr->qb.size, + rc, crm_ipc_buffer(client)); + + if (reply) { +diff --git a/lib/common/xml.c b/lib/common/xml.c +index 8eed245..299c7bf 100644 +--- a/lib/common/xml.c ++++ b/lib/common/xml.c +@@ -3821,6 +3821,7 @@ crm_xml_dump(xmlNode * data, int options, char **buffer, int *offset, int *max, + if(data == NULL) { + *offset = 0; + *max = 0; ++ return; + } + #if 0 + if (is_not_set(options, xml_log_option_filtered)) { +@@ -5621,7 +5622,7 @@ update_validation(xmlNode ** xml_blob, int *best, int max, gboolean transform, g + break; + + } else if (known_schemas[lpc].transform == NULL) { +- crm_notice("%s-style configuration is also valid for %s", ++ crm_debug("%s-style configuration is also valid for %s", + known_schemas[lpc].name, known_schemas[next].name); + + if (validate_with(xml, next, to_logs)) { +diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c +index f5e34ee..42bdf2b 100644 +--- a/lib/lrmd/lrmd_client.c ++++ b/lib/lrmd/lrmd_client.c +@@ -1369,7 +1369,7 @@ lrmd_api_disconnect(lrmd_t * lrmd) + { + lrmd_private_t *native = lrmd->private; + +- crm_info("Disconnecting from lrmd service"); ++ crm_info("Disconnecting from %d lrmd service", native->type); + switch (native->type) { + case CRM_CLIENT_IPC: + lrmd_ipc_disconnect(lrmd); +diff --git a/lib/services/dbus.c b/lib/services/dbus.c +index e2efecb..d42affe 100644 +--- a/lib/services/dbus.c ++++ b/lib/services/dbus.c +@@ -329,9 +329,6 @@ pcmk_dbus_lookup_cb(DBusPendingCall *pending, void *user_data) + + pcmk_dbus_lookup_result(reply, user_data); + +- if(pending) { +- dbus_pending_call_unref(pending); +- } + if(reply) { + dbus_message_unref(reply); + } +diff --git a/lib/services/services.c b/lib/services/services.c +index 7e2b9f7..3f40078 100644 +--- a/lib/services/services.c ++++ b/lib/services/services.c +@@ -150,6 +150,7 @@ resources_action_create(const char *name, const char *standard, const char *prov + + op = calloc(1, sizeof(svc_action_t)); + op->opaque = calloc(1, sizeof(svc_action_private_t)); ++ op->opaque->pending = NULL; + op->rsc = strdup(name); + op->action = strdup(action); + op->interval = interval; +@@ -158,6 +159,7 @@ resources_action_create(const char *name, const char *standard, const char *prov + op->agent = strdup(agent); + op->sequence = ++operations; + op->flags = flags; ++ + if (asprintf(&op->id, "%s_%s_%d", name, action, interval) == -1) { + goto return_error; + } +@@ -335,6 +337,7 @@ services_action_create_generic(const char *exec, const char *args[]) + + op->opaque->exec = strdup(exec); + op->opaque->args[0] = strdup(exec); ++ op->opaque->pending = NULL; + + for (cur_arg = 1; args && args[cur_arg - 1]; cur_arg++) { + op->opaque->args[cur_arg] = strdup(args[cur_arg - 1]); +@@ -361,17 +364,17 @@ services_set_op_pending(svc_action_t *op, DBusPendingCall *pending) + { + if (op->opaque->pending && (op->opaque->pending != pending)) { + if (pending) { +- crm_info("Lost pending DBus call (%p)", op->opaque->pending); ++ crm_info("Lost pending %s DBus call (%p)", op->id, op->opaque->pending); + } else { +- crm_trace("Done with pending DBus call (%p)", op->opaque->pending); ++ crm_info("Done with pending %s DBus call (%p)", op->id, op->opaque->pending); + } + dbus_pending_call_unref(op->opaque->pending); + } + op->opaque->pending = pending; + if (pending) { +- crm_trace("Updated pending DBus call (%p)", pending); ++ crm_info("Updated pending %s DBus call (%p)", op->id, pending); + } else { +- crm_trace("Cleared pending DBus call"); ++ crm_info("Cleared pending %s DBus call", op->id); + } + } + #endif +@@ -457,7 +460,7 @@ services_action_free(svc_action_t * op) + gboolean + cancel_recurring_action(svc_action_t * op) + { +- crm_info("Cancelling operation %s", op->id); ++ crm_info("Cancelling %s operation %s", op->standard, op->id); + + if (recurring_actions) { + g_hash_table_remove(recurring_actions, op->id); +diff --git a/lib/services/systemd.c b/lib/services/systemd.c +index e1e1bc9..ca56915 100644 +--- a/lib/services/systemd.c ++++ b/lib/services/systemd.c +@@ -189,16 +189,13 @@ systemd_loadunit_cb(DBusPendingCall *pending, void *user_data) + reply = dbus_pending_call_steal_reply(pending); + } + +- if(op) { +- crm_trace("Got result: %p for %p for %s, %s", reply, pending, op->rsc, op->action); +- } else { +- crm_trace("Got result: %p for %p", reply, pending); +- } ++ crm_trace("Got result: %p for %p / %p for %s", reply, pending, op->opaque->pending, op->id); ++ ++ CRM_LOG_ASSERT(pending == op->opaque->pending); ++ services_set_op_pending(op, NULL); ++ + systemd_loadunit_result(reply, user_data); + +- if(pending) { +- dbus_pending_call_unref(pending); +- } + if(reply) { + dbus_message_unref(reply); + } +@@ -209,6 +206,7 @@ systemd_unit_by_name(const gchar * arg_name, svc_action_t *op) + { + DBusMessage *msg; + DBusMessage *reply = NULL; ++ DBusPendingCall* pending = NULL; + char *name = NULL; + + /* +@@ -249,7 +247,11 @@ systemd_unit_by_name(const gchar * arg_name, svc_action_t *op) + return munit; + } + +- pcmk_dbus_send(msg, systemd_proxy, systemd_loadunit_cb, op, op? op->timeout : DBUS_TIMEOUT_USE_DEFAULT); ++ pending = pcmk_dbus_send(msg, systemd_proxy, systemd_loadunit_cb, op, op->timeout); ++ if(pending) { ++ services_set_op_pending(op, pending); ++ } ++ + dbus_message_unref(msg); + return NULL; + } +@@ -459,23 +461,12 @@ systemd_async_dispatch(DBusPendingCall *pending, void *user_data) + reply = dbus_pending_call_steal_reply(pending); + } + +- if(op) { +- crm_trace("Got result: %p for %p for %s, %s", reply, pending, op->rsc, op->action); +- if (pending == op->opaque->pending) { +- op->opaque->pending = NULL; +- } else { +- crm_info("Received unexpected reply for pending DBus call (%p vs %p)", +- op->opaque->pending, pending); +- } +- systemd_exec_result(reply, op); ++ crm_trace("Got result: %p for %p for %s, %s", reply, pending, op->rsc, op->action); + +- } else { +- crm_trace("Got result: %p for %p", reply, pending); +- } ++ CRM_LOG_ASSERT(pending == op->opaque->pending); ++ services_set_op_pending(op, NULL); ++ systemd_exec_result(reply, op); + +- if(pending) { +- dbus_pending_call_unref(pending); +- } + if(reply) { + dbus_message_unref(reply); + } +@@ -536,7 +527,6 @@ systemd_unit_exec_with_unit(svc_action_t * op, const char *unit) + free(state); + return op->rc == PCMK_OCF_OK; + } else if (pending) { +- dbus_pending_call_ref(pending); + services_set_op_pending(op, pending); + return TRUE; + } +diff --git a/lib/services/upstart.c b/lib/services/upstart.c +index 31b875b..eb8cfa8 100644 +--- a/lib/services/upstart.c ++++ b/lib/services/upstart.c +@@ -322,10 +322,7 @@ upstart_job_check(const char *name, const char *state, void *userdata) + } + + if (op->synchronous == FALSE) { +- if (op->opaque->pending) { +- dbus_pending_call_unref(op->opaque->pending); +- } +- op->opaque->pending = NULL; ++ services_set_op_pending(op, NULL); + operation_finalize(op); + } + } +@@ -392,6 +389,7 @@ upstart_async_dispatch(DBusPendingCall *pending, void *user_data) + if(pending) { + reply = dbus_pending_call_steal_reply(pending); + } ++ + if(pcmk_dbus_find_error(op->action, pending, reply, &error)) { + + /* ignore "already started" or "not running" errors */ +@@ -419,11 +417,10 @@ upstart_async_dispatch(DBusPendingCall *pending, void *user_data) + } + } + ++ CRM_LOG_ASSERT(pending == op->opaque->pending); ++ services_set_op_pending(op, NULL); + operation_finalize(op); + +- if(pending) { +- dbus_pending_call_unref(pending); +- } + if(reply) { + dbus_message_unref(reply); + } +@@ -483,8 +480,7 @@ upstart_job_exec(svc_action_t * op, gboolean synchronous) + free(state); + return op->rc == PCMK_OCF_OK; + } else if (pending) { +- dbus_pending_call_ref(pending); +- op->opaque->pending = pending; ++ services_set_op_pending(op, pending); + return TRUE; + } + return FALSE; +@@ -527,8 +523,7 @@ upstart_job_exec(svc_action_t * op, gboolean synchronous) + free(job); + + if(pending) { +- dbus_pending_call_ref(pending); +- op->opaque->pending = pending; ++ services_set_op_pending(op, pending); + return TRUE; + } + return FALSE; +diff --git a/lrmd/ipc_proxy.c b/lrmd/ipc_proxy.c +index 72d83c4..9427393 100644 +--- a/lrmd/ipc_proxy.c ++++ b/lrmd/ipc_proxy.c +@@ -165,14 +165,14 @@ ipc_proxy_forward_client(crm_client_t *ipc_proxy, xmlNode *xml) + */ + + if (safe_str_eq(msg_type, "event")) { +- crm_info("Sending event to %s", ipc_client->id); ++ crm_trace("Sending event to %s", ipc_client->id); + rc = crm_ipcs_send(ipc_client, 0, msg, crm_ipc_server_event); + + } else if (safe_str_eq(msg_type, "response")) { + int msg_id = 0; + + crm_element_value_int(xml, F_LRMD_IPC_MSG_ID, &msg_id); +- crm_info("Sending response to %d - %s", ipc_client->request_id, ipc_client->id); ++ crm_trace("Sending response to %d - %s", ipc_client->request_id, ipc_client->id); + rc = crm_ipcs_send(ipc_client, msg_id, msg, FALSE); + + CRM_LOG_ASSERT(msg_id == ipc_client->request_id); +diff --git a/lrmd/pacemaker_remote.service.in b/lrmd/pacemaker_remote.service.in +index 7ec42b4..15e61fb 100644 +--- a/lrmd/pacemaker_remote.service.in ++++ b/lrmd/pacemaker_remote.service.in +@@ -9,7 +9,6 @@ WantedBy=multi-user.target + Type=simple + KillMode=process + NotifyAccess=none +-SysVStartPriority=99 + EnvironmentFile=-/etc/sysconfig/pacemaker + + ExecStart=@sbindir@/pacemaker_remoted +diff --git a/mcp/pacemaker.service.in b/mcp/pacemaker.service.in +index 2ef9454..9b0a824 100644 +--- a/mcp/pacemaker.service.in ++++ b/mcp/pacemaker.service.in +@@ -20,7 +20,6 @@ WantedBy=multi-user.target + Type=simple + KillMode=process + NotifyAccess=main +-SysVStartPriority=99 + EnvironmentFile=-@sysconfdir@/sysconfig/pacemaker + EnvironmentFile=-@sysconfdir@/sysconfig/sbd + SuccessExitStatus=100 +diff --git a/pengine/allocate.c b/pengine/allocate.c +index ec5a18d..c2e56f9 100644 +--- a/pengine/allocate.c ++++ b/pengine/allocate.c +@@ -1495,11 +1495,12 @@ stage6(pe_working_set_t * data_set) + } + } + +- if (last_stonith) { +- order_actions(last_stonith, done, pe_order_implies_then); + +- } else if (dc_fence) { ++ if (dc_fence) { + order_actions(dc_down, done, pe_order_implies_then); ++ ++ } else if (last_stonith) { ++ order_actions(last_stonith, done, pe_order_implies_then); + } + + order_actions(done, all_stopped, pe_order_implies_then); +diff --git a/pengine/test10/rec-node-14.dot b/pengine/test10/rec-node-14.dot +index 395fa89..5ceef92 100644 +--- a/pengine/test10/rec-node-14.dot ++++ b/pengine/test10/rec-node-14.dot +@@ -2,9 +2,9 @@ + "all_stopped" [ style=bold color="green" fontcolor="orange" ] + "stonith 'reboot' node1" -> "stonith 'reboot' node3" [ style = bold] + "stonith 'reboot' node1" [ style=bold color="green" fontcolor="black"] ++"stonith 'reboot' node2" -> "stonith_complete" [ style = bold] + "stonith 'reboot' node2" [ style=bold color="green" fontcolor="black"] + "stonith 'reboot' node3" -> "stonith 'reboot' node2" [ style = bold] +-"stonith 'reboot' node3" -> "stonith_complete" [ style = bold] + "stonith 'reboot' node3" [ style=bold color="green" fontcolor="black"] + "stonith_complete" -> "all_stopped" [ style = bold] + "stonith_complete" [ style=bold color="green" fontcolor="orange" ] +diff --git a/pengine/test10/rec-node-14.exp b/pengine/test10/rec-node-14.exp +index 58bb5ca..0e5e163 100644 +--- a/pengine/test10/rec-node-14.exp ++++ b/pengine/test10/rec-node-14.exp +@@ -39,7 +39,7 @@ + + + +- ++ + + + +diff --git a/pengine/test10/stonith-0.dot b/pengine/test10/stonith-0.dot +index 29cdd59..8ad32fd 100644 +--- a/pengine/test10/stonith-0.dot ++++ b/pengine/test10/stonith-0.dot +@@ -71,13 +71,13 @@ digraph "g" { + "stonith 'reboot' c001n03" -> "ocf_192.168.100.181_stop_0 c001n03" [ style = bold] + "stonith 'reboot' c001n03" -> "ocf_192.168.100.183_stop_0 c001n03" [ style = bold] + "stonith 'reboot' c001n03" -> "rsc_c001n07_stop_0 c001n03" [ style = bold] ++"stonith 'reboot' c001n03" -> "stonith_complete" [ style = bold] + "stonith 'reboot' c001n03" [ style=bold color="green" fontcolor="black"] + "stonith 'reboot' c001n05" -> "group-1_stop_0" [ style = bold] + "stonith 'reboot' c001n05" -> "ocf_192.168.100.181_stop_0 c001n05" [ style = bold] + "stonith 'reboot' c001n05" -> "ocf_192.168.100.183_stop_0 c001n05" [ style = bold] + "stonith 'reboot' c001n05" -> "rsc_c001n05_stop_0 c001n05" [ style = bold] + "stonith 'reboot' c001n05" -> "stonith 'reboot' c001n03" [ style = bold] +-"stonith 'reboot' c001n05" -> "stonith_complete" [ style = bold] + "stonith 'reboot' c001n05" [ style=bold color="green" fontcolor="black"] + "stonith_complete" -> "all_stopped" [ style = bold] + "stonith_complete" -> "heartbeat_192.168.100.182_start_0 c001n02" [ style = bold] +diff --git a/pengine/test10/stonith-0.exp b/pengine/test10/stonith-0.exp +index 9d47215..a6695c9 100644 +--- a/pengine/test10/stonith-0.exp ++++ b/pengine/test10/stonith-0.exp +@@ -394,7 +394,7 @@ + + + +- ++ + + + +diff --git a/pengine/test10/systemhealth1.dot b/pengine/test10/systemhealth1.dot +index 28841b7..a29f519 100644 +--- a/pengine/test10/systemhealth1.dot ++++ b/pengine/test10/systemhealth1.dot +@@ -1,8 +1,8 @@ + digraph "g" { + "all_stopped" [ style=bold color="green" fontcolor="orange" ] ++"stonith 'reboot' hs21c" -> "stonith_complete" [ style = bold] + "stonith 'reboot' hs21c" [ style=bold color="green" fontcolor="black"] + "stonith 'reboot' hs21d" -> "stonith 'reboot' hs21c" [ style = bold] +-"stonith 'reboot' hs21d" -> "stonith_complete" [ style = bold] + "stonith 'reboot' hs21d" [ style=bold color="green" fontcolor="black"] + "stonith_complete" -> "all_stopped" [ style = bold] + "stonith_complete" [ style=bold color="green" fontcolor="orange" ] +diff --git a/pengine/test10/systemhealth1.exp b/pengine/test10/systemhealth1.exp +index 80a2329..aa2afe1 100644 +--- a/pengine/test10/systemhealth1.exp ++++ b/pengine/test10/systemhealth1.exp +@@ -27,7 +27,7 @@ + + + +- ++ + + + +diff --git a/pengine/test10/systemhealthm1.dot b/pengine/test10/systemhealthm1.dot +index 28841b7..a29f519 100644 +--- a/pengine/test10/systemhealthm1.dot ++++ b/pengine/test10/systemhealthm1.dot +@@ -1,8 +1,8 @@ + digraph "g" { + "all_stopped" [ style=bold color="green" fontcolor="orange" ] ++"stonith 'reboot' hs21c" -> "stonith_complete" [ style = bold] + "stonith 'reboot' hs21c" [ style=bold color="green" fontcolor="black"] + "stonith 'reboot' hs21d" -> "stonith 'reboot' hs21c" [ style = bold] +-"stonith 'reboot' hs21d" -> "stonith_complete" [ style = bold] + "stonith 'reboot' hs21d" [ style=bold color="green" fontcolor="black"] + "stonith_complete" -> "all_stopped" [ style = bold] + "stonith_complete" [ style=bold color="green" fontcolor="orange" ] +diff --git a/pengine/test10/systemhealthm1.exp b/pengine/test10/systemhealthm1.exp +index 80a2329..aa2afe1 100644 +--- a/pengine/test10/systemhealthm1.exp ++++ b/pengine/test10/systemhealthm1.exp +@@ -27,7 +27,7 @@ + + + +- ++ + + + +diff --git a/pengine/test10/systemhealthn1.dot b/pengine/test10/systemhealthn1.dot +index 28841b7..a29f519 100644 +--- a/pengine/test10/systemhealthn1.dot ++++ b/pengine/test10/systemhealthn1.dot +@@ -1,8 +1,8 @@ + digraph "g" { + "all_stopped" [ style=bold color="green" fontcolor="orange" ] ++"stonith 'reboot' hs21c" -> "stonith_complete" [ style = bold] + "stonith 'reboot' hs21c" [ style=bold color="green" fontcolor="black"] + "stonith 'reboot' hs21d" -> "stonith 'reboot' hs21c" [ style = bold] +-"stonith 'reboot' hs21d" -> "stonith_complete" [ style = bold] + "stonith 'reboot' hs21d" [ style=bold color="green" fontcolor="black"] + "stonith_complete" -> "all_stopped" [ style = bold] + "stonith_complete" [ style=bold color="green" fontcolor="orange" ] +diff --git a/pengine/test10/systemhealthn1.exp b/pengine/test10/systemhealthn1.exp +index 80a2329..aa2afe1 100644 +--- a/pengine/test10/systemhealthn1.exp ++++ b/pengine/test10/systemhealthn1.exp +@@ -27,7 +27,7 @@ + + + +- ++ + + + +diff --git a/pengine/test10/systemhealtho1.dot b/pengine/test10/systemhealtho1.dot +index 28841b7..a29f519 100644 +--- a/pengine/test10/systemhealtho1.dot ++++ b/pengine/test10/systemhealtho1.dot +@@ -1,8 +1,8 @@ + digraph "g" { + "all_stopped" [ style=bold color="green" fontcolor="orange" ] ++"stonith 'reboot' hs21c" -> "stonith_complete" [ style = bold] + "stonith 'reboot' hs21c" [ style=bold color="green" fontcolor="black"] + "stonith 'reboot' hs21d" -> "stonith 'reboot' hs21c" [ style = bold] +-"stonith 'reboot' hs21d" -> "stonith_complete" [ style = bold] + "stonith 'reboot' hs21d" [ style=bold color="green" fontcolor="black"] + "stonith_complete" -> "all_stopped" [ style = bold] + "stonith_complete" [ style=bold color="green" fontcolor="orange" ] +diff --git a/pengine/test10/systemhealtho1.exp b/pengine/test10/systemhealtho1.exp +index 80a2329..aa2afe1 100644 +--- a/pengine/test10/systemhealtho1.exp ++++ b/pengine/test10/systemhealtho1.exp +@@ -27,7 +27,7 @@ + + + +- ++ + + + +diff --git a/pengine/test10/systemhealthp1.dot b/pengine/test10/systemhealthp1.dot +index 28841b7..a29f519 100644 +--- a/pengine/test10/systemhealthp1.dot ++++ b/pengine/test10/systemhealthp1.dot +@@ -1,8 +1,8 @@ + digraph "g" { + "all_stopped" [ style=bold color="green" fontcolor="orange" ] ++"stonith 'reboot' hs21c" -> "stonith_complete" [ style = bold] + "stonith 'reboot' hs21c" [ style=bold color="green" fontcolor="black"] + "stonith 'reboot' hs21d" -> "stonith 'reboot' hs21c" [ style = bold] +-"stonith 'reboot' hs21d" -> "stonith_complete" [ style = bold] + "stonith 'reboot' hs21d" [ style=bold color="green" fontcolor="black"] + "stonith_complete" -> "all_stopped" [ style = bold] + "stonith_complete" [ style=bold color="green" fontcolor="orange" ] +diff --git a/pengine/test10/systemhealthp1.exp b/pengine/test10/systemhealthp1.exp +index 80a2329..aa2afe1 100644 +--- a/pengine/test10/systemhealthp1.exp ++++ b/pengine/test10/systemhealthp1.exp +@@ -27,7 +27,7 @@ + + + +- ++ + + + +diff --git a/tools/1node2heartbeat b/tools/1node2heartbeat +deleted file mode 100755 +index b63a0c8..0000000 +--- a/tools/1node2heartbeat ++++ /dev/null +@@ -1,326 +0,0 @@ +-#!/usr/bin/python +-# +-# Program to determine current list of enabled services for init state 3 +-# and create heartbeat CRM configuration for heartbeat to manage them +-# +-__copyright__=''' +-Author: Alan Robertson +-Copyright (C) 2006 International Business Machines +-''' +- +-# This program is free software; you can redistribute it and/or +-# modify it under the terms of the GNU General Public License +-# as published by the Free Software Foundation; either version 2 +-# of the License, or (at your option) any later version. +-# +-# This program is distributed in the hope that it will be useful, +-# but WITHOUT ANY WARRANTY; without even the implied warranty of +-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-# GNU General Public License for more details. +-# +-# You should have received a copy of the GNU General Public License +-# along with this program; if not, write to the Free Software +-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +-import os,re +-# +-# Here's the plan: +-# Find out the default run level +-# Find out what (additional?) services are enabled in that run level +-# Figure out which of them start after the network (or heartbeat?) +-# Ignore heartbeat :-) +-# Figure out which services supply the $services +-# Look to see if the SUSE /etc/insserv.conf file exists +-# If so, then scan it for who provides the $services +-# defined by the LSB +-# If we're on Red Hat, then make some Red Hat type assumptions +-# (whatever those might be) +-# If we're not, then make some generic assumptions... +-# Scan the init scripts for their dependencies... +-# Eliminate anything at or before 'network'. +-# Create resources corresponding to all active services +-# Include monitor actions for those services +-# that can be started after 'network' +-# Add the start-after dependencies +-# +-# Things to consider doing in the future: +-# Constrain them to only run on the local system? +-# Put them all in a convenience group (no colocation, no ordering) +-# Add start and stop timeouts +- +-ServiceKeywords = {} +-ServiceMap = {} +-ProvidesMap = {} +-RequiresMap = {} +-SkipMap = {'heartbeat': None, 'random': None} +-NoMonitor = {'microcode': None} +-PreReqs = ['network'] +-IgnoreList = [] +-sysname = os.uname()[1] +-InitDir = "/etc/init.d" +- +-def service_is_hb_compatible(service): +- scriptname = os.path.join(InitDir, service) +- command=scriptname + " status >/dev/null 2>&1"; +- rc = os.system(command) +- return rc == 0 +- +-def find_ordered_services(dir): +- allscripts = os.listdir(dir) +- allscripts.sort() +- services = [] +- for entry in allscripts: +- matchobj = re.match("S[0-9]+(.*)", entry) +- if not matchobj: +- continue +- service = matchobj.group(1) +- if SkipMap.has_key(service): +- continue +- if service_is_hb_compatible(service): +- services.append(service) +- else: +- IgnoreList.append(service) +- return services +- +- +-def register_services(initdir, services): +- for service in services: +- if not ServiceMap.has_key(service): +- ServiceMap[service] = os.path.join(initdir, service) +- for service in services: +- script_dependency_scan(service, os.path.join(initdir, service), ServiceMap) +- +-# +-# From the LSB version 3.1: "Comment Conventions for Init Scripts" +-# +-### BEGIN INIT INFO +-### END INIT INFO +-# +-# The delimiter lines may contain trailing whitespace, which shall be ignored. +-# All lines inside the block shall begin with a hash character '#' in the +-# first column, so the shell interprets them as comment lines which do not +-# affect operation of the script. The lines shall be of the form: +-# {keyword}: arg1 [arg2...] +-# with exactly one space character between the '#' and the keyword, with a +-# single exception. In lines following a line containing the Description +-# keyword, and until the next keyword or block ending delimiter is seen, +-# a line where the '#' is followed by more than one space or a tab +-# character shall be treated as a continuation of the previous line. +-# +- +-# Make this a class to avoid recompiling it for each script we scan. +-class pats: +- begin=re.compile("###\s+BEGIN\s+INIT\s+INFO") +- end=re.compile("###\s+END\s+INIT\s+INFO") +- desc=re.compile("# Description:\s*(.*)", re.IGNORECASE) +- desc_continue=re.compile("#( +|\t)\s*(.*)") +- keyword=re.compile("# ([^\s:]+):\s*(.*)\s*\Z") +- +-def script_keyword_scan(filename, servicename): +- keywords = {} +- ST_START=0 +- ST_INITINFO=1 +- ST_DESCRIPTION=1 +- description="" +- state=ST_START +- +- try: +- fd = open(filename) +- except IOError: +- return keywords +- +- while 1: +- line = fd.readline() +- if not line: +- break +- +- if state == ST_START: +- if pats.begin.match(line): +- state = ST_INITINFO +- continue +- if pats.end.match(line): +- break +- +- if state == ST_DESCRIPTION: +- match = pats.desc_continue.match(line) +- if match: +- description += ("\n" + match.group(2)) +- continue +- state = ST_INITINFO +- +- match = pats.desc.match(line) +- if match: +- state = ST_DESCRIPTION +- description = match.group(1) +- continue +- +- match = pats.keyword.match(line) +- if match: +- keywords[match.group(1)] = match.group(2) +- +- # Clean up and return +- fd.close() +- if description != "": +- keywords["Description"] = description +- keywords["_PATHNAME_"] = filename +- keywords["_RESOURCENAME_"] = "R_" + sysname + "_" + servicename +- return keywords +- +-def script_dependency_scan(service, script, servicemap): +- keywords=script_keyword_scan(script, service) +- ServiceKeywords[service] = keywords +- +-SysServiceGuesses = { +- '$local_fs': ['boot.localfs'], +- '$network': ['network'], +- '$named': ['named'], +- '$portmap': ['portmap'], +- '$remote_fs': ['nfs'], +- '$syslog': ['syslog'], +- '$netdaemons': ['portmap', 'inetd'], +- '$time': ['ntp'], +-} +- +-# +-# For specific versions of Linux, there are often better ways +-# to do this... +-# +-# (e.g., for SUSE Linux, one should look at /etc/insserv.conf file) +-# +-def map_sys_services(servicemap): +- sysservicemap = {} +- for sysserv in SysServiceGuesses.keys(): +- servlist = SysServiceGuesses[sysserv] +- result = [] +- for service in servlist: +- if servicemap.has_key(service): +- result.append(service) +- +- sysservicemap[sysserv] = result +- return sysservicemap +- +-# +-# +-# +-def create_service_dependencies(servicekeywords, systemservicemap): +- dependencies = {} +- for service in servicekeywords.keys(): +- if not dependencies.has_key(service): +- dependencies[service] = {} +- for key in ('Required-Start', 'Should-Start'): +- if not servicekeywords[service].has_key(key): +- continue +- for depserv in servicekeywords[service][key].split(): +- if systemservicemap.has_key(depserv): +- sysserv = systemservicemap[depserv] +- for serv in sysserv: +- dependencies[service][serv] = None +- else: +- if servicekeywords.has_key(depserv): +- dependencies[service][depserv] = None +- if len(dependencies[service]) == 0: +- del dependencies[service] +- return dependencies +- +-# +-# Modify the service name map to include all the mappings from +-# 'Provides' services to real service script names... +-# +-def map_script_services(sysservmap, servicekeywords): +- for service in servicekeywords.keys(): +- if not servicekeywords[service].has_key('Provides'): +- continue +- for provided in servicekeywords[service]['Provides'].split(): +- if not sysservmap.has_key(provided): +- sysservmap[provided] = [] +- sysservmap[provided].append(service) +- return sysservmap +- +-def create_cib_update(keywords, depmap): +- services = keywords.keys() +- services.sort() +- result = "" +- # Create the XML for the resources +- result += '\n' +- result += '\n' +- result += '\n' +- result += '\n' +- result += '\n' +- groupname="G_" + sysname + "_localinit" +- result += ' \n' +- for service in services: +- rid = keywords[service]["_RESOURCENAME_"] +- monid = "OPmon_" + sysname + '_' + service +- result += \ +- ' \n' + \ +- ' \n' + \ +- ' \n' +- if not NoMonitor.has_key(service): +- result += \ +- ' \n' +- result += \ +- ' \n' \ +- ' \n' +- result += ' \n' +- result += '\n' +- services = depmap.keys() +- services.sort() +- result += '\n' +- for service in services: +- rid = keywords[service]["_RESOURCENAME_"] +- deps = depmap[service].keys() +- deps.sort() +- for dep in deps: +- if not keywords.has_key(dep): +- continue +- depid = keywords[dep]["_RESOURCENAME_"] +- orderid='O_' + sysname + '_' + service + '_' + dep +- result += ' \n' +- loc_id="Loc_" + sysname + "_localinit" +- rule_id="LocRule_" + sysname + "_localinit" +- expr_id="LocExp_" + sysname + "_localinit" +- +- result += ' \n' +- result += ' \n' +- result += ' \n' +- result += ' \n' +- result += ' \n' +- result += '\n' +- result += '\n' +- result += '\n' +- result += '\n' +- return result +- +- +- +-def remove_a_prereq(service, servicemap, keywords, deps): +- if deps.has_key(service): +- parents = deps[service].keys() +- del deps[service] +- else: +- parents = [] +- if servicemap.has_key(service): +- del servicemap[service] +- if keywords.has_key(service): +- del keywords[service] +- for parent in parents: +- if not deps.has_key(parent): +- continue +- remove_a_prereq(parent, servicemap, keywords, deps) +- +- +-def remove_important_prereqs(prereqs, servicemap, keywords, deps): +- # Find everything these important prereqs need and get rid of them... +- for service in prereqs: +- remove_a_prereq(service, servicemap, keywords, deps) +- +-ServiceList = find_ordered_services(os.path.join(InitDir, "rc3.d")) +-register_services(InitDir, ServiceList) +-SysServiceMap = map_sys_services(ServiceMap) +-map_script_services(SysServiceMap, ServiceKeywords) +-ServiceDependencies = create_service_dependencies(ServiceKeywords,SysServiceMap) +-remove_important_prereqs(PreReqs, SysServiceMap, ServiceKeywords, ServiceDependencies) +- +-print create_cib_update(ServiceKeywords, ServiceDependencies) +diff --git a/tools/crm_commands.py.in b/tools/crm_commands.py.in +deleted file mode 100644 +index c48d82c..0000000 +--- a/tools/crm_commands.py.in ++++ /dev/null +@@ -1,132 +0,0 @@ +-# +-# +-# pingd OCF Resource Agent +-# Records (in the CIB) the current number of ping nodes a +-# cluster node can connect to. +-# +-# Copyright (c) 2006 Andrew Beekhof +-# All Rights Reserved. +-# +-# This program is free software; you can redistribute it and/or modify +-# it under the terms of version 2 of the GNU General Public License as +-# published by the Free Software Foundation. +-# +-# This program is distributed in the hope that it would be useful, but +-# WITHOUT ANY WARRANTY; without even the implied warranty of +-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +-# +-# Further, this software is distributed without any warranty that it is +-# free of the rightful claim of any third person regarding infringement +-# or the like. Any license provided herein, whether implied or +-# otherwise, applies only to this software file. Patent licenses, if +-# any, provided herein do not apply to combinations of this program with +-# other software, or any other product whatsoever. +-# +-# You should have received a copy of the GNU General Public License +-# along with this program; if not, write the Free Software Foundation, +-# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +-# +-####################################################################### +- +-import crm_utils as utl +- +-class HelpRequest(Exception): +- """Exception raised when a help listing is required.""" +- +-class ReparseRequest(Exception): +- """Exception raised when a command changed the command-line.""" +- +-def up(*args, **cmdoptions): +- l = len(utl.topic_stack) +- if l > 1: +- utl.topic_stack.pop() +- utl.set_topic(utl.topic_stack[-1]) +- else: +- utl.log_debug("Already at the top of the stack") +- +-def toggle_flag(*args, **cmdoptions): +- flag = cmdoptions["flag"] +- if utl.global_opts[flag]: +- utl.global_opts[flag] = 0 +- else: +- utl.global_opts[flag] = 1 +- +- return utl.global_opts[flag] +- +-def cd_(*args, **cmdoptions): +- utl.log_dev("args: %s\nopts: %s" % (repr(args), repr(cmdoptions))) +- if not cmdoptions["topic"]: +- utl.log_err("No topic specified") +- return 1 +- +- if cmdoptions["topic"]: +- utl.set_topic(cmdoptions["topic"]) +- if args: +- raise ReparseRequest() +- if utl.crm_topic not in utl.topic_stack: +- utl.topic_stack.append(cmdoptions["topic"]) +- if not utl.global_opts["interactive"]: +- help(cmdoptions["topic"]) +- return 0 +- +-def exit(*args, **cmdoptions): +- sys.exit(0) +- +-def help(*args, **cmdoptions): +- if args: +- raise HelpRequest(args[0]) +- raise HelpRequest(utl.crm_topic) +- +-def debugstate(*args, **cmdoptions): +- utl.log_info("Global Options: ") +- for opt in utl.global_opts.keys(): +- utl.log_info(" * %s:\t%s" % (opt, utl.global_opts[opt])) +- utl.log_info("Stack: "+repr(utl.topic_stack)) +- utl.log_info("Stack Head: "+utl.crm_topic) +- return 0 +- +-def do_list(*args, **cmdoptions): +- topic = utl.crm_topic +- if cmdoptions.has_key("topic") and cmdoptions["topic"]: +- topic = cmdoptions["topic"] +- +- utl.log_debug("Complete '%s' listing" % topic) +- if topic == "resources": +- utl.os_system("crm_resource -l", True) +- elif topic == "nodes": +- lines = utl.os_system("cibadmin -Q -o nodes", False) +- for line in lines: +- if line.find("node ") >= 0: +- print line.rstrip() +- else: +- utl.log_err("%s: Topic %s is not (yet) supported" % ("list", topic)) +- return 1 +- return 0 +- +-def do_status(*args, **cmdoptions): +- topic = utl.crm_topic +- if cmdoptions.has_key("topic") and cmdoptions["topic"]: +- topic = cmdoptions["topic"] +- +- if topic == "resources": +- if not args: +- utl.os_system("crm_resource -L", True) +- for rsc in args: +- utl.os_system("crm_resource -W -r %s"%rsc, True) +- +- elif topic == "nodes": +- lines = utl.os_system("cibadmin -Q -o status", False) +- for line in lines: +- line = line.rstrip() +- utl.log_dev("status line: "+line) +- if line.find("node_state ") >= 0: +- if not args: +- print line +- for node in args: +- if line.find(node) >= 0: +- print line +- else: +- utl.log_err("Topic %s is not (yet) supported" % topic) +- return 1 +- +- return 0 +diff --git a/tools/crm_mon.c b/tools/crm_mon.c +index 0b71275..46a59d6 100644 +--- a/tools/crm_mon.c ++++ b/tools/crm_mon.c +@@ -2715,6 +2715,7 @@ print_status(pe_working_set_t * data_set) + } else { + online_nodes = add_list_element(online_nodes, node_name); + } ++ free(node_name); + continue; + } + } else { +@@ -2727,6 +2728,7 @@ print_status(pe_working_set_t * data_set) + } else { + offline_nodes = add_list_element(offline_nodes, node_name); + } ++ free(node_name); + continue; + } + } +@@ -3078,6 +3080,7 @@ print_html_status(pe_working_set_t * data_set, const char *filename) + fprintf(stream, "\n"); + } + fprintf(stream, "\n"); ++ free(node_name); + } + fprintf(stream, "\n"); + +diff --git a/tools/crm_node.c b/tools/crm_node.c +index c484e17..d0195e3 100644 +--- a/tools/crm_node.c ++++ b/tools/crm_node.c +@@ -470,6 +470,7 @@ try_cman(int command, enum cluster_type_e stack) + + case 'l': + case 'p': ++ memset(cman_nodes, 0, MAX_NODES * sizeof(cman_node_t)); + rc = cman_get_nodes(cman_handle, MAX_NODES, &node_count, cman_nodes); + if (rc != 0) { + fprintf(stderr, "Couldn't query cman node list: %d %d", rc, errno); +@@ -489,6 +490,7 @@ try_cman(int command, enum cluster_type_e stack) + break; + + case 'i': ++ memset(&node, 0, sizeof(cman_node_t)); + rc = cman_get_node(cman_handle, CMAN_NODEID_US, &node); + if (rc != 0) { + fprintf(stderr, "Couldn't query cman node id: %d %d", rc, errno); +diff --git a/tools/crm_primitive.py.in b/tools/crm_primitive.py.in +deleted file mode 100644 +index cfe0b5c..0000000 +--- a/tools/crm_primitive.py.in ++++ /dev/null +@@ -1,268 +0,0 @@ +-#!@PYTHON@ +- +-'''Create an XML fragment describing a new resource +-''' +- +-__copyright__=''' +-Author: Andrew Beekhof +-Copyright (C) 2005 Andrew Beekhof +-''' +- +-# +-# This program is free software; you can redistribute it and/or +-# modify it under the terms of the GNU General Public License +-# as published by the Free Software Foundation; either version 2 +-# of the License, or (at your option) any later version. +-# +-# This program is distributed in the hope that it will be useful, +-# but WITHOUT ANY WARRANTY; without even the implied warranty of +-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-# GNU General Public License for more details. +-# +-# You should have received a copy of the GNU General Public License +-# along with this program; if not, write to the Free Software +-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +- +-import sys,string,os +-import xml.dom.minidom +- +-print_rsc_only = 0 +-rsc_name = None +-rsc_class = None +-rsc_type = None +-rsc_provider = None +-start_timeout = None +-stop_timeout = None +-monitor_interval = None +-monitor_timeout = None +-rsc_options = [] +-rsc_location = [] +-rsc_colocation = [] +- +-def create_cib() : +- doc = xml.dom.minidom.Document() +- cib = doc.createElement("cib") +- doc.appendChild(cib) +- +- configuration = doc.createElement("configuration") +- cib.appendChild(configuration) +- +- #crm_config = doc.createElement("crm_config") +- #configuration.appendChild(crm_config) +- +- resources = doc.createElement("resources") +- configuration.appendChild(resources) +- constraints = doc.createElement("constraints") +- configuration.appendChild(constraints) +- +- return doc, resources, constraints +- +-def cib_resource(doc, id, ra_class, type, provider): +- +- params = None +- +- resource = doc.createElement("primitive") +- +- resource.setAttribute("id", id) +- resource.setAttribute("type", type) +- resource.setAttribute("class", ra_class) +- +- if ra_class == "ocf": +- if not provider: +- provider = "heartbeat" +- resource.setAttribute("provider", provider) +- +- elif ra_class != "lsb" and ra_class != "heartbeat": +- print "Unknown resource class: "+ ra_class +- return None +- +- operations = doc.createElement("operations") +- resource.appendChild(operations) +- +- if monitor_interval != None: +- op = doc.createElement("op") +- operations.appendChild(op) +- op.setAttribute("id", id + "_mon_" + monitor_interval) +- op.setAttribute("name", "monitor") +- op.setAttribute("interval", monitor_interval) +- if monitor_timeout != None: +- op.setAttribute("timeout", monitor_timeout) +- +- if start_timeout != None: +- op = doc.createElement("op") +- operations.appendChild(op) +- op.setAttribute("id", id + "_start") +- op.setAttribute("name", "start") +- op.setAttribute("timeout", start_timeout) +- +- if stop_timeout != None: +- op = doc.createElement("op") +- operations.appendChild(op) +- op.setAttribute("id", id + "_stop") +- op.setAttribute("name", "stop") +- op.setAttribute("timeout", stop_timeout) +- +- instance_attributes = doc.createElement("instance_attributes") +- instance_attributes.setAttribute("id", id) +- resource.appendChild(instance_attributes) +- attributes = doc.createElement("attributes") +- instance_attributes.appendChild(attributes) +- for i in range(0,len(rsc_options)) : +- if rsc_options[i] == None : +- continue +- +- param = string.split(rsc_options[i], "=") +- nvpair = doc.createElement("nvpair") +- nvpair.setAttribute("id", id + "_" + param[0]) +- nvpair.setAttribute("name", param[0]) +- nvpair.setAttribute("value", param[1]) +- attributes.appendChild(nvpair) +- +- return resource +- +-def cib_rsc_location(doc, id, node, score): +- rule = doc.createElement("rule") +- rule.setAttribute("id", id+"_prefer_"+node+"_rule") +- rule.setAttribute("score", score) +- expression = doc.createElement("expression") +- expression.setAttribute("id",id+"_prefer_"+node+"_expr") +- expression.setAttribute("attribute","#uname") +- expression.setAttribute("operation","eq") +- expression.setAttribute("value", node) +- rule.appendChild(expression) +- return rule +- +-def cib_rsc_colocation(doc, id, other_resource, score): +- rsc_colocation = doc.createElement("rsc_colocation") +- rsc_colocation.setAttribute("id", id+"_colocate_with_"+other_resource) +- rsc_colocation.setAttribute("from", id) +- rsc_colocation.setAttribute("to", other_resource) +- rsc_colocation.setAttribute("score", score) +- return rsc_colocation +- +-def print_usage(): +- print "usage: " \ +- + sys.argv[0] \ +- + " --name "\ +- + " --class "\ +- + " --type "\ +- + " [--provider ]"\ +- + "\n\t"\ +- + " [--start-timeout ]"\ +- + " [--stop-timeout ]"\ +- + " [--monitor ]"\ +- + " [--monitor-timeout ]"\ +- + "\n\t"\ +- + " [--rsc-option name=value]*"\ +- + " [--rsc-location uname=score]*"\ +- + " [--rsc-colocation resource=score]*" +- print "Example:\n\t" + sys.argv[0] \ +- + " --name cluster_ip_1 --type IPaddr --provider heartbeat --class ocf "\ +- + "--rsc-option ip=192.168.1.101 --rsc-location node1=500 | cibadmin -C -p" +- sys.exit(1) +- +-if __name__=="__main__" : +- +- # Process arguments... +- skipthis = None +- args = sys.argv[1:] +- if len(args) == 0: +- print_usage() +- +- for i in range(0, len(args)) : +- if skipthis : +- skipthis = None +- continue +- +- elif args[i] == "--name" : +- skipthis = True +- rsc_name = args[i+1] +- +- elif args[i] == "--class" : +- skipthis = True +- rsc_class = args[i+1] +- +- elif args[i] == "--type" : +- skipthis = True +- rsc_type = args[i+1] +- +- elif args[i] == "--provider" : +- skipthis = True +- rsc_provider = args[i+1] +- +- elif args[i] == "--start-timeout" : +- skipthis = True +- start_timeout = args[i+1] +- +- elif args[i] == "--stop-timeout" : +- skipthis = True +- stop_timeout = args[i+1] +- +- elif args[i] == "--monitor" : +- skipthis = True +- monitor_interval = args[i+1] +- +- elif args[i] == "--monitor-timeout" : +- skipthis = True +- monitor_timeout = args[i+1] +- +- elif args[i] == "--rsc-option" : +- skipthis = True +- params = string.split(args[i+1], "=") +- if params[1] != None: +- rsc_options.append(args[i+1]) +- else: +- print "option '"+args[i+1]+"' must be of the form name=value" +- +- elif args[i] == "--rsc-location" : +- skipthis = True +- params = string.split(args[i+1], "=") +- if params[1] != None: +- rsc_location.append(args[i+1]) +- else: +- print "option '"+args[i+1]+"' must be of the form host=score" +- +- elif args[i] == "--rsc-colocation" : +- skipthis = True +- params = string.split(args[i+1], "=") +- if params[1] != None: +- rsc_colocation.append(args[i+1]) +- else: +- print "option '"+args[i+1]+"' must be of the form resource=score" +- +- elif args[i] == "--rsc-only" : +- print_rsc_only = 1 +- else: +- print "Unknown argument: "+ args[i] +- print_usage() +- +- cib = create_cib() +- pre_line = "" +- id_index = 1 +- resource = cib_resource(cib[0], rsc_name, rsc_class, rsc_type, rsc_provider) +- +- if print_rsc_only: +- print resource.toprettyxml() +- sys.exit(0) +- +- cib[1].appendChild(resource) +- +- if rsc_location != None : +- rsc_loc = cib[0].createElement("rsc_location") +- rsc_loc.setAttribute("id", rsc_name+"_preferences") +- rsc_loc.setAttribute("rsc", rsc_name) +- for i in range(0, len(rsc_location)) : +- param = string.split(rsc_location[i], "=") +- location_rule = cib_rsc_location(cib[0], rsc_name, param[0], param[1]) +- rsc_loc.appendChild(location_rule) +- cib[2].appendChild(rsc_loc) +- +- for i in range(0, len(rsc_colocation)) : +- if rsc_location[i] == None : +- continue +- +- param = string.split(rsc_colocation[i], "=") +- colocation_rule = cib_rsc_colocation(cib[0], rsc_name, param[0], param[1]) +- cib[2].appendChild(colocation_rule) +- +- print cib[0].toprettyxml() +diff --git a/tools/crm_resource.c b/tools/crm_resource.c +index 31136ef..2fce3b7 100644 +--- a/tools/crm_resource.c ++++ b/tools/crm_resource.c +@@ -853,6 +853,7 @@ main(int argc, char **argv) + rc = -ENXIO; + goto bail; + } ++ + rc = cli_resource_print_attribute(rsc_id, prop_name, &data_set); + + } else if (rsc_cmd == 'p') { +@@ -883,6 +884,10 @@ main(int argc, char **argv) + } else if (rsc_cmd == 'C' && rsc_id) { + resource_t *rsc = pe_find_resource(data_set.resources, rsc_id); + ++ if(do_force == FALSE) { ++ rsc = uber_parent(rsc); ++ } ++ + crm_debug("Re-checking the state of %s on %s", rsc_id, host_uname); + if(rsc) { + crmd_replies_needed = 0; +@@ -891,6 +896,11 @@ main(int argc, char **argv) + rc = -ENODEV; + } + ++ if(rc == pcmk_ok && BE_QUIET == FALSE) { ++ /* Now check XML_RSC_ATTR_TARGET_ROLE and XML_RSC_ATTR_MANAGED */ ++ cli_resource_check(cib_conn, rsc); ++ } ++ + if (rc == pcmk_ok) { + start_mainloop(); + } +diff --git a/tools/crm_resource.h b/tools/crm_resource.h +index 49b6138..5a206e0 100644 +--- a/tools/crm_resource.h ++++ b/tools/crm_resource.h +@@ -68,6 +68,7 @@ int cli_resource_print_property(const char *rsc, const char *attr, pe_working_se + int cli_resource_print_operations(const char *rsc_id, const char *host_uname, bool active, pe_working_set_t * data_set); + + /* runtime */ ++void cli_resource_check(cib_t * cib, resource_t *rsc); + int cli_resource_fail(crm_ipc_t * crmd_channel, const char *host_uname, const char *rsc_id, pe_working_set_t * data_set); + int cli_resource_search(const char *rsc, pe_working_set_t * data_set); + int cli_resource_delete(cib_t *cib_conn, crm_ipc_t * crmd_channel, const char *host_uname, resource_t * rsc, pe_working_set_t * data_set); +diff --git a/tools/crm_resource_print.c b/tools/crm_resource_print.c +index 9c3711c..946b9e3 100644 +--- a/tools/crm_resource_print.c ++++ b/tools/crm_resource_print.c +@@ -352,8 +352,11 @@ cli_resource_print_attribute(const char *rsc, const char *attr, pe_working_set_t + + if (safe_str_eq(attr_set_type, XML_TAG_ATTR_SETS)) { + get_rsc_attributes(params, the_rsc, current, data_set); ++ + } else if (safe_str_eq(attr_set_type, XML_TAG_META_SETS)) { ++ /* No need to redirect to the parent */ + get_meta_attributes(params, the_rsc, current, data_set); ++ + } else { + unpack_instance_attributes(data_set->input, the_rsc->xml, XML_TAG_UTILIZATION, NULL, + params, NULL, FALSE, data_set->now); +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index 006ec08..a270cbf 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -198,6 +198,7 @@ cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const ch + int rc = pcmk_ok; + static bool need_init = TRUE; + ++ char *lookup_id = NULL; + char *local_attr_id = NULL; + char *local_attr_set = NULL; + +@@ -212,14 +213,39 @@ cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const ch + } + + if (safe_str_eq(attr_set_type, XML_TAG_ATTR_SETS)) { +- rc = find_resource_attr(cib, XML_ATTR_ID, rsc_id, XML_TAG_META_SETS, attr_set, attr_id, ++ rc = find_resource_attr(cib, XML_ATTR_ID, uber_parent(rsc)->id, XML_TAG_META_SETS, attr_set, attr_id, + attr_name, &local_attr_id); +- if (rc == pcmk_ok) { +- printf("WARNING: There is already a meta attribute called %s (id=%s)\n", attr_name, +- local_attr_id); ++ if(rc == pcmk_ok && do_force == FALSE) { ++ if (BE_QUIET == FALSE) { ++ printf("WARNING: There is already a meta attribute for '%s' called '%s' (id=%s)\n", ++ uber_parent(rsc)->id, attr_name, local_attr_id); ++ printf(" Delete '%s' first or use --force to override\n", local_attr_id); ++ } ++ return -ENOTUNIQ; ++ } ++ ++ } else if(rsc->parent) { ++ ++ switch(rsc->parent->variant) { ++ case pe_group: ++ if (BE_QUIET == FALSE) { ++ printf("Updating '%s' for '%s' will not apply to its peers in '%s'\n", attr_name, rsc_id, rsc->parent->id); ++ } ++ break; ++ case pe_master: ++ case pe_clone: ++ rsc = rsc->parent; ++ if (BE_QUIET == FALSE) { ++ printf("Updating '%s' for '%s'...\n", rsc->id, rsc_id); ++ } ++ break; ++ default: ++ break; + } + } +- rc = find_resource_attr(cib, XML_ATTR_ID, rsc_id, attr_set_type, attr_set, attr_id, attr_name, ++ ++ lookup_id = clone_strip(rsc->id); /* Could be a cloned group! */ ++ rc = find_resource_attr(cib, XML_ATTR_ID, lookup_id, attr_set_type, attr_set, attr_id, attr_name, + &local_attr_id); + + if (rc == pcmk_ok) { +@@ -227,6 +253,7 @@ cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const ch + attr_id = local_attr_id; + + } else if (rc != -ENXIO) { ++ free(lookup_id); + free(local_attr_id); + return rc; + +@@ -250,7 +277,7 @@ cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const ch + free_xml(cib_top); + + if (attr_set == NULL) { +- local_attr_set = crm_concat(rsc_id, attr_set_type, '-'); ++ local_attr_set = crm_concat(lookup_id, attr_set_type, '-'); + attr_set = local_attr_set; + } + if (attr_id == NULL) { +@@ -263,7 +290,7 @@ cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const ch + } + + xml_top = create_xml_node(NULL, tag); +- crm_xml_add(xml_top, XML_ATTR_ID, rsc_id); ++ crm_xml_add(xml_top, XML_ATTR_ID, lookup_id); + + xml_obj = create_xml_node(xml_top, attr_set_type); + crm_xml_add(xml_obj, XML_ATTR_ID, attr_set); +@@ -285,7 +312,15 @@ cli_resource_update_attribute(const char *rsc_id, const char *attr_set, const ch + crm_log_xml_debug(xml_top, "Update"); + + rc = cib->cmds->modify(cib, XML_CIB_TAG_RESOURCES, xml_top, cib_options); ++ if (rc == pcmk_ok && BE_QUIET == FALSE) { ++ printf("Set '%s' option: id=%s%s%s%s%s=%s\n", lookup_id, local_attr_id, ++ attr_set ? " set=" : "", attr_set ? attr_set : "", ++ attr_name ? " name=" : "", attr_name ? attr_name : "", attr_value); ++ } ++ + free_xml(xml_top); ++ ++ free(lookup_id); + free(local_attr_id); + free(local_attr_set); + +@@ -330,6 +365,7 @@ cli_resource_delete_attribute(const char *rsc_id, const char *attr_set, const ch + xmlNode *xml_obj = NULL; + + int rc = pcmk_ok; ++ char *lookup_id = NULL; + char *local_attr_id = NULL; + resource_t *rsc = find_rsc_or_clone(rsc_id, data_set); + +@@ -337,7 +373,29 @@ cli_resource_delete_attribute(const char *rsc_id, const char *attr_set, const ch + return -ENXIO; + } + +- rc = find_resource_attr(cib, XML_ATTR_ID, rsc_id, attr_set_type, attr_set, attr_id, attr_name, ++ if(rsc->parent && safe_str_eq(attr_set_type, XML_TAG_META_SETS)) { ++ ++ switch(rsc->parent->variant) { ++ case pe_group: ++ if (BE_QUIET == FALSE) { ++ printf("Removing '%s' for '%s' will not apply to its peers in '%s'\n", attr_name, rsc_id, rsc->parent->id); ++ } ++ break; ++ case pe_master: ++ case pe_clone: ++ rsc = rsc->parent; ++ if (BE_QUIET == FALSE) { ++ printf("Removing '%s' from '%s' for '%s'...\n", attr_name, rsc->id, rsc_id); ++ } ++ break; ++ default: ++ break; ++ } ++ ++ } ++ ++ lookup_id = clone_strip(rsc->id); ++ rc = find_resource_attr(cib, XML_ATTR_ID, lookup_id, attr_set_type, attr_set, attr_id, attr_name, + &local_attr_id); + + if (rc == -ENXIO) { +@@ -360,8 +418,8 @@ cli_resource_delete_attribute(const char *rsc_id, const char *attr_set, const ch + CRM_ASSERT(cib); + rc = cib->cmds->delete(cib, XML_CIB_TAG_RESOURCES, xml_obj, cib_options); + +- if (rc == pcmk_ok) { +- printf("Deleted %s option: id=%s%s%s%s%s\n", rsc_id, local_attr_id, ++ if (rc == pcmk_ok && BE_QUIET == FALSE) { ++ printf("Deleted '%s' option: id=%s%s%s%s%s\n", lookup_id, local_attr_id, + attr_set ? " set=" : "", attr_set ? attr_set : "", + attr_name ? " name=" : "", attr_name ? attr_name : ""); + } +@@ -493,7 +551,10 @@ cli_resource_delete(cib_t *cib_conn, crm_ipc_t * crmd_channel, const char *host_ + for (lpc = rsc->children; lpc != NULL; lpc = lpc->next) { + resource_t *child = (resource_t *) lpc->data; + +- cli_resource_delete(cib_conn, crmd_channel, host_uname, child, data_set); ++ rc = cli_resource_delete(cib_conn, crmd_channel, host_uname, child, data_set); ++ if(rc != pcmk_ok || is_not_set(rsc->flags, pe_rsc_unique)) { ++ return rc; ++ } + } + return pcmk_ok; + +@@ -514,31 +575,78 @@ cli_resource_delete(cib_t *cib_conn, crm_ipc_t * crmd_channel, const char *host_ + node = pe_find_node(data_set->nodes, host_uname); + + if (node && node->details->rsc_discovery_enabled) { +- printf("Cleaning up %s on %s\n", rsc->id, host_uname); ++ printf("Cleaning up %s on %s", rsc->id, host_uname); + rc = send_lrm_rsc_op(crmd_channel, CRM_OP_LRM_DELETE, host_uname, rsc->id, TRUE, data_set); + } else { + printf("Resource discovery disabled on %s. Unable to delete lrm state.\n", host_uname); ++ rc = -EOPNOTSUPP; + } + + if (rc == pcmk_ok) { + char *attr_name = NULL; +- const char *id = rsc->id; + + if(node && node->details->remote_rsc == NULL && node->details->rsc_discovery_enabled) { + crmd_replies_needed++; + } +- if (rsc->clone_name) { +- id = rsc->clone_name; ++ ++ if(is_not_set(rsc->flags, pe_rsc_unique)) { ++ char *id = clone_strip(rsc->id); ++ attr_name = crm_strdup_printf("fail-count-%s", id); ++ free(id); ++ ++ } else if (rsc->clone_name) { ++ attr_name = crm_strdup_printf("fail-count-%s", rsc->clone_name); ++ ++ } else { ++ attr_name = crm_strdup_printf("fail-count-%s", rsc->id); + } + +- attr_name = crm_concat("fail-count", id, '-'); ++ printf(", removing %s\n", attr_name); + rc = attrd_update_delegate(NULL, 'D', host_uname, attr_name, NULL, XML_CIB_TAG_STATUS, NULL, + NULL, NULL, node ? is_remote_node(node) : FALSE); + free(attr_name); ++ ++ } else if(rc != -EOPNOTSUPP) { ++ printf(" - FAILED\n"); + } ++ + return rc; + } + ++void ++cli_resource_check(cib_t * cib_conn, resource_t *rsc) ++{ ++ ++ char *role_s = NULL; ++ char *managed = NULL; ++ resource_t *parent = uber_parent(rsc); ++ ++ find_resource_attr(cib_conn, XML_ATTR_ID, parent->id, ++ XML_TAG_META_SETS, NULL, NULL, XML_RSC_ATTR_MANAGED, &managed); ++ ++ find_resource_attr(cib_conn, XML_ATTR_ID, parent->id, ++ XML_TAG_META_SETS, NULL, NULL, XML_RSC_ATTR_TARGET_ROLE, &role_s); ++ ++ if(managed == NULL) { ++ managed = strdup("1"); ++ } ++ if(crm_is_true(managed) == FALSE) { ++ printf("\n\t*Resource %s is configured to not be managed by the cluster\n", parent->id); ++ } ++ if(role_s) { ++ enum rsc_role_e role = text2role(role_s); ++ if(role == RSC_ROLE_UNKNOWN) { ++ // Treated as if unset ++ ++ } else if(role == RSC_ROLE_STOPPED) { ++ printf("\n\t* The configuration specifies that '%s' should remain stopped\n", parent->id); ++ ++ } else if(parent->variant > pe_clone && role != RSC_ROLE_MASTER) { ++ printf("\n\t* The configuration specifies that '%s' should not be promoted\n", parent->id); ++ } ++ } ++} ++ + int + cli_resource_fail(crm_ipc_t * crmd_channel, const char *host_uname, + const char *rsc_id, pe_working_set_t * data_set) +diff --git a/tools/crm_simulate.c b/tools/crm_simulate.c +index 0051112..7d0a8eb 100644 +--- a/tools/crm_simulate.c ++++ b/tools/crm_simulate.c +@@ -59,8 +59,11 @@ char *use_date = NULL; + static void + get_date(pe_working_set_t * data_set) + { ++ int value = 0; + time_t original_date = 0; +- crm_element_value_int(data_set->input, "execution-date", (int*)&original_date); ++ ++ crm_element_value_int(data_set->input, "execution-date", &value); ++ original_date = value; + + if (use_date) { + data_set->now = crm_time_new(use_date); +diff --git a/tools/crm_utils.py.in b/tools/crm_utils.py.in +deleted file mode 100644 +index 67d6918..0000000 +--- a/tools/crm_utils.py.in ++++ /dev/null +@@ -1,188 +0,0 @@ +-#!/bin/env python +-# +-# +-# pingd OCF Resource Agent +-# Records (in the CIB) the current number of ping nodes a +-# cluster node can connect to. +-# +-# Copyright (c) 2006 Andrew Beekhof +-# All Rights Reserved. +-# +-# This program is free software; you can redistribute it and/or modify +-# it under the terms of version 2 of the GNU General Public License as +-# published by the Free Software Foundation. +-# +-# This program is distributed in the hope that it would be useful, but +-# WITHOUT ANY WARRANTY; without even the implied warranty of +-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +-# +-# Further, this software is distributed without any warranty that it is +-# free of the rightful claim of any third person regarding infringement +-# or the like. Any license provided herein, whether implied or +-# otherwise, applies only to this software file. Patent licenses, if +-# any, provided herein do not apply to combinations of this program with +-# other software, or any other product whatsoever. +-# +-# You should have received a copy of the GNU General Public License +-# along with this program; if not, write the Free Software Foundation, +-# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +-# +-####################################################################### +- +-import os +-import sys +-import getopt +-import readline +-import traceback +-from popen2 import Popen3 +- +-crm_topic = "crm" +-topic_stack = [ crm_topic ] +-hist_file = os.environ.get('HOME')+"/.crm_history" +-global_opts = {} +- +-def exit_(code=0): +- if global_opts["interactive"]: +- log_info("Exiting... ") +- try: +- readline.write_history_file(hist_file) +- log_debug("Wrote history to: "+hist_file) +- except: +- log_debug("Couldnt write history to: "+hist_file) +- sys.exit(code) +- +-def log_debug(log): +- if global_opts.has_key("debug") and global_opts["debug"]: +- print log +- +-def log_dev(log): +- if global_opts.has_key("devlog") and global_opts["devlog"]: +- print log +- +-def log_info(log): +- print log +- +-def log_err(log): +- print "ERROR: "+log +- +-def set_topic(name): +- global crm_topic +- if crm_topic != name: +- log_dev("topic: %s->%s" % (crm_topic, name)) +- crm_topic = name +- +-def os_system(cmd, print_raw=False): +- log_debug("Performing command: "+cmd) +- p = Popen3(cmd, None) +- p.tochild.close() +- result = p.fromchild.readlines() +- p.fromchild.close() +- p.wait() +- if print_raw: +- for line in result: +- print line.rstrip() +- return result +- +-# +-# Creates an argv-style array (that preserves quoting) for use in shell-mode +-# +-def create_argv(text): +- args = [] +- word = [] +- index = 0 +- total = len(text) +- +- in_word = False +- in_verbatum = False +- +- while index < total: +- finish_word = False +- append_word = False +- #log_debug("processing: "+text[index]) +- if text[index] == '\\': +- index = index +1 +- append_word = True +- +- elif text[index].isspace(): +- if in_verbatum or in_word: +- append_word = True +- else: +- finish_word = True +- +- elif text[index] == '"': +- if in_verbatum: +- append_word = True +- else: +- finish_word = True +- if in_word: +- in_word = False +- else: +- in_word = True +- +- elif text[index] == '\'': +- finish_word = True +- if in_verbatum: +- in_verbatum = False +- else: +- in_verbatum = True +- else: +- append_word = True +- +- if finish_word: +- if word: +- args.append(''.join(word)) +- word = [] +- elif append_word: +- word.append(text[index]) +- #log_debug("Added %s to word: %s" % (text[index], str(word))) +- +- index = index +1 +- +- if in_verbatum or in_word: +- text="" +- if word: +- text=" after: '%s'"%''.join(word) +- raise QuotingError("Un-matched quoting%s"%text, args) +- +- elif word: +- args.append(''.join(word)) +- +- return args +- +-def init_readline(func): +- readline.set_completer(func) +- readline.parse_and_bind("tab: complete") +- readline.set_history_length(100) +- +- try: +- readline.read_history_file(hist_file) +- except: +- pass +- +-def fancyopts(args, options, state): +- long = [] +- short = '' +- map = {} +- dt = {} +- +- for s, l, d, c in options: +- pl = l.replace('-', '_') +- map['-'+s] = map['--'+l] = pl +- state[pl] = d +- dt[pl] = type(d) +- if not d is None and not callable(d): +- if s: s += ':' +- if l: l += '=' +- if s: short = short + s +- if l: long.append(l) +- +- opts, args = getopt.getopt(args, short, long) +- +- for opt, arg in opts: +- if dt[map[opt]] is type(fancyopts): state[map[opt]](state,map[opt],arg) +- elif dt[map[opt]] is type(1): state[map[opt]] = int(arg) +- elif dt[map[opt]] is type(''): state[map[opt]] = arg +- elif dt[map[opt]] is type([]): state[map[opt]].append(arg) +- elif dt[map[opt]] is type(None): state[map[opt]] = 1 +- +- return args +diff --git a/tools/regression.acls.exp b/tools/regression.acls.exp +index ae6735a..ac7ae0c 100644 +--- a/tools/regression.acls.exp ++++ b/tools/regression.acls.exp +@@ -253,10 +253,10 @@ Error performing operation: Permission denied + =#=#=#= End test: unknownguy: Set stonith-enabled - Permission denied (13) =#=#=#= + * Passed: crm_attribute - unknownguy: Set stonith-enabled + =#=#=#= Begin test: unknownguy: Create a resource =#=#=#= +-__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs +-__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs +-__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs +-__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs ++__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs ++__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs ++__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs ++__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs + Call failed: Permission denied + =#=#=#= End test: unknownguy: Create a resource - Permission denied (13) =#=#=#= + * Passed: cibadmin - unknownguy: Create a resource +@@ -273,8 +273,8 @@ Error performing operation: Permission denied + =#=#=#= End test: l33t-haxor: Set stonith-enabled - Permission denied (13) =#=#=#= + * Passed: crm_attribute - l33t-haxor: Set stonith-enabled + =#=#=#= Begin test: l33t-haxor: Create a resource =#=#=#= +-__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy']: parent +-__xml_acl_post_process: Cannot add new node primitive at /cib/configuration/resources/primitive[@id='dummy'] ++__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy']: parent ++__xml_acl_post_process: Cannot add new node primitive at /cib/configuration/resources/primitive[@id='dummy'] + Call failed: Permission denied + =#=#=#= End test: l33t-haxor: Create a resource - Permission denied (13) =#=#=#= + * Passed: cibadmin - l33t-haxor: Create a resource +@@ -323,13 +323,13 @@ Call failed: Permission denied + =#=#=#= End test: niceguy: Query configuration - OK (0) =#=#=#= + * Passed: cibadmin - niceguy: Query configuration + =#=#=#= Begin test: niceguy: Set enable-acl =#=#=#= +-__xml_acl_check: 400 access denied to /cib/configuration/crm_config/cluster_property_set[@id='cib-bootstrap-options']/nvpair[@id='cib-bootstrap-options-enable-acl'][@value]: default ++__xml_acl_check: 400 access denied to /cib/configuration/crm_config/cluster_property_set[@id='cib-bootstrap-options']/nvpair[@id='cib-bootstrap-options-enable-acl'][@value]: default + Error performing operation: Permission denied + Error setting enable-acl=false (section=crm_config, set=): Permission denied + =#=#=#= End test: niceguy: Set enable-acl - Permission denied (13) =#=#=#= + * Passed: crm_attribute - niceguy: Set enable-acl + =#=#=#= Begin test: niceguy: Set stonith-enabled =#=#=#= +-__xml_acl_post_process: Creation of nvpair=cib-bootstrap-options-stonith-enabled is allowed ++__xml_acl_post_process: Creation of nvpair=cib-bootstrap-options-stonith-enabled is allowed + =#=#=#= Current cib after: niceguy: Set stonith-enabled =#=#=#= + + +@@ -376,8 +376,8 @@ __xml_acl_post_process: Creation of nvpair=cib-bootstrap-options-stonith-enable + =#=#=#= End test: niceguy: Set stonith-enabled - OK (0) =#=#=#= + * Passed: crm_attribute - niceguy: Set stonith-enabled + =#=#=#= Begin test: niceguy: Create a resource =#=#=#= +-__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy']: default +-__xml_acl_post_process: Cannot add new node primitive at /cib/configuration/resources/primitive[@id='dummy'] ++__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy']: default ++__xml_acl_post_process: Cannot add new node primitive at /cib/configuration/resources/primitive[@id='dummy'] + Call failed: Permission denied + =#=#=#= End test: niceguy: Create a resource - Permission denied (13) =#=#=#= + * Passed: cibadmin - niceguy: Create a resource +@@ -533,10 +533,11 @@ Error performing operation: Permission denied + =#=#=#= End test: l33t-haxor: Remove a resource meta attribute - Permission denied (13) =#=#=#= + * Passed: crm_resource - l33t-haxor: Remove a resource meta attribute + =#=#=#= Begin test: niceguy: Create a resource meta attribute =#=#=#= +-error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined +-error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option +-error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity +-__xml_acl_post_process: Creation of nvpair=dummy-meta_attributes-target-role is allowed ++error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined ++error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option ++error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity ++__xml_acl_post_process: Creation of nvpair=dummy-meta_attributes-target-role is allowed ++Set 'dummy' option: id=dummy-meta_attributes-target-role set=dummy-meta_attributes name=target-role=Stopped + =#=#=#= Current cib after: niceguy: Create a resource meta attribute =#=#=#= + + +@@ -589,9 +590,9 @@ __xml_acl_post_process: Creation of nvpair=dummy-meta_attributes-target-role is + =#=#=#= End test: niceguy: Create a resource meta attribute - OK (0) =#=#=#= + * Passed: crm_resource - niceguy: Create a resource meta attribute + =#=#=#= Begin test: niceguy: Query a resource meta attribute =#=#=#= +-error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined +-error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option +-error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity ++error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined ++error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option ++error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity + Stopped + =#=#=#= Current cib after: niceguy: Query a resource meta attribute =#=#=#= + +@@ -645,10 +646,10 @@ Stopped + =#=#=#= End test: niceguy: Query a resource meta attribute - OK (0) =#=#=#= + * Passed: crm_resource - niceguy: Query a resource meta attribute + =#=#=#= Begin test: niceguy: Remove a resource meta attribute =#=#=#= +-error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined +-error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option +-error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity +-Deleted dummy option: id=dummy-meta_attributes-target-role name=target-role ++error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined ++error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option ++error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity ++Deleted 'dummy' option: id=dummy-meta_attributes-target-role name=target-role + =#=#=#= Current cib after: niceguy: Remove a resource meta attribute =#=#=#= + + +@@ -699,10 +700,11 @@ Deleted dummy option: id=dummy-meta_attributes-target-role name=target-role + =#=#=#= End test: niceguy: Remove a resource meta attribute - OK (0) =#=#=#= + * Passed: crm_resource - niceguy: Remove a resource meta attribute + =#=#=#= Begin test: niceguy: Create a resource meta attribute =#=#=#= +-error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined +-error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option +-error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity +-__xml_acl_post_process: Creation of nvpair=dummy-meta_attributes-target-role is allowed ++error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined ++error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option ++error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity ++__xml_acl_post_process: Creation of nvpair=dummy-meta_attributes-target-role is allowed ++Set 'dummy' option: id=dummy-meta_attributes-target-role set=dummy-meta_attributes name=target-role=Started + =#=#=#= Current cib after: niceguy: Create a resource meta attribute =#=#=#= + + +@@ -804,8 +806,8 @@ __xml_acl_post_process: Creation of nvpair=dummy-meta_attributes-target-role is + + + =#=#=#= Begin test: niceguy: Replace - remove acls =#=#=#= +-__xml_acl_check: 400 access denied to /cib[@epoch]: default +-__xml_acl_check: 400 access denied to /cib/configuration/acls: default ++__xml_acl_check: 400 access denied to /cib[@epoch]: default ++__xml_acl_check: 400 access denied to /cib/configuration/acls: default + Call failed: Permission denied + =#=#=#= End test: niceguy: Replace - remove acls - Permission denied (13) =#=#=#= + * Passed: cibadmin - niceguy: Replace - remove acls +@@ -859,9 +861,9 @@ Call failed: Permission denied + + + =#=#=#= Begin test: niceguy: Replace - create resource =#=#=#= +-__xml_acl_check: 400 access denied to /cib[@epoch]: default +-__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy2']: default +-__xml_acl_post_process: Cannot add new node primitive at /cib/configuration/resources/primitive[@id='dummy2'] ++__xml_acl_check: 400 access denied to /cib[@epoch]: default ++__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy2']: default ++__xml_acl_post_process: Cannot add new node primitive at /cib/configuration/resources/primitive[@id='dummy2'] + Call failed: Permission denied + =#=#=#= End test: niceguy: Replace - create resource - Permission denied (13) =#=#=#= + * Passed: cibadmin - niceguy: Replace - create resource +@@ -914,8 +916,8 @@ Call failed: Permission denied + + + =#=#=#= Begin test: niceguy: Replace - modify attribute (deny) =#=#=#= +-__xml_acl_check: 400 access denied to /cib[@epoch]: default +-__xml_acl_check: 400 access denied to /cib/configuration/crm_config/cluster_property_set[@id='cib-bootstrap-options']/nvpair[@id='cib-bootstrap-options-enable-acl'][@value]: default ++__xml_acl_check: 400 access denied to /cib[@epoch]: default ++__xml_acl_check: 400 access denied to /cib/configuration/crm_config/cluster_property_set[@id='cib-bootstrap-options']/nvpair[@id='cib-bootstrap-options-enable-acl'][@value]: default + Call failed: Permission denied + =#=#=#= End test: niceguy: Replace - modify attribute (deny) - Permission denied (13) =#=#=#= + * Passed: cibadmin - niceguy: Replace - modify attribute (deny) +@@ -968,8 +970,8 @@ Call failed: Permission denied + + + =#=#=#= Begin test: niceguy: Replace - delete attribute (deny) =#=#=#= +-__xml_acl_check: 400 access denied to /cib[@epoch]: default +-__xml_acl_check: 400 access denied to /cib/configuration/crm_config/cluster_property_set[@id='cib-bootstrap-options']/nvpair[@id='cib-bootstrap-options-enable-acl']: default ++__xml_acl_check: 400 access denied to /cib[@epoch]: default ++__xml_acl_check: 400 access denied to /cib/configuration/crm_config/cluster_property_set[@id='cib-bootstrap-options']/nvpair[@id='cib-bootstrap-options-enable-acl']: default + Call failed: Permission denied + =#=#=#= End test: niceguy: Replace - delete attribute (deny) - Permission denied (13) =#=#=#= + * Passed: cibadmin - niceguy: Replace - delete attribute (deny) +@@ -1022,8 +1024,8 @@ Call failed: Permission denied + + + =#=#=#= Begin test: niceguy: Replace - create attribute (deny) =#=#=#= +-__xml_acl_check: 400 access denied to /cib[@epoch]: default +-__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy'][@description]: default ++__xml_acl_check: 400 access denied to /cib[@epoch]: default ++__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy'][@description]: default + Call failed: Permission denied + =#=#=#= End test: niceguy: Replace - create attribute (deny) - Permission denied (13) =#=#=#= + * Passed: cibadmin - niceguy: Replace - create attribute (deny) +@@ -1180,28 +1182,28 @@ Call failed: Permission denied + + !#!#!#!#! Upgrading to pacemaker-2.0 and retesting !#!#!#!#! + =#=#=#= Begin test: root: Upgrade to pacemaker-2.0 =#=#=#= +-__xml_acl_post_process: Creation of acl_permission=observer-read-1 is allowed +-__xml_acl_post_process: Creation of acl_permission=observer-write-1 is allowed +-__xml_acl_post_process: Creation of acl_permission=observer-write-2 is allowed +-__xml_acl_post_process: Creation of acl_permission=admin-read-1 is allowed +-__xml_acl_post_process: Creation of acl_permission=admin-write-1 is allowed +-__xml_acl_post_process: Creation of acl_target=l33t-haxor is allowed +-__xml_acl_post_process: Creation of role=auto-l33t-haxor is allowed +-__xml_acl_post_process: Creation of acl_role=auto-l33t-haxor is allowed +-__xml_acl_post_process: Creation of acl_permission=crook-nothing is allowed +-__xml_acl_post_process: Creation of acl_target=niceguy is allowed +-__xml_acl_post_process: Creation of role=observer is allowed +-__xml_acl_post_process: Creation of acl_target=bob is allowed +-__xml_acl_post_process: Creation of role=admin is allowed +-__xml_acl_post_process: Creation of acl_target=badidea is allowed +-__xml_acl_post_process: Creation of role=auto-badidea is allowed +-__xml_acl_post_process: Creation of acl_role=auto-badidea is allowed +-__xml_acl_post_process: Creation of acl_permission=badidea-resources is allowed +-__xml_acl_post_process: Creation of acl_target=betteridea is allowed +-__xml_acl_post_process: Creation of role=auto-betteridea is allowed +-__xml_acl_post_process: Creation of acl_role=auto-betteridea is allowed +-__xml_acl_post_process: Creation of acl_permission=betteridea-nothing is allowed +-__xml_acl_post_process: Creation of acl_permission=betteridea-resources is allowed ++__xml_acl_post_process: Creation of acl_permission=observer-read-1 is allowed ++__xml_acl_post_process: Creation of acl_permission=observer-write-1 is allowed ++__xml_acl_post_process: Creation of acl_permission=observer-write-2 is allowed ++__xml_acl_post_process: Creation of acl_permission=admin-read-1 is allowed ++__xml_acl_post_process: Creation of acl_permission=admin-write-1 is allowed ++__xml_acl_post_process: Creation of acl_target=l33t-haxor is allowed ++__xml_acl_post_process: Creation of role=auto-l33t-haxor is allowed ++__xml_acl_post_process: Creation of acl_role=auto-l33t-haxor is allowed ++__xml_acl_post_process: Creation of acl_permission=crook-nothing is allowed ++__xml_acl_post_process: Creation of acl_target=niceguy is allowed ++__xml_acl_post_process: Creation of role=observer is allowed ++__xml_acl_post_process: Creation of acl_target=bob is allowed ++__xml_acl_post_process: Creation of role=admin is allowed ++__xml_acl_post_process: Creation of acl_target=badidea is allowed ++__xml_acl_post_process: Creation of role=auto-badidea is allowed ++__xml_acl_post_process: Creation of acl_role=auto-badidea is allowed ++__xml_acl_post_process: Creation of acl_permission=badidea-resources is allowed ++__xml_acl_post_process: Creation of acl_target=betteridea is allowed ++__xml_acl_post_process: Creation of role=auto-betteridea is allowed ++__xml_acl_post_process: Creation of acl_role=auto-betteridea is allowed ++__xml_acl_post_process: Creation of acl_permission=betteridea-nothing is allowed ++__xml_acl_post_process: Creation of acl_permission=betteridea-resources is allowed + =#=#=#= Current cib after: root: Upgrade to pacemaker-2.0 =#=#=#= + + +@@ -1271,10 +1273,10 @@ Error performing operation: Permission denied + =#=#=#= End test: unknownguy: Set stonith-enabled - Permission denied (13) =#=#=#= + * Passed: crm_attribute - unknownguy: Set stonith-enabled + =#=#=#= Begin test: unknownguy: Create a resource =#=#=#= +-__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs +-__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs +-__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs +-__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs ++__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs ++__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs ++__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs ++__xml_acl_check: Ordinary user unknownguy cannot access the CIB without any defined ACLs + Call failed: Permission denied + =#=#=#= End test: unknownguy: Create a resource - Permission denied (13) =#=#=#= + * Passed: cibadmin - unknownguy: Create a resource +@@ -1291,8 +1293,8 @@ Error performing operation: Permission denied + =#=#=#= End test: l33t-haxor: Set stonith-enabled - Permission denied (13) =#=#=#= + * Passed: crm_attribute - l33t-haxor: Set stonith-enabled + =#=#=#= Begin test: l33t-haxor: Create a resource =#=#=#= +-__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy']: parent +-__xml_acl_post_process: Cannot add new node primitive at /cib/configuration/resources/primitive[@id='dummy'] ++__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy']: parent ++__xml_acl_post_process: Cannot add new node primitive at /cib/configuration/resources/primitive[@id='dummy'] + Call failed: Permission denied + =#=#=#= End test: l33t-haxor: Create a resource - Permission denied (13) =#=#=#= + * Passed: cibadmin - l33t-haxor: Create a resource +@@ -1351,7 +1353,7 @@ Call failed: Permission denied + =#=#=#= End test: niceguy: Query configuration - OK (0) =#=#=#= + * Passed: cibadmin - niceguy: Query configuration + =#=#=#= Begin test: niceguy: Set enable-acl =#=#=#= +-__xml_acl_check: 400 access denied to /cib/configuration/crm_config/cluster_property_set[@id='cib-bootstrap-options']/nvpair[@id='cib-bootstrap-options-enable-acl'][@value]: default ++__xml_acl_check: 400 access denied to /cib/configuration/crm_config/cluster_property_set[@id='cib-bootstrap-options']/nvpair[@id='cib-bootstrap-options-enable-acl'][@value]: default + Error performing operation: Permission denied + Error setting enable-acl=false (section=crm_config, set=): Permission denied + =#=#=#= End test: niceguy: Set enable-acl - Permission denied (13) =#=#=#= +@@ -1412,8 +1414,8 @@ Error setting enable-acl=false (section=crm_config, set=): Permission deni + =#=#=#= End test: niceguy: Set stonith-enabled - OK (0) =#=#=#= + * Passed: crm_attribute - niceguy: Set stonith-enabled + =#=#=#= Begin test: niceguy: Create a resource =#=#=#= +-__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy']: default +-__xml_acl_post_process: Cannot add new node primitive at /cib/configuration/resources/primitive[@id='dummy'] ++__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy']: default ++__xml_acl_post_process: Cannot add new node primitive at /cib/configuration/resources/primitive[@id='dummy'] + Call failed: Permission denied + =#=#=#= End test: niceguy: Create a resource - Permission denied (13) =#=#=#= + * Passed: cibadmin - niceguy: Create a resource +@@ -1596,10 +1598,11 @@ Error performing operation: Permission denied + =#=#=#= End test: l33t-haxor: Remove a resource meta attribute - Permission denied (13) =#=#=#= + * Passed: crm_resource - l33t-haxor: Remove a resource meta attribute + =#=#=#= Begin test: niceguy: Create a resource meta attribute =#=#=#= +-error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined +-error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option +-error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity +-__xml_acl_post_process: Creation of nvpair=dummy-meta_attributes-target-role is allowed ++error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined ++error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option ++error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity ++__xml_acl_post_process: Creation of nvpair=dummy-meta_attributes-target-role is allowed ++Set 'dummy' option: id=dummy-meta_attributes-target-role set=dummy-meta_attributes name=target-role=Stopped + =#=#=#= Current cib after: niceguy: Create a resource meta attribute =#=#=#= + + +@@ -1661,9 +1664,9 @@ __xml_acl_post_process: Creation of nvpair=dummy-meta_attributes-target-role is + =#=#=#= End test: niceguy: Create a resource meta attribute - OK (0) =#=#=#= + * Passed: crm_resource - niceguy: Create a resource meta attribute + =#=#=#= Begin test: niceguy: Query a resource meta attribute =#=#=#= +-error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined +-error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option +-error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity ++error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined ++error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option ++error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity + Stopped + =#=#=#= Current cib after: niceguy: Query a resource meta attribute =#=#=#= + +@@ -1726,10 +1729,10 @@ Stopped + =#=#=#= End test: niceguy: Query a resource meta attribute - OK (0) =#=#=#= + * Passed: crm_resource - niceguy: Query a resource meta attribute + =#=#=#= Begin test: niceguy: Remove a resource meta attribute =#=#=#= +-error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined +-error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option +-error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity +-Deleted dummy option: id=dummy-meta_attributes-target-role name=target-role ++error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined ++error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option ++error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity ++Deleted 'dummy' option: id=dummy-meta_attributes-target-role name=target-role + =#=#=#= Current cib after: niceguy: Remove a resource meta attribute =#=#=#= + + +@@ -1789,10 +1792,11 @@ Deleted dummy option: id=dummy-meta_attributes-target-role name=target-role + =#=#=#= End test: niceguy: Remove a resource meta attribute - OK (0) =#=#=#= + * Passed: crm_resource - niceguy: Remove a resource meta attribute + =#=#=#= Begin test: niceguy: Create a resource meta attribute =#=#=#= +-error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined +-error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option +-error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity +-__xml_acl_post_process: Creation of nvpair=dummy-meta_attributes-target-role is allowed ++error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined ++error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option ++error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity ++__xml_acl_post_process: Creation of nvpair=dummy-meta_attributes-target-role is allowed ++Set 'dummy' option: id=dummy-meta_attributes-target-role set=dummy-meta_attributes name=target-role=Started + =#=#=#= Current cib after: niceguy: Create a resource meta attribute =#=#=#= + + +@@ -1903,8 +1907,8 @@ __xml_acl_post_process: Creation of nvpair=dummy-meta_attributes-target-role is + + + =#=#=#= Begin test: niceguy: Replace - remove acls =#=#=#= +-__xml_acl_check: 400 access denied to /cib[@epoch]: default +-__xml_acl_check: 400 access denied to /cib/configuration/acls: default ++__xml_acl_check: 400 access denied to /cib[@epoch]: default ++__xml_acl_check: 400 access denied to /cib/configuration/acls: default + Call failed: Permission denied + =#=#=#= End test: niceguy: Replace - remove acls - Permission denied (13) =#=#=#= + * Passed: cibadmin - niceguy: Replace - remove acls +@@ -1967,9 +1971,9 @@ Call failed: Permission denied + + + =#=#=#= Begin test: niceguy: Replace - create resource =#=#=#= +-__xml_acl_check: 400 access denied to /cib[@epoch]: default +-__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy2']: default +-__xml_acl_post_process: Cannot add new node primitive at /cib/configuration/resources/primitive[@id='dummy2'] ++__xml_acl_check: 400 access denied to /cib[@epoch]: default ++__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy2']: default ++__xml_acl_post_process: Cannot add new node primitive at /cib/configuration/resources/primitive[@id='dummy2'] + Call failed: Permission denied + =#=#=#= End test: niceguy: Replace - create resource - Permission denied (13) =#=#=#= + * Passed: cibadmin - niceguy: Replace - create resource +@@ -2031,8 +2035,8 @@ Call failed: Permission denied + + + =#=#=#= Begin test: niceguy: Replace - modify attribute (deny) =#=#=#= +-__xml_acl_check: 400 access denied to /cib[@epoch]: default +-__xml_acl_check: 400 access denied to /cib/configuration/crm_config/cluster_property_set[@id='cib-bootstrap-options']/nvpair[@id='cib-bootstrap-options-enable-acl'][@value]: default ++__xml_acl_check: 400 access denied to /cib[@epoch]: default ++__xml_acl_check: 400 access denied to /cib/configuration/crm_config/cluster_property_set[@id='cib-bootstrap-options']/nvpair[@id='cib-bootstrap-options-enable-acl'][@value]: default + Call failed: Permission denied + =#=#=#= End test: niceguy: Replace - modify attribute (deny) - Permission denied (13) =#=#=#= + * Passed: cibadmin - niceguy: Replace - modify attribute (deny) +@@ -2094,8 +2098,8 @@ Call failed: Permission denied + + + =#=#=#= Begin test: niceguy: Replace - delete attribute (deny) =#=#=#= +-__xml_acl_check: 400 access denied to /cib[@epoch]: default +-__xml_acl_check: 400 access denied to /cib/configuration/crm_config/cluster_property_set[@id='cib-bootstrap-options']/nvpair[@id='cib-bootstrap-options-enable-acl']: default ++__xml_acl_check: 400 access denied to /cib[@epoch]: default ++__xml_acl_check: 400 access denied to /cib/configuration/crm_config/cluster_property_set[@id='cib-bootstrap-options']/nvpair[@id='cib-bootstrap-options-enable-acl']: default + Call failed: Permission denied + =#=#=#= End test: niceguy: Replace - delete attribute (deny) - Permission denied (13) =#=#=#= + * Passed: cibadmin - niceguy: Replace - delete attribute (deny) +@@ -2157,8 +2161,8 @@ Call failed: Permission denied + + + =#=#=#= Begin test: niceguy: Replace - create attribute (deny) =#=#=#= +-__xml_acl_check: 400 access denied to /cib[@epoch]: default +-__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy'][@description]: default ++__xml_acl_check: 400 access denied to /cib[@epoch]: default ++__xml_acl_check: 400 access denied to /cib/configuration/resources/primitive[@id='dummy'][@description]: default + Call failed: Permission denied + =#=#=#= End test: niceguy: Replace - create attribute (deny) - Permission denied (13) =#=#=#= + * Passed: cibadmin - niceguy: Replace - create attribute (deny) +diff --git a/tools/regression.tools.exp b/tools/regression.tools.exp +index 287caf9..b2f4df1 100644 +--- a/tools/regression.tools.exp ++++ b/tools/regression.tools.exp +@@ -626,6 +626,7 @@ Deleted nodes attribute: id=nodes-node1-standby name=standby + =#=#=#= End test: Create a resource - OK (0) =#=#=#= + * Passed: cibadmin - Create a resource + =#=#=#= Begin test: Create a resource meta attribute =#=#=#= ++Set 'dummy' option: id=dummy-meta_attributes-is-managed set=dummy-meta_attributes name=is-managed=false + =#=#=#= Current cib after: Create a resource meta attribute =#=#=#= + + +@@ -695,7 +696,7 @@ false + =#=#=#= End test: Query a resource meta attribute - OK (0) =#=#=#= + * Passed: crm_resource - Query a resource meta attribute + =#=#=#= Begin test: Remove a resource meta attribute =#=#=#= +-Deleted dummy option: id=dummy-meta_attributes-is-managed name=is-managed ++Deleted 'dummy' option: id=dummy-meta_attributes-is-managed name=is-managed + =#=#=#= Current cib after: Remove a resource meta attribute =#=#=#= + + +@@ -728,6 +729,7 @@ Deleted dummy option: id=dummy-meta_attributes-is-managed name=is-managed + =#=#=#= End test: Remove a resource meta attribute - OK (0) =#=#=#= + * Passed: crm_resource - Remove a resource meta attribute + =#=#=#= Begin test: Create a resource attribute =#=#=#= ++Set 'dummy' option: id=dummy-instance_attributes-delay set=dummy-instance_attributes name=delay=10s + =#=#=#= Current cib after: Create a resource attribute =#=#=#= + + +@@ -763,7 +765,7 @@ Deleted dummy option: id=dummy-meta_attributes-is-managed name=is-managed + =#=#=#= End test: Create a resource attribute - OK (0) =#=#=#= + * Passed: crm_resource - Create a resource attribute + =#=#=#= Begin test: List the configured resources =#=#=#= +- dummy (ocf::pacemaker:Dummy): Stopped ++ dummy (ocf::pacemaker:Dummy): Stopped + =#=#=#= Current cib after: List the configured resources =#=#=#= + + +@@ -973,8 +975,8 @@ Error performing operation: No such device or address + Current cluster status: + Online: [ node1 ] + +- dummy (ocf::pacemaker:Dummy): Stopped +- Fence (stonith:fence_true): Stopped ++ dummy (ocf::pacemaker:Dummy): Stopped ++ Fence (stonith:fence_true): Stopped + + Transition Summary: + * Start dummy (node1) +@@ -990,8 +992,8 @@ Executing cluster transition: + Revised cluster status: + Online: [ node1 ] + +- dummy (ocf::pacemaker:Dummy): Started node1 +- Fence (stonith:fence_true): Started node1 ++ dummy (ocf::pacemaker:Dummy): Started node1 ++ Fence (stonith:fence_true): Started node1 + + =#=#=#= Current cib after: Bring resources online =#=#=#= + +@@ -1710,8 +1712,8 @@ Error performing operation: No such device or address + Current cluster status: + Online: [ node1 ] + +- dummy (ocf::pacemaker:Dummy): Started node1 +- Fence (stonith:fence_true): Started node1 ++ dummy (ocf::pacemaker:Dummy): Started node1 ++ Fence (stonith:fence_true): Started node1 + + Performing requested modifications + + Bringing node node2 online +@@ -1733,8 +1735,8 @@ Executing cluster transition: + Revised cluster status: + Online: [ node1 node2 node3 ] + +- dummy (ocf::pacemaker:Dummy): Started node1 +- Fence (stonith:fence_true): Started node2 ++ dummy (ocf::pacemaker:Dummy): Started node1 ++ Fence (stonith:fence_true): Started node2 + + =#=#=#= Current cib after: Create two more nodes and bring them online =#=#=#= + +@@ -1996,8 +1998,8 @@ WARNING: Creating rsc_location constraint 'cli-ban-dummy-on-node2' with a score + Current cluster status: + Online: [ node1 node2 node3 ] + +- dummy (ocf::pacemaker:Dummy): Started node1 +- Fence (stonith:fence_true): Started node2 ++ dummy (ocf::pacemaker:Dummy): Started node1 ++ Fence (stonith:fence_true): Started node2 + + Transition Summary: + * Move dummy (Started node1 -> node3) +@@ -2010,8 +2012,8 @@ Executing cluster transition: + Revised cluster status: + Online: [ node1 node2 node3 ] + +- dummy (ocf::pacemaker:Dummy): Started node3 +- Fence (stonith:fence_true): Started node2 ++ dummy (ocf::pacemaker:Dummy): Started node3 ++ Fence (stonith:fence_true): Started node2 + + =#=#=#= Current cib after: Relocate resources due to ban =#=#=#= + +diff --git a/valgrind-pcmk.suppressions b/valgrind-pcmk.suppressions +index 2e382df..0a47096 100644 +--- a/valgrind-pcmk.suppressions ++++ b/valgrind-pcmk.suppressions +@@ -1,4 +1,4 @@ +-# Valgrind suppressions for PE testing ++# Valgrind suppressions for Pacemaker testing + { + Valgrind bug + Memcheck:Addr8 +@@ -57,6 +57,15 @@ + } + + { ++ Cman - Who cares if unused bytes are uninitialized ++ Memcheck:Param ++ sendmsg(msg) ++ fun:__sendmsg_nocancel ++ obj:*/libcman.so.3.0 ++ obj:*/libcman.so.3.0 ++} ++ ++{ + Cman - Jump or move depends on uninitialized values + Memcheck:Cond + obj:*/libcman.so.3.0 diff --git a/SOURCES/pacemaker-rollup-7-1-63f8e9a.patch b/SOURCES/pacemaker-rollup-7-1-63f8e9a.patch deleted file mode 100644 index c79ea14..0000000 --- a/SOURCES/pacemaker-rollup-7-1-63f8e9a.patch +++ /dev/null @@ -1,121831 +0,0 @@ -diff --git a/.gitignore b/.gitignore -index ac0f761..5a12fca 100644 ---- a/.gitignore -+++ b/.gitignore -@@ -12,6 +12,7 @@ Makefile.in - *.pc - *.pyc - *.bz2 -+*.tar.gz - *.rpm - *.la - *.lo -@@ -48,24 +49,41 @@ m4/ltoptions.m4 - m4/ltsugar.m4 - m4/ltversion.m4 - m4/lt~obsolete.m4 -+test-driver -+ylwrap - - # Configure targets -+Doxyfile -+coverage.sh - cts/CTSvars.py -+cts/HBDummy - cts/LSBDummy - cts/benchmark/clubench -+cts/lxc_autogen.sh -+extra/logrotate/pacemaker - include/config.h - include/config.h.in - include/crm_config.h -+lrmd/pacemaker_remote -+lrmd/pacemaker_remoted -+lrmd/pacemaker_remote.service - mcp/pacemaker -+mcp/pacemaker.combined.upstart - mcp/pacemaker.service -+mcp/pacemaker.upstart - pengine/regression.core.sh - publican.cfg - shell/modules/help.py - shell/modules/ra.py - shell/modules/ui.py - shell/modules/vars.py -+tools/cibsecret - tools/coverage.sh -+tools/crm_error -+tools/crm_mon.service -+tools/crm_mon.upstart - tools/crm_report -+tools/report.common - lrmd/regression.py - fencing/regression.py - -@@ -76,6 +94,7 @@ fencing/regression.py - *.8 - *.8.xml - *.8.html -+attrd/attrd - doc/*/en-US/images/*.png - doc/*/tmp/** - doc/*/publish -@@ -84,6 +103,7 @@ cib/cibmon - cib/cibpipe - crmd/atest - crmd/crmd -+doc/api/* - doc/Clusters_from_Scratch.txt - doc/Pacemaker_Explained.txt - doc/acls.html -@@ -93,6 +113,7 @@ fencing/stonith_admin - fencing/stonithd - fencing/stonithd.xml - lrmd/lrmd -+lrmd/lrmd_internal_ctl - lrmd/lrmd_test - mcp/pacemakerd - pengine/pengine -@@ -100,7 +121,6 @@ pengine/pengine.xml - pengine/ptest - shell/regression/testcases/confbasic-xml.filter - scratch --tools/attrd - tools/attrd_updater - tools/cibadmin - tools/crm_attribute -@@ -117,14 +137,16 @@ tools/iso8601 - tools/crm_ticket - tools/report.collector.1 - xml/crm.dtd --xml/pacemaker.rng --extra/rgmanager/ccs2cib --extra/rgmanager/ccs_flatten --extra/rgmanager/disable_rgmanager -+xml/pacemaker*.rng -+xml/versions.rng -+doc/Clusters_from_Scratch.build - doc/Clusters_from_Scratch/en-US/Ap-*.xml - doc/Clusters_from_Scratch/en-US/Ch-*.xml -+doc/Pacemaker_Explained.build - doc/Pacemaker_Explained/en-US/Ch-*.xml - doc/Pacemaker_Explained/en-US/Ap-*.xml -+doc/Pacemaker_Remote.build -+doc/Pacemaker_Remote/en-US/Ch-*.xml - lib/gnu/libgnu.a - lib/gnu/stdalign.h - *.coverity -@@ -132,9 +154,8 @@ lib/gnu/stdalign.h - #Other - mock - HTML --pacemaker.spec -+pacemaker*.spec - pengine/.regression.failed.diff --ClusterLabs-pacemaker-*.tar.gz - coverity-* - - compat_reports -diff --git a/.travis.yml b/.travis.yml -index 8ca2c57..9634df7 100644 ---- a/.travis.yml -+++ b/.travis.yml -@@ -29,13 +29,13 @@ env: - - # sudo add-apt-repository ppa:hotot-team - before_install: -- - sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu/ saucy main" -+ - sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu/ utopic main" - - sudo apt-get update -qq - - install: - - sudo apt-get install -qq automake autoconf chrpath libglib2.0-dev perl net-tools python libtool libxml2-dev bison flex uuid-dev libbz2-dev zlib1g-dev libltdl3-dev libgnutls-dev python-central python-dev libpam0g-dev libncurses5-dev libcorosync-dev libxslt1-dev libdbus-1-dev - - sudo apt-get install -qq cluster-glue-dev heartbeat-dev libheartbeat2-dev -- - sudo apt-get install -qq libqb-dev/saucy -+ - sudo apt-get install -qq libqb-dev - - before_script: - # Save and restore CC so that ./configure can pass -@@ -69,3 +69,4 @@ notifications: - branches: - only: - - master -+ - 1.1 -diff --git a/ChangeLog b/ChangeLog -index d70edbd..e445890 100644 ---- a/ChangeLog -+++ b/ChangeLog -@@ -1,4 +1,218 @@ - -+* Wed Jun 24 2015 Andrew Beekhof Pacemaker-1.1.13-1 -+- Update source tarball to revision: 2a1847e -+- Changesets: 750 -+- Diff: 156 files changed, 11323 insertions(+), 3725 deletions(-) -+ -+- Features added since Pacemaker-1.1.12 -+ + Allow fail-counts to be removed en-mass when the new attrd is in operation -+ + attrd supports private attributes (not written to CIB) -+ + crmd: Ensure a watchdog device is in use if stonith-watchdog-timeout is configured -+ + crmd: If configured, trigger the watchdog immediately if we loose quorum and no-quorum-policy=suicide -+ + crm_diff: Support generating a difference without versions details if --no-version/-u is supplied -+ + crm_resource: Implement an intelligent restart capability -+ + Fencing: Advertise the watchdog device for fencing operations -+ + Fencing: Allow the cluster to recover resources if the watchdog is in use -+ + fencing: cl#5134 - Support random fencing delay to avoid double fencing -+ + mcp: Allow orphan children to initiate node panic via SIGQUIT -+ + mcp: Turn on sbd integration if pacemakerd finds it running -+ + mcp: Two new error codes that result in machine reset or power off -+ + Officially support the resource-discovery attribute for location constraints -+ + PE: Allow natural ordering of colocation sets -+ + PE: Support non-actionable degraded mode for OCF -+ + pengine: cl#5207 - Display "UNCLEAN" for resources running on unclean offline nodes -+ + remote: pcmk remote client tool for use with container wrapper script -+ + Support machine panics for some kinds of errors (via sbd if available) -+ + tools: add crm_resource --wait option -+ + tools: attrd_updater supports --query and --all options -+ + tools: attrd_updater: Allow attributes to be set for other nodes -+ -+- Changes since Pacemaker-1.1.12 -+ + pengine: exclusive discovery implies rsc is only allowed on exclusive subset of nodes -+ + acl: Correctly implement the 'reference' acl directive -+ + acl: Do not delay evaluation of added nodes in some situations -+ + attrd: b22b1fe did uuid test too early -+ + attrd: Clean out the node cache when requested by the admin -+ + attrd: fixes double free in attrd legacy -+ + attrd: properly write attributes for peers once uuid is discovered -+ + attrd: refresh should force an immediate write-out of all attributes -+ + attrd: Simplify how node deletions happen -+ + Bug rhbz#1067544 - Tools: Correctly handle --ban, --move and --locate for master/slave groups -+ + Bug rhbz#1181824 - Ensure the DC can be reliably fenced -+ + cib: Ability to upgrade cib validation schema in legacy mode -+ + cib: Always generate digests for cib diffs in legacy mode -+ + cib: assignment where comparison intended -+ + cib: Avoid nodeid conflicts we don't care about -+ + cib: Correctly add "update-origin", "update-client" and "update-user" attributes for cib -+ + cib: Correctly set up signal handlers -+ + cib: Correctly track node state -+ + cib: Do not update on disk backups if we're just querying them -+ + cib: Enable cib legacy mode for plugin-based clusters -+ + cib: Ensure file-based backends treat '-o section' consistently with the native backend -+ + cib: Ensure upgrade operations from a non-DC get an acknowledgement -+ + cib: No need to enforce cib digests for v2 diffs in legacy mode -+ + cib: Revert d153b86 to instantly get cib synchronized in legacy mode -+ + cib: tls sock cleanup for remote cib connections -+ + cli: Ensure subsequent unknown long options are correctly detected -+ + cluster: Invoke crm_remove_conflicting_peer() only when the new node's uname is being assigned in the node cache -+ + common: Increment current and age for lib common as a result of APIs being added -+ + corosync: Bug cl#5232 - Somewhat gracefully handle nodes with invalid UUIDs -+ + corosync: Avoid unnecessary repeated CMAP API calls -+ + crmd/pengine: handle on-fail=ignore properly -+ + crmd: Add "on_node" attribute for *_last_failure_0 lrm resource operations -+ + crmd: All peers need to track node shutdown requests -+ + crmd: Cached copies of transient attributes cease to be valid once a node leaves the membership -+ + crmd: Correctly add the local option that validates against schema for pengine to calculate -+ + crmd: Disable debug logging that results in significant overhead -+ + crmd: do not remove connection resources during re-probe -+ + crmd: don't update fail count twice for same failure -+ + crmd: Ensure remote connection resources timeout properly during 'migrate_from' action -+ + crmd: Ensure throttle_mode() does something on Linux -+ + crmd: Fixes crash when remote connection migration fails -+ + crmd: gracefully handle remote node disconnects during op execution -+ + crmd: Handle remote connection failures while executing ops on remote connection -+ + crmd: include remote nodes when forcing cluster wide resource reprobe -+ + crmd: never stop recurring monitor ops for pcmk remote during incomplete migration -+ + crmd: Prevent the old version of DC from being fenced when it shuts down for rolling-upgrade -+ + crmd: Prevent use-of-NULL during reprobe -+ + crmd: properly update job limit for baremetal remote-nodes -+ + crmd: Remote-node throttle jobs count towards cluster-node hosting conneciton rsc -+ + crmd: Reset stonith failcount to recover transitioner when the node rejoins -+ + crmd: resolves memory leak in crmd. -+ + crmd: respect start-failure-is-fatal even for artifically injected events -+ + crmd: Wait for all pending operations to complete before poking the policy engine -+ + crmd: When container's host is fenced, cancel in-flight operations -+ + crm_attribute: Correctly update config options when -o crm_config is specified -+ + crm_failcount: Better error reporting when no resource is specified -+ + crm_mon: add exit reason to resource failure output -+ + crm_mon: Fill CRM_notify_node in traps with node's uname rather than node's id if possible -+ + crm_mon: Repair notification delivery when the v2 patch format is in use -+ + crm_node: Correctly remove nodes from the CIB by nodeid -+ + crm_report: More patterns for finding logs on non-DC nodes -+ + crm_resource: Allow resource restart operations to be node specific -+ + crm_resource: avoid deletion of lrm cache on node with resource discovery disabled. -+ + crm_resource: Calculate how long to wait for a restart based on the resource timeouts -+ + crm_resource: Clean up memory in --restart error paths -+ + crm_resource: Display the locations of all anonymous clone children when supplying the children's common ID -+ + crm_resource: Ensure --restart sets/clears meta attributes -+ + crm_resource: Ensure fail-counts are purged when we redetect the state of all resources -+ + crm_resource: Implement --timeout for resource restart operations -+ + crm_resource: Include group members when calculating the next timeout -+ + crm_resource: Memory leak in error paths -+ + crm_resource: Prevent use-after-free -+ + crm_resource: Repair regression test outputs -+ + crm_resource: Use-after-free when restarting a resource -+ + dbus: ref count leaks -+ + dbus: Ensure both the read and write queues get dispatched -+ + dbus: Fail gracefully if malloc fails -+ + dbus: handle dispatch queue when multiple replies need to be processed -+ + dbus: Notice when dbus connections get disabled -+ + dbus: Remove double-free introduced while trying to make coverity shut up -+ + ensure if B is colocated with A, B can never run without A -+ + fence_legacy: Avoid passing 'port' to cluster-glue agents -+ + fencing: Allow nodes to be purged from the member cache -+ + fencing: Correctly make args for fencing agents -+ + fencing: Correctly wait for self-fencing to occur when the watchdog is in use -+ + fencing: Ensure the hostlist parameter is set for watchdog agents -+ + fencing: Force 'stonith-ng' as the system name -+ + fencing: Gracefully handle invalid metadata from agents -+ + fencing: If configured, wait stonith-watchdog-timer seconds for self-fencing to complete -+ + fencing: Reject actions for devices that haven't been explicitly registered yet -+ + ipc: properly allocate server enforced buffer size on client -+ + ipc: use server enforced buffer during ipc client send -+ + lrmd, services: interpret LSB status codes properly -+ + lrmd: add back support for class heartbeat agents -+ + lrmd: cancel pending async connection during disconnect -+ + lrmd: enable ipc proxy for docker-wrapper privileged mode -+ + lrmd: fix rescheduling of systemd monitor op during start -+ + lrmd: Handle systemd reporting 'done' before a resource is actually stopped -+ + lrmd: Hint to child processes that using sd_notify is not required -+ + lrmd: Log with the correct personality -+ + lrmd: Prevent glib assert triggered by timers being removed from mainloop more than once -+ + lrmd: report original timeout when systemd operation completes -+ + lrmd: store failed operation exit reason in cib -+ + mainloop: resolves race condition mainloop poll involving modification of ipc connections -+ + make targetted reprobe for remote node work, crm_resource -C -N -+ + mcp: Allow a configurable delay when debugging shutdown issues -+ + mcp: Avoid requiring 'export' for SYS-V sysconfig options -+ + Membership: Detect and resolve nodes that change their ID -+ + pacemakerd: resolves memory leak of xml structure in pacemakerd -+ + pengine: ability to launch resources in isolated containers -+ + pengine: add #kind=remote for baremetal remote-nodes -+ + pengine: allow baremetal remote-nodes to recover without requiring fencing when cluster-node fails -+ + pengine: allow remote-nodes to be placed in maintenance mode -+ + pengine: Avoid trailing whitespaces when printing resource state -+ + pengine: cl#5130 - Choose nodes capable of running all the colocated utilization resources -+ + pengine: cl#5130 - Only check the capacities of the nodes that are allowed to run the resource -+ + pengine: Correctly compare feature set to determine how to unpack meta attributes -+ + pengine: disable migrations for resources with isolation containers -+ + pengine: disable reloading of resources within isolated container wrappers -+ + pengine: Do not aggregate children in a pending state into the started/stopped/etc lists -+ + pengine: Do not record duplicate copies of the failed actions -+ + pengine: Do not reschedule monitors that are no longer needed while resource definitions have changed -+ + pengine: Fence baremetal remote when recurring monitor op fails -+ + pengine: Fix colocation with unmanaged resources -+ + pengine: Fix the behaviors of multi-state resources with asymmetrical ordering -+ + pengine: fixes pengine crash with orphaned remote node connection resource -+ + pengine: fixes segfault caused by malformed log warning -+ + pengine: handle cloned isolated resources in a sane way -+ + pengine: handle isolated resource scenario, cloned group of isolated resources -+ + pengine: Handle ordering between stateful and migratable resources -+ + pengine: imply stop in container node resources when host node is fenced -+ + pengine: only fence baremetal remote when connection can fails or can not be recovered -+ + pengine: only kill process group on timeout when on-fail does not equal block. -+ + pengine: per-node control over resource discovery -+ + pengine: prefer migration target for remote node connections -+ + pengine: prevent disabling rsc discovery per node in certain situations -+ + pengine: Prevent use-after-free in sort_rsc_process_order() -+ + pengine: properly handle ordering during remote connection partial migration -+ + pengine: properly recover remote-nodes when cluster-node proxy goes offline -+ + pengine: remove unnecessary whitespace from notify environment variables -+ + pengine: require-all feature for ordered clones -+ + pengine: Resolve memory leaks -+ + pengine: resource discovery mode for location constraints -+ + pengine: restart master instances on instance attribute changes -+ + pengine: Turn off legacy unpacking of resource options into the meta hashtable -+ + pengine: Watchdog integration is sufficient for fencing -+ + Perform systemd reloads asynchronously -+ + ping: Correctly advertise multiplier default -+ + Prefer to inherit the watchdog timeout from SBD -+ + properly record stop args after reload -+ + provide fake meta data for ra class heartbeat -+ + remote: report timestamps for remote connection resource operations -+ + remote: Treat recv msg timeout as a disconnect -+ + service: Prevent potential use-of-NULL in metadata lookups -+ + solaris: Allow compilation when dirent.d_type is not available -+ + solaris: Correctly replace the linux swab functions -+ + solaris: Disable throttling since /proc doesn't exist -+ + stonith-ng: Correctly observe the watchdog completion timeout -+ + stonith-ng: Correctly track node state -+ + stonith-ng: Reset mainloop source IDs after removing them -+ + systemd: Correctly handle long running stop actions -+ + systemd: Ensure failed monitor operations always return -+ + systemd: Ensure we don't call dbus_message_unref() with NULL -+ + systemd: fix crash caused when canceling in-flight operation -+ + systemd: Kindly ask dbus NOT to kill the process if the dbus connection fails -+ + systemd: Perform actions asynchronously -+ + systemd: Perform monitor operations without blocking -+ + systemd: Tell systemd not to take DBus down from underneath us -+ + systemd: Trick systemd into not stopping our services before us during shutdown -+ + tools: Improve crm_mon output with certain option combinations -+ + upstart: Monitor actions always return 'ok' or 'not running' -+ + upstart: Perform more parts of monitor operations without blocking -+ + xml: add 'require-all' to xml schema for constraints -+ + xml: cl#5231 - Unset the deleted attributes in the resulting diffs -+ + xml: Clone the latest constraint schema in preparation for changes" -+ + xml: Correctly create v1 patchsets when deleting attributes -+ + xml: Do not change the ordering of properties when applying v1 cib diffs -+ + xml: Do not dump deleted attributes -+ + xml: Do not prune leaves from v1 cib diffs that are being created with digests -+ + xml: Ensure ACLs are reapplied before calculating what a replace operation changed -+ + xml: Fix upgrade-1.3.xsl to correctly transform ACL rules with "attribute" -+ + xml: Prevent assert errors in crm_element_value() on applying a patch without version information -+ + xml: Prevent potential use-of-NULL -+ -+ - * Tue Jul 22 2014 Andrew Beekhof Pacemaker-1.1.12-1 - - Update source tarball to revision: 93a037d - - Changesets: 795 -diff --git a/Doxyfile.in b/Doxyfile.in -index 81f21d6..68fc8ce 100644 ---- a/Doxyfile.in -+++ b/Doxyfile.in -@@ -1,109 +1,119 @@ --# Doxyfile 1.7.4 -+# Doxyfile 1.8.5 - - # This file describes the settings to be used by the documentation system - # doxygen (www.doxygen.org) for a project. - # --# All text after a hash (#) is considered a comment and will be ignored. -+# All text after a double hash (##) is considered a comment and is placed in -+# front of the TAG it is preceding. -+# -+# All text after a single hash (#) is considered a comment and will be ignored. - # The format is: --# TAG = value [value, ...] --# For lists items can also be appended using: --# TAG += value [value, ...] --# Values that contain spaces should be placed between quotes (" "). -+# TAG = value [value, ...] -+# For lists, items can also be appended using: -+# TAG += value [value, ...] -+# Values that contain spaces should be placed between quotes (\" \"). - - #--------------------------------------------------------------------------- - # Project related configuration options - #--------------------------------------------------------------------------- - - # This tag specifies the encoding used for all characters in the config file --# that follow. The default is UTF-8 which is also the encoding used for all --# text before the first occurrence of this tag. Doxygen uses libiconv (or the --# iconv built into libc) for the transcoding. See --# http://www.gnu.org/software/libiconv for the list of possible encodings. -+# that follow. The default is UTF-8 which is also the encoding used for all text -+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv -+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv -+# for the list of possible encodings. -+# The default value is: UTF-8. - - DOXYFILE_ENCODING = UTF-8 - --# The PROJECT_NAME tag is a single word (or a sequence of words surrounded --# by quotes) that should identify the project. -+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by -+# double-quotes, unless you are using Doxywizard) that should identify the -+# project for which the documentation is generated. This name is used in the -+# title of most generated pages and in a few other places. -+# The default value is: My Project. - - PROJECT_NAME = @PACKAGE_NAME@ - --# The PROJECT_NUMBER tag can be used to enter a project or revision number. --# This could be handy for archiving the generated documentation or --# if some version control system is used. -+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This -+# could be handy for archiving the generated documentation or if some version -+# control system is used. - - PROJECT_NUMBER = @PACKAGE_VERSION@-@BUILD_VERSION@ - - # Using the PROJECT_BRIEF tag one can provide an optional one line description --# for a project that appears at the top of each page and should give viewer --# a quick idea about the purpose of the project. Keep the description short. -+# for a project that appears at the top of each page and should give viewer a -+# quick idea about the purpose of the project. Keep the description short. - - PROJECT_BRIEF = "Scalable High-Availability cluster resource manager" - --# With the PROJECT_LOGO tag one can specify an logo or icon that is --# included in the documentation. The maximum height of the logo should not --# exceed 55 pixels and the maximum width should not exceed 200 pixels. --# Doxygen will copy the logo to the output directory. -+# With the PROJECT_LOGO tag one can specify an logo or icon that is included in -+# the documentation. The maximum height of the logo should not exceed 55 pixels -+# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo -+# to the output directory. - - PROJECT_LOGO = doc/publican-clusterlabs/en-US/images/title_logo.png - --# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) --# base path where the generated documentation will be put. --# If a relative path is entered, it will be relative to the location --# where doxygen was started. If left blank the current directory will be used. -+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path -+# into which the generated documentation will be written. If a relative path is -+# entered, it will be relative to the location where doxygen was started. If -+# left blank the current directory will be used. - - OUTPUT_DIRECTORY = doc/api/ - --# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create --# 4096 sub-directories (in 2 levels) under the output directory of each output --# format and will distribute the generated files over these directories. --# Enabling this option can be useful when feeding doxygen a huge amount of --# source files, where putting all generated files in the same directory would --# otherwise cause performance problems for the file system. -+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub- -+# directories (in 2 levels) under the output directory of each output format and -+# will distribute the generated files over these directories. Enabling this -+# option can be useful when feeding doxygen a huge amount of source files, where -+# putting all generated files in the same directory would otherwise causes -+# performance problems for the file system. -+# The default value is: NO. - - CREATE_SUBDIRS = NO - - # The OUTPUT_LANGUAGE tag is used to specify the language in which all - # documentation generated by doxygen is written. Doxygen will use this - # information to generate all constant output in the proper language. --# The default language is English, other supported languages are: --# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, --# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, --# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English --# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, --# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, --# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. -+# Possible values are: Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese- -+# Traditional, Croatian, Czech, Danish, Dutch, English, Esperanto, Farsi, -+# Finnish, French, German, Greek, Hungarian, Italian, Japanese, Japanese-en, -+# Korean, Korean-en, Latvian, Norwegian, Macedonian, Persian, Polish, -+# Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, Swedish, -+# Turkish, Ukrainian and Vietnamese. -+# The default value is: English. - - OUTPUT_LANGUAGE = English - --# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will --# include brief member descriptions after the members that are listed in --# the file and class documentation (similar to JavaDoc). --# Set to NO to disable this. -+# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member -+# descriptions after the members that are listed in the file and class -+# documentation (similar to Javadoc). Set to NO to disable this. -+# The default value is: YES. - - BRIEF_MEMBER_DESC = YES - --# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend --# the brief description of a member or function before the detailed description. --# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the -+# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief -+# description of a member or function before the detailed description -+# -+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the - # brief descriptions will be completely suppressed. -+# The default value is: YES. - - REPEAT_BRIEF = YES - --# This tag implements a quasi-intelligent brief description abbreviator --# that is used to form the text in various listings. Each string --# in this list, if found as the leading text of the brief description, will be --# stripped from the text and the result after processing the whole list, is --# used as the annotated text. Otherwise, the brief description is used as-is. --# If left blank, the following values are used ("$name" is automatically --# replaced with the name of the entity): "The $name class" "The $name widget" --# "The $name file" "is" "provides" "specifies" "contains" --# "represents" "a" "an" "the" -+# This tag implements a quasi-intelligent brief description abbreviator that is -+# used to form the text in various listings. Each string in this list, if found -+# as the leading text of the brief description, will be stripped from the text -+# and the result, after processing the whole list, is used as the annotated -+# text. Otherwise, the brief description is used as-is. If left blank, the -+# following values are used ($name is automatically replaced with the name of -+# the entity):The $name class, The $name widget, The $name file, is, provides, -+# specifies, contains, represents, a, an and the. - - ABBREVIATE_BRIEF = - - # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then --# Doxygen will generate a detailed section even if there is only a brief -+# doxygen will generate a detailed section even if there is only a brief - # description. -+# The default value is: NO. - - ALWAYS_DETAILED_SEC = NO - -@@ -111,153 +121,204 @@ ALWAYS_DETAILED_SEC = NO - # inherited members of a class in the documentation of that class as if those - # members were ordinary class members. Constructors, destructors and assignment - # operators of the base classes will not be shown. -+# The default value is: NO. - - INLINE_INHERITED_MEMB = NO - --# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full --# path before files name in the file list and in the header files. If set --# to NO the shortest path that makes the file name unique will be used. -+# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path -+# before files name in the file list and in the header files. If set to NO the -+# shortest path that makes the file name unique will be used -+# The default value is: YES. - - FULL_PATH_NAMES = YES - --# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag --# can be used to strip a user-defined part of the path. Stripping is --# only done if one of the specified strings matches the left-hand part of --# the path. The tag can be used to show relative paths in the file list. --# If left blank the directory from which doxygen is run is used as the --# path to strip. -+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. -+# Stripping is only done if one of the specified strings matches the left-hand -+# part of the path. The tag can be used to show relative paths in the file list. -+# If left blank the directory from which doxygen is run is used as the path to -+# strip. -+# -+# Note that you can specify absolute paths here, but also relative paths, which -+# will be relative from the directory where doxygen is started. -+# This tag requires that the tag FULL_PATH_NAMES is set to YES. - - STRIP_FROM_PATH = - --# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of --# the path mentioned in the documentation of a class, which tells --# the reader which header file to include in order to use a class. --# If left blank only the name of the header file containing the class --# definition is used. Otherwise one should specify the include paths that --# are normally passed to the compiler using the -I flag. -+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the -+# path mentioned in the documentation of a class, which tells the reader which -+# header file to include in order to use a class. If left blank only the name of -+# the header file containing the class definition is used. Otherwise one should -+# specify the list of include paths that are normally passed to the compiler -+# using the -I flag. - - STRIP_FROM_INC_PATH = - --# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter --# (but less readable) file names. This can be useful if your file system --# doesn't support long names like on DOS, Mac, or CD-ROM. -+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but -+# less readable) file names. This can be useful is your file systems doesn't -+# support long names like on DOS, Mac, or CD-ROM. -+# The default value is: NO. - - SHORT_NAMES = NO - --# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen --# will interpret the first line (until the first dot) of a JavaDoc-style --# comment as the brief description. If set to NO, the JavaDoc --# comments will behave just like regular Qt-style comments --# (thus requiring an explicit @brief command for a brief description.) -+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the -+# first line (until the first dot) of a Javadoc-style comment as the brief -+# description. If set to NO, the Javadoc-style will behave just like regular Qt- -+# style comments (thus requiring an explicit @brief command for a brief -+# description.) -+# The default value is: NO. - - JAVADOC_AUTOBRIEF = NO - --# If the QT_AUTOBRIEF tag is set to YES then Doxygen will --# interpret the first line (until the first dot) of a Qt-style --# comment as the brief description. If set to NO, the comments --# will behave just like regular Qt-style comments (thus requiring --# an explicit \brief command for a brief description.) -+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first -+# line (until the first dot) of a Qt-style comment as the brief description. If -+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus -+# requiring an explicit \brief command for a brief description.) -+# The default value is: NO. - - QT_AUTOBRIEF = NO - --# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen --# treat a multi-line C++ special comment block (i.e. a block of //! or /// --# comments) as a brief description. This used to be the default behaviour. --# The new default is to treat a multi-line C++ comment block as a detailed --# description. Set this tag to YES if you prefer the old behaviour instead. -+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a -+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as -+# a brief description. This used to be the default behavior. The new default is -+# to treat a multi-line C++ comment block as a detailed description. Set this -+# tag to YES if you prefer the old behavior instead. -+# -+# Note that setting this tag to YES also means that rational rose comments are -+# not recognized any more. -+# The default value is: NO. - - MULTILINE_CPP_IS_BRIEF = NO - --# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented --# member inherits the documentation from any documented member that it --# re-implements. -+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the -+# documentation from any documented member that it re-implements. -+# The default value is: YES. - - INHERIT_DOCS = YES - --# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce --# a new page for each member. If set to NO, the documentation of a member will --# be part of the file/class/namespace that contains it. -+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a -+# new page for each member. If set to NO, the documentation of a member will be -+# part of the file/class/namespace that contains it. -+# The default value is: NO. - - SEPARATE_MEMBER_PAGES = NO - --# The TAB_SIZE tag can be used to set the number of spaces in a tab. --# Doxygen uses this value to replace tabs by spaces in code fragments. -+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen -+# uses this value to replace tabs by spaces in code fragments. -+# Minimum value: 1, maximum value: 16, default value: 4. - - TAB_SIZE = 4 - --# This tag can be used to specify a number of aliases that acts --# as commands in the documentation. An alias has the form "name=value". --# For example adding "sideeffect=\par Side Effects:\n" will allow you to --# put the command \sideeffect (or @sideeffect) in the documentation, which --# will result in a user-defined paragraph with heading "Side Effects:". --# You can put \n's in the value part of an alias to insert newlines. -+# This tag can be used to specify a number of aliases that act as commands in -+# the documentation. An alias has the form: -+# name=value -+# For example adding -+# "sideeffect=@par Side Effects:\n" -+# will allow you to put the command \sideeffect (or @sideeffect) in the -+# documentation, which will result in a user-defined paragraph with heading -+# "Side Effects:". You can put \n's in the value part of an alias to insert -+# newlines. - - ALIASES = - --# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C --# sources only. Doxygen will then generate output that is more tailored for C. --# For instance, some of the names that are used will be different. The list --# of all members will be omitted, etc. -+# This tag can be used to specify a number of word-keyword mappings (TCL only). -+# A mapping has the form "name=value". For example adding "class=itcl::class" -+# will allow you to use the command class in the itcl::class meaning. -+ -+TCL_SUBST = -+ -+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources -+# only. Doxygen will then generate output that is more tailored for C. For -+# instance, some of the names that are used will be different. The list of all -+# members will be omitted, etc. -+# The default value is: NO. - - OPTIMIZE_OUTPUT_FOR_C = YES - --# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java --# sources only. Doxygen will then generate output that is more tailored for --# Java. For instance, namespaces will be presented as packages, qualified --# scopes will look different, etc. -+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or -+# Python sources only. Doxygen will then generate output that is more tailored -+# for that language. For instance, namespaces will be presented as packages, -+# qualified scopes will look different, etc. -+# The default value is: NO. - - OPTIMIZE_OUTPUT_JAVA = NO - - # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran --# sources only. Doxygen will then generate output that is more tailored for --# Fortran. -+# sources. Doxygen will then generate output that is tailored for Fortran. -+# The default value is: NO. - - OPTIMIZE_FOR_FORTRAN = NO - - # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL --# sources. Doxygen will then generate output that is tailored for --# VHDL. -+# sources. Doxygen will then generate output that is tailored for VHDL. -+# The default value is: NO. - - OPTIMIZE_OUTPUT_VHDL = NO - - # Doxygen selects the parser to use depending on the extension of the files it --# parses. With this tag you can assign which parser to use for a given extension. --# Doxygen has a built-in mapping, but you can override or extend it using this --# tag. The format is ext=language, where ext is a file extension, and language --# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C, --# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make -+# parses. With this tag you can assign which parser to use for a given -+# extension. Doxygen has a built-in mapping, but you can override or extend it -+# using this tag. The format is ext=language, where ext is a file extension, and -+# language is one of the parsers supported by doxygen: IDL, Java, Javascript, -+# C#, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL. For instance to make - # doxygen treat .inc files as Fortran files (default is PHP), and .f files as C --# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions --# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. -+# (default is Fortran), use: inc=Fortran f=C. -+# -+# Note For files without extension you can use no_extension as a placeholder. -+# -+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise -+# the files are not read by doxygen. - - EXTENSION_MAPPING = - -+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments -+# according to the Markdown format, which allows for more readable -+# documentation. See http://daringfireball.net/projects/markdown/ for details. -+# The output of markdown processing is further processed by doxygen, so you can -+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in -+# case of backward compatibilities issues. -+# The default value is: YES. -+ -+MARKDOWN_SUPPORT = YES -+ -+# When enabled doxygen tries to link words that correspond to documented -+# classes, or namespaces to their corresponding documentation. Such a link can -+# be prevented in individual cases by by putting a % sign in front of the word -+# or globally by setting AUTOLINK_SUPPORT to NO. -+# The default value is: YES. -+ -+AUTOLINK_SUPPORT = YES -+ - # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want --# to include (a tag file for) the STL sources as input, then you should --# set this tag to YES in order to let doxygen match functions declarations and --# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. --# func(std::string) {}). This also makes the inheritance and collaboration -+# to include (a tag file for) the STL sources as input, then you should set this -+# tag to YES in order to let doxygen match functions declarations and -+# definitions whose arguments contain STL classes (e.g. func(std::string); -+# versus func(std::string) {}). This also make the inheritance and collaboration - # diagrams that involve STL classes more complete and accurate. -+# The default value is: NO. - - BUILTIN_STL_SUPPORT = NO - - # If you use Microsoft's C++/CLI language, you should set this option to YES to - # enable parsing support. -+# The default value is: NO. - - CPP_CLI_SUPPORT = NO - --# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. --# Doxygen will parse them like normal C++ but will assume all classes use public --# instead of private inheritance when no explicit protection keyword is present. -+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen -+# will parse them like normal C++ but will assume all classes use public instead -+# of private inheritance when no explicit protection keyword is present. -+# The default value is: NO. - - SIP_SUPPORT = NO - --# For Microsoft's IDL there are propget and propput attributes to indicate getter --# and setter methods for a property. Setting this option to YES (the default) --# will make doxygen replace the get and set methods by a property in the --# documentation. This will only work if the methods are indeed getting or --# setting a simple type. If this is not the case, or you want to show the --# methods anyway, you should set this option to NO. -+# For Microsoft's IDL there are propget and propput attributes to indicate -+# getter and setter methods for a property. Setting this option to YES will make -+# doxygen to replace the get and set methods by a property in the documentation. -+# This will only work if the methods are indeed getting or setting a simple -+# type. If this is not the case, or you want to show the methods anyway, you -+# should set this option to NO. -+# The default value is: YES. - - IDL_PROPERTY_SUPPORT = YES - -@@ -265,394 +326,464 @@ IDL_PROPERTY_SUPPORT = YES - # tag is set to YES, then doxygen will reuse the documentation of the first - # member in the group (if any) for the other members of the group. By default - # all members of a group must be documented explicitly. -+# The default value is: NO. - - DISTRIBUTE_GROUP_DOC = YES - --# Set the SUBGROUPING tag to YES (the default) to allow class member groups of --# the same type (for instance a group of public functions) to be put as a --# subgroup of that type (e.g. under the Public Functions section). Set it to --# NO to prevent subgrouping. Alternatively, this can be done per class using --# the \nosubgrouping command. -+# Set the SUBGROUPING tag to YES to allow class member groups of the same type -+# (for instance a group of public functions) to be put as a subgroup of that -+# type (e.g. under the Public Functions section). Set it to NO to prevent -+# subgrouping. Alternatively, this can be done per class using the -+# \nosubgrouping command. -+# The default value is: YES. - - SUBGROUPING = YES - --# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and --# unions are shown inside the group in which they are included (e.g. using --# @ingroup) instead of on a separate page (for HTML and Man pages) or --# section (for LaTeX and RTF). -+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions -+# are shown inside the group in which they are included (e.g. using \ingroup) -+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX -+# and RTF). -+# -+# Note that this feature does not work in combination with -+# SEPARATE_MEMBER_PAGES. -+# The default value is: NO. - - INLINE_GROUPED_CLASSES = NO - --# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum --# is documented as struct, union, or enum with the name of the typedef. So -+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions -+# with only public data fields or simple typedef fields will be shown inline in -+# the documentation of the scope in which they are defined (i.e. file, -+# namespace, or group documentation), provided this scope is documented. If set -+# to NO, structs, classes, and unions are shown on a separate page (for HTML and -+# Man pages) or section (for LaTeX and RTF). -+# The default value is: NO. -+ -+INLINE_SIMPLE_STRUCTS = NO -+ -+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or -+# enum is documented as struct, union, or enum with the name of the typedef. So - # typedef struct TypeS {} TypeT, will appear in the documentation as a struct - # with name TypeT. When disabled the typedef will appear as a member of a file, --# namespace, or class. And the struct will be named TypeS. This can typically --# be useful for C code in case the coding convention dictates that all compound -+# namespace, or class. And the struct will be named TypeS. This can typically be -+# useful for C code in case the coding convention dictates that all compound - # types are typedef'ed and only the typedef is referenced, never the tag name. -+# The default value is: NO. - - TYPEDEF_HIDES_STRUCT = NO - --# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to --# determine which symbols to keep in memory and which to flush to disk. --# When the cache is full, less often used symbols will be written to disk. --# For small to medium size projects (<1000 input files) the default value is --# probably good enough. For larger projects a too small cache size can cause --# doxygen to be busy swapping symbols to and from disk most of the time --# causing a significant performance penalty. --# If the system has enough physical memory increasing the cache will improve the --# performance by keeping more symbols in memory. Note that the value works on --# a logarithmic scale so increasing the size by one will roughly double the --# memory usage. The cache size is given by this formula: --# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, --# corresponding to a cache size of 2^16 = 65536 symbols -- --SYMBOL_CACHE_SIZE = 0 -+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This -+# cache is used to resolve symbols given their name and scope. Since this can be -+# an expensive process and often the same symbol appears multiple times in the -+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small -+# doxygen will become slower. If the cache is too large, memory is wasted. The -+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range -+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 -+# symbols. At the end of a run doxygen will report the cache usage and suggest -+# the optimal cache size from a speed point of view. -+# Minimum value: 0, maximum value: 9, default value: 0. -+ -+LOOKUP_CACHE_SIZE = 0 - - #--------------------------------------------------------------------------- - # Build related configuration options - #--------------------------------------------------------------------------- - - # If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in --# documentation are documented, even if no documentation was available. --# Private class members and static file members will be hidden unless --# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES -+# documentation are documented, even if no documentation was available. Private -+# class members and static file members will be hidden unless the -+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. -+# Note: This will also disable the warnings about undocumented members that are -+# normally produced when WARNINGS is set to YES. -+# The default value is: NO. - - EXTRACT_ALL = YES - --# If the EXTRACT_PRIVATE tag is set to YES all private members of a class --# will be included in the documentation. -+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will -+# be included in the documentation. -+# The default value is: NO. - - EXTRACT_PRIVATE = NO - --# If the EXTRACT_STATIC tag is set to YES all static members of a file --# will be included in the documentation. -+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal -+# scope will be included in the documentation. -+# The default value is: NO. -+ -+EXTRACT_PACKAGE = NO -+ -+# If the EXTRACT_STATIC tag is set to YES all static members of a file will be -+# included in the documentation. -+# The default value is: NO. - - EXTRACT_STATIC = NO - --# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) --# defined locally in source files will be included in the documentation. --# If set to NO only classes defined in header files are included. -+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined -+# locally in source files will be included in the documentation. If set to NO -+# only classes defined in header files are included. Does not have any effect -+# for Java sources. -+# The default value is: YES. - - EXTRACT_LOCAL_CLASSES = NO - --# This flag is only useful for Objective-C code. When set to YES local --# methods, which are defined in the implementation section but not in --# the interface are included in the documentation. --# If set to NO (the default) only methods in the interface are included. -+# This flag is only useful for Objective-C code. When set to YES local methods, -+# which are defined in the implementation section but not in the interface are -+# included in the documentation. If set to NO only methods in the interface are -+# included. -+# The default value is: NO. - - EXTRACT_LOCAL_METHODS = NO - - # If this flag is set to YES, the members of anonymous namespaces will be - # extracted and appear in the documentation as a namespace called --# 'anonymous_namespace{file}', where file will be replaced with the base --# name of the file that contains the anonymous namespace. By default --# anonymous namespaces are hidden. -+# 'anonymous_namespace{file}', where file will be replaced with the base name of -+# the file that contains the anonymous namespace. By default anonymous namespace -+# are hidden. -+# The default value is: NO. - - EXTRACT_ANON_NSPACES = NO - --# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all --# undocumented members of documented classes, files or namespaces. --# If set to NO (the default) these members will be included in the --# various overviews, but no documentation section is generated. --# This option has no effect if EXTRACT_ALL is enabled. -+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all -+# undocumented members inside documented classes or files. If set to NO these -+# members will be included in the various overviews, but no documentation -+# section is generated. This option has no effect if EXTRACT_ALL is enabled. -+# The default value is: NO. - - HIDE_UNDOC_MEMBERS = NO - --# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all --# undocumented classes that are normally visible in the class hierarchy. --# If set to NO (the default) these classes will be included in the various --# overviews. This option has no effect if EXTRACT_ALL is enabled. -+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all -+# undocumented classes that are normally visible in the class hierarchy. If set -+# to NO these classes will be included in the various overviews. This option has -+# no effect if EXTRACT_ALL is enabled. -+# The default value is: NO. - - HIDE_UNDOC_CLASSES = NO - --# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all --# friend (class|struct|union) declarations. --# If set to NO (the default) these declarations will be included in the --# documentation. -+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend -+# (class|struct|union) declarations. If set to NO these declarations will be -+# included in the documentation. -+# The default value is: NO. - - HIDE_FRIEND_COMPOUNDS = NO - --# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any --# documentation blocks found inside the body of a function. --# If set to NO (the default) these blocks will be appended to the --# function's detailed documentation block. -+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any -+# documentation blocks found inside the body of a function. If set to NO these -+# blocks will be appended to the function's detailed documentation block. -+# The default value is: NO. - - HIDE_IN_BODY_DOCS = NO - --# The INTERNAL_DOCS tag determines if documentation --# that is typed after a \internal command is included. If the tag is set --# to NO (the default) then the documentation will be excluded. --# Set it to YES to include the internal documentation. -+# The INTERNAL_DOCS tag determines if documentation that is typed after a -+# \internal command is included. If the tag is set to NO then the documentation -+# will be excluded. Set it to YES to include the internal documentation. -+# The default value is: NO. - - INTERNAL_DOCS = NO - --# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate --# file names in lower-case letters. If set to YES upper-case letters are also -+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file -+# names in lower-case letters. If set to YES upper-case letters are also - # allowed. This is useful if you have classes or files whose names only differ - # in case and if your file system supports case sensitive file names. Windows - # and Mac users are advised to set this option to NO. -+# The default value is: system dependent. - - CASE_SENSE_NAMES = YES - --# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen --# will show members with their full class and namespace scopes in the --# documentation. If set to YES the scope will be hidden. -+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with -+# their full class and namespace scopes in the documentation. If set to YES the -+# scope will be hidden. -+# The default value is: NO. - - HIDE_SCOPE_NAMES = NO - --# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen --# will put a list of the files that are included by a file in the documentation --# of that file. -+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of -+# the files that are included by a file in the documentation of that file. -+# The default value is: YES. - - SHOW_INCLUDE_FILES = YES - --# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen --# will list include files with double quotes in the documentation --# rather than with sharp brackets. -+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include -+# files with double quotes in the documentation rather than with sharp brackets. -+# The default value is: NO. - - FORCE_LOCAL_INCLUDES = NO - --# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] --# is inserted in the documentation for inline members. -+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the -+# documentation for inline members. -+# The default value is: YES. - - INLINE_INFO = YES - --# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen --# will sort the (detailed) documentation of file and class members --# alphabetically by member name. If set to NO the members will appear in --# declaration order. -+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the -+# (detailed) documentation of file and class members alphabetically by member -+# name. If set to NO the members will appear in declaration order. -+# The default value is: YES. - - SORT_MEMBER_DOCS = YES - --# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the --# brief documentation of file, namespace and class members alphabetically --# by member name. If set to NO (the default) the members will appear in --# declaration order. -+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief -+# descriptions of file, namespace and class members alphabetically by member -+# name. If set to NO the members will appear in declaration order. -+# The default value is: NO. - - SORT_BRIEF_DOCS = NO - --# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen --# will sort the (brief and detailed) documentation of class members so that --# constructors and destructors are listed first. If set to NO (the default) --# the constructors will appear in the respective orders defined by --# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. --# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO --# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. -+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the -+# (brief and detailed) documentation of class members so that constructors and -+# destructors are listed first. If set to NO the constructors will appear in the -+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. -+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief -+# member documentation. -+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting -+# detailed member documentation. -+# The default value is: NO. - - SORT_MEMBERS_CTORS_1ST = YES - --# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the --# hierarchy of group names into alphabetical order. If set to NO (the default) --# the group names will appear in their defined order. -+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy -+# of group names into alphabetical order. If set to NO the group names will -+# appear in their defined order. -+# The default value is: NO. - - SORT_GROUP_NAMES = NO - --# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be --# sorted by fully-qualified names, including namespaces. If set to --# NO (the default), the class list will be sorted only by class name, --# not including the namespace part. -+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by -+# fully-qualified names, including namespaces. If set to NO, the class list will -+# be sorted only by class name, not including the namespace part. - # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. --# Note: This option applies only to the class list, not to the --# alphabetical list. -+# Note: This option applies only to the class list, not to the alphabetical -+# list. -+# The default value is: NO. - - SORT_BY_SCOPE_NAME = NO - --# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to --# do proper type resolution of all parameters of a function it will reject a --# match between the prototype and the implementation of a member function even --# if there is only one candidate or it is obvious which candidate to choose --# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen --# will still accept a match between prototype and implementation in such cases. -+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper -+# type resolution of all parameters of a function it will reject a match between -+# the prototype and the implementation of a member function even if there is -+# only one candidate or it is obvious which candidate to choose by doing a -+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still -+# accept a match between prototype and implementation in such cases. -+# The default value is: NO. - - STRICT_PROTO_MATCHING = NO - --# The GENERATE_TODOLIST tag can be used to enable (YES) or --# disable (NO) the todo list. This list is created by putting \todo --# commands in the documentation. -+# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the -+# todo list. This list is created by putting \todo commands in the -+# documentation. -+# The default value is: YES. - - GENERATE_TODOLIST = YES - --# The GENERATE_TESTLIST tag can be used to enable (YES) or --# disable (NO) the test list. This list is created by putting \test --# commands in the documentation. -+# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the -+# test list. This list is created by putting \test commands in the -+# documentation. -+# The default value is: YES. - - GENERATE_TESTLIST = YES - --# The GENERATE_BUGLIST tag can be used to enable (YES) or --# disable (NO) the bug list. This list is created by putting \bug --# commands in the documentation. -+# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug -+# list. This list is created by putting \bug commands in the documentation. -+# The default value is: YES. - - GENERATE_BUGLIST = YES - --# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or --# disable (NO) the deprecated list. This list is created by putting --# \deprecated commands in the documentation. -+# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO) -+# the deprecated list. This list is created by putting \deprecated commands in -+# the documentation. -+# The default value is: YES. - - GENERATE_DEPRECATEDLIST= YES - --# The ENABLED_SECTIONS tag can be used to enable conditional --# documentation sections, marked by \if sectionname ... \endif. -+# The ENABLED_SECTIONS tag can be used to enable conditional documentation -+# sections, marked by \if ... \endif and \cond -+# ... \endcond blocks. - - ENABLED_SECTIONS = - --# The MAX_INITIALIZER_LINES tag determines the maximum number of lines --# the initial value of a variable or macro consists of for it to appear in --# the documentation. If the initializer consists of more lines than specified --# here it will be hidden. Use a value of 0 to hide initializers completely. --# The appearance of the initializer of individual variables and macros in the --# documentation can be controlled using \showinitializer or \hideinitializer --# command in the documentation regardless of this setting. -+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the -+# initial value of a variable or macro / define can have for it to appear in the -+# documentation. If the initializer consists of more lines than specified here -+# it will be hidden. Use a value of 0 to hide initializers completely. The -+# appearance of the value of individual variables and macros / defines can be -+# controlled using \showinitializer or \hideinitializer command in the -+# documentation regardless of this setting. -+# Minimum value: 0, maximum value: 10000, default value: 30. - - MAX_INITIALIZER_LINES = 30 - --# Set the SHOW_USED_FILES tag to NO to disable the list of files generated --# at the bottom of the documentation of classes and structs. If set to YES the --# list will mention the files that were used to generate the documentation. -+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at -+# the bottom of the documentation of classes and structs. If set to YES the list -+# will mention the files that were used to generate the documentation. -+# The default value is: YES. - - SHOW_USED_FILES = YES - --# If the sources in your project are distributed over multiple directories --# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy --# in the documentation. The default is NO. -- --SHOW_DIRECTORIES = NO -- --# Set the SHOW_FILES tag to NO to disable the generation of the Files page. --# This will remove the Files entry from the Quick Index and from the --# Folder Tree View (if specified). The default is YES. -+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This -+# will remove the Files entry from the Quick Index and from the Folder Tree View -+# (if specified). -+# The default value is: YES. - - SHOW_FILES = YES - --# Set the SHOW_NAMESPACES tag to NO to disable the generation of the --# Namespaces page. --# This will remove the Namespaces entry from the Quick Index --# and from the Folder Tree View (if specified). The default is YES. -+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces -+# page. This will remove the Namespaces entry from the Quick Index and from the -+# Folder Tree View (if specified). -+# The default value is: YES. - - SHOW_NAMESPACES = YES - - # The FILE_VERSION_FILTER tag can be used to specify a program or script that - # doxygen should invoke to get the current version for each file (typically from - # the version control system). Doxygen will invoke the program by executing (via --# popen()) the command , where is the value of --# the FILE_VERSION_FILTER tag, and is the name of an input file --# provided by doxygen. Whatever the program writes to standard output --# is used as the file version. See the manual for examples. -+# popen()) the command command input-file, where command is the value of the -+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided -+# by doxygen. Whatever the program writes to standard output is used as the file -+# version. For an example see the documentation. - - FILE_VERSION_FILTER = - - # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed - # by doxygen. The layout file controls the global structure of the generated --# output files in an output format independent way. The create the layout file --# that represents doxygen's defaults, run doxygen with the -l option. --# You can optionally specify a file name after the option, if omitted --# DoxygenLayout.xml will be used as the name of the layout file. -+# output files in an output format independent way. To create the layout file -+# that represents doxygen's defaults, run doxygen with the -l option. You can -+# optionally specify a file name after the option, if omitted DoxygenLayout.xml -+# will be used as the name of the layout file. -+# -+# Note that if you run doxygen from a directory containing a file called -+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE -+# tag is left empty. - - LAYOUT_FILE = - -+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing -+# the reference definitions. This must be a list of .bib files. The .bib -+# extension is automatically appended if omitted. This requires the bibtex tool -+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. -+# For LaTeX the style of the bibliography can be controlled using -+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the -+# search path. Do not use file names with spaces, bibtex cannot handle them. See -+# also \cite for info how to create references. -+ -+CITE_BIB_FILES = -+ - #--------------------------------------------------------------------------- --# configuration options related to warning and progress messages -+# Configuration options related to warning and progress messages - #--------------------------------------------------------------------------- - --# The QUIET tag can be used to turn on/off the messages that are generated --# by doxygen. Possible values are YES and NO. If left blank NO is used. -+# The QUIET tag can be used to turn on/off the messages that are generated to -+# standard output by doxygen. If QUIET is set to YES this implies that the -+# messages are off. -+# The default value is: NO. - - QUIET = NO - - # The WARNINGS tag can be used to turn on/off the warning messages that are --# generated by doxygen. Possible values are YES and NO. If left blank --# NO is used. -+# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES -+# this implies that the warnings are on. -+# -+# Tip: Turn warnings on while writing the documentation. -+# The default value is: YES. - - WARNINGS = YES - --# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings --# for undocumented members. If EXTRACT_ALL is set to YES then this flag will --# automatically be disabled. -+# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate -+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag -+# will automatically be disabled. -+# The default value is: YES. - - WARN_IF_UNDOCUMENTED = YES - --# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for --# potential errors in the documentation, such as not documenting some --# parameters in a documented function, or documenting parameters that --# don't exist or using markup commands wrongly. -+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for -+# potential errors in the documentation, such as not documenting some parameters -+# in a documented function, or documenting parameters that don't exist or using -+# markup commands wrongly. -+# The default value is: YES. - - WARN_IF_DOC_ERROR = YES - --# The WARN_NO_PARAMDOC option can be enabled to get warnings for --# functions that are documented, but have no documentation for their parameters --# or return value. If set to NO (the default) doxygen will only warn about --# wrong or incomplete parameter documentation, but not about the absence of --# documentation. -+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that -+# are documented, but have no documentation for their parameters or return -+# value. If set to NO doxygen will only warn about wrong or incomplete parameter -+# documentation, but not about the absence of documentation. -+# The default value is: NO. - - WARN_NO_PARAMDOC = NO - --# The WARN_FORMAT tag determines the format of the warning messages that --# doxygen can produce. The string should contain the $file, $line, and $text --# tags, which will be replaced by the file and line number from which the --# warning originated and the warning text. Optionally the format may contain --# $version, which will be replaced by the version of the file (if it could --# be obtained via FILE_VERSION_FILTER) -+# The WARN_FORMAT tag determines the format of the warning messages that doxygen -+# can produce. The string should contain the $file, $line, and $text tags, which -+# will be replaced by the file and line number from which the warning originated -+# and the warning text. Optionally the format may contain $version, which will -+# be replaced by the version of the file (if it could be obtained via -+# FILE_VERSION_FILTER) -+# The default value is: $file:$line: $text. - - WARN_FORMAT = "$file:$line: $text" - --# The WARN_LOGFILE tag can be used to specify a file to which warning --# and error messages should be written. If left blank the output is written --# to stderr. -+# The WARN_LOGFILE tag can be used to specify a file to which warning and error -+# messages should be written. If left blank the output is written to standard -+# error (stderr). - - WARN_LOGFILE = - - #--------------------------------------------------------------------------- --# configuration options related to the input files -+# Configuration options related to the input files - #--------------------------------------------------------------------------- - --# The INPUT tag can be used to specify the files and/or directories that contain --# documented source files. You may enter file names like "myfile.cpp" or --# directories like "/usr/src/myproject". Separate the files or directories --# with spaces. -+# The INPUT tag is used to specify the files and/or directories that contain -+# documented source files. You may enter file names like myfile.cpp or -+# directories like /usr/src/myproject. Separate the files or directories with -+# spaces. -+# Note: If this tag is empty the current directory is searched. - --INPUT = include/crm include/crm_config.h include/doxygen.h -+INPUT = include \ -+ lib - - # This tag can be used to specify the character encoding of the source files --# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is --# also the default input encoding. Doxygen uses libiconv (or the iconv built --# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for --# the list of possible encodings. -+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses -+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv -+# documentation (see: http://www.gnu.org/software/libiconv) for the list of -+# possible encodings. -+# The default value is: UTF-8. - - INPUT_ENCODING = UTF-8 - - # If the value of the INPUT tag contains directories, you can use the --# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp --# and *.h) to filter out the source-files in the directories. If left --# blank the following patterns are tested: --# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh --# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py --# *.f90 *.f *.for *.vhd *.vhdl -+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and -+# *.h) to filter out the source-files in the directories. If left blank the -+# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii, -+# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp, -+# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown, -+# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, -+# *.qsf, *.as and *.js. - - FILE_PATTERNS = - --# The RECURSIVE tag can be used to turn specify whether or not subdirectories --# should be searched for input files as well. Possible values are YES and NO. --# If left blank NO is used. -+# The RECURSIVE tag can be used to specify whether or not subdirectories should -+# be searched for input files as well. -+# The default value is: NO. - - RECURSIVE = YES - --# The EXCLUDE tag can be used to specify files and/or directories that should -+# The EXCLUDE tag can be used to specify files and/or directories that should be - # excluded from the INPUT source files. This way you can easily exclude a - # subdirectory from a directory tree whose root is specified with the INPUT tag. -+# -+# Note that relative paths are relative to the directory from which doxygen is -+# run. - - EXCLUDE = - --# The EXCLUDE_SYMLINKS tag can be used select whether or not files or -+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or - # directories that are symbolic links (a Unix file system feature) are excluded - # from the input. -+# The default value is: NO. - - EXCLUDE_SYMLINKS = NO - - # If the value of the INPUT tag contains directories, you can use the - # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude --# certain files from those directories. Note that the wildcards are matched --# against the file with absolute path, so to exclude all test directories --# for example use the pattern */test/* -+# certain files from those directories. -+# -+# Note that the wildcards are matched against the file with absolute path, so to -+# exclude all test directories for example use the pattern */test/* - - EXCLUDE_PATTERNS = - -@@ -661,744 +792,1080 @@ EXCLUDE_PATTERNS = - # output. The symbol name can be a fully qualified name, a word, or if the - # wildcard * is used, a substring. Examples: ANamespace, AClass, - # AClass::ANamespace, ANamespace::*Test -+# -+# Note that the wildcards are matched against the file with absolute path, so to -+# exclude all test directories use the pattern */test/* - - EXCLUDE_SYMBOLS = - --# The EXAMPLE_PATH tag can be used to specify one or more files or --# directories that contain example code fragments that are included (see --# the \include command). -+# The EXAMPLE_PATH tag can be used to specify one or more files or directories -+# that contain example code fragments that are included (see the \include -+# command). - - EXAMPLE_PATH = . - - # If the value of the EXAMPLE_PATH tag contains directories, you can use the --# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp --# and *.h) to filter out the source-files in the directories. If left --# blank all files are included. -+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and -+# *.h) to filter out the source-files in the directories. If left blank all -+# files are included. - - EXAMPLE_PATTERNS = - - # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be --# searched for input files to be used with the \include or \dontinclude --# commands irrespective of the value of the RECURSIVE tag. --# Possible values are YES and NO. If left blank NO is used. -+# searched for input files to be used with the \include or \dontinclude commands -+# irrespective of the value of the RECURSIVE tag. -+# The default value is: NO. - - EXAMPLE_RECURSIVE = YES - --# The IMAGE_PATH tag can be used to specify one or more files or --# directories that contain image that are included in the documentation (see --# the \image command). -+# The IMAGE_PATH tag can be used to specify one or more files or directories -+# that contain images that are to be included in the documentation (see the -+# \image command). - - IMAGE_PATH = - - # The INPUT_FILTER tag can be used to specify a program that doxygen should - # invoke to filter for each input file. Doxygen will invoke the filter program --# by executing (via popen()) the command , where --# is the value of the INPUT_FILTER tag, and is the name of an --# input file. Doxygen will then use the output that the filter program writes --# to standard output. --# If FILTER_PATTERNS is specified, this tag will be --# ignored. -+# by executing (via popen()) the command: -+# -+# -+# -+# where is the value of the INPUT_FILTER tag, and is the -+# name of an input file. Doxygen will then use the output that the filter -+# program writes to standard output. If FILTER_PATTERNS is specified, this tag -+# will be ignored. -+# -+# Note that the filter must not add or remove lines; it is applied before the -+# code is scanned, but not when the output code is generated. If lines are added -+# or removed, the anchors will not be placed correctly. - - INPUT_FILTER = - - # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern --# basis. --# Doxygen will compare the file name with each pattern and apply the --# filter if there is a match. --# The filters are a list of the form: --# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further --# info on how filters are used. If FILTER_PATTERNS is empty or if --# non of the patterns match the file name, INPUT_FILTER is applied. -+# basis. Doxygen will compare the file name with each pattern and apply the -+# filter if there is a match. The filters are a list of the form: pattern=filter -+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how -+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the -+# patterns match the file name, INPUT_FILTER is applied. - - FILTER_PATTERNS = - - # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using --# INPUT_FILTER) will be used to filter the input files when producing source --# files to browse (i.e. when SOURCE_BROWSER is set to YES). -+# INPUT_FILTER ) will also be used to filter the input files that are used for -+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). -+# The default value is: NO. - - FILTER_SOURCE_FILES = NO - - # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file --# pattern. A pattern will override the setting for FILTER_PATTERN (if any) --# and it is also possible to disable source filtering for a specific pattern --# using *.ext= (so without naming a filter). This option only has effect when --# FILTER_SOURCE_FILES is enabled. -+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and -+# it is also possible to disable source filtering for a specific pattern using -+# *.ext= (so without naming a filter). -+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. - - FILTER_SOURCE_PATTERNS = - -+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that -+# is part of the input, its contents will be placed on the main page -+# (index.html). This can be useful if you have a project on for instance GitHub -+# and want to reuse the introduction page also for the doxygen output. -+ -+USE_MDFILE_AS_MAINPAGE = -+ - #--------------------------------------------------------------------------- --# configuration options related to source browsing -+# Configuration options related to source browsing - #--------------------------------------------------------------------------- - --# If the SOURCE_BROWSER tag is set to YES then a list of source files will --# be generated. Documented entities will be cross-referenced with these sources. --# Note: To get rid of all source code in the generated output, make sure also --# VERBATIM_HEADERS is set to NO. -+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be -+# generated. Documented entities will be cross-referenced with these sources. -+# -+# Note: To get rid of all source code in the generated output, make sure that -+# also VERBATIM_HEADERS is set to NO. -+# The default value is: NO. - - SOURCE_BROWSER = YES - --# Setting the INLINE_SOURCES tag to YES will include the body --# of functions and classes directly in the documentation. -+# Setting the INLINE_SOURCES tag to YES will include the body of functions, -+# classes and enums directly into the documentation. -+# The default value is: NO. - - INLINE_SOURCES = NO - --# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct --# doxygen to hide any special comment blocks from generated source code --# fragments. Normal C and C++ comments will always remain visible. -+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any -+# special comment blocks from generated source code fragments. Normal C, C++ and -+# Fortran comments will always remain visible. -+# The default value is: YES. - - STRIP_CODE_COMMENTS = YES - --# If the REFERENCED_BY_RELATION tag is set to YES --# then for each documented function all documented --# functions referencing it will be listed. -+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented -+# function all documented functions referencing it will be listed. -+# The default value is: NO. - - REFERENCED_BY_RELATION = NO - --# If the REFERENCES_RELATION tag is set to YES --# then for each documented function all documented entities --# called/used by that function will be listed. -+# If the REFERENCES_RELATION tag is set to YES then for each documented function -+# all documented entities called/used by that function will be listed. -+# The default value is: NO. - - REFERENCES_RELATION = NO - --# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) --# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from --# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will --# link to the source code. --# Otherwise they will link to the documentation. -+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set -+# to YES, then the hyperlinks from functions in REFERENCES_RELATION and -+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will -+# link to the documentation. -+# The default value is: YES. - - REFERENCES_LINK_SOURCE = YES - --# If the USE_HTAGS tag is set to YES then the references to source code --# will point to the HTML generated by the htags(1) tool instead of doxygen --# built-in source browser. The htags tool is part of GNU's global source --# tagging system (see http://www.gnu.org/software/global/global.html). You --# will need version 4.8.6 or higher. -+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the -+# source code will show a tooltip with additional information such as prototype, -+# brief description and links to the definition and documentation. Since this -+# will make the HTML file larger and loading of large files a bit slower, you -+# can opt to disable this feature. -+# The default value is: YES. -+# This tag requires that the tag SOURCE_BROWSER is set to YES. -+ -+SOURCE_TOOLTIPS = YES -+ -+# If the USE_HTAGS tag is set to YES then the references to source code will -+# point to the HTML generated by the htags(1) tool instead of doxygen built-in -+# source browser. The htags tool is part of GNU's global source tagging system -+# (see http://www.gnu.org/software/global/global.html). You will need version -+# 4.8.6 or higher. -+# -+# To use it do the following: -+# - Install the latest version of global -+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file -+# - Make sure the INPUT points to the root of the source tree -+# - Run doxygen as normal -+# -+# Doxygen will invoke htags (and that will in turn invoke gtags), so these -+# tools must be available from the command line (i.e. in the search path). -+# -+# The result: instead of the source browser generated by doxygen, the links to -+# source code will now point to the output of htags. -+# The default value is: NO. -+# This tag requires that the tag SOURCE_BROWSER is set to YES. - - USE_HTAGS = NO - --# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen --# will generate a verbatim copy of the header file for each class for --# which an include is specified. Set to NO to disable this. -+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a -+# verbatim copy of the header file for each class for which an include is -+# specified. Set to NO to disable this. -+# See also: Section \class. -+# The default value is: YES. - - VERBATIM_HEADERS = YES - - #--------------------------------------------------------------------------- --# configuration options related to the alphabetical class index -+# Configuration options related to the alphabetical class index - #--------------------------------------------------------------------------- - --# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index --# of all compounds will be generated. Enable this if the project --# contains a lot of classes, structs, unions or interfaces. -+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all -+# compounds will be generated. Enable this if the project contains a lot of -+# classes, structs, unions or interfaces. -+# The default value is: YES. - - ALPHABETICAL_INDEX = YES - --# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then --# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns --# in which this list will be split (can be a number in the range [1..20]) -+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in -+# which the alphabetical index list will be split. -+# Minimum value: 1, maximum value: 20, default value: 5. -+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - - COLS_IN_ALPHA_INDEX = 5 - --# In case all classes in a project start with a common prefix, all --# classes will be put under the same header in the alphabetical index. --# The IGNORE_PREFIX tag can be used to specify one or more prefixes that --# should be ignored while generating the index headers. -+# In case all classes in a project start with a common prefix, all classes will -+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag -+# can be used to specify a prefix (or a list of prefixes) that should be ignored -+# while generating the index headers. -+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - - IGNORE_PREFIX = - - #--------------------------------------------------------------------------- --# configuration options related to the HTML output -+# Configuration options related to the HTML output - #--------------------------------------------------------------------------- - --# If the GENERATE_HTML tag is set to YES (the default) Doxygen will --# generate HTML output. -+# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output -+# The default value is: YES. - - GENERATE_HTML = YES - --# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. --# If a relative path is entered the value of OUTPUT_DIRECTORY will be --# put in front of it. If left blank `html' will be used as the default path. -+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a -+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of -+# it. -+# The default directory is: html. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - HTML_OUTPUT = html - --# The HTML_FILE_EXTENSION tag can be used to specify the file extension for --# each generated HTML page (for example: .htm,.php,.asp). If it is left blank --# doxygen will generate files with .html extension. -+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each -+# generated HTML page (for example: .htm, .php, .asp). -+# The default value is: .html. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - HTML_FILE_EXTENSION = .html - --# The HTML_HEADER tag can be used to specify a personal HTML header for --# each generated HTML page. If it is left blank doxygen will generate a --# standard header. Note that when using a custom header you are responsible --# for the proper inclusion of any scripts and style sheets that doxygen --# needs, which is dependent on the configuration options used. --# It is adviced to generate a default header using "doxygen -w html --# header.html footer.html stylesheet.css YourConfigFile" and then modify --# that header. Note that the header is subject to change so you typically --# have to redo this when upgrading to a newer version of doxygen or when changing the value of configuration settings such as GENERATE_TREEVIEW! -+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for -+# each generated HTML page. If the tag is left blank doxygen will generate a -+# standard header. -+# -+# To get valid HTML the header file that includes any scripts and style sheets -+# that doxygen needs, which is dependent on the configuration options used (e.g. -+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a -+# default header using -+# doxygen -w html new_header.html new_footer.html new_stylesheet.css -+# YourConfigFile -+# and then modify the file new_header.html. See also section "Doxygen usage" -+# for information on how to generate the default header that doxygen normally -+# uses. -+# Note: The header is subject to change so you typically have to regenerate the -+# default header when upgrading to a newer version of doxygen. For a description -+# of the possible markers and block names see the documentation. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - HTML_HEADER = - --# The HTML_FOOTER tag can be used to specify a personal HTML footer for --# each generated HTML page. If it is left blank doxygen will generate a --# standard footer. -+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each -+# generated HTML page. If the tag is left blank doxygen will generate a standard -+# footer. See HTML_HEADER for more information on how to generate a default -+# footer and what special commands can be used inside the footer. See also -+# section "Doxygen usage" for information on how to generate the default footer -+# that doxygen normally uses. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - HTML_FOOTER = - --# If the HTML_TIMESTAMP tag is set to YES then the generated HTML documentation will contain the timesstamp. -+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style -+# sheet that is used by each HTML page. It can be used to fine-tune the look of -+# the HTML output. If left blank doxygen will generate a default style sheet. -+# See also section "Doxygen usage" for information on how to generate the style -+# sheet that doxygen normally uses. -+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as -+# it is more robust and this tag (HTML_STYLESHEET) will in the future become -+# obsolete. -+# This tag requires that the tag GENERATE_HTML is set to YES. - --HTML_TIMESTAMP = NO -+HTML_STYLESHEET = - --# The HTML_STYLESHEET tag can be used to specify a user-defined cascading --# style sheet that is used by each HTML page. It can be used to --# fine-tune the look of the HTML output. If the tag is left blank doxygen --# will generate a default style sheet. Note that doxygen will try to copy --# the style sheet file to the HTML output directory, so don't put your own --# stylesheet in the HTML output directory as well, or it will be erased! -+# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional user- -+# defined cascading style sheet that is included after the standard style sheets -+# created by doxygen. Using this option one can overrule certain style aspects. -+# This is preferred over using HTML_STYLESHEET since it does not replace the -+# standard style sheet and is therefor more robust against future updates. -+# Doxygen will copy the style sheet file to the output directory. For an example -+# see the documentation. -+# This tag requires that the tag GENERATE_HTML is set to YES. - --HTML_STYLESHEET = -+HTML_EXTRA_STYLESHEET = - - # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or - # other source files which should be copied to the HTML output directory. Note - # that these files will be copied to the base HTML output directory. Use the --# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these --# files. In the HTML_STYLESHEET file, use the file name only. Also note that --# the files will be copied as-is; there are no commands or markers available. -+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these -+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the -+# files will be copied as-is; there are no commands or markers available. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - HTML_EXTRA_FILES = - --# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. --# Doxygen will adjust the colors in the stylesheet and background images --# according to this color. Hue is specified as an angle on a colorwheel, --# see http://en.wikipedia.org/wiki/Hue for more information. --# For instance the value 0 represents red, 60 is yellow, 120 is green, --# 180 is cyan, 240 is blue, 300 purple, and 360 is red again. --# The allowed range is 0 to 359. -+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen -+# will adjust the colors in the stylesheet and background images according to -+# this color. Hue is specified as an angle on a colorwheel, see -+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value -+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 -+# purple, and 360 is red again. -+# Minimum value: 0, maximum value: 359, default value: 220. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - HTML_COLORSTYLE_HUE = 220 - --# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of --# the colors in the HTML output. For a value of 0 the output will use --# grayscales only. A value of 255 will produce the most vivid colors. -+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors -+# in the HTML output. For a value of 0 the output will use grayscales only. A -+# value of 255 will produce the most vivid colors. -+# Minimum value: 0, maximum value: 255, default value: 100. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - HTML_COLORSTYLE_SAT = 100 - --# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to --# the luminance component of the colors in the HTML output. Values below --# 100 gradually make the output lighter, whereas values above 100 make --# the output darker. The value divided by 100 is the actual gamma applied, --# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, --# and 100 does not change the gamma. -+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the -+# luminance component of the colors in the HTML output. Values below 100 -+# gradually make the output lighter, whereas values above 100 make the output -+# darker. The value divided by 100 is the actual gamma applied, so 80 represents -+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not -+# change the gamma. -+# Minimum value: 40, maximum value: 240, default value: 80. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - HTML_COLORSTYLE_GAMMA = 80 - - # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML --# page will contain the date and time when the page was generated. Setting --# this to NO can help when comparing the output of multiple runs. -+# page will contain the date and time when the page was generated. Setting this -+# to NO can help when comparing the output of multiple runs. -+# The default value is: YES. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - HTML_TIMESTAMP = YES - --# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, --# files or namespaces will be aligned in HTML using tables. If set to --# NO a bullet list will be used. -- --HTML_ALIGN_MEMBERS = YES -- - # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML - # documentation will contain sections that can be hidden and shown after the --# page has loaded. For this to work a browser that supports --# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox --# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). -+# page has loaded. -+# The default value is: NO. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - HTML_DYNAMIC_SECTIONS = NO - --# If the GENERATE_DOCSET tag is set to YES, additional index files --# will be generated that can be used as input for Apple's Xcode 3 --# integrated development environment, introduced with OSX 10.5 (Leopard). --# To create a documentation set, doxygen will generate a Makefile in the --# HTML output directory. Running make will produce the docset in that --# directory and running "make install" will install the docset in --# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find --# it at startup. --# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html -+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries -+# shown in the various tree structured indices initially; the user can expand -+# and collapse entries dynamically later on. Doxygen will expand the tree to -+# such a level that at most the specified number of entries are visible (unless -+# a fully collapsed tree already exceeds this amount). So setting the number of -+# entries 1 will produce a full collapsed tree by default. 0 is a special value -+# representing an infinite number of entries and will result in a full expanded -+# tree by default. -+# Minimum value: 0, maximum value: 9999, default value: 100. -+# This tag requires that the tag GENERATE_HTML is set to YES. -+ -+HTML_INDEX_NUM_ENTRIES = 100 -+ -+# If the GENERATE_DOCSET tag is set to YES, additional index files will be -+# generated that can be used as input for Apple's Xcode 3 integrated development -+# environment (see: http://developer.apple.com/tools/xcode/), introduced with -+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a -+# Makefile in the HTML output directory. Running make will produce the docset in -+# that directory and running make install will install the docset in -+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at -+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html - # for more information. -+# The default value is: NO. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - GENERATE_DOCSET = NO - --# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the --# feed. A documentation feed provides an umbrella under which multiple --# documentation sets from a single provider (such as a company or product suite) --# can be grouped. -+# This tag determines the name of the docset feed. A documentation feed provides -+# an umbrella under which multiple documentation sets from a single provider -+# (such as a company or product suite) can be grouped. -+# The default value is: Doxygen generated docs. -+# This tag requires that the tag GENERATE_DOCSET is set to YES. - - DOCSET_FEEDNAME = "Doxygen generated docs" - --# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that --# should uniquely identify the documentation set bundle. This should be a --# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen --# will append .docset to the name. -+# This tag specifies a string that should uniquely identify the documentation -+# set bundle. This should be a reverse domain-name style string, e.g. -+# com.mycompany.MyDocSet. Doxygen will append .docset to the name. -+# The default value is: org.doxygen.Project. -+# This tag requires that the tag GENERATE_DOCSET is set to YES. - - DOCSET_BUNDLE_ID = org.doxygen.Pacemaker - --# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify -+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify - # the documentation publisher. This should be a reverse domain-name style - # string, e.g. com.mycompany.MyDocSet.documentation. -+# The default value is: org.doxygen.Publisher. -+# This tag requires that the tag GENERATE_DOCSET is set to YES. - - DOCSET_PUBLISHER_ID = org.doxygen.ClusterLabs - --# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. -+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. -+# The default value is: Publisher. -+# This tag requires that the tag GENERATE_DOCSET is set to YES. - - DOCSET_PUBLISHER_NAME = ClusterLabs - --# If the GENERATE_HTMLHELP tag is set to YES, additional index files --# will be generated that can be used as input for tools like the --# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) --# of the generated HTML documentation. -+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three -+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The -+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on -+# Windows. -+# -+# The HTML Help Workshop contains a compiler that can convert all HTML output -+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML -+# files are now used as the Windows 98 help format, and will replace the old -+# Windows help format (.hlp) on all Windows platforms in the future. Compressed -+# HTML files also contain an index, a table of contents, and you can search for -+# words in the documentation. The HTML workshop also contains a viewer for -+# compressed HTML files. -+# The default value is: NO. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - GENERATE_HTMLHELP = NO - --# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can --# be used to specify the file name of the resulting .chm file. You --# can add a path in front of the file if the result should not be -+# The CHM_FILE tag can be used to specify the file name of the resulting .chm -+# file. You can add a path in front of the file if the result should not be - # written to the html output directory. -+# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - - CHM_FILE = - --# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can --# be used to specify the location (absolute path including file name) of --# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run --# the HTML help compiler on the generated index.hhp. -+# The HHC_LOCATION tag can be used to specify the location (absolute path -+# including file name) of the HTML help compiler ( hhc.exe). If non-empty -+# doxygen will try to run the HTML help compiler on the generated index.hhp. -+# The file has to be specified with full path. -+# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - - HHC_LOCATION = - --# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag --# controls if a separate .chi index file is generated (YES) or that --# it should be included in the master .chm file (NO). -+# The GENERATE_CHI flag controls if a separate .chi index file is generated ( -+# YES) or that it should be included in the master .chm file ( NO). -+# The default value is: NO. -+# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - - GENERATE_CHI = NO - --# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING --# is used to encode HtmlHelp index (hhk), content (hhc) and project file --# content. -+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc) -+# and project file content. -+# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - - CHM_INDEX_ENCODING = - --# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag --# controls whether a binary table of contents is generated (YES) or a --# normal table of contents (NO) in the .chm file. -+# The BINARY_TOC flag controls whether a binary table of contents is generated ( -+# YES) or a normal table of contents ( NO) in the .chm file. -+# The default value is: NO. -+# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - - BINARY_TOC = NO - --# The TOC_EXPAND flag can be set to YES to add extra items for group members --# to the contents of the HTML help documentation and to the tree view. -+# The TOC_EXPAND flag can be set to YES to add extra items for group members to -+# the table of contents of the HTML help documentation and to the tree view. -+# The default value is: NO. -+# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - - TOC_EXPAND = NO - - # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and --# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated --# that can be used as input for Qt's qhelpgenerator to generate a --# Qt Compressed Help (.qch) of the generated HTML documentation. -+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that -+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help -+# (.qch) of the generated HTML documentation. -+# The default value is: NO. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - GENERATE_QHP = NO - --# If the QHG_LOCATION tag is specified, the QCH_FILE tag can --# be used to specify the file name of the resulting .qch file. --# The path specified is relative to the HTML output folder. -+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify -+# the file name of the resulting .qch file. The path specified is relative to -+# the HTML output folder. -+# This tag requires that the tag GENERATE_QHP is set to YES. - - QCH_FILE = - --# The QHP_NAMESPACE tag specifies the namespace to use when generating --# Qt Help Project output. For more information please see --# http://doc.trolltech.com/qthelpproject.html#namespace -+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help -+# Project output. For more information please see Qt Help Project / Namespace -+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). -+# The default value is: org.doxygen.Project. -+# This tag requires that the tag GENERATE_QHP is set to YES. - - QHP_NAMESPACE = org.doxygen.Project - --# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating --# Qt Help Project output. For more information please see --# http://doc.trolltech.com/qthelpproject.html#virtual-folders -+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt -+# Help Project output. For more information please see Qt Help Project / Virtual -+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- -+# folders). -+# The default value is: doc. -+# This tag requires that the tag GENERATE_QHP is set to YES. - - QHP_VIRTUAL_FOLDER = doc - --# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to --# add. For more information please see --# http://doc.trolltech.com/qthelpproject.html#custom-filters -+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom -+# filter to add. For more information please see Qt Help Project / Custom -+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- -+# filters). -+# This tag requires that the tag GENERATE_QHP is set to YES. - - QHP_CUST_FILTER_NAME = - --# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the --# custom filter to add. For more information please see --# --# Qt Help Project / Custom Filters. -+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the -+# custom filter to add. For more information please see Qt Help Project / Custom -+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- -+# filters). -+# This tag requires that the tag GENERATE_QHP is set to YES. - - QHP_CUST_FILTER_ATTRS = - - # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this --# project's --# filter section matches. --# --# Qt Help Project / Filter Attributes. -+# project's filter section matches. Qt Help Project / Filter Attributes (see: -+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). -+# This tag requires that the tag GENERATE_QHP is set to YES. - - QHP_SECT_FILTER_ATTRS = - --# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can --# be used to specify the location of Qt's qhelpgenerator. --# If non-empty doxygen will try to run qhelpgenerator on the generated --# .qhp file. -+# The QHG_LOCATION tag can be used to specify the location of Qt's -+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the -+# generated .qhp file. -+# This tag requires that the tag GENERATE_QHP is set to YES. - - QHG_LOCATION = - --# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files --# will be generated, which together with the HTML files, form an Eclipse help --# plugin. To install this plugin and make it available under the help contents --# menu in Eclipse, the contents of the directory containing the HTML and XML --# files needs to be copied into the plugins directory of eclipse. The name of --# the directory within the plugins directory should be the same as --# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before --# the help appears. -+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be -+# generated, together with the HTML files, they form an Eclipse help plugin. To -+# install this plugin and make it available under the help contents menu in -+# Eclipse, the contents of the directory containing the HTML and XML files needs -+# to be copied into the plugins directory of eclipse. The name of the directory -+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. -+# After copying Eclipse needs to be restarted before the help appears. -+# The default value is: NO. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - GENERATE_ECLIPSEHELP = NO - --# A unique identifier for the eclipse help plugin. When installing the plugin --# the directory name containing the HTML and XML files should also have --# this name. -+# A unique identifier for the Eclipse help plugin. When installing the plugin -+# the directory name containing the HTML and XML files should also have this -+# name. Each documentation set should have its own identifier. -+# The default value is: org.doxygen.Project. -+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. - - ECLIPSE_DOC_ID = org.doxygen.Project - --# The DISABLE_INDEX tag can be used to turn on/off the condensed index at --# top of each HTML page. The value NO (the default) enables the index and --# the value YES disables it. -+# If you want full control over the layout of the generated HTML pages it might -+# be necessary to disable the index and replace it with your own. The -+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top -+# of each HTML page. A value of NO enables the index and the value YES disables -+# it. Since the tabs in the index contain the same information as the navigation -+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. -+# The default value is: NO. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - DISABLE_INDEX = NO - --# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values --# (range [0,1..20]) that doxygen will group on one line in the generated HTML --# documentation. Note that a value of 0 will completely suppress the enum --# values from appearing in the overview section. -- --ENUM_VALUES_PER_LINE = 4 -- - # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index --# structure should be generated to display hierarchical information. --# If the tag value is set to YES, a side panel will be generated --# containing a tree-like index structure (just like the one that --# is generated for HTML Help). For this to work a browser that supports --# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). --# Windows users are probably better off using the HTML help feature. -+# structure should be generated to display hierarchical information. If the tag -+# value is set to YES, a side panel will be generated containing a tree-like -+# index structure (just like the one that is generated for HTML Help). For this -+# to work a browser that supports JavaScript, DHTML, CSS and frames is required -+# (i.e. any modern browser). Windows users are probably better off using the -+# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can -+# further fine-tune the look of the index. As an example, the default style -+# sheet generated by doxygen has an example that shows how to put an image at -+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has -+# the same information as the tab index, you could consider setting -+# DISABLE_INDEX to YES when enabling this option. -+# The default value is: NO. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - GENERATE_TREEVIEW = NO - --# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, --# and Class Hierarchy pages using a tree view instead of an ordered list. -+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that -+# doxygen will group on one line in the generated HTML documentation. -+# -+# Note that a value of 0 will completely suppress the enum values from appearing -+# in the overview section. -+# Minimum value: 0, maximum value: 20, default value: 4. -+# This tag requires that the tag GENERATE_HTML is set to YES. - --USE_INLINE_TREES = NO -+ENUM_VALUES_PER_LINE = 4 - --# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be --# used to set the initial width (in pixels) of the frame in which the tree --# is shown. -+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used -+# to set the initial width (in pixels) of the frame in which the tree is shown. -+# Minimum value: 0, maximum value: 1500, default value: 250. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - TREEVIEW_WIDTH = 250 - --# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open --# links to external symbols imported via tag files in a separate window. -+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to -+# external symbols imported via tag files in a separate window. -+# The default value is: NO. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - EXT_LINKS_IN_WINDOW = NO - --# Use this tag to change the font size of Latex formulas included --# as images in the HTML documentation. The default is 10. Note that --# when you change the font size after a successful doxygen run you need --# to manually remove any form_*.png images from the HTML output directory --# to force them to be regenerated. -+# Use this tag to change the font size of LaTeX formulas included as images in -+# the HTML documentation. When you change the font size after a successful -+# doxygen run you need to manually remove any form_*.png images from the HTML -+# output directory to force them to be regenerated. -+# Minimum value: 8, maximum value: 50, default value: 10. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - FORMULA_FONTSIZE = 10 - - # Use the FORMULA_TRANPARENT tag to determine whether or not the images --# generated for formulas are transparent PNGs. Transparent PNGs are --# not supported properly for IE 6.0, but are supported on all modern browsers. --# Note that when changing this option you need to delete any form_*.png files --# in the HTML output before the changes have effect. -+# generated for formulas are transparent PNGs. Transparent PNGs are not -+# supported properly for IE 6.0, but are supported on all modern browsers. -+# -+# Note that when changing this option you need to delete any form_*.png files in -+# the HTML output directory before the changes have effect. -+# The default value is: YES. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - FORMULA_TRANSPARENT = YES - --# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax --# (see http://www.mathjax.org) which uses client side Javascript for the --# rendering instead of using prerendered bitmaps. Use this if you do not --# have LaTeX installed or if you want to formulas look prettier in the HTML --# output. When enabled you also need to install MathJax separately and --# configure the path to it using the MATHJAX_RELPATH option. -+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see -+# http://www.mathjax.org) which uses client side Javascript for the rendering -+# instead of using prerendered bitmaps. Use this if you do not have LaTeX -+# installed or if you want to formulas look prettier in the HTML output. When -+# enabled you may also need to install MathJax separately and configure the path -+# to it using the MATHJAX_RELPATH option. -+# The default value is: NO. -+# This tag requires that the tag GENERATE_HTML is set to YES. - - USE_MATHJAX = NO - --# When MathJax is enabled you need to specify the location relative to the --# HTML output directory using the MATHJAX_RELPATH option. The destination --# directory should contain the MathJax.js script. For instance, if the mathjax --# directory is located at the same level as the HTML output directory, then --# MATHJAX_RELPATH should be ../mathjax. The default value points to the --# mathjax.org site, so you can quickly see the result without installing --# MathJax, but it is strongly recommended to install a local copy of MathJax --# before deployment. -+# When MathJax is enabled you can set the default output format to be used for -+# the MathJax output. See the MathJax site (see: -+# http://docs.mathjax.org/en/latest/output.html) for more details. -+# Possible values are: HTML-CSS (which is slower, but has the best -+# compatibility), NativeMML (i.e. MathML) and SVG. -+# The default value is: HTML-CSS. -+# This tag requires that the tag USE_MATHJAX is set to YES. -+ -+MATHJAX_FORMAT = HTML-CSS -+ -+# When MathJax is enabled you need to specify the location relative to the HTML -+# output directory using the MATHJAX_RELPATH option. The destination directory -+# should contain the MathJax.js script. For instance, if the mathjax directory -+# is located at the same level as the HTML output directory, then -+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax -+# Content Delivery Network so you can quickly see the result without installing -+# MathJax. However, it is strongly recommended to install a local copy of -+# MathJax from http://www.mathjax.org before deployment. -+# The default value is: http://cdn.mathjax.org/mathjax/latest. -+# This tag requires that the tag USE_MATHJAX is set to YES. - - MATHJAX_RELPATH = http://www.mathjax.org/mathjax - --# When the SEARCHENGINE tag is enabled doxygen will generate a search box --# for the HTML output. The underlying search engine uses javascript --# and DHTML and should work on any modern browser. Note that when using --# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets --# (GENERATE_DOCSET) there is already a search function so this one should --# typically be disabled. For large projects the javascript based search engine --# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. -+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax -+# extension names that should be enabled during MathJax rendering. For example -+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols -+# This tag requires that the tag USE_MATHJAX is set to YES. -+ -+MATHJAX_EXTENSIONS = -+ -+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces -+# of code that will be used on startup of the MathJax code. See the MathJax site -+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an -+# example see the documentation. -+# This tag requires that the tag USE_MATHJAX is set to YES. -+ -+MATHJAX_CODEFILE = -+ -+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for -+# the HTML output. The underlying search engine uses javascript and DHTML and -+# should work on any modern browser. Note that when using HTML help -+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) -+# there is already a search function so this one should typically be disabled. -+# For large projects the javascript based search engine can be slow, then -+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to -+# search using the keyboard; to jump to the search box use + S -+# (what the is depends on the OS and browser, but it is typically -+# , / -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ - - ---- --endif::[] -- --ifdef::crmsh[] --..... --# crm configure show --node pcmk-1 --node pcmk-2 --primitive WebData ocf:linbit:drbd \ -- params drbd_resource="wwwdata" \ -- op monitor interval="60s" --primitive WebFS ocf:heartbeat:Filesystem \ -- params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.101" cidr_netmask="32" clusterip_hash="sourceip" \ -- op monitor interval="30s" --primitive ipmi-fencing stonith::fence_ipmilan \ -- params pcmk_host_list="pcmk-1 pcmk-2" ipaddr=10.0.0.1 login=testuser passwd=abc123 \ -- op monitor interval="60s" --ms WebDataClone WebData \ -- meta master-max="2" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" --clone WebFSClone WebFS --clone WebIP ClusterIP \ -- meta globally-unique="true" clone-max="2" clone-node-max="2" --clone WebSiteClone WebSite --colocation WebSite-with-WebFS inf: WebSiteClone WebFSClone --colocation fs_on_drbd inf: WebFSClone WebDataClone:Master --colocation website-with-ip inf: WebSiteClone WebIP --order WebFS-after-WebData inf: WebDataClone:promote WebFSClone:start --order WebSite-after-WebFS inf: WebFSClone WebSiteClone --order apache-after-ip inf: WebIP WebSiteClone --property $id="cib-bootstrap-options" \ -- dc-version="1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f" \ -- cluster-infrastructure="openais" \ -- expected-quorum-votes="2" \ -- stonith-enabled="true" \ -- no-quorum-policy="ignore" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" --..... --endif::[] -- - - === Node List === - --The list of cluster nodes is automatically populated by the cluster. -- --ifdef::pcs[] --..... -+---- -+[root@pcmk-1 ~]# pcs status nodes - Pacemaker Nodes: -- Online: [ pcmk-1 pcmk-2 ] --..... --endif::[] -- --ifdef::crmsh[] --..... --node pcmk-1 --node pcmk-2 --..... --endif::[] -+ Online: pcmk-1 pcmk-2 -+ Standby: -+ Offline: -+---- - - === Cluster Options === - --This is where the cluster automatically stores some information about --the cluster -- --* dc-version - the version (including upstream source-code hash) of Pacemaker used on the DC -- --* cluster-infrastructure - the cluster infrastructure being used (heartbeat or openais) -- --* expected-quorum-votes - the maximum number of nodes expected to be part of the cluster -- --and where the admin can set options that control the way the cluster --operates -- --* stonith-enabled=true - Make use of STONITH -- --* no-quorum-policy=ignore - Ignore loss of quorum and continue to host resources. -- --ifdef::pcs[] --[source,C] - ---- --# pcs property --dc-version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --cluster-infrastructure: corosync --no-quorum-policy: ignore --stonith-enabled: true -+[root@pcmk-1 ~]# pcs property -+Cluster Properties: -+ cluster-infrastructure: corosync -+ cluster-name: mycluster -+ dc-version: 1.1.12-a9c8177 -+ have-watchdog: false -+ last-lrm-refresh: 1419129162 -+ stonith-enabled: true - ---- --endif::[] - --ifdef::crmsh[] --..... --property $id="cib-bootstrap-options" \ -- dc-version="1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f" \ -- cluster-infrastructure="openais" \ -- expected-quorum-votes="2" \ -- stonith-enabled="true" \ -- no-quorum-policy="ignore" --..... --endif::[] -+The output shows state information automatically obtained about the cluster, including: -+* *cluster-infrastructure* - the cluster communications layer in use (heartbeat or corosync) -+* *cluster-name* - the cluster name chosen by the administrator when the cluster was created -+* *dc-version* - the version (including upstream source-code hash) of Pacemaker used on the Designated Controller - --=== Resources === -+The output also shows options set by the administrator that control the way the cluster operates, including: -+* *stonith-enabled=true* - whether the cluster is allowed to use STONITH resources - -+=== Resources === - - ==== Default Options ==== - --Here we configure cluster options that apply to every resource. -- --ifdef::pcs[] --* resource-stickiness - Specify the aversion to moving resources to other machines --[source,C] - ---- --# pcs resource defaults -+[root@pcmk-1 ~]# pcs resource defaults - resource-stickiness: 100 - ---- --endif::[] - --ifdef::crmsh[] --* resource-stickiness - Specify the aversion to moving resources to other machines --..... --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" --..... --endif::[] -+This shows cluster option defaults that apply to every resource that does not -+explicitly set the option itself. Above: -+* *resource-stickiness* - Specify the aversion to moving healthy resources to other machines - - ==== Fencing ==== - --ifdef::pcs[] --[source,C] - ---- --# pcs stonith show -- impi-fencing (stonith:fence_ipmilan) Started --# pcs stonith show impi-fencing --Resource: impi-fencing -- pcmk_host_list: pcmk-1 pcmk-2 -- ipaddr: 10.0.0.1 -- login: testuser -- passwd: acd123 ------ --endif::[] -- --ifdef::crmsh[] --..... --primitive ipmi-fencing stonith::fence_ipmilan \ -- params pcmk_host_list="pcmk-1 pcmk-2" ipaddr=10.0.0.1 login=testuser passwd=abc123 \ -- op monitor interval="60s" --clone Fencing rsa-fencing --..... --endif::[] -+[root@pcmk-1 ~]# pcs stonith show -+ ipmi-fencing (stonith:fence_ipmilan) Started -+[root@pcmk-1 ~]# pcs stonith show ipmi-fencing -+ Resource: ipmi-fencing (class=stonith type=fence_ipmilan) -+ Attributes: ipaddr="10.0.0.1" login="testuser" passwd="acd123" pcmk_host_list="pcmk-1 pcmk-2" -+ Operations: monitor interval=60s (fence-monitor-interval-60s) -+---- - - ==== Service Address ==== - -@@ -352,147 +383,86 @@ that we want two instances of the clone (one "request bucket" for each - node) and that if one node fails, then the remaining node should hold - both. - --ifdef::pcs[] --[source,C] ------ --# pcs resource show ClusterIP-clone --Resource: ClusterIP-clone -- ip: 192.168.0.120 -- cidr_netmask: 32 -- clusterip_hash: sourceip -- globally-unique: true -- clone-max: 2 -- clone-node-max: 2 -- op monitor interval=30s ------ --endif::[] -- --ifdef::crmsh[] --..... --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.101" cidr_netmask="32" clusterip_hash="sourceip" \ -- op monitor interval="30s" --clone WebIP ClusterIP -- meta globally-unique="true" clone-max="2" clone-node-max="2" --..... --endif::[] -- --[NOTE] --======= --TODO: The RA should check for globally-unique=true when cloned --======= -+---- -+[root@pcmk-1 ~]# pcs resource show ClusterIP-clone -+ Clone: ClusterIP-clone -+ Meta Attrs: clone-max=2 clone-node-max=2 globally-unique=true -+ Resource: ClusterIP (class=ocf provider=heartbeat type=IPaddr2) -+ Attributes: ip=192.168.122.120 cidr_netmask=32 clusterip_hash=sourceip -+ Operations: start interval=0s timeout=20s (ClusterIP-start-timeout-20s) -+ stop interval=0s timeout=20s (ClusterIP-stop-timeout-20s) -+ monitor interval=30s (ClusterIP-monitor-interval-30s) -+---- - - ==== DRBD - Shared Storage ==== - --Here we define the DRBD service and specify which DRBD resource (from --drbd.conf) it should manage. We make it a master/slave resource and, in --order to have an active/active setup, allow both instances to be promoted --by specifying master-max=2. We also set the notify option so that the --cluster will tell DRBD agent when it's peer changes state. -+Here, we define the DRBD service and specify which DRBD resource (from -+/etc/drbd.d/*.res) it should manage. We make it a master/slave resource and, in -+order to have an active/active setup, allow both instances to be promoted to master -+at the same time. We also set the notify option so that the -+cluster will tell DRBD agent when its peer changes state. - --ifdef::pcs[] --[source,C] - ---- --# pcs resource show WebDataClone --Resource: WebDataClone -- drbd_resource: wwwdata -- master-node-max: 1 -- clone-max: 2 -- clone-node-max: 1 -- notify: true -- master-max: 2 -- op monitor interval=60s --# pcs constraint ref WebDataClone -+[root@pcmk-1 ~]# pcs resource show WebDataClone -+ Master: WebDataClone -+ Meta Attrs: master-max=2 master-node-max=1 clone-max=2 clone-node-max=1 notify=true -+ Resource: WebData (class=ocf provider=linbit type=drbd) -+ Attributes: drbd_resource=wwwdata -+ Operations: start interval=0s timeout=240 (WebData-start-timeout-240) -+ promote interval=0s timeout=90 (WebData-promote-timeout-90) -+ demote interval=0s timeout=90 (WebData-demote-timeout-90) -+ stop interval=0s timeout=100 (WebData-stop-timeout-100) -+ monitor interval=60s (WebData-monitor-interval-60s) -+[root@pcmk-1 ~]# pcs constraint ref WebDataClone - Resource: WebDataClone - colocation-WebFS-WebDataClone-INFINITY - order-WebDataClone-WebFS-mandatory - ---- --endif::[] -- --ifdef::crmsh[] --..... --primitive WebData ocf:linbit:drbd \ -- params drbd_resource="wwwdata" \ -- op monitor interval="60s" --ms WebDataClone WebData \ -- meta master-max="2" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" --..... --endif::[] -- - - ==== Cluster Filesystem ==== - - The cluster filesystem ensures that files are read and written correctly. - We need to specify the block device (provided by DRBD), where we want it --mounted and that we are using GFS2. Again it is a clone because it is -+mounted and that we are using GFS2. Again, it is a clone because it is - intended to be active on both nodes. The additional constraints ensure --that it can only be started on nodes with active gfs-control and drbd --instances. -+that it can only be started on nodes with active DLM and DRBD instances. - --ifdef::pcs[] --[source,C] - ---- --# pcs resource show WebFS-clone --Resource: WebFS-clone -- device: /dev/drbd/by-res/wwwdata -- directory: /var/www/html -- fstype: gfs2 --# pcs constraint ref WebFS-clone -+[root@pcmk-1 ~]# pcs resource show WebFS-clone -+ Clone: WebFS-clone -+ Resource: WebFS (class=ocf provider=heartbeat type=Filesystem) -+ Attributes: device=/dev/drbd1 directory=/var/www/html fstype=gfs2 -+ Operations: start interval=0s timeout=60 (WebFS-start-timeout-60) -+ stop interval=0s timeout=60 (WebFS-stop-timeout-60) -+ monitor interval=20 timeout=40 (WebFS-monitor-interval-20) -+[root@pcmk-1 ~]# pcs constraint ref WebFS-clone - Resource: WebFS-clone - colocation-WebFS-WebDataClone-INFINITY - colocation-WebSite-WebFS-INFINITY -- order-WebFS-WebSite-mandatory -+ colocation-WebFS-clone-dlm-clone-INFINITY - order-WebDataClone-WebFS-mandatory -+ order-WebFS-WebSite-mandatory -+ order-dlm-clone-WebFS-clone-mandatory - ---- --endif::[] -- --ifdef::crmsh[] --..... --primitive WebFS ocf:heartbeat:Filesystem \ -- params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" --clone WebFSClone WebFS --colocation WebFS-with-gfs-control inf: WebFSClone gfs-clone --colocation fs_on_drbd inf: WebFSClone WebDataClone:Master --order WebFS-after-WebData inf: WebDataClone:promote WebFSClone:start --order start-WebFS-after-gfs-control inf: gfs-clone WebFSClone --..... --endif::[] - - ==== Apache ==== - --Lastly we have the actual service, Apache. We need only tell the cluster --where to find it's main configuration file and restrict it to running on --nodes that have the required filesystem mounted and the IP address --active. -+Lastly, we have the actual service, Apache. We need only tell the cluster -+where to find its main configuration file and restrict it to running on -+nodes that have the required filesystem mounted and the IP address active. - --ifdef::pcs[] --[source,C] - ---- --# pcs resource show WebSite-clone --Resource: WebSite-clone -- configfile: /etc/httpd/conf/httpd.conf -- statusurl: http://localhost/server-status -- master-max: 2 -- op monitor interval=1min --# pcs constraint ref WebSite-clone -+[root@pcmk-1 ~]# pcs resource show WebSite-clone -+ Clone: WebSite-clone -+ Resource: WebSite (class=ocf provider=heartbeat type=apache) -+ Attributes: configfile=/etc/httpd/conf/httpd.conf statusurl=http://localhost/server-status -+ Operations: start interval=0s timeout=40s (WebSite-start-timeout-40s) -+ stop interval=0s timeout=60s (WebSite-stop-timeout-60s) -+ monitor interval=1min (WebSite-monitor-interval-1min) -+[root@pcmk-1 ~]# pcs constraint ref WebSite-clone - Resource: WebSite-clone - colocation-WebSite-ClusterIP-INFINITY - colocation-WebSite-WebFS-INFINITY - order-ClusterIP-WebSite-mandatory - order-WebFS-WebSite-mandatory - ---- --endif::[] -- --ifdef::crmsh[] --..... --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --clone WebSiteClone WebSite --colocation WebSite-with-WebFS inf: WebSiteClone WebFSClone --colocation website-with-ip inf: WebSiteClone WebIP --order apache-after-ip inf: WebIP WebSiteClone --order WebSite-after-WebFS inf: WebFSClone WebSiteClone --..... --endif::[] -- -diff --git a/doc/Clusters_from_Scratch/en-US/Ap-Corosync-Conf.txt b/doc/Clusters_from_Scratch/en-US/Ap-Corosync-Conf.txt -index 0ab5d57..df14dd1 100644 ---- a/doc/Clusters_from_Scratch/en-US/Ap-Corosync-Conf.txt -+++ b/doc/Clusters_from_Scratch/en-US/Ap-Corosync-Conf.txt -@@ -1,12 +1,9 @@ - [appendix] - -- - == Sample Corosync Configuration == - --ifdef::pcs[] --.Sample corosync.conf for two-node cluster using a node list. -+.Sample +corosync.conf+ for two-node cluster created by `pcs`. - ..... --# Please read the corosync.conf.5 manual page - totem { - version: 2 - secauth: off -@@ -18,104 +15,19 @@ nodelist { - node { - ring0_addr: pcmk-1 - nodeid: 1 -- } -+ } - node { - ring0_addr: pcmk-2 - nodeid: 2 -- } -+ } - } - - quorum { -- provider: corosync_votequorum --} -- --logging { -- to_syslog: yes --} --..... --endif::[] -- --ifdef::crmsh[] --.Sample Corosync.conf for a two-node cluster using multicast. --..... --# Please read the corosync.conf.5 manual page --totem { -- version: 2 -- -- # cypto_cipher and crypto_hash: Used for mutual node authentication. -- # If you choose to enable this, then do remember to create a shared -- # secret with "corosync-keygen". -- crypto_cipher: none -- crypto_hash: none -- -- # interface: define at least one interface to communicate -- # over. If you define more than one interface stanza, you must -- # also set rrp_mode. -- interface { -- # Rings must be consecutively numbered, starting at 0. -- ringnumber: 0 -- # This is normally the *network* address of the -- # interface to bind to. This ensures that you can use -- # identical instances of this configuration file -- # across all your cluster nodes, without having to -- # modify this option. --bindnetaddr: 192.168.122.0 -- # However, if you have multiple physical network -- # interfaces configured for the same subnet, then the -- # network address alone is not sufficient to identify -- # the interface Corosync should bind to. In that case, -- # configure the *host* address of the interface -- # instead: -- # bindnetaddr: 192.168.1.1 -- # When selecting a multicast address, consider RFC -- # 2365 (which, among other things, specifies that -- # 239.255.x.x addresses are left to the discretion of -- # the network administrator). Do not reuse multicast -- # addresses across multiple Corosync clusters sharing -- # the same network. --mcastaddr: 239.255.1.1 -- # Corosync uses the port you specify here for UDP -- # messaging, and also the immediately preceding -- # port. Thus if you set this to 5405, Corosync sends -- # messages over UDP ports 5405 and 5404. --mcastport: 4000 -- # Time-to-live for cluster communication packets. The -- # number of hops (routers) that this ring will allow -- # itself to pass. Note that multicast routing must be -- # specifically enabled on most network routers. -- ttl: 1 -- } -+provider: corosync_votequorum -+two_node: 1 - } - - logging { -- # Log the source file and line where messages are being -- # generated. When in doubt, leave off. Potentially useful for -- # debugging. -- fileline: off -- # Log to standard error. When in doubt, set to no. Useful when -- # running in the foreground (when invoking "corosync -f") -- to_stderr: no -- # Log to a log file. When set to "no", the "logfile" option -- # must not be set. -- to_logfile: yes -- logfile: /var/log/cluster/corosync.log -- # Log to the system log daemon. When in doubt, set to yes. -- to_syslog: yes -- # Log debug messages (very verbose). When in doubt, leave off. -- debug: off -- # Log messages with time stamps. When in doubt, set to on -- # (unless you are only logging to syslog, where double -- # timestamps can be annoying). -- timestamp: on -- logger_subsys { -- subsys: QUORUM -- debug: off -- } --} -- --quorum { -- provider: corosync_votequorum -- expected_votes: 2 -+to_syslog: yes - } - ..... --endif::[] -diff --git a/doc/Clusters_from_Scratch/en-US/Ap-Reading.txt b/doc/Clusters_from_Scratch/en-US/Ap-Reading.txt -index a8ef3e3..26d5d7e 100644 ---- a/doc/Clusters_from_Scratch/en-US/Ap-Reading.txt -+++ b/doc/Clusters_from_Scratch/en-US/Ap-Reading.txt -@@ -2,11 +2,11 @@ - == Further Reading == - - - Project Website --http://www.clusterlabs.org -+http://www.clusterlabs.org/ - --- Cluster Commands --A comprehensive guide to cluster commands has been written by SuSE and can be found at: -+- SuSE has a comprehensive guide to cluster commands (though using the `crmsh` command-line -+ shell rather than `pcs`) at: - http://www.suse.com/documentation/sle_ha/book_sleha/?page=/documentation/sle_ha/book_sleha/data/book_sleha.html - - - Corosync -- http://www.corosync.org -+ http://www.corosync.org/ -diff --git a/doc/Clusters_from_Scratch/en-US/Book_Info.xml b/doc/Clusters_from_Scratch/en-US/Book_Info.xml -index 521394b..4eb6943 100644 ---- a/doc/Clusters_from_Scratch/en-US/Book_Info.xml -+++ b/doc/Clusters_from_Scratch/en-US/Book_Info.xml -@@ -8,8 +8,14 @@ - Creating Active/Passive and Active/Active Clusters on Fedora - Pacemaker - 1.1 -- 5 -- 0 -+ -+ 8 -+ 1 - - - The purpose of this document is to provide a start-to-finish guide to building an example active/passive cluster with Pacemaker and show how it can be converted to an active/active one. -diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Active-Active.txt b/doc/Clusters_from_Scratch/en-US/Ch-Active-Active.txt -index 5943c19..ca980c4 100644 ---- a/doc/Clusters_from_Scratch/en-US/Ch-Active-Active.txt -+++ b/doc/Clusters_from_Scratch/en-US/Ch-Active-Active.txt -@@ -1,755 +1,380 @@ --= Conversion to Active/Active = -- --== Requirements == -+= Convert Cluster to Active/Active = - - The primary requirement for an Active/Active cluster is that the data - required for your services is available, simultaneously, on both --machines. Pacemaker makes no requirement on how this is achieved, you --could use a SAN if you had one available, however since DRBD supports --multiple Primaries, we can also use that. -+machines. Pacemaker makes no requirement on how this is achieved; you -+could use a SAN if you had one available, but since DRBD supports -+multiple Primaries, we can continue to use it here. -+ -+== Install Cluster Filesystem Software == - - The only hitch is that we need to use a cluster-aware filesystem. The - one we used earlier with DRBD, ext4, is not one of those. Both OCFS2 --and GFS2 are supported, however here we will use GFS2 which comes with --Fedora 17. -- --=== Installing the required Software === -- --[source,C] --# yum install -y gfs2-utils dlm kernel-modules-extra --..... --Loaded plugins: langpacks, presto, refresh-packagekit --Resolving Dependencies ----> Running transaction check -----> Package dlm.x86_64 0:3.99.4-1.fc17 will be installed -----> Package gfs2-utils.x86_64 0:3.1.4-3.fc17 will be installed -----> Package kernel-modules-extra.x86_64 0:3.4.4-3.fc17 will be installed ----> Finished Dependency Resolution -- --Dependencies Resolved -- --================================================================================ -- Package Arch Version Repository Size --================================================================================ --Installing: -- dlm x86_64 3.99.4-1.fc17 updates 83 k -- gfs2-utils x86_64 3.1.4-3.fc17 fedora 214 k -- kernel-modules-extra x86_64 3.4.4-3.fc17 updates 1.7 M -- --Transaction Summary --================================================================================ --Install 3 Packages -- --Total download size: 1.9 M --Installed size: 7.7 M --Downloading Packages: --(1/3): dlm-3.99.4-1.fc17.x86_64.rpm | 83 kB 00:00 --(2/3): gfs2-utils-3.1.4-3.fc17.x86_64.rpm | 214 kB 00:00 --(3/3): kernel-modules-extra-3.4.4-3.fc17.x86_64.rpm | 1.7 MB 00:01 ---------------------------------------------------------------------------------- --Total 615 kB/s | 1.9 MB 00:03 --Running Transaction Check --Running Transaction Test --Transaction Test Succeeded --Running Transaction -- Installing : kernel-modules-extra-3.4.4-3.fc17.x86_64 1/3 -- Installing : gfs2-utils-3.1.4-3.fc17.x86_64 2/3 -- Installing : dlm-3.99.4-1.fc17.x86_64 3/3 -- Verifying : dlm-3.99.4-1.fc17.x86_64 1/3 -- Verifying : gfs2-utils-3.1.4-3.fc17.x86_64 2/3 -- Verifying : kernel-modules-extra-3.4.4-3.fc17.x86_64 3/3 -- --Installed: -- dlm.x86_64 0:3.99.4-1.fc17 -- gfs2-utils.x86_64 0:3.1.4-3.fc17 -- kernel-modules-extra.x86_64 0:3.4.4-3.fc17 -- --Complete! --..... -- --== Create a GFS2 Filesystem == -+and GFS2 are supported; here, we will use GFS2. - --[[GFS2_prep]] --=== Preparation === -+On both nodes, install the GFS2 command-line utilities and the -+Distributed Lock Manager (DLM) required by cluster filesystems: -+---- -+# yum install -y gfs2-utils dlm -+---- - --Before we do anything to the existing partition, we need to make sure it --is unmounted. We do this by telling the cluster to stop the WebFS resource. --This will ensure that other resources (in our case, Apache) using WebFS --are not only stopped, but stopped in the correct order. -+== Configure the Cluster for the DLM == - --ifdef::pcs[] --[source,C] -+The DLM needs to run on both nodes, so we'll start by creating a resource for -+it (using the *ocf:pacemaker:controld* resource script), and clone it: - ---- --# pcs resource disable WebFS --# pcs resource -- ClusterIP (ocf::heartbeat:IPaddr2) Started -- WebSite (ocf::heartbeat:apache) Stopped -+[root@pcmk-1 ~]# pcs cluster cib dlm_cfg -+[root@pcmk-1 ~]# pcs -f dlm_cfg resource create dlm ocf:pacemaker:controld op monitor interval=60s -+[root@pcmk-1 ~]# pcs -f dlm_cfg resource clone dlm clone-max=2 clone-node-max=1 -+[root@pcmk-1 ~]# pcs -f dlm_cfg resource show -+ ClusterIP (ocf::heartbeat:IPaddr2): Started -+ WebSite (ocf::heartbeat:apache): Started - Master/Slave Set: WebDataClone [WebData] - Masters: [ pcmk-2 ] - Slaves: [ pcmk-1 ] -- WebFS (ocf::heartbeat:Filesystem) Stopped -+ WebFS (ocf::heartbeat:Filesystem): Started -+ Clone Set: dlm-clone [dlm] -+ Stopped: [ pcmk-1 pcmk-2 ] - ---- --endif::[] - --ifdef::crmsh[] --[source,C] ------- --# crm resource disable WebFS --# crm_mon -1 --============ --Last updated: Tue Apr 3 14:07:36 2012 --Last change: Tue Apr 3 14:07:15 2012 via cibadmin on pcmk-1 -+Activate our new configuration, and see how the cluster responds: -+---- -+[root@pcmk-1 ~]# pcs cluster cib-push dlm_cfg -+CIB updated -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+Last updated: Sat Dec 20 21:53:44 2014 -+Last change: Sat Dec 20 21:53:40 2014 - Stack: corosync --Current DC: pcmk-1 (1702537408) - partition with quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --5 Resources configured. --============ -+Current DC: pcmk-1 (1) - partition with quorum -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+8 Resources configured -+ - - Online: [ pcmk-1 pcmk-2 ] - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -+Full list of resources: -+ -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -+ WebSite (ocf::heartbeat:apache): Started pcmk-2 - Master/Slave Set: WebDataClone [WebData] - Masters: [ pcmk-2 ] - Slaves: [ pcmk-1 ] ------- --endif::[] -+ WebFS (ocf::heartbeat:Filesystem): Started pcmk-2 -+ ipmi-fencing (stonith:fence_ipmilan): Started pcmk-1 -+ Clone Set: dlm-clone [dlm] -+ Started: [ pcmk-1 pcmk-2 ] - --[NOTE] --======= -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - --Note that both Apache and WebFS have been stopped. -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled -+---- - --======= -+[[GFS2_prep]] -+== Create and Populate GFS2 Filesystem == - --=== Create and Populate an GFS2 Partition === -+Before we do anything to the existing partition, we need to make sure it -+is unmounted. We do this by telling the cluster to stop the WebFS resource. -+This will ensure that other resources (in our case, Apache) using WebFS -+are not only stopped, but stopped in the correct order. - --Now that the cluster stack and integration pieces are running smoothly, --we can create an GFS2 partition. -+---- -+[root@pcmk-1 ~]# pcs resource disable WebFS -+[root@pcmk-1 ~]# pcs resource -+ ClusterIP (ocf::heartbeat:IPaddr2): Started -+ WebSite (ocf::heartbeat:apache): Stopped -+ Master/Slave Set: WebDataClone [WebData] -+ Masters: [ pcmk-2 ] -+ Slaves: [ pcmk-1 ] -+ WebFS (ocf::heartbeat:Filesystem): Stopped -+ Clone Set: dlm-clone [dlm] -+ Started: [ pcmk-1 pcmk-2 ] -+---- -+ -+You can see that both Apache and WebFS have been stopped, -+and that *pcmk-2* is the current master for the DRBD device. -+ -+Now we can create a new GFS2 filesystem on the DRBD device. - - [WARNING] - ========= -- - This will erase all previous content stored on the DRBD device. Ensure - you have a copy of any important data. -- - ========= - --We need to specify a number of additional parameters when creating a --GFS2 partition. -- --First we must use the -p option to specify that we want to use the the --Kernel's DLM. Next we use -j to indicate that it should reserve enough --space for two journals (one per node accessing the filesystem). -- --ifdef::pcs[] --Lastly, we use -t to specify the lock table name. The format for this --field is +clustername:fsname+. For the +fsname+, we need to use the same --value as specified in 'corosync.conf' for +cluster_name+. If you setup --corosync with the same cluster name we used in this tutorial, cluster --name will be 'mycluster'. If you are unsure what your cluster name is, --open up /etc/corosync/corosync.conf, or execute the command --'pcs cluster corosync pcmk-1' to view the corosync config. The cluster --name will be in the +totem+ block. --endif::[] -- --ifdef::crmsh[] --Lastly, we use -t to specify the lock table name. The format for this --field is +clustername:fsname+. For the +fsname+, we need to use the same --value as specified in 'corosync.conf' for +cluster_name+. Just pick --something unique and descriptive and add somewhere inside the +totem+ --block. For example: -- --..... --totem { -- version: 2 -- -- # cypto_cipher and crypto_hash: Used for mutual node authentication. -- # If you choose to enable this, then do remember to create a shared -- # secret with "corosync-keygen". -- crypto_cipher: none -- crypto_hash: none -- cluster_name: mycluster -- ... --..... -- --[IMPORTANT] --=========== --Do this on each node in the cluster and be sure to restart them before --continuing. --=========== --endif::[] -- - [IMPORTANT] - =========== --We must run the next command on whichever node last had '/dev/drbd' --mounted. Otherwise you will receive the message: -- -+Run the next command on whichever node has the DRBD Primary role. -+Otherwise, you will receive the message: - ----- - /dev/drbd1: Read-only file system - ----- - =========== - --[source,C] - ----- --# ssh pcmk-2 -- mkfs.gfs2 -p lock_dlm -j 2 -t mycluster:web /dev/drbd1 --This will destroy any data on /dev/drbd1. --It appears to contain: Linux rev 1.0 ext4 filesystem data, UUID=dc45fff3-c47a-4db2-96f7-a8049a323fe4 (extents) (large files) (huge files) -+[root@pcmk-2 ~]# mkfs.gfs2 -p lock_dlm -j 2 -t mycluster:web /dev/drbd1 -+It appears to contain an existing filesystem (ext4) -+This will destroy any data on /dev/drbd1 - Are you sure you want to proceed? [y/n]y - Device: /dev/drbd1 --Blocksize: 4096 --Device Size 0.97 GB (253935 blocks) --Filesystem Size: 0.97 GB (253932 blocks) -+Block size: 4096 -+Device size: 1.00 GB (262127 blocks) -+Filesystem size: 1.00 GB (262124 blocks) - Journals: 2 --Resource Groups: 4 --Locking Protocol: "lock_dlm" --Lock Table: "mycluster" --UUID: ed293a02-9eee-3fa3-ed1c-435ef1fd0116 -+Resource groups: 3 -+Locking protocol: "lock_dlm" -+Lock table: "mycluster:web" -+UUID: b2b30e6c-8890-33fa-a1ba-3c70edd4b5f0 - ----- - --ifdef::pcs[] --[source,C] ------ --# pcs cluster cib dlm_cfg --# pcs -f dlm_cfg resource create dlm ocf:pacemaker:controld op monitor interval=60s --# pcs -f dlm_cfg resource clone dlm clone-max=2 clone-node-max=1 --# pcs -f dlm_cfg resource show -- ClusterIP (ocf::heartbeat:IPaddr2) Started -- WebSite (ocf::heartbeat:apache) Stopped -- Master/Slave Set: WebDataClone [WebData] -- Masters: [ pcmk-2 ] -- Slaves: [ pcmk-1 ] -- WebFS (ocf::heartbeat:Filesystem) Stopped -- Clone Set: dlm-clone [dlm] -- Stopped: [ dlm:0 dlm:1 ] --# pcs cluster cib-push dlm_cfg --CIB updated --# pcs status -+The `mkfs.gfs2` command required a number of additional parameters: - --Last updated: Fri Sep 14 12:54:50 2012 --Last change: Fri Sep 14 12:54:43 2012 via cibadmin on pcmk-1 --Stack: corosync --Current DC: pcmk-1 (1) - partition with quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --7 Resources configured. -+* `-p lock_dlm` specifies that we want to use the -+kernel's DLM. - --Online: [ pcmk-1 pcmk-2 ] -- --Full list of resources: -- -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -- WebSite (ocf::heartbeat:apache): Stopped -- Master/Slave Set: WebDataClone [WebData] -- Masters: [ pcmk-2 ] -- Slaves: [ pcmk-1 ] -- WebFS (ocf::heartbeat:Filesystem): Stopped -- Clone Set: dlm-clone [dlm] -- Started: [ pcmk-1 pcmk-2 ] ------ --endif::[] -- --ifdef::crmsh[] --[source,C] ------- --# crm --crm(live)# cib new dlm --INFO: dlm shadow CIB created --crm(dlm)# configure primitive dlm ocf:pacemaker:controld \ -- op monitor interval=60s --crm(dlm)# configure clone dlm_clone dlm meta clone-max=2 clone-node-max=1 --crm(dlm)# configure show --node $id="1702537408" pcmk-1 \ -- attributes standby="off" --node $id="1719314624" pcmk-2 --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.120" cidr_netmask="32" \ -- op monitor interval="30s" --primitive WebData ocf:linbit:drbd \ -- params drbd_resource="wwwdata" \ -- op monitor interval="60s" --primitive WebFS ocf:heartbeat:Filesystem \ -- params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="ext4" \ -- meta target-role="Stopped" --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --primitive dlm ocf:pacemaker:controld \ -- op monitor interval="60s" --ms WebDataClone WebData \ -- meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" --clone dlm_clone dlm \ -- meta clone-max="2" clone-node-max="1" --location prefer-pcmk-1 WebSite 50: pcmk-1 --colocation WebSite-with-WebFS inf: WebSite WebFS --colocation fs_on_drbd inf: WebFS WebDataClone:Master --colocation website-with-ip inf: WebSite ClusterIP --order WebFS-after-WebData inf: WebDataClone:promote WebFS:start --order WebSite-after-WebFS inf: WebFS WebSite --order apache-after-ip inf: ClusterIP WebSite --property $id="cib-bootstrap-options" \ -- dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ -- cluster-infrastructure="corosync" \ -- stonith-enabled="false" \ -- no-quorum-policy="ignore" \ -- last-lrm-refresh="1333446866" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" --op_defaults $id="op-options" \ -- timeout="240s" --crm(dlm)# cib commit dlm --INFO: commited 'dlm' shadow CIB to the cluster --crm(dlm)# quit --bye --# crm_mon -1 --============ --Last updated: Wed Apr 4 01:15:11 2012 --Last change: Wed Apr 4 00:50:11 2012 via crmd on pcmk-1 --Stack: corosync --Current DC: pcmk-1 (1702537408) - partition with quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --7 Resources configured. --============ -- --Online: [ pcmk-1 pcmk-2 ] -+* `-j 2` indicates that the filesystem should reserve enough -+space for two journals (one for each node that will access the filesystem). - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 -- Master/Slave Set: WebDataClone [WebData] -- Masters: [ pcmk-1 ] -- Slaves: [ pcmk-2 ] -- Clone Set: dlm_clone [dlm] -- Started: [ pcmk-1 pcmk-2 ] ------- --endif::[] -+* `-t mycluster:web` specifies the lock table name. The format for -+this field is +pass:[clustername:fsname]+. For -++pass:[clustername]+, we need to use the same -+value we specified originally with `pcs cluster setup --name` (which is also -+the value of *cluster_name* in +/etc/corosync/corosync.conf+). -+If you are unsure what your cluster name is, you can look in -++/etc/corosync/corosync.conf+ or execute the command -+`pcs cluster corosync pcmk-1 | grep cluster_name`. - --Then (re)populate the new filesystem with data (web pages). For now we'll --create another variation on our home page. -+Now we can (re-)populate the new filesystem with data -+(web pages). We'll create yet another variation on our home page. - --[source,C] - ----- --# mount /dev/drbd1 /mnt/ --# cat <<-END >/mnt/index.html -+[root@pcmk-2 ~]# mount /dev/drbd1 /mnt -+[root@pcmk-2 ~]# cat <<-END >/mnt/index.html - - My Test Site - GFS2 - - END --# umount /dev/drbd1 --# drbdadm verify wwwdata -+[root@pcmk-2 ~]# umount /dev/drbd1 -+[root@pcmk-2 ~]# drbdadm verify wwwdata - ----- - - == Reconfigure the Cluster for GFS2 == - -+With the WebFS resource stopped, let's update the configuration. - --ifdef::pcs[] -- --With the WebFS resource stopped, lets update the configuration. -- --[source,C] - ---- --# pcs resource show WebFS --Resource: WebFS -- device: /dev/drbd/by-res/wwwdata -- directory: /var/www/html -- fstype: ext4 -- target-role: Stopped -+[root@pcmk-1 ~]# pcs resource show WebFS -+ Resource: WebFS (class=ocf provider=heartbeat type=Filesystem) -+ Attributes: device=/dev/drbd1 directory=/var/www/html fstype=ext4 -+ Meta Attrs: target-role=Stopped -+ Operations: start interval=0s timeout=60 (WebFS-start-timeout-60) -+ stop interval=0s timeout=60 (WebFS-stop-timeout-60) -+ monitor interval=20 timeout=40 (WebFS-monitor-interval-20) - ---- - --The fstype option needs to be updated to gfs2 instead of ext4. -+The fstype option needs to be updated to *gfs2* instead of *ext4*. - --[source,C] - ---- --# pcs resource update WebFS fstype=gfs2 --# pcs resource show WebFS --Resource: WebFS -- device: /dev/drbd/by-res/wwwdata -- directory: /var/www/html -- fstype: gfs2 -- target-role: Stopped --CIB updated -+[root@pcmk-1 ~]# pcs resource update WebFS fstype=gfs2 -+[root@pcmk-1 ~]# pcs resource show WebFS -+ Resource: WebFS (class=ocf provider=heartbeat type=Filesystem) -+ Attributes: device=/dev/drbd1 directory=/var/www/html fstype=gfs2 -+ Meta Attrs: target-role=Stopped -+ Operations: start interval=0s timeout=60 (WebFS-start-timeout-60) -+ stop interval=0s timeout=60 (WebFS-stop-timeout-60) -+ monitor interval=20 timeout=40 (WebFS-monitor-interval-20) - ---- --endif::[] -- --ifdef::crmsh[] --[source,C] ------- --# crm --crm(live) # cib new GFS2 --INFO: GFS2 shadow CIB created --crm(GFS2) # configure delete WebFS --crm(GFS2) # configure primitive WebFS ocf:heartbeat:Filesystem params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" ------- -- --Now that we've recreated the resource, we also need to recreate all the --constraints that used it. This is because the shell will automatically --remove any constraints that referenced WebFS. -- --[source,C] ------- --crm(GFS2) # configure colocation WebSite-with-WebFS inf: WebSite WebFS --crm(GFS2) # configure colocation fs_on_drbd inf: WebFS WebDataClone:Master --crm(GFS2) # configure order WebFS-after-WebData inf: WebDataClone:promote WebFS:start --crm(GFS2) # configure order WebSite-after-WebFS inf: WebFS WebSite --crm(GFS2) # configure show --node pcmk-1 --node pcmk-2 --primitive WebData ocf:linbit:drbd \ -- params drbd_resource="wwwdata" \ -- op monitor interval="60s" --primitive WebFS ocf:heartbeat:Filesystem \ -- params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.101" cidr_netmask="32" \ -- op monitor interval="30s" --ms WebDataClone WebData \ -- meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" --colocation WebSite-with-WebFS inf: WebSite WebFS --colocation fs_on_drbd inf: WebFS WebDataClone:Master --colocation website-with-ip inf: WebSite ClusterIP --order WebFS-after-WebData inf: WebDataClone:promote WebFS:start --order WebSite-after-WebFS inf: WebFS WebSite --order apache-after-ip inf: ClusterIP WebSite --property $id="cib-bootstrap-options" \ -- dc-version="1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f" \ -- cluster-infrastructure="openais" \ -- expected-quorum-votes="2" \ -- stonith-enabled="false" \ -- no-quorum-policy="ignore" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" ------- -- --Review the configuration before uploading it to the cluster, quitting the --shell and watching the cluster's response -- --[source,C] ------- --crm(GFS2) # cib commit GFS2 --INFO: commited 'GFS2' shadow CIB to the cluster --crm(GFS2) # quit --bye --# crm_mon --============ --Last updated: Thu Sep 3 20:49:54 2009 --Stack: openais --Current DC: pcmk-2 - partition with quorum --Version: 1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f --2 Nodes configured, 2 expected votes --6 Resources configured. --============ - --Online: [ pcmk-1 pcmk-2 ] -- --WebSite (ocf::heartbeat:apache): Started pcmk-2 --Master/Slave Set: WebDataClone -- Masters: [ pcmk-1 ] -- Slaves: [ pcmk-2 ] --ClusterIP (ocf::heartbeat:IPaddr): Started pcmk-2WebFS (ocf::heartbeat:Filesystem): Started pcmk-1 ------- --endif::[] -- --== Reconfigure Pacemaker for Active/Active == -- --Almost everything is in place. Recent versions of DRBD are capable of --operating in Primary/Primary mode and the filesystem we're using is --cluster aware. All we need to do now is reconfigure the cluster to take --advantage of this. -- --ifdef::pcs[] --This will involve a number of changes, so we'll want work with a --local cib file. -- --[source,C] -+GFS2 requires that DLM be running, so we also need to set up new colocation -+and ordering constraints for it: - ---- --# pcs cluster cib active_cfg -+[root@pcmk-1 ~]# pcs constraint colocation add WebFS with dlm-clone INFINITY -+[root@pcmk-1 ~]# pcs constraint order dlm-clone then WebFS - ---- --endif::[] - --ifdef::crmsh[] --This will involve a number of changes, so we'll again use interactive --mode. -- --[source,C] ------- --# crm --# cib new active ------- --endif::[] -+== Clone the IP address == - - There's no point making the services active on both locations if we can't --reach them, so lets first clone the IP address. Cloned IPaddr2 resources --use an iptables rule to ensure that each request only gets processed by one of --the two clone instances. The additional meta options tell the cluster how --many instances of the clone we want (one "request bucket" for each node) --and that if all other nodes fail, then the remaining node should hold all --of them. Otherwise the requests would be simply discarded. -+reach them both, so let's clone the IP address. - -+The *IPaddr2* resource agent has built-in intelligence for when it is configured -+as a clone. It will utilize a multicast MAC address to have the local switch -+send the relevant packets to all nodes in the cluster, together with *iptables -+clusterip* rules on the nodes so that any given packet will be grabbed by -+exactly one node. This will give us a simple but effective form of -+load-balancing requests between our two nodes. - --ifdef::pcs[] -+Let's start a new config, and clone our IP: - ---- --# pcs -f active_cfg resource clone ClusterIP \ -- globally-unique=true clone-max=2 clone-node-max=2 -+[root@pcmk-1 ~]# pcs cluster cib loadbalance_cfg -+[root@pcmk-1 ~]# pcs -f loadbalance_cfg resource clone ClusterIP \ -+ clone-max=2 clone-node-max=2 globally-unique=true - ---- - --Notice when the ClusterIP becomes a clone, the constraints -+* `clone-max=2` tells the resource agent to split packets this many ways. This -+should equal the number of nodes that can host the IP. -+* `clone-node-max=2` says that one node can run up to 2 instances -+of the clone. This should also equal the number of nodes that can -+host the IP, so that if any node goes down, another node can take over -+the failed node's "request bucket". Otherwise, requests intended for -+the failed node would be discarded. -+* `globally-unique=true` tells the cluster that one clone isn't identical -+to another (each handles a different "bucket"). This also tells the resource -+agent to insert *iptables* rules so each host only processes packets in its -+bucket(s). -+ -+Notice that when the ClusterIP becomes a clone, the constraints - referencing ClusterIP now reference the clone. This is - done automatically by pcs. --endif::[] -- --ifdef::pcs[] --[source,C] - ---- --# pcs -f active_cfg constraint -+[root@pcmk-1 ~]# pcs -f loadbalance_cfg constraint - Location Constraints: - Ordering Constraints: -- start ClusterIP-clone then start WebSite -- WebFS then WebSite -- promote WebDataClone then start WebFS -+ start ClusterIP-clone then start WebSite (kind:Mandatory) -+ promote WebDataClone then start WebFS (kind:Mandatory) -+ start WebFS then start WebSite (kind:Mandatory) -+ start dlm-clone then start WebFS (kind:Mandatory) - Colocation Constraints: -- WebSite with ClusterIP-clone -- WebFS with WebDataClone (with-rsc-role:Master) -- WebSite with WebFS -+ WebSite with ClusterIP-clone (score:INFINITY) -+ WebFS with WebDataClone (score:INFINITY) (with-rsc-role:Master) -+ WebSite with WebFS (score:INFINITY) -+ WebFS with dlm-clone (score:INFINITY) - ---- --endif::[] -- --ifdef::crmsh[] --[source,C] ------- --# configure clone WebIP ClusterIP \ -- meta globally-unique="true" clone-max="2" clone-node-max="2" ------- --endif::[] - --Now we must tell the ClusterIP how to decide which requests are --processed by which hosts. To do this we must specify the --clusterip_hash parameter. -+Now we must tell the resource how to decide which requests are -+processed by which hosts. To do this, we specify the *clusterip_hash* parameter. -+The value of *sourceip* means that the source IP address of incoming packets -+will be hashed; each node will process a certain range of hashes. - --ifdef::pcs[] --[source,C] - ---- --# pcs -f active_cfg resource update ClusterIP clusterip_hash=sourceip -+[root@pcmk-1 ~]# pcs -f loadbalance_cfg resource update ClusterIP clusterip_hash=sourceip - ---- --endif::[] - --ifdef::crmsh[] --Open the ClusterIP resource -- --[source,C] ------- --# configure edit ClusterIP -+Load our configuration to the cluster, and see how it responds. - ----- -+[root@pcmk-1 ~]# pcs cluster cib-push loadbalance_cfg -+CIB updated -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+Last updated: Sat Dec 20 22:05:48 2014 -+Last change: Sat Dec 20 22:05:34 2014 -+Stack: corosync -+Current DC: pcmk-1 (1) - partition with quorum -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+9 Resources configured - --And add the following to the params line - --..... --clusterip_hash="sourceip" --..... -+Online: [ pcmk-1 pcmk-2 ] - --So that the complete definition looks like: -+Full list of resources: - --..... --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.101" cidr_netmask="32" clusterip_hash="sourceip" \ -- op monitor interval="30s" --..... -+ WebSite (ocf::heartbeat:apache): Stopped -+ Master/Slave Set: WebDataClone [WebData] -+ Masters: [ pcmk-1 ] -+ Slaves: [ pcmk-2 ] -+ WebFS (ocf::heartbeat:Filesystem): Stopped -+ ipmi-fencing (stonith:fence_ipmilan): Started pcmk-1 -+ Clone Set: dlm-clone [dlm] -+ Started: [ pcmk-1 pcmk-2 ] -+ Clone Set: ClusterIP-clone [ClusterIP] (unique) -+ ClusterIP:0 (ocf::heartbeat:IPaddr2): Started pcmk-1 -+ ClusterIP:1 (ocf::heartbeat:IPaddr2): Started pcmk-2 - --Here is the full transcript -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - --[source,C] ------- --# crm crm(live) --# cib new active --INFO: active shadow CIB created --crm(active) # configure clone WebIP ClusterIP \ -- meta globally-unique="true" clone-max="2" clone-node-max="2" --crm(active) # configure shownode pcmk-1 --node pcmk-2 --primitive WebData ocf:linbit:drbd \ -- params drbd_resource="wwwdata" \ -- op monitor interval="60s" --primitive WebFS ocf:heartbeat:Filesystem \ -- params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.101" cidr_netmask="32" clusterip_hash="sourceip" \ -- op monitor interval="30s" --ms WebDataClone WebData \ -- meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" --clone WebIP ClusterIP \ -- meta globally-unique="true" clone-max="2" clone-node-max="2" --colocation WebSite-with-WebFS inf: WebSite WebFS --colocation fs_on_drbd inf: WebFS WebDataClone:Master --colocation website-with-ip inf: WebSite WebIPorder WebFS-after-WebData inf: WebDataClone:promote WebFS:start --order WebSite-after-WebFS inf: WebFS WebSiteorder apache-after-ip inf: WebIP WebSite --property $id="cib-bootstrap-options" \ -- dc-version="1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f" \ -- cluster-infrastructure="openais" \ -- expected-quorum-votes="2" \ -- stonith-enabled="false" \ -- no-quorum-policy="ignore" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled - ----- - --Notice how any constraints that referenced ClusterIP have been updated --to use WebIP instead. This is an additional benefit of using the crm --shell. --endif::[] -+If desired, you can demonstrate that all request buckets are working -+by using a tool such as `arping` from several source hosts -+to see which host responds to each. - --Next we need to convert the filesystem and Apache resources into --clones. -+== Clone the Filesystem and Apache Resources == - --ifdef::pcs[] -+Now that we have a cluster filesystem ready to go, -+and our nodes can load-balance requests to a shared IP address, -+we can configure the cluster so both nodes mount the filesystem -+and respond to web requests. -+ -+Clone the filesystem and Apache resources in a new configuration. - Notice how pcs automatically updates the relevant constraints again. --[source,C] - ---- --# pcs -f active_cfg resource clone WebFS --# pcs -f active_cfg resource clone WebSite --# pcs -f active_cfg constraint -+[root@pcmk-1 ~]# pcs cluster cib active_cfg -+[root@pcmk-1 ~]# pcs -f active_cfg resource clone WebFS -+[root@pcmk-1 ~]# pcs -f active_cfg resource clone WebSite -+[root@pcmk-1 ~]# pcs -f active_cfg constraint - Location Constraints: - Ordering Constraints: -- start ClusterIP-clone then start WebSite-clone -- WebFS-clone then WebSite-clone -- promote WebDataClone then start WebFS-clone -+ start ClusterIP-clone then start WebSite-clone (kind:Mandatory) -+ promote WebDataClone then start WebFS-clone (kind:Mandatory) -+ start WebFS-clone then start WebSite-clone (kind:Mandatory) -+ start dlm-clone then start WebFS-clone (kind:Mandatory) - Colocation Constraints: -- WebSite-clone with ClusterIP-clone -- WebFS-clone with WebDataClone (with-rsc-role:Master) -- WebSite-clone with WebFS-clone -+ WebSite-clone with ClusterIP-clone (score:INFINITY) -+ WebFS-clone with WebDataClone (score:INFINITY) (with-rsc-role:Master) -+ WebSite-clone with WebFS-clone (score:INFINITY) -+ WebFS-clone with dlm-clone (score:INFINITY) - ---- --endif::[] - --ifdef::crmsh[] --Again, the shell will automatically update any relevant --constraints. -+Tell the cluster that it is now allowed to promote both instances to be DRBD -+Primary (aka. master). - --[source,C] - ----- --crm(active) # configure clone WebFSClone WebFS --crm(active) # configure clone WebSiteClone WebSite -+[root@pcmk-1 ~]# pcs -f active_cfg resource update WebDataClone master-max=2 - ----- --endif::[] -- --The last step is to tell the cluster that it is now allowed to promote --both instances to be Primary (aka. Master). - --ifdef::pcs[] --[source,C] ------- --# pcs -f active_cfg resource update WebDataClone master-max=2 ------- --endif::[] -- --ifdef::crmsh[] --[source,C] ------- --crm(active) # configure edit WebDataClone -+Finally, load our configuration to the cluster, and re-enable the WebFS resource -+(which we disabled earlier). - ----- -- --Change master-max to 2 -- --[source,C] ------- --crm(active) # configure show --node pcmk-1 --node pcmk-2 --primitive WebData ocf:linbit:drbd \ -- params drbd_resource="wwwdata" \ -- op monitor interval="60s" --primitive WebFS ocf:heartbeat:Filesystem \ -- params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.101" cidr_netmask="32" clusterip_hash="sourceip" \ -- op monitor interval="30s" --ms WebDataClone WebData \ -- meta master-max="2" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" --clone WebFSClone WebFSclone WebIP ClusterIP \ -- meta globally-unique="true" clone-max="2" clone-node-max="2" --clone WebSiteClone WebSitecolocation WebSite-with-WebFS inf: WebSiteClone WebFSClone --colocation fs_on_drbd inf: WebFSClone WebDataClone:Master --colocation website-with-ip inf: WebSiteClone WebIP --order WebFS-after-WebData inf: WebDataClone:promote WebFSClone:start --order WebSite-after-WebFS inf: WebFSClone WebSiteClone --order apache-after-ip inf: WebIP WebSiteClone --property $id="cib-bootstrap-options" \ -- dc-version="1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f" \ -- cluster-infrastructure="openais" \ -- expected-quorum-votes="2" \ -- stonith-enabled="false" \ -- no-quorum-policy="ignore" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" ------- --endif::[] -- --Review the configuration before uploading it to the cluster, quitting the --shell and watching the cluster's response -- --ifdef::pcs[] --[source,C] ------- --# pcs cluster cib-push active_cfg --# pcs resource enable WebFS -+[root@pcmk-1 ~]# pcs cluster cib-push active_cfg -+CIB updated -+[root@pcmk-1 ~]# pcs resource enable WebFS - ----- - --After all the processes are started the status should look --similar to this. -- --[source,C] -+After all the processes are started, the status should look similar to this. - ----- --# pcs resource -+[root@pcmk-1 ~]# pcs resource - Master/Slave Set: WebDataClone [WebData] -- Masters: [ pcmk-2 pcmk-1 ] -+ Masters: [ pcmk-1 pcmk-2 ] - Clone Set: dlm-clone [dlm] -- Started: [ pcmk-2 pcmk-1 ] -+ Started: [ pcmk-1 pcmk-2 ] - Clone Set: ClusterIP-clone [ClusterIP] (unique) -- ClusterIP:0 (ocf::heartbeat:IPaddr2) Started -- ClusterIP:1 (ocf::heartbeat:IPaddr2) Started -+ ClusterIP:0 (ocf::heartbeat:IPaddr2): Started -+ ClusterIP:1 (ocf::heartbeat:IPaddr2): Started - Clone Set: WebFS-clone [WebFS] - Started: [ pcmk-1 pcmk-2 ] - Clone Set: WebSite-clone [WebSite] - Started: [ pcmk-1 pcmk-2 ] - ----- --endif::[] -- --ifdef::crmsh[] --[source,C] ------- --crm(active) # cib commit active --INFO: commited 'active' shadow CIB to the cluster --crm(active) # quit --bye --# crm_mon --============ --Last updated: Thu Sep 3 21:37:27 2009 --Stack: openais --Current DC: pcmk-2 - partition with quorum --Version: 1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f --2 Nodes configured, 2 expected votes --6 Resources configured. --============ -- --Online: [ pcmk-1 pcmk-2 ] - --Master/Slave Set: WebDataClone -- Masters: [ pcmk-1 pcmk-2 ] --Clone Set: WebIP Started: [ pcmk-1 pcmk-2 ] --Clone Set: WebFSClone Started: [ pcmk-1 pcmk-2 ] --Clone Set: WebSiteClone Started: [ pcmk-1 pcmk-2 ] --Clone Set: dlm_clone Started: [ pcmk-1 pcmk-2 ] ------- --endif::[] -+== Test Failover == - --=== Testing Recovery === -+Testing failover is left as an exercise for the reader. -+For example, you can put one node into standby mode, -+use `pcs status` to confirm that its ClusterIP clone was -+moved to the other node, and use `arping` to verify that -+packets are not being lost from any source host. - - [NOTE] --======= --TODO: Put one node into standby to demonstrate failover --======= -+==== -+You may find that when a failed node rejoins the cluster, -+both ClusterIP clones stay on one node, due to the -+resource stickiness. While this works fine, it effectively eliminates -+load-balancing and returns the cluster to an active-passive setup again. -+You can avoid this by disabling stickiness for the IP address resource: -+---- -+[root@pcmk-1 ~]# pcs resource meta ClusterIP resource-stickiness=0 -+---- -+==== -diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Active-Passive.txt b/doc/Clusters_from_Scratch/en-US/Ch-Active-Passive.txt -index c91647b..eae49ea 100644 ---- a/doc/Clusters_from_Scratch/en-US/Ch-Active-Passive.txt -+++ b/doc/Clusters_from_Scratch/en-US/Ch-Active-Passive.txt -@@ -1,88 +1,75 @@ --= Creating an Active/Passive Cluster = -+= Create an Active/Passive Cluster = - --== Exploring the Existing Configuration == -+== Explore the Existing Configuration == - - When Pacemaker starts up, it automatically records the number and details --of the nodes in the cluster as well as which stack is being used and the -+of the nodes in the cluster, as well as which stack is being used and the - version of Pacemaker being used. - --This is what the base configuration should look like. -+The first few lines of output should look like this: - --ifdef::pcs[] --[source,C] - ---- --# pcs status --Last updated: Fri Sep 14 10:12:01 2012 --Last change: Fri Sep 14 09:51:55 2012 via crmd on pcmk-2 -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+WARNING: no stonith devices and stonith-enabled is not false -+Last updated: Tue Dec 16 16:15:29 2014 -+Last change: Tue Dec 16 15:49:47 2014 - Stack: corosync --Current DC: pcmk-1 (1) - partition with quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --0 Resources configured. -+Current DC: pcmk-2 (2) - partition with quorum -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+0 Resources configured - --Online: [ pcmk-1 pcmk-2 ] - --Full list of resources: ------ --endif::[] -- --ifdef::crmsh[] --[source,C] ------ --# crm configure show --node $id="1702537408" pcmk-1 --node $id="1719314624" pcmk-2 --property $id="cib-bootstrap-options" \ -- dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ -- cluster-infrastructure="corosync" -+Online: [ pcmk-1 pcmk-2 ] - ---- --endif::[] -- --ifdef::pcs[] - --For those that are not of afraid of XML, you can see the raw cluster --configuration and status by using the +pcs cluster cib+ command. -+For those who are not of afraid of XML, you can see the raw cluster -+configuration and status by using the `pcs cluster cib` command. - - .The last XML you'll see in this document - ====== --[source,C] - ---- --# pcs cluster cib -+[root@pcmk-1 ~]# pcs cluster cib - ---- - [source,XML] - ---- -- -+ - - - -- -+ -+ - -+ - - - -- -- -+ -+ - - - - - -- -+ - - - - - -+ - - - - -- -+ - - - - - -+ - - - -@@ -91,150 +78,88 @@ configuration and status by using the +pcs cluster cib+ command. - - ---- - ====== --endif::[] -- --ifdef::crmsh[] --For those that are not of afraid of XML, you can see the raw configuration by appending "xml" to the previous command. - --.The last XML you'll see in this document --====== --[source,C] ------ --# crm configure show xml ------ --[source,XML] ------ -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- ------ --====== --endif::[] -- --Before we make any changes, its a good idea to check the validity of -+Before we make any changes, it's a good idea to check the validity of - the configuration. - --[source,C] - ---- --# crm_verify -L -V -+[root@pcmk-1 ~]# crm_verify -L -V - error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined - error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option - error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity - Errors found during check: config not valid -- -V may provide more details - ---- - - As you can see, the tool has found some errors. - --In order to guarantee the safety of your data -+In order to guarantee the safety of your data, - footnote:[If the data is corrupt, there is little point in continuing to make it available] --, the default for STONITH -+the default for STONITH - footnote:[A common node fencing mechanism. Used to ensure data integrity by powering off "bad" nodes] --in Pacemaker is +enabled+. However it also knows when no STONITH configuration has been -+in Pacemaker is *enabled*. However, it also knows when no STONITH configuration has been - supplied and reports this as a problem (since the cluster would not be - able to make progress if a situation requiring node fencing arose). - --For now, we will disable this feature and configure it later in the --Configuring STONITH section. It is important to note that the use of --STONITH is highly encouraged, turning it off tells the cluster to --simply pretend that failed nodes are safely powered off. Some vendors --will even refuse to support clusters that have it disabled. -+We will disable this feature for now and configure it later. - --To disable STONITH, we set the _stonith-enabled_ cluster option to --false. -+To disable STONITH, set the *stonith-enabled* cluster option to -+false: - --ifdef::pcs[] --[source,C] - ---- --# pcs property set stonith-enabled=false --# crm_verify -L -+[root@pcmk-1 ~]# pcs property set stonith-enabled=false -+[root@pcmk-1 ~]# crm_verify -L - ---- --endif::[] -- --ifdef::crmsh[] --[source,C] ------ --# crm configure property stonith-enabled=false --# crm_verify -L ------ --endif::[] - - With the new cluster option set, the configuration is now valid. - - [WARNING] - ========= -- --The use of stonith-enabled=false is completely inappropriate for a --production cluster. We use it here to defer the discussion of its --configuration which can differ widely from one installation to the --next. See <<_what_is_stonith>> for information on why STONITH is important -+The use of `stonith-enabled=false` is completely inappropriate for a -+production cluster. It tells the cluster to simply pretend that failed nodes -+are safely powered off. Some vendors will refuse to support clusters that have -+STONITH disabled. -+ -+We disable STONITH here only to defer the discussion of its -+configuration, which can differ widely from one installation to the -+next. See <<_what_is_stonith>> for information on why STONITH is important - and details on how to configure it. -- - ========= - --== Adding a Resource == -+== Add a Resource == - --The first thing we should do is configure an IP address. Regardless of --where the cluster service(s) are running, we need a consistent address --to contact them on. Here I will choose and add 192.168.122.120 as the --floating address, give it the imaginative name ClusterIP and tell the --cluster to check that its running every 30 seconds. -+Our first resource will be a unique IP address that the cluster can bring up on -+either node. Regardless of where any cluster service(s) are running, end -+users need a consistent address to contact them on. Here, I will choose -+192.168.122.120 as the floating address, give it the imaginative name ClusterIP -+and tell the cluster to check whether it is running every 30 seconds. - -- --[IMPORTANT] -+[WARNING] - =========== --The chosen address must not be one already associated with --a physical node -+The chosen address must not already be in use on the network. -+Do not reuse an IP address one of the nodes already has configured. - =========== - --//// --No syntax highlighting here to avoid line munging with source,C --//// --ifdef::pcs[] - ---- --# pcs resource create ClusterIP ocf:heartbeat:IPaddr2 \ -- ip=192.168.0.120 cidr_netmask=32 op monitor interval=30s -+[root@pcmk-1 ~]# pcs resource create ClusterIP ocf:heartbeat:IPaddr2 \ -+ ip=192.168.122.120 cidr_netmask=32 op monitor interval=30s - ---- --endif::[] - --ifdef::crmsh[] ------ --# crm configure primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip=192.168.122.120 cidr_netmask=32 \ -- op monitor interval=30s ------ --endif::[] -+Another important piece of information here is *ocf:heartbeat:IPaddr2*. -+This tells Pacemaker three things about the resource you want to add: -+ -+* The first field (*ocf* in this case) is the standard to which the resource -+script conforms and where to find it. - --The other important piece of information here is ocf:heartbeat:IPaddr2. -+* The second field (*heartbeat* in this case) is standard-specific; for OCF -+resources, it tells the cluster which OCF namespace the resource script is in. - --This tells Pacemaker three things about the resource you want to --add. The first field, ocf, is the standard to which the resource --script conforms to and where to find it. The second field is specific --to OCF resources and tells the cluster which namespace to find the --resource script in, in this case heartbeat. The last field indicates --the name of the resource script. -+* The third field (*IPaddr2* in this case) is the name of the resource script. - --ifdef::pcs[] --To obtain a list of the available resource standards (the ocf part of --ocf:heartbeat:IPaddr2), run -+To obtain a list of the available resource standards (the *ocf* part of -+*ocf:heartbeat:IPaddr2*), run: - --[source,C] - ---- --# pcs resource standards -+[root@pcmk-1 ~]# pcs resource standards - ocf - lsb - service -@@ -242,24 +167,20 @@ systemd - stonith - ---- - --To obtain a list of the available ocf resource providers (the heartbeat --part of ocf:heartbeat:IPaddr2), run -+To obtain a list of the available OCF resource providers (the *heartbeat* -+part of *ocf:heartbeat:IPaddr2*), run: - --[source,C] - ---- --# pcs resource providers -+[root@pcmk-1 ~]# pcs resource providers - heartbeat --linbit - pacemaker --redhat - ---- - - Finally, if you want to see all the resource agents available for --a specific ocf provider (the IPaddr2 part of ocf:heartbeat:IPaddr2), run -+a specific OCF provider (the *IPaddr2* part of *ocf:heartbeat:IPaddr2*), run: - --[source,C] - ---- --# pcs resource agents ocf:heartbeat -+[root@pcmk-1 ~]# pcs resource agents ocf:heartbeat - AoEtarget - AudibleAlarm - CTDB -@@ -273,411 +194,232 @@ IPaddr2 - . - . - . --symlink --syslog-ng - tomcat -+varnish - vmware -+zabbixserver - ---- --endif::[] - --ifdef::crmsh[] -+Now, verify that the IP resource has been added, and display the cluster's -+status to see that it is now active: - --To obtain a list of the available resource classes, run -- --[source,C] ------ --# crm ra classes --heartbeat --lsb --ocf / heartbeat pacemaker --stonith - ---- -- --To then find all the OCF resource agents provided by Pacemaker and --Heartbeat, run -- --[source,C] ------ --# crm ra list ocf pacemaker --ClusterMon Dummy HealthCPU HealthSMART Stateful SysInfo --SystemHealth controld o2cb ping pingd --# crm ra list ocf heartbeat --AoEtarget AudibleAlarm CTDB ClusterMon --Delay Dummy EvmsSCC Evmsd --Filesystem ICP IPaddr IPaddr2 --IPsrcaddr IPv6addr LVM LinuxSCSI --MailTo ManageRAID ManageVE Pure-FTPd --Raid1 Route SAPDatabase SAPInstance --SendArp ServeRAID SphinxSearchDaemon Squid --Stateful SysInfo VIPArip VirtualDomain --WAS WAS6 WinPopup Xen --Xinetd anything apache conntrackd --db2 drbd eDir88 ethmonitor --exportfs fio iSCSILogicalUnit iSCSITarget --ids iscsi jboss ldirectord --lxc mysql mysql-proxy nfsserver --nginx oracle oralsnr pgsql --pingd portblock postfix proftpd --rsyncd scsi2reservation sfex symlink --syslog-ng tomcat vmware ------ --endif::[] -- --Now verify that the IP resource has been added and display the cluster's --status to see that it is now active. -- --ifdef::pcs[] --[source,C] ------ --# pcs status -- --Last updated: Fri Sep 14 10:17:00 2012 --Last change: Fri Sep 14 10:15:48 2012 via cibadmin on pcmk-1 -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+Last updated: Tue Dec 16 17:44:40 2014 -+Last change: Tue Dec 16 17:44:26 2014 - Stack: corosync - Current DC: pcmk-1 (1) - partition with quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --1 Resources configured. -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+1 Resources configured -+ - - Online: [ pcmk-1 pcmk-2 ] - - Full list of resources: - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 ------ --endif::[] -- --ifdef::crmsh[] --[source,C] ------ --# crm configure show --node $id="1702537408" pcmk-1 --node $id="1719314624" pcmk-2 --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.120" cidr_netmask="32" \ -- op monitor interval="30s" --property $id="cib-bootstrap-options" \ -- dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ -- cluster-infrastructure="corosync" \ -- stonith-enabled="false" --# crm_mon -1 --============ --Last updated: Tue Apr 3 09:56:50 2012 --Last change: Tue Apr 3 09:54:37 2012 via cibadmin on pcmk-1 --Stack: corosync --Current DC: pcmk-1 (1702537408) - partition with quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --1 Resources configured. --============ -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 - --Online: [ pcmk-1 pcmk-2 ] -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled - ---- --endif::[] - - == Perform a Failover == - --Being a high-availability cluster, we should test failover of our new --resource before moving on. -+Since our ultimate goal is high availability, we should test failover of -+our new resource before moving on. - - First, find the node on which the IP address is running. - --ifdef::pcs[] --[source,C] - ---- --# pcs status -- --Last updated: Fri Sep 14 10:17:00 2012 --Last change: Fri Sep 14 10:15:48 2012 via cibadmin on pcmk-1 -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+Last updated: Tue Dec 16 17:44:40 2014 -+Last change: Tue Dec 16 17:44:26 2014 - Stack: corosync - Current DC: pcmk-1 (1) - partition with quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --1 Resources configured. -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+1 Resources configured -+ - - Online: [ pcmk-1 pcmk-2 ] - - Full list of resources: - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 - ---- --endif::[] - --ifdef::crmsh[] --[source,C] -+You can see that the status of the *ClusterIP* resource -+is *Started* on a particular node (in this example, *pcmk-1*). -+Shut down Pacemaker and Corosync on that machine to trigger a failover. -+ - ---- --# crm resource status ClusterIP --resource ClusterIP is running on: pcmk-1 -+[root@pcmk-1 ~]# pcs cluster stop pcmk-1 -+Stopping Cluster... - ---- --endif::[] - --Shut down Pacemaker and Corosync on that machine. -+[NOTE] -+====== -+A cluster command such as +pcs cluster stop pass:[nodename]+ can be run -+from any node in the cluster, not just the affected node. -+====== - --ifdef::pcs[] --[source,C] -+Verify that pacemaker and corosync are no longer running: - ---- --#pcs cluster stop pcmk-1 --Stopping Cluster... -+[root@pcmk-1 ~]# pcs status -+Error: cluster is not currently running on this node - ---- - --Once Corosync is no longer running, go to the other node and check the --cluster status. -+Go to the other node, and check the cluster status. - --[source,C] - ---- --# pcs status -- --Last updated: Fri Sep 14 10:31:01 2012 --Last change: Fri Sep 14 10:15:48 2012 via cibadmin on pcmk-1 -+[root@pcmk-2 ~]# pcs status -+Cluster name: mycluster -+Last updated: Wed Dec 17 10:30:56 2014 -+Last change: Tue Dec 16 17:44:26 2014 - Stack: corosync --Current DC: pcmk-2 (2) - partition WITHOUT quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --1 Resources configured. -+Current DC: pcmk-2 (2) - partition with quorum -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+1 Resources configured -+ - - Online: [ pcmk-2 ] - OFFLINE: [ pcmk-1 ] - - Full list of resources: - -- ClusterIP (ocf::heartbeat:IPaddr2): Stopped ------ --endif::[] -- --ifdef::crmsh[] --[source,C] ------ --# ssh pcmk-1 -- service pacemaker stop --# ssh pcmk-1 -- service corosync stop ------ -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 - --Once Corosync is no longer running, go to the other node and check the --cluster status with crm_mon. -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - --[source,C] ------ --# crm_mon -1 --============ --Last updated: Tue Apr 3 10:01:28 2012 --Last change: Tue Apr 3 09:54:39 2012 via cibadmin on pcmk-1 --Stack: corosync --Current DC: pcmk-2 (1719314624) - partition WITHOUT quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --1 Resources configured. --============ -- --Online: [ pcmk-2 ] --OFFLINE: [ pcmk-1 ] -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled - ---- --endif::[] - --There are three things to notice about the cluster's current --state. The first is that, as expected, +pcmk-1+ is now offline. However --we can also see that +ClusterIP+ isn't running anywhere! -+Notice that *pcmk-1* is *OFFLINE* for cluster purposes (its *PCSD* is still -+*Online*, allowing it to receive `pcs` commands, but it is not participating in -+the cluster). - -+Also notice that *ClusterIP* is now running on pcmk-2 -- failover happened -+automatically, and no errors are reported. - --=== Quorum and Two-Node Clusters === -- --This is because the cluster no longer has quorum, as can be seen by --the text "partition WITHOUT quorum" in the status output. In order --to reduce the possibility of data corruption, Pacemaker's default --behavior is to stop all resources if the cluster does not have quorum. -- --A cluster is said to have quorum when more than half the known or --expected nodes are online, or for the mathematically inclined, --whenever the following equation is true: -- -+[IMPORTANT] -+.Quorum -+==== -+If a cluster splits into two (or more) groups of nodes that can no longer -+communicate with each other (aka. _partitions_), _quorum_ is used to prevent -+resources from starting on more nodes than desired, which would risk -+data corruption. -+ -+A cluster has quorum when more than half of all known nodes are online in -+the same partition, or for the mathematically inclined, whenever the following -+equation is true: - .... - total_nodes < 2 * active_nodes - .... - --Therefore a two-node cluster only has quorum when both nodes are --running, which is no longer the case for our cluster. This would --normally make the creation of a two-node cluster pointless --footnote:[Actually some would argue that two-node clusters are always pointless, but that is an argument for another time] --, however it is possible to control how Pacemaker behaves when quorum --is lost. In particular, we can tell the cluster to simply ignore --quorum altogether. -- --ifdef::pcs[] --[source,C] ------ --# pcs property set no-quorum-policy=ignore --# pcs property --dc-version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --cluster-infrastructure: corosync --stonith-enabled: false --no-quorum-policy: ignore ------ --endif::[] -- --ifdef::crmsh[] --[source,C] ------ --# crm configure property no-quorum-policy=ignore --# crm configure show --node $id="1702537408" pcmk-1 --node $id="1719314624" pcmk-2 --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.120" cidr_netmask="32" \ -- op monitor interval="30s" --property $id="cib-bootstrap-options" \ -- dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ -- cluster-infrastructure="corosync" \ -- stonith-enabled="false" \ -- no-quorum-policy="ignore" ------ --endif::[] -- --After a few moments, the cluster will start the IP address on the --remaining node. Note that the cluster still does not have quorum. -- --ifdef::pcs[] --[source,C] ------ --# pcs status --Last updated: Fri Sep 14 10:38:11 2012 --Last change: Fri Sep 14 10:37:53 2012 via cibadmin on pcmk-2 --Stack: corosync --Current DC: pcmk-2 (2) - partition WITHOUT quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --1 Resources configured. -- --Online: [ pcmk-2 ] --OFFLINE: [ pcmk-1 ] -- --Full list of resources: -- -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 ------ --endif::[] -- --ifdef::crmsh[] --[source,C] ------ --# crm_mon -1 --============ --Last updated: Tue Apr 3 10:02:46 2012 --Last change: Tue Apr 3 10:02:08 2012 via cibadmin on pcmk-2 --Stack: corosync --Current DC: pcmk-2 (1719314624) - partition WITHOUT quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --1 Resources configured. --============ -- --Online: [ pcmk-2 ] --OFFLINE: [ pcmk-1 ] -- -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 ------ --endif::[] -- --Now simulate node recovery by restarting the cluster stack on +pcmk-1+ and --check the cluster's status. Note, if you get an authentication error with --the 'pcs cluster start pcmk-1' command, you must authenticate on the node --using the 'pcs cluster auth pcmk pcmk-1 pcmk-2' command discussed earlier. -- --ifdef::pcs[] --[source,C] ------ --# pcs cluster start pcmk-1 --Starting Cluster... --# pcs status -- --Last updated: Fri Sep 14 10:42:56 2012 --Last change: Fri Sep 14 10:37:53 2012 via cibadmin on pcmk-2 -+For example, if a 5-node cluster split into 3- and 2-node paritions, -+the 3-node partition would have quorum and could continue serving resources. -+If a 6-node cluster split into two 3-node partitions, neither partition -+would have quorum; pacemaker's default behavior in such cases is to -+stop all resources, in order to prevent data corruption. -+ -+Two-node clusters are a special case. By the above definition, -+a two-node cluster would only have quorum when both nodes are -+running. This would make the creation of a two-node cluster pointless, -+footnote:[Some would argue that two-node clusters are always pointless, but that is an argument for another time] -+but corosync has the ability to treat two-node clusters as if only one node -+is required for quorum. -+ -+The `pcs cluster setup` command will automatically configure *two_node: 1* -+in +corosync.conf+, so a two-node cluster will "just work". -+ -+If you are using a different cluster shell, you will have to configure -++corosync.conf+ appropriately yourself. If you are using older versions of -+corosync, you will have to ignore quorum at the pacemaker level, using `pcs -+property set no-quorum-policy=ignore` (or the equivalent command if you are -+using a different cluster shell). -+==== -+ -+Now, simulate node recovery by restarting the cluster stack on *pcmk-1*, and -+check the cluster's status. -+ -+---- -+[root@pcmk-1 ~]# pcs cluster start pcmk-1 -+pcmk-1: Starting Cluster... -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+Last updated: Wed Dec 17 10:50:11 2014 -+Last change: Tue Dec 16 17:44:26 2014 - Stack: corosync - Current DC: pcmk-2 (2) - partition with quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --1 Resources configured. -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+1 Resources configured -+ - - Online: [ pcmk-1 pcmk-2 ] - - Full list of resources: - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 ------ --endif::[] -- --ifdef::crmsh[] --[source,C] ------ --# service corosync start --Starting Corosync Cluster Engine (corosync): [ OK ] --# service pacemaker start --Starting Pacemaker Cluster Manager: [ OK ] --# crm_mon --============ --Last updated: Fri Aug 28 15:32:13 2009 --Stack: openais --Current DC: pcmk-2 - partition with quorum --Version: 1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f --2 Nodes configured, 2 expected votes --1 Resources configured. --============ --Online: [ pcmk-1 pcmk-2 ] -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -+ -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - --ClusterIP (ocf::heartbeat:IPaddr): Started pcmk-2 -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled - ---- --endif::[] - - [NOTE] - ====== --In the dark days, the cluster may have moved the IP back to its --original location (+pcmk-1+). Usually this is no longer the case. -+With older versions of pacemaker, the cluster might move the IP back to its -+original location (*pcmk-1*). Usually, this is no longer the case. - ====== - --=== Prevent Resources from Moving after Recovery === -+== Prevent Resources from Moving after Recovery == - - In most circumstances, it is highly desirable to prevent healthy - resources from being moved around the cluster. Moving resources almost --always requires a period of downtime. For complex services like Oracle -+always requires a period of downtime. For complex services such as - databases, this period can be quite long. - --To address this, Pacemaker has the concept of resource stickiness --which controls how much a service prefers to stay running where it -+To address this, Pacemaker has the concept of resource _stickiness_, -+which controls how strongly a service prefers to stay running where it - is. You may like to think of it as the "cost" of any downtime. By - default, Pacemaker assumes there is zero cost associated with moving - resources and will do so to achieve "optimal" --footnote:[It should be noted that Pacemaker's definition of --optimal may not always agree with that of a human's. The order in which --Pacemaker processes lists of resources and nodes creates implicit --preferences in situations where the administrator has not explicitly --specified them] -+footnote:[Pacemaker's definition of optimal may not always agree with that of a -+human's. The order in which Pacemaker processes lists of resources and nodes -+creates implicit preferences in situations where the administrator has not -+explicitly specified them.] - resource placement. We can specify a different stickiness for every - resource, but it is often sufficient to change the default. - --ifdef::pcs[] --[source,C] - ---- --# pcs resource defaults resource-stickiness=100 --# pcs resource defaults -+[root@pcmk-1 ~]# pcs resource defaults resource-stickiness=100 -+[root@pcmk-1 ~]# pcs resource defaults - resource-stickiness: 100 - ---- --endif::[] -- --ifdef::crmsh[] --[source,C] ------ --# crm configure rsc_defaults resource-stickiness=100 --# crm configure show --node $id="1702537408" pcmk-1 --node $id="1719314624" pcmk-2 --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.120" cidr_netmask="32" \ -- op monitor interval="30s" --property $id="cib-bootstrap-options" \ -- dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ -- cluster-infrastructure="corosync" \ -- stonith-enabled="false" \ -- no-quorum-policy="ignore" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" ------ --endif::[] -+ -+[NOTE] -+====== -+Earlier versions of pcs, such as the one shipped with Fedora 20, -+require that `rsc` be added after `resource` in the above commands. -+====== -diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Apache.txt b/doc/Clusters_from_Scratch/en-US/Ch-Apache.txt -index 71777db..cbb1669 100644 ---- a/doc/Clusters_from_Scratch/en-US/Ch-Apache.txt -+++ b/doc/Clusters_from_Scratch/en-US/Ch-Apache.txt -@@ -1,324 +1,171 @@ --= Apache - Adding More Services = -+= Add Apache as a Cluster Service = - --== Forward == - Now that we have a basic but functional active/passive two-node cluster, - we're ready to add some real services. We're going to start with Apache --because its a feature of many clusters and relatively simple to -+because it is a feature of many clusters and relatively simple to - configure. - --== Installation == -+== Install Apache == - - Before continuing, we need to make sure Apache is installed on both - hosts. We also need the wget tool in order for the cluster to be able to check - the status of the Apache server. - --[source,C] -+---- - # yum install -y httpd wget -+---- - --..... --Loaded plugins: langpacks, presto, refresh-packagekit --fedora/metalink | 2.6 kB 00:00 --updates/metalink | 3.2 kB 00:00 --updates-testing/metalink | 41 kB 00:00 --Resolving Dependencies ----> Running transaction check -----> Package httpd.x86_64 0:2.2.22-3.fc17 will be installed ----> Processing Dependency: httpd-tools = 2.2.22-3.fc17 for package: httpd-2.2.22-3.fc17.x86_64 ----> Processing Dependency: apr-util-ldap for package: httpd-2.2.22-3.fc17.x86_64 ----> Processing Dependency: libaprutil-1.so.0()(64bit) for package: httpd-2.2.22-3.fc17.x86_64 ----> Processing Dependency: libapr-1.so.0()(64bit) for package: httpd-2.2.22-3.fc17.x86_64 ----> Running transaction check -----> Package apr.x86_64 0:1.4.6-1.fc17 will be installed -----> Package apr-util.x86_64 0:1.4.1-2.fc17 will be installed -----> Package apr-util-ldap.x86_64 0:1.4.1-2.fc17 will be installed -----> Package httpd-tools.x86_64 0:2.2.22-3.fc17 will be installed ----> Finished Dependency Resolution -- --Dependencies Resolved -- --===================================================================================== -- Package Arch Version Repository Size --===================================================================================== --Installing: -- httpd x86_64 2.2.22-3.fc17 updates-testing 823 k -- wget x86_64 1.13.4-2.fc17 fedora 495 k --Installing for dependencies: -- apr x86_64 1.4.6-1.fc17 fedora 99 k -- apr-util x86_64 1.4.1-2.fc17 fedora 78 k -- apr-util-ldap x86_64 1.4.1-2.fc17 fedora 17 k -- httpd-tools x86_64 2.2.22-3.fc17 updates-testing 74 k -- --Transaction Summary --===================================================================================== --Install 1 Package (+4 Dependent packages) -- --Total download size: 1.1 M --Installed size: 3.5 M --Downloading Packages: --(1/6): apr-1.4.6-1.fc17.x86_64.rpm | 99 kB 00:00 --(2/6): apr-util-1.4.1-2.fc17.x86_64.rpm | 78 kB 00:00 --(3/6): apr-util-ldap-1.4.1-2.fc17.x86_64.rpm | 17 kB 00:00 --(4/6): httpd-2.2.22-3.fc17.x86_64.rpm | 823 kB 00:01 --(5/6): httpd-tools-2.2.22-3.fc17.x86_64.rpm | 74 kB 00:00 --(6/6): wget-1.13.4-2.fc17.x86_64.rpm | 495 kB 00:01 --------------------------------------------------------------------------------------- --Total 238 kB/s | 1.1 MB 00:04 --Running Transaction Check --Running Transaction Test --Transaction Test Succeeded --Running Transaction -- Installing : apr-1.4.6-1.fc17.x86_64 1/6 -- Installing : apr-util-1.4.1-2.fc17.x86_64 2/6 -- Installing : apr-util-ldap-1.4.1-2.fc17.x86_64 3/6 -- Installing : httpd-tools-2.2.22-3.fc17.x86_64 4/6 -- Installing : httpd-2.2.22-3.fc17.x86_64 5/6 -- Installing : wget-1.13.4-2.fc17.x86_64 6/6 -- Verifying : apr-util-ldap-1.4.1-2.fc17.x86_64 1/6 -- Verifying : httpd-tools-2.2.22-3.fc17.x86_64 2/6 -- Verifying : apr-util-1.4.1-2.fc17.x86_64 3/6 -- Verifying : apr-1.4.6-1.fc17.x86_64 4/6 -- Verifying : httpd-2.2.22-3.fc17.x86_64 5/6 -- Verifying : wget-1.13.4-2.fc17.x86_64 6/6 -- --Installed: -- httpd.x86_64 0:2.2.22-3.fc17 wget.x86_64 0:1.13.4-2.fc17 -- --Dependency Installed: -- apr.x86_64 0:1.4.6-1.fc17 apr-util.x86_64 0:1.4.1-2.fc17 -- apr-util-ldap.x86_64 0:1.4.1-2.fc17 httpd-tools.x86_64 0:2.2.22-3.fc17 -- --Complete! --..... -- --== Preparation == -- --First we need to create a page for Apache to serve up. On Fedora the --default Apache docroot is /var/www/html, so we'll create an index file --there. -- --[source,C] ------- --# cat <<-END >/var/www/html/index.html -- -- My Test Site - pcmk-1 -- --END ------- -+[IMPORTANT] -+==== -+Do *not* enable the httpd service. Services that are intended to -+be managed via the cluster software should never be managed by the OS. -+ -+It is often useful, however, to manually start the service, verify that -+it works, then stop it again, before adding it to the cluster. This -+allows you to resolve any non-cluster-related problems before continuing. -+Since this is a simple example, we'll skip that step here. -+==== - --For the moment, we will simplify things by serving up only a static site --and manually sync the data between the two nodes. So run the command --again on pcmk-2. -+== Create Website Documents == -+ -+We need to create a page for Apache to serve. On Fedora, the -+default Apache document root is /var/www/html, so we'll create an index file -+there. For the moment, we will simplify things by serving a static site -+and manually synchronizing the data between the two nodes, so run this command -+on both nodes: - --[source,C] - ----- --[root@pcmk-2 ~]# cat <<-END >/var/www/html/index.html -+# cat <<-END >/var/www/html/index.html - -- My Test Site - pcmk-2 -+ My Test Site - $(hostname) - -- END -+END - ----- - - == Enable the Apache status URL == - - In order to monitor the health of your Apache instance, and recover it if - it fails, the resource agent used by Pacemaker assumes the server-status --URL is available. Look for the following in '/etc/httpd/conf/httpd.conf' --and make sure it is not disabled or commented out: -+URL is available. On both nodes, enable the URL with: - --[source,Apache Configuration] ------- -- -- SetHandler server-status -- Order deny,allow -- Deny from all -- Allow from 127.0.0.1 -- ------- -+---- -+# cat <<-END >/etc/httpd/conf.d/status.conf -+ -+ SetHandler server-status -+ Order deny,allow -+ Deny from all -+ Allow from 127.0.0.1 -+ -+END -+---- -+ -+[NOTE] -+====== -+If you are using a different operating system or an earlier version of Fedora, -+server-status may already be enabled or may be configurable in a different -+location. -+====== - --== Update the Configuration == -+== Configure the Cluster == - --At this point, Apache is ready to go, all that needs to be done is to --add it to the cluster. Lets call the resource WebSite. We need to use --an OCF script called apache in the heartbeat namespace --footnote:[Compare the key used here ocf:heartbeat:apache with the one we used earlier for the IP address: ocf:heartbeat:IPaddr2] --, the only required parameter is the path to the main Apache --configuration file and we'll tell the cluster to check once a --minute that apache is still running. -+At this point, Apache is ready to go, and all that needs to be done is to -+add it to the cluster. Let's call the resource WebSite. We need to use -+an OCF resource script called apache in the heartbeat namespace. -+footnote:[Compare the key used here, *ocf:heartbeat:apache*, with the one we -+used earlier for the IP address, *ocf:heartbeat:IPaddr2*] -+The script's only required parameter is the path to the main Apache -+configuration file, and we'll tell the cluster to check once a -+minute that Apache is still running. - --ifdef::pcs[] --//// --source,C doesn't deal well with \'s --//// ------- --pcs resource create WebSite ocf:heartbeat:apache \ -+---- -+[root@pcmk-1 ~]# pcs resource create WebSite ocf:heartbeat:apache \ - configfile=/etc/httpd/conf/httpd.conf \ -- statusurl="http://localhost/server-status" op monitor interval=1min ------- -+ statusurl="http://localhost/server-status" \ -+ op monitor interval=1min -+---- - --By default, the operation timeout for all resource's start, stop, and monitor --operations is 20 seconds. In many cases this timeout period is less than --the advised timeout period. For the purposes of this tutorial, we will --adjust the global operation timeout default to 240 seconds. -+By default, the operation timeout for all resources' start, stop, and monitor -+operations is 20 seconds. In many cases, this timeout period is less than -+a particular resource's advised timeout period. For the purposes of this -+tutorial, we will adjust the global operation timeout default to 240 seconds. - --[source,C] ------- --# pcs resource op defaults timeout=240s --# pcs resource op defaults -+---- -+[root@pcmk-1 ~]# pcs resource op defaults timeout=240s -+[root@pcmk-1 ~]# pcs resource op defaults - timeout: 240s ------- -- --endif::[] -- --ifdef::crmsh[] --[source,Bash] ------- --# crm configure primitive WebSite ocf:heartbeat:apache \ -- params configfile=/etc/httpd/conf/httpd.conf \ -- statusurl="http://localhost/server-status" \ -- op monitor interval=1min --WARNING: WebSite: default timeout 20s for start is smaller than the advised 40s --WARNING: WebSite: default timeout 20s for stop is smaller than the advised 60s ------- -- --The easiest way resolve this, is to change the default: -+---- - --[source,Bash] ------- --# crm configure op_defaults timeout=240s --# crm configure show --node $id="1702537408" pcmk-1 --node $id="1719314624" pcmk-2 --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.120" cidr_netmask="32" \ -- op monitor interval="30s" --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --property $id="cib-bootstrap-options" \ -- dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ -- cluster-infrastructure="corosync" \ -- stonith-enabled="false" \ -- no-quorum-policy="ignore" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" --op_defaults $id="op-options" \ -- timeout="240s" ------- --endif::[] -+[NOTE] -+====== -+In a production cluster, it is usually better to adjust each resource's -+start, stop, and monitor timeouts to values that are appropriate to -+the behavior observed in your environment, rather than adjust -+the global default. -+====== - --After a short delay, we should see the cluster start apache -+After a short delay, we should see the cluster start Apache. - --ifdef::pcs[] --[source,C] - ----- --# pcs status -- --Last updated: Fri Sep 14 10:51:27 2012 --Last change: Fri Sep 14 10:50:46 2012 via crm_attribute on pcmk-1 -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+Last updated: Wed Dec 17 12:40:41 2014 -+Last change: Wed Dec 17 12:40:05 2014 - Stack: corosync - Current DC: pcmk-2 (2) - partition with quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --2 Resources configured. -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+2 Resources configured -+ - - Online: [ pcmk-1 pcmk-2 ] - - Full list of resources: - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -- WebSite (ocf::heartbeat:apache): Started pcmk-1 ------- --endif::[] -- --ifdef::crmsh[] --[source,C] ------- --# crm_mon -1 --============ --Last updated: Tue Apr 3 11:54:29 2012 --Last change: Tue Apr 3 11:54:26 2012 via crmd on pcmk-1 --Stack: corosync --Current DC: pcmk-1 (1702537408) - partition with quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --2 Resources configured. --============ -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -+ WebSite (ocf::heartbeat:apache): Started pcmk-1 - --Online: [ pcmk-1 pcmk-2 ] -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - --ClusterIP (ocf:heartbeat:IPaddr2): Started pcmk-2 --WebSite (ocf:heartbeat:apache): Started pcmk-1 -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled - ----- --endif::[] - - Wait a moment, the WebSite resource isn't running on the same host as our - IP address! - --ifdef::pcs[] - [NOTE] - ====== - If, in the `pcs status` output, you see the WebSite resource has - failed to start, then you've likely not enabled the status URL correctly. --You can check if this is the problem by running: -- --.... --wget http://127.0.0.1/server-status --.... -- --If you see +Connection refused+ in the output, then this is indeed the --problem. Check to ensure that +Allow from 127.0.0.1+ is present for --the ++ block. -- --====== --endif::[] -- --ifdef::crmsh[] --[NOTE] --====== --If, in the `crm_mon` output, you see: -- --.... --Failed actions: -- WebSite_start_0 (node=pcmk-2, call=301, rc=1, status=complete): unknown error --.... -- --Then you've likely not enabled the status URL correctly. --You can check if this is the problem by running: -+You can check whether this is the problem by running: - - .... --wget http://127.0.0.1/server-status -+wget -O - http://127.0.0.1/server-status - .... - --If you see +Connection refused+ in the output, then this is indeed the --problem. Check to ensure that +Allow from 127.0.0.1+ is present for --the ++ block. -+If you see *Connection refused* in the output, then this is likely the -+problem. Ensure that *Allow from 127.0.0.1* is present for -+the ** block. - - ====== --endif::[] - --== Ensuring Resources Run on the Same Host == -+== Ensure Resources Run on the Same Host == - - To reduce the load on any one machine, Pacemaker will generally try to --spread the configured resources across the cluster nodes. However we -+spread the configured resources across the cluster nodes. However, we - can tell the cluster that two resources are related and need to run on --the same host (or not at all). Here we instruct the cluster that -+the same host (or not at all). Here, we instruct the cluster that - WebSite can only run on the host that ClusterIP is active on. - --ifdef::pcs[] --To achieve this we use a colocation constraint that indicates it is -+To achieve this, we use a _colocation constraint_ that indicates it is - mandatory for WebSite to run on the same node as ClusterIP. The - "mandatory" part of the colocation constraint is indicated by using a - score of INFINITY. The INFINITY score also means that if ClusterIP is not - active anywhere, WebSite will not be permitted to run. --endif::[] -- --ifdef::crmsh[] --For the constraint, we need a name (choose something descriptive like --website-with-ip), indicate that its mandatory (so that if ClusterIP is --not active anywhere, WebSite will not be permitted to run anywhere --either) by specifying a score of INFINITY and finally list the two --resources. --endif::[] - - [NOTE] - ======= -@@ -328,258 +175,143 @@ anywhere. - - [IMPORTANT] - =========== -- - Colocation constraints are "directional", in that they imply certain - things about the order in which the two resources will have a location --chosen. In this case we're saying +WebSite+ needs to be placed on the --same machine as +ClusterIP+, this implies that we must know the --location of +ClusterIP+ before choosing a location for +WebSite+. -- -+chosen. In this case, we're saying that *WebSite* needs to be placed on the -+same machine as *ClusterIP*, which implies that the cluster must know the -+location of *ClusterIP* before choosing a location for *WebSite*. - =========== - --ifdef::pcs[] --[source,C] - ----- --# pcs constraint colocation add WebSite ClusterIP INFINITY --# pcs constraint -+[root@pcmk-1 ~]# pcs constraint colocation add WebSite with ClusterIP INFINITY -+[root@pcmk-1 ~]# pcs constraint - Location Constraints: - Ordering Constraints: - Colocation Constraints: -- WebSite with ClusterIP --# pcs status -- --Last updated: Fri Sep 14 11:00:44 2012 --Last change: Fri Sep 14 11:00:25 2012 via cibadmin on pcmk-1 -+ WebSite with ClusterIP (score:INFINITY) -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+Last updated: Wed Dec 17 13:57:58 2014 -+Last change: Wed Dec 17 13:57:22 2014 - Stack: corosync - Current DC: pcmk-2 (2) - partition with quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --2 Resources configured. -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+2 Resources configured -+ - - Online: [ pcmk-1 pcmk-2 ] - - Full list of resources: - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -- WebSite (ocf::heartbeat:apache): Started pcmk-2 ------- --endif::[] -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -+ WebSite (ocf::heartbeat:apache): Started pcmk-2 - --ifdef::crmsh[] --[source,C] ------- --# crm configure colocation website-with-ip INFINITY: WebSite ClusterIP --# crm configure show --node $id="1702537408" pcmk-1 --node $id="1719314624" pcmk-2 --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.120" cidr_netmask="32" \ -- op monitor interval="30s" --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --colocation website-with-ip inf: WebSite ClusterIP --property $id="cib-bootstrap-options" \ -- dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ -- cluster-infrastructure="corosync" \ -- stonith-enabled="false" \ -- no-quorum-policy="ignore" \ -- last-lrm-refresh="1333446866" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" --op_defaults $id="op-options" \ -- timeout="240s" --# crm_mon -1 --============ --Last updated: Tue Apr 3 11:57:13 2012 --Last change: Tue Apr 3 11:56:10 2012 via cibadmin on pcmk-1 --Stack: corosync --Current DC: pcmk-2 (1719314624) - partition with quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --2 Resources configured. --============ -- --Online: [ pcmk-1 pcmk-2 ] -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - -- ClusterIP (ocf:heartbeat:IPaddr2): Started pcmk-2 -- WebSite (ocf:heartbeat:apache): Started pcmk-2 -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled - ----- --endif::[] - --== Controlling Resource Start/Stop Ordering == -+== Ensure Resources Start and Stop in Order == -+ -+Like many services, Apache can be configured to bind to specific -+IP addresses on a host or to the wildcard IP address. If Apache -+binds to the wildcard, it doesn't matter whether an IP address -+is added before or after Apache starts; Apache will respond on -+that IP just the same. However, if Apache binds only to certain IP -+address(es), the order matters: If the address is added after Apache -+starts, Apache won't respond on that address. - --When Apache starts, it binds to the available IP addresses. It doesn't --know about any addresses we add afterwards, so not only do they need to --run on the same node, but we need to make sure ClusterIP is already --active before we start WebSite. We do this by adding an ordering --constraint. -+To be sure our WebSite responds regardless of Apache's address configuration, -+we need to make sure ClusterIP not only runs on the same node, -+but starts before WebSite. A colocation constraint only ensures the -+resources run together, not the order in which they are started and stopped. - --ifdef::pcs[] --By default all order constraints are mandatory constraints unless --otherwise configured. This means that the recovery of ClusterIP will --also trigger the recovery of WebSite. -+We do this by adding an ordering constraint. By default, all order constraints -+are mandatory, which means that the recovery of ClusterIP will also trigger the -+recovery of WebSite. - --[source,C] - ----- --# pcs constraint order ClusterIP then WebSite -+[root@pcmk-1 ~]# pcs constraint order ClusterIP then WebSite - Adding ClusterIP WebSite (kind: Mandatory) (Options: first-action=start then-action=start) --# pcs constraint -+[root@pcmk-1 ~]# pcs constraint - Location Constraints: - Ordering Constraints: -- start ClusterIP then start WebSite -+ start ClusterIP then start WebSite (kind:Mandatory) - Colocation Constraints: -- WebSite with ClusterIP ------- --endif::[] -- --ifdef::crmsh[] -- --We need to give it a name (choose something descriptive like --apache-after-ip), indicate that its mandatory (so that any recovery for --ClusterIP will also trigger recovery of WebSite) and list the two --resources in the order we need them to start. -- --[source,C] ------- --# crm configure order apache-after-ip mandatory: ClusterIP WebSite --# crm configure show --node $id="1702537408" pcmk-1 --node $id="1719314624" pcmk-2 --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.120" cidr_netmask="32" \ -- op monitor interval="30s" --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --colocation website-with-ip inf: WebSite ClusterIP --order apache-after-ip inf: ClusterIP WebSite --property $id="cib-bootstrap-options" \ -- dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ -- cluster-infrastructure="corosync" \ -- stonith-enabled="false" \ -- no-quorum-policy="ignore" \ -- last-lrm-refresh="1333446866" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" --op_defaults $id="op-options" \ -- timeout="240s" -+ WebSite with ClusterIP (score:INFINITY) - ----- --endif::[] - --== Specifying a Preferred Location == -+== Prefer One Node Over Another == - - Pacemaker does not rely on any sort of hardware symmetry between nodes, - so it may well be that one machine is more powerful than the other. In --such cases it makes sense to host the resources there if it is available. --To do this we create a location constraint. -+such cases, it makes sense to host the resources on the more powerful node if -+it is available. To do this, we create a location constraint. - --ifdef::pcs[] - In the location constraint below, we are saying the WebSite resource --prefers the node pcmk-1 with a score of 50. The score here indicates --how badly we'd like the resource to run somewhere. -+prefers the node pcmk-1 with a score of 50. Here, the score indicates -+how badly we'd like the resource to run at this location. - --[source,C] - ----- --# pcs constraint location WebSite prefers pcmk-1=50 --# pcs constraint -+[root@pcmk-1 ~]# pcs constraint location WebSite prefers pcmk-1=50 -+[root@pcmk-1 ~]# pcs constraint - Location Constraints: - Resource: WebSite - Enabled on: pcmk-1 (score:50) - Ordering Constraints: -- start ClusterIP then start WebSite -+ start ClusterIP then start WebSite (kind:Mandatory) - Colocation Constraints: -- WebSite with ClusterIP --# pcs status --Last updated: Fri Sep 14 11:06:37 2012 --Last change: Fri Sep 14 11:06:26 2012 via cibadmin on pcmk-1 -+ WebSite with ClusterIP (score:INFINITY) -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+Last updated: Wed Dec 17 14:11:49 2014 -+Last change: Wed Dec 17 14:11:20 2014 - Stack: corosync - Current DC: pcmk-2 (2) - partition with quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --2 Resources configured. -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+2 Resources configured -+ - - Online: [ pcmk-1 pcmk-2 ] - - Full list of resources: - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -- WebSite (ocf::heartbeat:apache): Started pcmk-2 ------- --endif::[] -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -+ WebSite (ocf::heartbeat:apache): Started pcmk-2 - --ifdef::crmsh[] --Again we give it a descriptive name (prefer-pcmk-1), specify the resource we --want to run there (WebSite), how badly we'd like it to run there (we'll use --50 for now, but in a two-node situation almost any value above 0 will do) and --the host's name. -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - --[source,C] ------- --# crm configure location prefer-pcmk-1 WebSite 50: pcmk-1 --WARNING: prefer-pcmk-1: referenced node pcmk-1 does not exist -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled - ----- - --This warning should be ignored. -- --[source,C] ------- --# crm configure show --node $id="1702537408" pcmk-1 --node $id="1719314624" pcmk-2 --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.120" cidr_netmask="32" \ -- op monitor interval="30s" --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --location prefer-pcmk-1 WebSite 50: pcmk-1 --colocation website-with-ip inf: WebSite ClusterIP --order apache-after-ip inf: ClusterIP WebSite --property $id="cib-bootstrap-options" \ -- dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ -- cluster-infrastructure="corosync" \ -- stonith-enabled="false" \ -- no-quorum-policy="ignore" \ -- last-lrm-refresh="1333446866" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" --op_defaults $id="op-options" \ -- timeout="240s" --# crm_mon -1 --============ --Last updated: Tue Apr 3 12:02:14 2012 --Last change: Tue Apr 3 11:59:42 2012 via cibadmin on pcmk-1 --Stack: corosync --Current DC: pcmk-2 (1719314624) - partition with quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --2 Resources configured. --============ -- --Online: [ pcmk-1 pcmk-2 ] -- -- ClusterIP (ocf:heartbeat:IPaddr2): Started pcmk-2 -- WebSite (ocf:heartbeat:apache): Started pcmk-2 ------- --endif::[] -- - Wait a minute, the resources are still on pcmk-2! - --Even though we now prefer pcmk-1 over pcmk-2, that preference is -+Even though WebSite now prefers to run on pcmk-1, that preference is - (intentionally) less than the resource stickiness (how much we - preferred not to have unnecessary downtime). - --To see the current placement scores, you can use a tool called crm_simulate -+To see the current placement scores, you can use a tool called crm_simulate. - --[source,C] - ---- --# crm_simulate -sL -+[root@pcmk-1 ~]# crm_simulate -sL -+ - Current cluster status: - Online: [ pcmk-1 pcmk-2 ] - -- ClusterIP (ocf:heartbeat:IPaddr2): Started pcmk-2 -- WebSite (ocf:heartbeat:apache): Started pcmk-2 -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -+ WebSite (ocf::heartbeat:apache): Started pcmk-2 - - Allocation scores: - native_color: ClusterIP allocation score on pcmk-1: 50 -@@ -590,206 +322,108 @@ native_color: WebSite allocation score on pcmk-2: 100 - Transition Summary: - ---- - --== Manually Moving Resources Around the Cluster == -+== Move Resources Manually == - --ifdef::pcs[] - There are always times when an administrator needs to override the --cluster and force resources to move to a specific location. By --updating our previous location constraint with a score of INFINITY, --WebSite will be forced to move to pcmk-1. -+cluster and force resources to move to a specific location. In this example, -+we will force the WebSite to move to pcmk-1 by -+updating our previous location constraint with a score of INFINITY. - --[source,C] - ----- --# pcs constraint location WebSite prefers pcmk-1=INFINITY --# pcs constraint --full -+[root@pcmk-1 ~]# pcs constraint location WebSite prefers pcmk-1=INFINITY -+[root@pcmk-1 ~]# pcs constraint - Location Constraints: - Resource: WebSite -- Enabled on: pcmk-1 (score:INFINITY) (id:location-WebSite-pcmk-1-INFINITY) -+ Enabled on: pcmk-1 (score:INFINITY) - Ordering Constraints: -- start ClusterIP then start WebSite (Mandatory) (id:order-ClusterIP-WebSite-mandatory) -+ start ClusterIP then start WebSite (kind:Mandatory) - Colocation Constraints: -- WebSite with ClusterIP (INFINITY) (id:colocation-WebSite-ClusterIP-INFINITY) --# pcs status -- --Last updated: Fri Sep 14 11:16:26 2012 --Last change: Fri Sep 14 11:16:18 2012 via cibadmin on pcmk-1 -+ WebSite with ClusterIP (score:INFINITY) -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+Last updated: Wed Dec 17 14:19:34 2014 -+Last change: Wed Dec 17 14:18:37 2014 - Stack: corosync - Current DC: pcmk-2 (2) - partition with quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --2 Resources configured. -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+2 Resources configured -+ - - Online: [ pcmk-1 pcmk-2 ] - - Full list of resources: - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 -- WebSite (ocf::heartbeat:apache): Started pcmk-1 ------- --endif::[] -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 -+ WebSite (ocf::heartbeat:apache): Started pcmk-1 - --ifdef::crmsh[] --There are always times when an administrator needs to override the --cluster and force resources to move to a specific location. Underneath we --use location constraints like the one we created above, happily you don't --need to care. Just provide the name of the resource and the intended --location, we'll do the rest. -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - --[source,C] ------- --# crm resource move WebSite pcmk-1 --# crm_mon -1 --============ --Last updated: Tue Apr 3 12:03:41 2012 --Last change: Tue Apr 3 12:03:37 2012 via crm_resource on pcmk-1 --Stack: corosync --Current DC: pcmk-2 (1719314624) - partition with quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --2 Resources configured. --============ -- --Online: [ pcmk-1 pcmk-2 ] -- -- ClusterIP (ocf:heartbeat:IPaddr2): Started pcmk-1 -- WebSite (ocf:heartbeat:apache): Started pcmk-1 ------- -- --Notice how the colocation rule we created has ensured that ClusterIP was also moved to pcmk-1. --For the curious, we can see the effect of this command by examining the configuration -- --[source,C] ------- --# crm configure show --node $id="1702537408" pcmk-1 --node $id="1719314624" pcmk-2 --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.120" cidr_netmask="32" \ -- op monitor interval="30s" --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --location cli-prefer-WebSite WebSite \ -- rule $id="cli-prefer-rule-WebSite" inf: #uname eq pcmk-1 --location prefer-pcmk-1 WebSite 50: pcmk-1 --colocation website-with-ip inf: WebSite ClusterIP --order apache-after-ip inf: ClusterIP WebSite --property $id="cib-bootstrap-options" \ -- dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ -- cluster-infrastructure="corosync" \ -- stonith-enabled="false" \ -- no-quorum-policy="ignore" \ -- last-lrm-refresh="1333446866" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" --op_defaults $id="op-options" \ -- timeout="240s" -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled - ----- - --The automated constraint used to move the resources to +pcmk-1+ is the --line beginning with +location cli-prefer-WebSite+. --endif::[] -- --=== Giving Control Back to the Cluster === -- --Once we've finished whatever activity that required us to move the --resources to pcmk-1, in our case nothing, we can then allow the cluster --to resume normal operation with the unmove command. Since we previously -+Once we've finished whatever activity required us to move the -+resources to pcmk-1 (in our case nothing), we can then allow the cluster -+to resume normal operation by removing the new constraint. Since we previously - configured a default stickiness, the resources will remain on pcmk-1. - --ifdef::pcs[] --[source,C] -+First, use the `--full` option to get the constraint's ID: - ----- --# pcs constraint all -+[root@pcmk-1 ~]# pcs constraint --full - Location Constraints: - Resource: WebSite - Enabled on: pcmk-1 (score:INFINITY) (id:location-WebSite-pcmk-1-INFINITY) - Ordering Constraints: -- start ClusterIP then start WebSite (Mandatory) (id:order-ClusterIP-WebSite-mandatory) --Colocation Constraints: -- WebSite with ClusterIP (INFINITY) (id:colocation-WebSite-ClusterIP-INFINITY) --# pcs constraint remove location-WebSite-pcmk-1-INFINITY --# pcs constraint --Location Constraints: --Ordering Constraints: -- start ClusterIP then start WebSite -+ start ClusterIP then start WebSite (kind:Mandatory) (id:order-ClusterIP-WebSite-mandatory) - Colocation Constraints: -- WebSite with ClusterIP -+ WebSite with ClusterIP (score:INFINITY) (id:colocation-WebSite-ClusterIP-INFINITY) - ----- --endif::[] - --ifdef::crmsh[] --[source,C] -+Then remove the desired contraint using its ID: - ----- --# crm resource unmove WebSite --# crm configure show --node $id="1702537408" pcmk-1 --node $id="1719314624" pcmk-2 --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.120" cidr_netmask="32" \ -- op monitor interval="30s" --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --location prefer-pcmk-1 WebSite 50: pcmk-1 --colocation website-with-ip inf: WebSite ClusterIP --order apache-after-ip inf: ClusterIP WebSite --property $id="cib-bootstrap-options" \ -- dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ -- cluster-infrastructure="corosync" \ -- stonith-enabled="false" \ -- no-quorum-policy="ignore" \ -- last-lrm-refresh="1333446866" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" --op_defaults $id="op-options" \ -- timeout="240s" -+[root@pcmk-1 ~]# pcs constraint remove location-WebSite-pcmk-1-INFINITY -+[root@pcmk-1 ~]# pcs constraint -+Location Constraints: -+Ordering Constraints: -+ start ClusterIP then start WebSite (kind:Mandatory) -+Colocation Constraints: -+ WebSite with ClusterIP (score:INFINITY) - ----- --endif::[] - --Note that the constraint is now gone. If we check the cluster --status, we can also see that as expected the resources are still active -+Note that the location constraint is now gone. If we check the cluster -+status, we can also see that (as expected) the resources are still active - on pcmk-1. - --ifdef::pcs[] --[source,C] - ----- - # pcs status -- --Last updated: Fri Sep 14 11:57:12 2012 --Last change: Fri Sep 14 11:57:03 2012 via cibadmin on pcmk-1 -+Cluster name: mycluster -+Last updated: Wed Dec 17 14:25:21 2014 -+Last change: Wed Dec 17 14:24:29 2014 - Stack: corosync - Current DC: pcmk-2 (2) - partition with quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --2 Resources configured. -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+2 Resources configured -+ - - Online: [ pcmk-1 pcmk-2 ] - - Full list of resources: - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 -- WebSite (ocf::heartbeat:apache): Started pcmk-1 ------- --endif::[] -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 -+ WebSite (ocf::heartbeat:apache): Started pcmk-1 - --ifdef::crmsh[] --[source,C] ------- --# crm_mon --============ --Last updated: Tue Apr 3 12:05:08 2012 --Last change: Tue Apr 3 12:03:37 2012 via crm_resource on pcmk-1 --Stack: corosync --Current DC: pcmk-2 (1719314624) - partition with quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --2 Resources configured. --============ -- --Online: [ pcmk-1 pcmk-2 ] -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - -- ClusterIP (ocf:heartbeat:IPaddr2): Started pcmk-1 -- WebSite (ocf:heartbeat:apache): Started pcmk-1 -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled - ----- --endif::[] -diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Installation.txt b/doc/Clusters_from_Scratch/en-US/Ch-Installation.txt -index cf47602..1c2303b 100644 ---- a/doc/Clusters_from_Scratch/en-US/Ch-Installation.txt -+++ b/doc/Clusters_from_Scratch/en-US/Ch-Installation.txt -@@ -1,46 +1,46 @@ - = Installation = - --== OS Installation == -+== Install the OS == - - Detailed instructions for installing Fedora are available at --http://docs.fedoraproject.org/en-US/Fedora/20/html/Installation_Guide/ in a number of --languages. The abbreviated version is as follows... -+http://docs.fedoraproject.org/en-US/Fedora/21/html/Installation_Guide/ in a number of -+languages. The abbreviated version is as follows: - --Point your browser to http://fedoraproject.org/en/get-fedora-all, --locate the +Install Media+ section and download the install DVD that --matches your hardware. -+Point your browser to https://getfedora.org/, -+choose a flavor (Server is an appropriate choice), -+and download the installation image appropriate to your hardware. - --Burn the disk image to a DVD --footnote:[http://docs.fedoraproject.org/en-US/Fedora/20/html/Burning_ISO_images_to_disc/index.html] -+Burn the installation image to a DVD or USB drive -+footnote:[http://docs.fedoraproject.org/en-US/Fedora/21/html/Installation_Guide/sect-preparing-boot-media.html] - and boot from it, or use the image to boot a virtual machine. - --After clicking through the welcome screen, select your language, --keyboard layout --footnote:[http://docs.fedoraproject.org/en-US/Fedora/20/html/Installation_Guide/language-selection-x86.html] -+After starting the installation, select your language and keyboard layout at -+the welcome screen. -+footnote:[http://docs.fedoraproject.org/en-US/Fedora/21/html/Installation_Guide/sect-installation-graphical-mode.html] - --At this point you get a chance to tweak the default installation options. -+At this point, you get a chance to tweak the default installation options. - --In the +Network Configuration+ section you'll want to: -+In the *NETWORK & HOSTNAME* section you'll want to: - --- Assign your machine a host name -+- Assign your machine a host name. - I happen to control the clusterlabs.org domain name, so I will use -- that here. --- Assign a fixed IP address -+ pcmk-1.clusterlabs.org here. -+- Assign a fixed IPv4 address. In this example, I'll use 192.168.122.101. - - [IMPORTANT] - =========== - Do not accept the default network settings. --Cluster machines should never obtain an IP address via DHCP. -+Cluster machines should never obtain an IP address via DHCP, because -+DHCP's periodic address renewal will interfere with corosync. - --If you miss this step, this can easily be configured after installation. You will have --to navigate to +system settings+ and select +network+. From there you can select -+If you miss this step during installation, it can easily be fixed later. You will have -+to navigate to *system settings* and select *network*. From there, you can select - what device to configure. - =========== - --In the +Software Selection+ section (try saying that 10 times --quickly), choose +Minimal Install+ so that we see everything that gets --installed. Don't enable updates yet, we'll do that (and install any --extra software we need) later. -+In the *Software Selection* section (try saying that 10 times -+quickly), leave all *Add-Ons* unchecked so that we see everything that gets -+installed. We'll install any extra software we need later. - - [IMPORTANT] - =========== -@@ -49,95 +49,96 @@ By default Fedora uses LVM for partitioning which allows us to - dynamically change the amount of space allocated to a given partition. - - However, by default it also allocates all free space to the +/+ --(aka. +root+) partition which cannot be dynamically _reduced_ in size --(dynamic increases are fine by-the-way). -+(aka. *root*) partition, which cannot be dynamically _reduced_ in size -+(dynamic increases are fine, by the way). - - So if you plan on following the DRBD or GFS2 portions of this guide, --you should reserve at least 1Gb of space on each machine from which to --create a shared volume. To do so, enter the +Installation --Destination+ section where you are be given an opportunity to reduce --the size of the +root+ partition (after chosing which hard drive you --wish to install to). -+you should reserve at least 1GiB of space on each machine from which to -+create a shared volume. To do so, enter the *Installation -+Destination* section where you are be given an opportunity to reduce -+the size of the *root* partition (after choosing which hard drive you -+wish to install to). If you want the reserved space to be available -+within an LVM volume group, be sure to select *Modify...* next to -+the volume group name and change the *Size policy:* to *Fixed* -+or *As large as possible*. - - =========== - - It is highly recommended to enable NTP on your cluster nodes. Doing so - ensures all nodes agree on the current time and makes reading log files --significantly easier. You can do this in the +Date & Time+ section. --footnote:[http://docs.fedoraproject.org/en-US/Fedora/20/html/Installation_Guide/s1-timezone-x86.html] -+significantly easier. You can do this in the *DATE & TIME* section. -+footnote:[http://docs.fedoraproject.org/en-US/Fedora/21/html/Installation_Guide/sect-installation-gui-date-and-time.html] - -- -- --Once the node reboots, you'll see a (possibly mangled) login prompt on --the console. Login using +root+ and the password you created earlier. -+Once you've completed the installation, set a root password as instructed. -+For the purposes of this document, it is not necessary to create any additional -+users. After the node reboots, you'll see a (possibly mangled) login prompt on -+the console. Login using *root* and the password you created earlier. - - image::images/Console.png["Initial Console",align="center",scaledwidth="65%"] - - [NOTE] - ====== - --From here on in we're going to be working exclusively from the terminal. -+From here on, we're going to be working exclusively from the terminal. - - ====== - --== Post Installation Tasks == -+== Configure the OS == - --=== Networking === -+=== Verify Networking === - --Check the machine has the static IP address you configured earlier -+Ensure that the machine has the static IP address you configured earlier. - --[source,C] - ----- --# ip addr --1: lo: mtu 16436 qdisc noqueue state UNKNOWN -+[root@pcmk-1 ~]# ip addr -+1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default - link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 - inet 127.0.0.1/8 scope host lo - inet6 ::1/128 scope host - valid_lft forever preferred_lft forever --2: eth0: mtu 1500 qdisc pfifo_fast state UP qlen 1000 -+2: eth0: mtu 1500 qdisc pfifo_fast state UP group default qlen 1000 - link/ether 52:54:00:d7:d6:08 brd ff:ff:ff:ff:ff:ff - inet 192.168.122.101/24 brd 192.168.122.255 scope global eth0 -+ valid_lft forever preferred_lft forever - inet6 fe80::5054:ff:fed7:d608/64 scope link - valid_lft forever preferred_lft forever - ----- - - [NOTE] - ===== --If you ever need to change the node's IP address from the command line follow these instructions: -+If you ever need to change the node's IP address from the command line, follow these instructions: - - .... --# manually edit /etc/sysconfig/network-scripts/ifcfg-${device} --# nmcli dev disconnect ${device} --# nmcli con reload ${device} --# nmcli con up ${device} -+[root@pcmk-1 ~]# vim /etc/sysconfig/network-scripts/ifcfg-${device} # manually edit as desired -+[root@pcmk-1 ~]# nmcli dev disconnect ${device} -+[root@pcmk-1 ~]# nmcli con reload ${device} -+[root@pcmk-1 ~]# nmcli con up ${device} - .... - --This makes +NetworkManager+ aware that a change was made on the config file. -+This makes *NetworkManager* aware that a change was made on the config file. - - ===== - --Next, check the routes are ok: -+Next, ensure that the routes are as expected: - --[source,C] - ----- - [root@pcmk-1 ~]# ip route --default via 192.168.122.1 dev eth0 -+default via 192.168.122.1 dev eth0 proto static metric 1024 - 192.168.122.0/24 dev eth0 proto kernel scope link src 192.168.122.101 - ----- - --If there is no line beginning with +default via+, then you may need to add a line such as -+If there is no line beginning with *default via*, then you may need to add a line such as - - [source,Bash] - GATEWAY=192.168.122.1 - --to '/etc/sysconfig/network' and restart the network. -+to +/etc/sysconfig/network+ and restart the network. - --Now check for connectivity to the outside world. Start small by --testing if we can read the gateway we configured. -+Now, check for connectivity to the outside world. Start small by -+testing whether we can reach the gateway we configured. - --[source,C] - ----- --# ping -c 1 192.168.122.1 -+[root@pcmk-1 ~]# ping -c 1 192.168.122.1 - PING 192.168.122.1 (192.168.122.1) 56(84) bytes of data. - 64 bytes from 192.168.122.1: icmp_req=1 ttl=64 time=0.249 ms - -@@ -146,11 +147,10 @@ PING 192.168.122.1 (192.168.122.1) 56(84) bytes of data. - rtt min/avg/max/mdev = 0.249/0.249/0.249/0.000 ms - ----- - --Now try something external, choose a location you know will be available. -+Now try something external; choose a location you know should be available. - --[source,C] - ----- --# ping -c 1 www.google.com -+[root@pcmk-1 ~]# ping -c 1 www.google.com - PING www.l.google.com (173.194.72.106) 56(84) bytes of data. - 64 bytes from tf-in-f106.1e100.net (173.194.72.106): icmp_req=1 ttl=41 time=167 ms - -@@ -159,15 +159,14 @@ PING www.l.google.com (173.194.72.106) 56(84) bytes of data. - rtt min/avg/max/mdev = 167.618/167.618/167.618/0.000 ms - ----- - --=== Leaving the Console === -+=== Login Remotely === - --The console isn't a very friendly place to work from, we will now -+The console isn't a very friendly place to work from, so we will now - switch to accessing the machine remotely via SSH where we can --use copy&paste etc. -+use copy and paste, etc. - --First we check we can see the newly installed at all: -+From another host, check whether we can see the new host at all: - --[source,C] - ----- - beekhof@f16 ~ # ping -c 1 192.168.122.101 - PING 192.168.122.101 (192.168.122.101) 56(84) bytes of data. -@@ -178,9 +177,8 @@ PING 192.168.122.101 (192.168.122.101) 56(84) bytes of data. - rtt min/avg/max/mdev = 1.012/1.012/1.012/0.000 ms - ----- - --Next we login via SSH -+Next, login as root via SSH. - --[source,C] - ----- - beekhof@f16 ~ # ssh -l root 192.168.122.11 - root@192.168.122.11's password: -@@ -188,7 +186,14 @@ Last login: Fri Mar 30 19:41:19 2012 from 192.168.122.1 - [root@pcmk-1 ~]# - ----- - --=== Security Shortcuts === -+=== Apply Updates === -+ -+Apply any package updates released since your installation image was created: -+---- -+[root@pcmk-1 ~]# yum update -+---- -+ -+=== Disable Security During Testing === - - To simplify this guide and focus on the aspects directly connected to - clustering, we will now disable the machine's firewall and SELinux -@@ -196,64 +201,74 @@ installation. - - [WARNING] - =========== --Both of these actions create significant security issues --and should not be performed on machines that will be exposed to the --outside world. -+These actions create significant security issues and should not be performed on -+machines that will be exposed to the outside world. - =========== - --[IMPORTANT] --=========== -+//// - TODO: Create an Appendix that deals with (at least) re-enabling the firewall. -+//// -+ -+---- -+[root@pcmk-1 ~]# setenforce 0 -+[root@pcmk-1 ~]# sed -i.bak "s/SELINUX=enforcing/SELINUX=permissive/g" /etc/selinux/config -+[root@pcmk-1 ~]# systemctl disable firewalld.service -+[root@pcmk-1 ~]# systemctl stop firewalld.service -+[root@pcmk-1 ~]# iptables --flush -+---- -+ -+[NOTE] - =========== -+If you are using Fedora 17 or earlier or are using the iptables -+service for your firewall, the commands would be: - --[source,C] - ---- --# setenforce 0 --# sed -i.bak "s/SELINUX=enforcing/SELINUX=permissive/g" /etc/selinux/config --# systemctl disable iptables.service --# rm '/etc/systemd/system/basic.target.wants/iptables.service' --# systemctl stop iptables.service -+[root@pcmk-1 ~]# setenforce 0 -+[root@pcmk-1 ~]# sed -i.bak "s/SELINUX=enforcing/SELINUX=permissive/g" /etc/selinux/config -+[root@pcmk-1 ~]# systemctl disable iptables.service -+[root@pcmk-1 ~]# rm -f /etc/systemd/system/basic.target.wants/iptables.service -+[root@pcmk-1 ~]# systemctl stop iptables.service -+[root@pcmk-1 ~]# iptables --flush - ---- -+=========== - --=== Short Node Names === -+ -+=== Use Short Node Names === - - During installation, we filled in the machine's fully qualified domain --name (FQDN) which can be rather long when it appears in cluster logs and -+name (FQDN), which can be rather long when it appears in cluster logs and - status output. See for yourself how the machine identifies itself: - (((Nodes, short name))) - --[source,C] - ---- --# uname -n -+[root@pcmk-1 ~]# uname -n - pcmk-1.clusterlabs.org --# dnsdomainname -+[root@pcmk-1 ~]# dnsdomainname - clusterlabs.org - ---- - (((Nodes, Domain name (Query)))) - - The output from the second command is fine, but we really don't need the - domain name included in the basic host details. To address this, we need --to use the +hostnamectl+ tool to strip off the domain name. --[source,C] -+to use the `hostnamectl` tool to strip off the domain name. - ---- --# hostnamectl set-hostname $(uname -n | sed s/\\..*//)' -+[root@pcmk-1 ~]# hostnamectl set-hostname $(uname -n | sed s/\\..*//) - ---- - (((Nodes, Domain name (Remove from host name)))) - - Now check the machine is using the correct names - --[source,C] - ---- --# uname -n -+[root@pcmk-1 ~]# uname -n - pcmk-1 --# dnsdomainname -+[root@pcmk-1 ~]# dnsdomainname - clusterlabs.org - ---- - - If it concerns you that the shell prompt has not been updated, simply - log out and back in again. - --== Before You Continue == -+== Repeat for Second Node == - - Repeat the Installation steps so far, so that you have two Fedora - nodes ready to have the cluster software installed. -@@ -261,13 +276,14 @@ nodes ready to have the cluster software installed. - For the purposes of this document, the additional node is called - pcmk-2 with address 192.168.122.102. - --=== Finalize Networking === -+== Configure Communication Between Nodes == -+ -+=== Configure Host Name Resolution === - - Confirm that you can communicate between the two new nodes: - --[source,C] - ---- --# ping -c 3 192.168.122.102 -+[root@pcmk-1 ~]# ping -c 3 192.168.122.102 - PING 192.168.122.102 (192.168.122.102) 56(84) bytes of data. - 64 bytes from 192.168.122.102: icmp_seq=1 ttl=64 time=0.343 ms - 64 bytes from 192.168.122.102: icmp_seq=2 ttl=64 time=0.402 ms -@@ -280,21 +296,19 @@ rtt min/avg/max/mdev = 0.343/0.434/0.558/0.092 ms - - Now we need to make sure we can communicate with the machines by their - name. If you have a DNS server, add additional entries for the two --machines. Otherwise, you'll need to add the machines to '/etc/hosts' . --Below are the entries for my cluster nodes: -+machines. Otherwise, you'll need to add the machines to +/etc/hosts+ -+on both nodes. Below are the entries for my cluster nodes: - --[source,C] - ---- --# grep pcmk /etc/hosts -+[root@pcmk-1 ~]# grep pcmk /etc/hosts - 192.168.122.101 pcmk-1.clusterlabs.org pcmk-1 - 192.168.122.102 pcmk-2.clusterlabs.org pcmk-2 - ---- - - We can now verify the setup by again using ping: - --[source,C] - ---- --# ping -c 3 pcmk-2 -+[root@pcmk-1 ~]# ping -c 3 pcmk-2 - PING pcmk-2.clusterlabs.org (192.168.122.101) 56(84) bytes of data. - 64 bytes from pcmk-1.clusterlabs.org (192.168.122.101): icmp_seq=1 ttl=64 time=0.164 ms - 64 bytes from pcmk-1.clusterlabs.org (192.168.122.101): icmp_seq=2 ttl=64 time=0.475 ms -@@ -316,16 +330,15 @@ without being prompted. - - [WARNING] - ========= --Unprotected SSH keys, those without a password, are not recommended for servers exposed to the outside world. -+Unprotected SSH keys (those without a password) are not recommended for servers exposed to the outside world. - We use them here only to simplify the demo. - ========= - - Create a new key and allow anyone with that key to log in: - - .Creating and Activating a new SSH Key --[source,C] - ---- --# ssh-keygen -t dsa -f ~/.ssh/id_dsa -N "" -+[root@pcmk-1 ~]# ssh-keygen -t dsa -f ~/.ssh/id_dsa -N "" - Generating public/private dsa key pair. - Your identification has been saved in /root/.ssh/id_dsa. - Your public key has been saved in /root/.ssh/id_dsa.pub. -@@ -345,17 +358,16 @@ The key's randomart image is: - | | - +-----------------+ - --# cp .ssh/id_dsa.pub .ssh/authorized_keys -+[root@pcmk-1 ~]# cp ~/.ssh/id_dsa.pub ~/.ssh/authorized_keys - ---- - (((Creating and Activating a new SSH Key))) - --Install the key on the other nodes and test that you can now run commands --remotely, without being prompted -+Install the key on the other node and test that you can now run commands -+remotely, without being prompted. - - .Installing the SSH Key on Another Host --[source,C] - ---- --# scp -r .ssh pcmk-2: -+[root@pcmk-1 ~]# scp -r ~/.ssh pcmk-2: - The authenticity of host 'pcmk-2 (192.168.122.102)' can't be established. - RSA key fingerprint is b1:2b:55:93:f1:d9:52:2b:0f:f2:8a:4e:ae:c6:7c:9a. - Are you sure you want to continue connecting (yes/no)? yes -@@ -364,276 +376,152 @@ id_dsa.pub 100% 616 0.6KB/s 00:00 - id_dsa 100% 672 0.7KB/s 00:00 - known_hosts 100% 400 0.4KB/s 00:00 - authorized_keys 100% 616 0.6KB/s 00:00 --# ssh pcmk-2 -- uname -n -+[root@pcmk-1 ~]# ssh pcmk-2 -- uname -n - pcmk-2 --# - ---- - --== Cluster Software Installation == -+== Install the Cluster Software == - --=== Install the Cluster Software === -+Fedora 17 and later comes with everything you need, so simply fire up a shell -+on both nodes and run the following to install pacemaker and command-line -+cluster management software: - --Since version 12, Fedora comes with recent versions of everything you --need, so simply fire up a shell on all your nodes and run: -- --[source,C] - ---- --[ALL] # yum install -y pacemaker pcs -+# yum install -y pacemaker pcs psmisc - ---- - --Now install the cluster software on the second node. -- --ifdef::pcs[] --=== Install the Cluster Management Software === --The pcs cli command coupled with the pcs daemon creates a cluster --management system capable of managing all aspects of the cluster stack --across all nodes from a single location. -- --[source,C] ------ --[ALL] # yum install -y pcs ------ -+[IMPORTANT] -+=========== -+This document will show commands that need to be executed on both nodes -+with a simple `#` prompt. Be sure to run them on each node individually. -+=========== - --Make sure to install the pcs packages on both nodes. --endif::[] -+[NOTE] -+=========== -+This document uses pcs for cluster management. Other alternatives, -+such as crmsh, are available, but their syntax -+will differ from the examples used here. -+=========== - --== Setup == -+== Configure the Cluster Software == - --ifdef::pcs[] - === Enable pcs Daemon === - - Before the cluster can be configured, the pcs daemon must be started and enabled --to boot on startup on each node. This daemon works with the pcs cli command to manage --syncing the corosync configuration across all the nodes in the cluster. -+to start at boot time on each node. This daemon works with the pcs command-line interface -+to manage synchronizing the corosync configuration across all nodes in the cluster. - --Start and enable the daemon by issuing the following commands on each node. -+Start and enable the daemon by issuing the following commands on each node: - --[source,C] - ---- - # systemctl start pcsd.service - # systemctl enable pcsd.service - ---- - --Now we need a way for `pcs` to talk to itself on other nodes in the --cluster. This is necessary in order to perform tasks such as syncing --the corosync config, or starting/stopping the cluster on remote nodes -+The installed packages will create a *hacluster* user with a disabled password. -+While this is fine for running `pcs` commands locally, -+the account needs a login password in order to perform such tasks as syncing -+the corosync configuration, or starting and stopping the cluster on other nodes. - --While `pcs` can be used locally without setting up these user --accounts, this tutorial will make use of these remote access commands, --so we will set a password for the 'hacluster' user. Its probably best --if password is consistent across all the nodes. -+This tutorial will make use of such commands, -+so now we will set a password for the *hacluster* user, using the same password -+on both nodes: - --As 'root', run: -- --[source,C] - ---- - # passwd hacluster - password: - ---- - -+[NOTE] -+=========== - Alternatively, to script this process or set the password on a --different machine to the one you're logged into, you can use -+different machine from the one you're logged into, you can use - the `--stdin` option for `passwd`: - --[source,C] ------ --# ssh pcmk-2 -- 'echo redhat1 | passwd --stdin hacluster' ------ -- --endif::[] -- --ifdef::crmsh[] -- --=== Preparation - Multicast === -- --Choose a port number and --http://en.wikipedia.org/wiki/Multicast[multi-cast] address. --http://en.wikipedia.org/wiki/Multicast_address[] -- --Be sure that the values you chose do not conflict with any existing --clusters you might have. For this document, I have chosen port '4000' --and used '239.255.1.1' as the multi-cast address. -- --endif::[] -- --=== Notes on Multicast Address Assignment === -- --There are several subtle points that often deserve consideration when --choosing/assigning multicast addresses for corosync. --footnote:[This information is borrowed from, the now defunct, http://web.archive.org/web/20101211210054/http://29west.com/docs/THPM/multicast-address-assignment.html] -- --. Avoid '224.0.0.x' --+ --Traffic to addresses of the form '224.0.0.x' is often flooded to all --switch ports. This address range is reserved for link-local uses. Many --routing protocols assume that all traffic within this range will be --received by all routers on the network. Hence (at least all Cisco) --switches flood traffic within this range. The flooding behavior --overrides the normal selective forwarding behavior of a --multicast-aware switch (e.g. IGMP snooping, CGMP, etc.). -- --. Watch for '32:1' overlap --+ --32 non-contiguous IP multicast addresses are mapped onto each Ethernet --multicast address. A receiver that joins a single IP multicast group --implicitly joins 31 others due to this overlap. Of course, filtering --in the operating system discards undesired multicast traffic from --applications, but NIC bandwidth and CPU resources are nonetheless --consumed discarding it. The overlap occurs in the 5 high-order bits, --so it's best to use the 23 low-order bits to make distinct multicast --streams unique. For example, IP multicast addresses in the range --'239.0.0.0' to '239.127.255.255' all map to unique Ethernet multicast --addresses. However, IP multicast address '239.128.0.0' maps to the --same Ethernet multicast address as '239.0.0.0', '239.128.0.1' maps to --the same Ethernet multicast address as '239.0.0.1', etc. -- --. Avoid 'x.0.0.y' and 'x.128.0.y' --+ --Combining the above two considerations, it's best to avoid using IP --multicast addresses of the form 'x.0.0.y' and 'x.128.0.y' since they --all map onto the range of Ethernet multicast addresses that are --flooded to all switch ports. -- --. Watch for address assignment conflicts --+ --http://www.iana.org/[IANA] administers --http://www.iana.org/assignments/multicast-addresses[Internet multicast --addresses]. Potential conflicts with Internet multicast address --assignments can be avoided by using --http://www.ietf.org/rfc/rfc3180.txt[GLOP addressing] --(http://en.wikipedia.org/wiki/Autonomous_system_%28Internet%29[AS] --required) or http://www.ietf.org/rfc/rfc2365.txt[administratively --scoped] addresses. Such addresses can be safely used on a network --connected to the Internet without fear of conflict with multicast --sources originating on the Internet. Administratively scoped addresses --are roughly analogous to the unicast address space for --http://www.ietf.org/rfc/rfc1918.txt[private internets]. Site-local --multicast addresses are of the form '239.255.x.y', but can grow down --to '239.252.x.y' if needed. Organization-local multicast addresses are --of the form '239.192-251.x.y', but can grow down to '239.x.y.z' if --needed. -- --For a more detailed treatment (57 pages!), see --http://www.cisco.com/en/US/tech/tk828/technologies_white_paper09186a00802d4643.shtml[Cisco's --Guidelines for Enterprise IP Multicast Address Allocation] paper. -- --=== Configuring Corosync === -- --ifdef::pcs[] -- --In the past, at this point in the tutorial an explanation of how to --configure and propagate corosync's /etc/corosync.conf file would be --necessary. Using pcs with the pcs daemon greatly simplifies this --process by generating 'corosync.conf' across all the nodes in the --cluster with a single command. The only thing required to achieve --this is to authenticate as the pcs user 'hacluster' on one of the --nodes in the cluster, and then issue the 'pcs cluster setup' command --with a list of all the node names in the cluster. -- --[source,C] ------ --# pcs cluster auth pcmk-1 pcmk-2 -+---- -+[root@pcmk-1 ~]# ssh pcmk-2 -- 'echo redhat1 | passwd --stdin hacluster' -+---- -+=========== -+ -+=== Configure Corosync === -+ -+On either node, use `pcs cluster auth` to authenticate as the *hacluster* user: -+ -+---- -+[root@pcmk-1 ~]# pcs cluster auth pcmk-1 pcmk-2 - Username: hacluster - Password: - pcmk-1: Authorized - pcmk-2: Authorized -- --# pcs cluster setup --name mycluster pcmk-1 pcmk-2 --pcmk-1: Succeeded --pcmk-2: Succeeded - ---- - --That's it. Corosync is configured across the cluster. If you --received an authorization error for either of those commands, make --sure you setup the 'hacluster' user account and password on every node --in the cluster with the same password. -- --endif::[] -- --ifdef::crmsh[] -- - [IMPORTANT] - =========== --The instructions below only apply for a machine with a single NIC. If you --have a more complicated setup, you should edit the configuration --manually. --=========== -+The version of pcs shipped with Fedora 21 will bind only to -+the host's IPv6 address in some circumstances. If you get errors -+with `pcs cluster auth`, add this line before the first *server.run* line in -++/usr/lib/pcsd/ssl.rb+ to bind to IPv4 only: - --[source,C] - ---- --# export ais_port=4000 --# export ais_mcast=239.255.1.1 -+webrick_options[:BindAddress] = '0.0.0.0' - ---- - --Next we automatically determine the hosts address. By not using the full --address, we make the configuration suitable to be copied to other nodes. -- --[source,Bash] -+And restart pcsd: - ---- --export ais_addr=`ip addr | grep "inet " | tail -n 1 | awk '{print $4}' | sed s/255/0/g` -+[root@pcmk-1 ~]# systemctl restart pcsd - ---- - --Display and verify the configuration options -+This is a temporary workaround that will get removed if the pcsd -+package is later updated. -+=========== - --[source,Bash] -+Next, use `pcs cluster setup` to generate and synchronize the corosync -+configuration: - ---- --# env | grep ais_ --ais_mcast=239.255.1.1 --ais_port=4000 --ais_addr=192.168.122.0 -+[root@pcmk-1 ~]# pcs cluster setup --name mycluster pcmk-1 pcmk-2 -+Shutting down pacemaker/corosync services... -+Redirecting to /bin/systemctl stop pacemaker.service -+Redirecting to /bin/systemctl stop corosync.service -+Killing any remaining services... -+Removing all cluster configuration files... -+pcmk-1: Succeeded -+pcmk-2: Succeeded - ---- - --Once you're happy with the chosen values, update the Corosync --configuration -+If you received an authorization error for either of those commands, make -+sure you configured the *hacluster* user account on each node -+with the same password. - --[source,C] ------ --# cp /etc/corosync/corosync.conf.example /etc/corosync/corosync.conf --# sed -i.bak "s/.*mcastaddr:.*/mcastaddr:\ $ais_mcast/g" /etc/corosync/corosync.conf --# sed -i.bak "s/.*mcastport:.*/mcastport:\ $ais_port/g" /etc/corosync/corosync.conf --# sed -i.bak "s/.*\tbindnetaddr:.*/bindnetaddr:\ $ais_addr/g" /etc/corosync/corosync.conf ------ -+[NOTE] -+====== -+Early versions of pcs, such as the one shipped with Fedora 20 and earlier, -+require that `--name` be omitted from the above command. - --Lastly, you'll need to enable quorum -+If using a different cluster shell such as crmsh rather than pcs, you must -+manually create a corosync.conf and copy it to all nodes. - --[source,Bash] ------- --cat << END >> /etc/corosync/corosync.conf --quorum { -- provider: corosync_votequorum -- expected_votes: 2 --} --END ------- -- --endif::[] -+The pcs command will configure corosync to use UDP unicast transport; if you -+choose to use multicast instead, choose a multicast address carefully. -+footnote:[For some subtle issues, see the now-defunct http://web.archive.org/web/20101211210054/http://29west.com/docs/THPM/multicast-address-assignment.html or the more detailed treatment in -+http://www.cisco.com/c/dam/en/us/support/docs/ip/ip-multicast/ipmlt_wp.pdf[Cisco's -+Guidelines for Enterprise IP Multicast Address Allocation] paper.] -+====== - - The final /etc/corosync.conf configuration on each node should look - something like the sample in Appendix B, Sample Corosync Configuration. - -- --[IMPORTANT] --=========== --Pacemaker used to obtain membership and quorum from a custom Corosync plugin. --This plugin also had the capability to start Pacemaker automatically when Corosync was started. -- --Neither behavior is possible with Corosync 2.0 and beyond as support for plugins was removed. --Instead, Pacemaker must be started as a separate service. -- --Also, since Pacemaker made use of the plugin for message routing, a node using the plugin (Corosync prior to 2.0) cannot talk to one that isn't (Corosync 2.0+). --Rolling upgrades between these versions are therefor not possible and an alternate strategy footnote:[http://www.clusterlabs.org/doc/en-US/Pacemaker/1.1/html/Pacemaker_Explained/ap-upgrade.html] must be used. --=========== -- --ifdef::crmsh[] --=== Propagate the Configuration === -- --Now we need to copy the changes so far to the other node: -- --[source,C] ------ --# for f in /etc/corosync/corosync.conf /etc/hosts; do scp $f pcmk-2:$f ; done --corosync.conf 100% 1528 1.5KB/s 00:00 --hosts 100% 281 0.3KB/s 00:00 --# ------ --endif::[] -+[NOTE] -+====== -+With versions of Corosync before 2.0, Pacemaker could obtain membership and -+quorum from a custom Corosync plugin. This plugin also had the capability to -+start Pacemaker automatically when Corosync was started. -+Neither behavior is possible with Corosync 2.0 and later, as support for -+plugins was removed. -+ -+Because Pacemaker made use of the plugin for message routing, a cluster node -+using an older Corosync cannot talk to one using Corosync 2.0 or later. -+Rolling upgrades between these versions are therefore not possible, and an -+alternate strategy -+footnote:[http://www.clusterlabs.org/doc/en-US/Pacemaker/1.1/html/Pacemaker_Explained/ap-upgrade.html] -+must be used. -+====== -diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Intro.txt b/doc/Clusters_from_Scratch/en-US/Ch-Intro.txt -index a3a0329..7ed4f80 100644 ---- a/doc/Clusters_from_Scratch/en-US/Ch-Intro.txt -+++ b/doc/Clusters_from_Scratch/en-US/Ch-Intro.txt -@@ -7,7 +7,7 @@ resources. The redundancy of multiple machines is used to guard - against failures of many types. - - This document will walk through the installation and setup of simple --clusters using the Fedora distribution, version 20. -+clusters using the &DISTRO; distribution, version &DISTRO_VERSION;. - - The clusters described here will use Pacemaker and Corosync to provide - resource management and messaging. Required packages and modifications -@@ -23,146 +23,4 @@ under its control. - When more in depth information is required and for real world usage, - please refer to the http://www.clusterlabs.org/doc/[Pacemaker Explained] manual. - --== What Is Pacemaker? == -- --Pacemaker is a cluster resource manager. -- --It achieves maximum availability for your cluster services --(aka. resources) by detecting and recovering from node and --resource-level failures by making use of the messaging and membership --capabilities provided by your preferred cluster infrastructure (either --http://www.corosync.org/[Corosync] or --http://linux-ha.org/wiki/Heartbeat[Heartbeat]). -- --Pacemaker's key features include: -- -- * Detection and recovery of node and service-level failures -- * Storage agnostic, no requirement for shared storage -- * Resource agnostic, anything that can be scripted can be clustered -- * Supports STONITH for ensuring data integrity -- * Supports large and small clusters -- * Supports both quorate and resource driven clusters -- * Supports practically any redundancy configuration -- * Automatically replicated configuration that can be updated from any node -- * Ability to specify cluster-wide service ordering, colocation and anti-colocation -- * Support for advanced service types -- ** Clones: for services which need to be active on multiple nodes -- ** Multi-state: for services with multiple modes (eg. master/slave, primary/secondary) -- * Unified, scriptable, cluster management tools. -- --== Pacemaker Architecture == -- --At the highest level, the cluster is made up of three pieces: -- -- * Non-cluster aware components. These pieces -- include the resources themselves, scripts that start, stop and -- monitor them, and also a local daemon that masks the differences -- between the different standards these scripts implement. -- -- * Resource management. Pacemaker provides the brain that processes -- and reacts to events regarding the cluster. These events include -- nodes joining or leaving the cluster; resource events caused by -- failures, maintenance, scheduled activities; and other -- administrative actions. Pacemaker will compute the ideal state of -- the cluster and plot a path to achieve it after any of these -- events. This may include moving resources, stopping nodes and even -- forcing them offline with remote power switches. -- -- * Low level infrastructure. Projects like Corosync, CMAN and -- Heartbeat provide reliable messaging, membership and quorum -- information about the cluster. -- --When combined with Corosync, Pacemaker also supports popular open --source cluster filesystems. --footnote:[ --Even though Pacemaker also supports Heartbeat, the filesystems need to --use the stack for messaging and membership and Corosync seems to be --what they're standardizing on. -- --Technically it would be possible for them to support Heartbeat as --well, however there seems little interest in this. --] -- --Due to past standardization within the cluster filesystem community, --they make use of a common distributed lock manager which makes use of --Corosync for its messaging and membership capabilities (which nodes --are up/down) and Pacemaker for fencing services. -- --.The Pacemaker Stack --image::images/pcmk-stack.png["The Pacemaker stack",width="10cm",height="7.5cm",align="center"] -- --=== Internal Components === -- --Pacemaker itself is composed of five key components: -- -- * CIB (aka. Cluster Information Base) -- * CRMd (aka. Cluster Resource Management daemon) -- * LRMd (aka. Local Resource Management daemon) -- * PEngine (aka. PE or Policy Engine) -- * STONITHd -- --.Internal Components --image::images/pcmk-internals.png["Subsystems of a Pacemaker cluster",align="center",scaledwidth="65%"] -- --The CIB uses XML to represent both the cluster's configuration and --current state of all resources in the cluster. The contents of the CIB --are automatically kept in sync across the entire cluster and are used --by the PEngine to compute the ideal state of the cluster and how it --should be achieved. -- --This list of instructions is then fed to the DC (Designated --Controller). Pacemaker centralizes all cluster decision making by --electing one of the CRMd instances to act as a master. Should the --elected CRMd process, or the node it is on, fail... a new one is --quickly established. -- --The DC carries out the PEngine's instructions in the required order by --passing them to either the LRMd (Local Resource Management daemon) or --CRMd peers on other nodes via the cluster messaging infrastructure --(which in turn passes them on to their LRMd process). -- --The peer nodes all report the results of their operations back to the --DC and, based on the expected and actual results, will either execute --any actions that needed to wait for the previous one to complete, or --abort processing and ask the PEngine to recalculate the ideal cluster --state based on the unexpected results. -- --In some cases, it may be necessary to power off nodes in order to --protect shared data or complete resource recovery. For this Pacemaker --comes with STONITHd. -- --STONITH is an acronym for Shoot-The-Other-Node-In-The-Head and is --usually implemented with a remote power switch. -- --In Pacemaker, STONITH devices are modeled as resources (and configured --in the CIB) to enable them to be easily monitored for failure, however --STONITHd takes care of understanding the STONITH topology such that --its clients simply request a node be fenced and it does the rest. -- --== Types of Pacemaker Clusters == -- --Pacemaker makes no assumptions about your environment, this allows it --to support practically any --http://en.wikipedia.org/wiki/High-availability_cluster#Node_configurations[redundancy --configuration] including Active/Active, Active/Passive, N+1, N+M, --N-to-1 and N-to-N. -- --.Active/Passive Redundancy --image::images/pcmk-active-passive.png["Active/Passive Redundancy",width="10cm",height="7.5cm",align="center"] -- --Two-node Active/Passive clusters using Pacemaker and DRBD are a --cost-effective solution for many High Availability situations. -- --.Shared Failover --image::images/pcmk-shared-failover.png["Shared Failover",width="10cm",height="7.5cm",align="center"] -- --By supporting many nodes, Pacemaker can dramatically reduce hardware --costs by allowing several active/passive clusters to be combined and --share a common backup node -- --.N to N Redundancy --image::images/pcmk-active-active.png["N to N Redundancy",width="10cm",height="7.5cm",align="center"] -- --When shared storage is available, every node can potentially be used --for failover. Pacemaker can even run multiple copies of services to --spread out the workload. -+include::../../shared/en-US/pacemaker-intro.txt[] -diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Shared-Storage.txt b/doc/Clusters_from_Scratch/en-US/Ch-Shared-Storage.txt -index cc2cec6..b5c87f5 100644 ---- a/doc/Clusters_from_Scratch/en-US/Ch-Shared-Storage.txt -+++ b/doc/Clusters_from_Scratch/en-US/Ch-Shared-Storage.txt -@@ -1,148 +1,63 @@ --= Replicated Storage with DRBD = -+= Replicate Storage Using DRBD = - --== Background == - Even if you're serving up static websites, having to manually synchronize - the contents of that website to all the machines in the cluster is not - ideal. For dynamic websites, such as a wiki, it's not even an option. Not --everyone care afford network-attached storage but somehow the data needs --to be kept in sync. Enter DRBD which can be thought of as network based --RAID-1. See http://www.drbd.org/ for more details. -+everyone care afford network-attached storage, but somehow the data needs -+to be kept in sync. -+ -+Enter DRBD, which can be thought of as network-based RAID-1. -+footnote:[See http://www.drbd.org/ for details.] - - == Install the DRBD Packages == - --Since its inclusion in the upstream 2.6.33 kernel, everything needed --to use DRBD has shiped with Fedora since version 13. All you need to --do is install it: -+DRBD itself is included in the upstream kernel, -+footnote:[Since version 2.6.33] -+but we do need some utilities to use it effectively. On both nodes, run: - --[source,C] -+---- - # yum install -y drbd-pacemaker drbd-udev -+---- - --..... --Loaded plugins: langpacks, presto, refresh-packagekit --Resolving Dependencies ----> Running transaction check -----> Package drbd-pacemaker.x86_64 0:8.3.11-5.fc17 will be installed ----> Processing Dependency: drbd-utils = 8.3.11-5.fc17 for package: drbd-pacemaker-8.3.11-5.fc17.x86_64 -----> Package drbd-udev.x86_64 0:8.3.11-5.fc17 will be installed ----> Running transaction check -----> Package drbd-utils.x86_64 0:8.3.11-5.fc17 will be installed ----> Finished Dependency Resolution -- --Dependencies Resolved -- --====================================================================================== -- Package Arch Version Repository Size --====================================================================================== --Installing: -- drbd-pacemaker x86_64 8.3.11-5.fc17 updates-testing 22 k -- drbd-udev x86_64 8.3.11-5.fc17 updates-testing 6.4 k --Installing for dependencies: -- drbd-utils x86_64 8.3.11-5.fc17 updates-testing 183 k -- --Transaction Summary --====================================================================================== --Install 2 Packages (+1 Dependent package) -- --Total download size: 212 k --Installed size: 473 k --Downloading Packages: --(1/3): drbd-pacemaker-8.3.11-5.fc17.x86_64.rpm | 22 kB 00:00 --(2/3): drbd-udev-8.3.11-5.fc17.x86_64.rpm | 6.4 kB 00:00 --(3/3): drbd-utils-8.3.11-5.fc17.x86_64.rpm | 183 kB 00:00 ---------------------------------------------------------------------------------------- --Total 293 kB/s | 212 kB 00:00 --Running Transaction Check --Running Transaction Test --Transaction Test Succeeded --Running Transaction -- Installing : drbd-utils-8.3.11-5.fc17.x86_64 1/3 -- Installing : drbd-pacemaker-8.3.11-5.fc17.x86_64 2/3 -- Installing : drbd-udev-8.3.11-5.fc17.x86_64 3/3 -- Verifying : drbd-pacemaker-8.3.11-5.fc17.x86_64 1/3 -- Verifying : drbd-udev-8.3.11-5.fc17.x86_64 2/3 -- Verifying : drbd-utils-8.3.11-5.fc17.x86_64 3/3 -- --Installed: -- drbd-pacemaker.x86_64 0:8.3.11-5.fc17 drbd-udev.x86_64 0:8.3.11-5.fc17 -- --Dependency Installed: -- drbd-utils.x86_64 0:8.3.11-5.fc17 -- --Complete! --..... -- --== Configure DRBD == -- --Before we configure DRBD, we need to set aside some disk for it to use. -- --=== Create A Partition for DRBD === -+== Allocate a Disk Volume for DRBD == - --If you have more than 1Gb free, feel free to use it. For this guide --however, 1Gb is plenty of space for a single html file and sufficient for --later holding the GFS2 metadata. -+DRBD will need its own block device on each node. This can be -+a physical disk partition or logical volume, of whatever size -+you need for your data. For this document, we will use a -+1GiB logical volume, which is more than sufficient for a single HTML file and -+(later) GFS2 metadata. - --[source,C] - ---- --# vgdisplay | grep -e Name -e Free -- VG Name vg_pcmk1 -- Free PE / Size 31 / 992.00 MiB --# lvs -- LV VG Attr LSize Pool Origin Data% Move Log Copy% Convert -- lv_root vg_pcmk1 -wi-ao-- 8.56g -- lv_swap vg_pcmk1 -wi-ao-- 960.00m --# lvcreate -n drbd-demo -L 1G vg_pcmk1 -+[root@pcmk-1 ~]# vgdisplay | grep -e Name -e Free -+ VG Name fedora-server_pcmk-1 -+ Free PE / Size 511 / 2.00 GiB -+[root@pcmk-1 ~]# lvcreate --name drbd-demo --size 1G fedora-server_pcmk-1 - Logical volume "drbd-demo" created --# lvs -- LV VG Attr LSize Pool Origin Data% Move Log Copy% Convert -- drbd-demo vg_pcmk1 -wi-a--- 1.00G -- lv_root vg_pcmk1 -wi-ao-- 8.56g -- lv_swap vg_pcmk1 -wi-ao-- 960.00m -+[root@pcmk-1 ~]# lvs -+ LV VG Attr LSize Pool Origin Data% Meta% Move Log Cpy%Sync Convert -+ drbd-demo fedora-server_pcmk-1 -wi-a----- 1.00g -+ root fedora-server_pcmk-1 -wi-ao---- 5.00g -+ swap fedora-server_pcmk-1 -wi-ao---- 1.00g - ---- - --Repeat this on the second node, be sure to use the same size partition. -+Repeat this on the second node, making sure to use the same size. - --[source,C] - ---- --# ssh pcmk-2 -- lvs --LV VG Attr LSize Origin Snap% Move Log Copy% Convert -- lv_root vg_pcmk1 -wi-ao-- 8.56g -- lv_swap vg_pcmk1 -wi-ao-- 960.00m --# ssh pcmk-2 -- lvcreate -n drbd-demo -L 1G vg_pcmk1 -+[root@pcmk-1 ~]# ssh pcmk-2 -- lvcreate --name drbd-demo --size 1G fedora-server_pcmk-2 - Logical volume "drbd-demo" created --# ssh pcmk-2 -- lvs --LV VG Attr LSize Origin Snap% Move Log Copy% Convert -- drbd-demo vg_pcmk1 -wi-a--- 1.00G -- lv_root vg_pcmk1 -wi-ao-- 8.56g -- lv_swap vg_pcmk1 -wi-ao-- 960.00m - ---- - --=== Write the DRBD Config === -+== Configure DRBD == - - There is no series of commands for building a DRBD configuration, so simply --copy the configuration below to /etc/drbd.conf -- --Detailed information on the directives used in this configuration (and --other alternatives) is available from --http://www.drbd.org/users-guide/ch-configure.html -- --[WARNING] --========= -- --Be sure to use the names and addresses of your nodes if they differ from --the ones used in this guide. -- --========= -+run this on both nodes to use this sample configuration: - --.... --global { -- usage-count yes; --} --common { -- protocol C; --} -+---- -+# cat </etc/drbd.d/wwwdata.res - resource wwwdata { -+ protocol C; - meta-disk internal; -- device /dev/drbd1; -+ device /dev/drbd1; - syncer { - verify-alg sha1; - } -@@ -150,341 +65,285 @@ resource wwwdata { - allow-two-primaries; - } - on pcmk-1 { -- disk /dev/vg_pcmk1/drbd-demo; -+ disk /dev/fedora-server_pcmk-1/drbd-demo; - address 192.168.122.101:7789; - } - on pcmk-2 { -- disk /dev/vg_pcmk1/drbd-demo; -+ disk /dev/fedora-server_pcmk-2/drbd-demo; - address 192.168.122.102:7789; - } - } --.... -+END -+---- -+ -+[IMPORTANT] -+========= -+Edit the file to use the hostnames, IP addresses and logical volume paths -+of your nodes if they differ from the ones used in this guide. -+========= - - [NOTE] - ======= -+Detailed information on the directives used in this configuration (and -+other alternatives) is available at -+http://www.drbd.org/users-guide/ch-configure.html - --TODO: Explain the reason for the allow-two-primaries option -- -+The *allow-two-primaries* option would not normally be used in -+an active/passive cluster. We are adding it here for the convenience -+of changing to an active/active cluster later. - ======= - --=== Initialize and Load DRBD === -+== Initialize DRBD == -+ -+With the configuration in place, we can now get DRBD running. - --With the configuration in place, we can now perform the DRBD --initialization -+These commands create the local metadata for the DRBD resource, -+ensure the DRBD kernel module is loaded, and bring up the DRBD resource. -+Run them on one node: - --[source,C] - ---- - # drbdadm create-md wwwdata --Writing meta data... - initializing activity log --NOT initialized bitmap -+NOT initializing bitmap -+Writing meta data... - New drbd meta data block successfully created. --success -+# modprobe drbd -+# drbdadm up wwwdata - ---- - --Now load the DRBD kernel module and confirm that everything is sane -+We can confirm DRBD's status on this node: - --[source,C] - ---- --# modprobe drbd --# drbdadm up wwwdata - # cat /proc/drbd --version: 8.3.11 (api:88/proto:86-96) --srcversion: 0D2B62DEDB020A425130935 -+version: 8.4.5 (api:1/proto:86-101) -+srcversion: 153833F4A69E341D3F3E707 - -- 1: cs:Connected ro:Secondary/Secondary ds:Inconsistent/Inconsistent C r----- -- ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:1015740 -+ 1: cs:WFConnection ro:Secondary/Unknown ds:Inconsistent/DUnknown C r----s -+ ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:1048508 - ---- - --Repeat on the second node -+Because we have not yet initialized the data, this node's data -+is marked as *Inconsistent*. Because we have not yet initialized -+the second node, the local state is *WFConnection* (waiting for connection), -+and the partner node's status is marked as *Unknown*. -+ -+Now, repeat the above commands on the second node. This time, -+when we check the status, it shows: - --[source,C] - ---- --# ssh pcmk-2 -- drbdadm --force create-md wwwdata --Writing meta data... --initializing activity log --NOT initialized bitmap --New drbd meta data block successfully created. --success --# ssh pcmk-2 -- modprobe drbd --WARNING: Deprecated config file /etc/modprobe.conf, all config files belong into /etc/modprobe.d/. --# ssh pcmk-2 -- drbdadm up wwwdata --# ssh pcmk-2 -- cat /proc/drbd --version: 8.3.11 (api:88/proto:86-96) --srcversion: 0D2B62DEDB020A425130935 -+# cat /proc/drbd -+version: 8.4.5 (api:1/proto:86-101) -+srcversion: 153833F4A69E341D3F3E707 - - 1: cs:Connected ro:Secondary/Secondary ds:Inconsistent/Inconsistent C r----- -- ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:1015740 -+ ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:1048508 - ---- - --Now we need to tell DRBD which set of data to use. Since both sides --contain garbage, we can run the following on pcmk-1: -+You can see the state has changed to *Connected*, meaning the two DRBD nodes -+are communicating properly, and both nodes are in *Secondary* role -+with *Inconsistent* data. -+ -+To make the data consistent, we need to tell DRBD which node should be -+considered to have the correct data. In this case, since we are creating -+a new resource, both have garbage, so we'll just pick pcmk-1 -+and run this command on it: - --[source,C] - ---- --# drbdadm -- --overwrite-data-of-peer primary wwwdata --# cat /proc/drbd --version: 8.3.11 (api:88/proto:86-96) --srcversion: 0D2B62DEDB020A425130935 -+[root@pcmk-1 ~]# drbdadm primary --force wwwdata -+---- -+ -+[NOTE] -+====== -+In DRBD 8.3 and earlier, the equivalent command is: -+---- -+[root@pcmk-1 ~]# drbdadm -- --overwrite-data-of-peer primary wwwdata -+---- -+====== -+ -+If we check the status immediately, we'll see something like this: -+---- -+[root@pcmk-1 ~]# cat /proc/drbd -+version: 8.4.5 (api:1/proto:86-101) -+srcversion: 153833F4A69E341D3F3E707 - - 1: cs:SyncSource ro:Primary/Secondary ds:UpToDate/Inconsistent C r----- -- ns:8064 nr:0 dw:0 dr:8728 al:0 bm:0 lo:0 pe:1 ua:0 ap:0 ep:1 wo:f oos:1007804 -- [>....................] sync'ed: 0.9% (1007804/1015740)K -- finish: 0:12:35 speed: 1,320 (1,320) K/sec -+ ns:2872 nr:0 dw:0 dr:3784 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:1045636 -+ [>....................] sync'ed: 0.4% (1045636/1048508)K -+ finish: 0:10:53 speed: 1,436 (1,436) K/sec - ---- - --After a while, the sync should finish and you'll see: -+We can see that this node has the *Primary* role, the partner node has -+the *Secondary* role, this node's data is now considered *UpToDate*, -+the partner node's data is still *Inconsistent*, and a progress bar -+shows how far along the partner node is in synchronizing the data. - --[source,C] -+After a while, the sync should finish, and you'll see something like: - ---- --# cat /proc/drbd --version: 8.3.11 (api:88/proto:86-96) --srcversion: 0D2B62DEDB020A425130935 -+[root@pcmk-1 ~]# cat /proc/drbd -+version: 8.4.5 (api:1/proto:86-101) -+srcversion: 153833F4A69E341D3F3E707 - - 1: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate C r----- -- ns:1015740 nr:0 dw:0 dr:1016404 al:0 bm:62 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:0 -+ ns:1048508 nr:0 dw:0 dr:1049420 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:0 - ---- - --pcmk-1 is now in the Primary state which allows it to be written to. --Which means it's a good point at which to create a filesystem and populate --it with some data to serve up via our WebSite resource. -+Both sets of data are now *UpToDate*, and we can proceed to creating -+and populating a filesystem for our WebSite resource's documents. - -+== Populate the DRBD Disk == - --=== Populate DRBD with Data === -+On the node with the primary role (pcmk-1 in this example), -+create a filesystem on the DRBD device: - --[source,C] - ---- --# mkfs.ext4 /dev/drbd1 --mke2fs 1.42 (29-Nov-2011) --Filesystem label= --OS type: Linux --Block size=4096 (log=2) --Fragment size=4096 (log=2) --Stride=0 blocks, Stripe width=0 blocks --63488 inodes, 253935 blocks --12696 blocks (5.00%) reserved for the super user --First data block=0 --Maximum filesystem blocks=260046848 --8 block groups --32768 blocks per group, 32768 fragments per group --7936 inodes per group --Superblock backups stored on blocks: -+[root@pcmk-1 ~]# mkfs.ext4 /dev/drbd1 -+mke2fs 1.42.11 (09-Jul-2014) -+Creating filesystem with 262127 4k blocks and 65536 inodes -+Filesystem UUID: 26879260-9077-4d6d-ad69-7d31d3d8d8d4 -+Superblock backups stored on blocks: - 32768, 98304, 163840, 229376 - --Allocating group tables: done --Writing inode tables: done -+Allocating group tables: done -+Writing inode tables: done - Creating journal (4096 blocks): done - Writing superblocks and filesystem accounting information: done - ---- - --Now mount the newly created filesystem so we can create our index file -+[NOTE] -+==== -+In this example, we create an ext4 filesystem with no special options. -+In a production environment, you should choose a filesystem type and -+options that are suitable for your application. -+==== -+ -+Mount the newly created filesystem, populate it with our web document, -+then unmount it (the cluster will handle mounting and unmounting it later): - --[source,C] - ---- --# mount /dev/drbd1 /mnt/ --# cat <<-END >/mnt/index.html -+[root@pcmk-1 ~]# mount /dev/drbd1 /mnt -+[root@pcmk-1 ~]# cat <<-END >/mnt/index.html - -- My Test Site - drbd -+ My Test Site - DRBD - - END --# umount /dev/drbd1 -+[root@pcmk-1 ~]# umount /dev/drbd1 - ---- - --== Configure the Cluster for DRBD == -- --ifdef::pcs[] -+== Configure the Cluster for the DRBD device == - --One handy feature pcs has is the ability to queue up several changes -+One handy feature `pcs` has is the ability to queue up several changes - into a file and commit those changes atomically. To do this, start by --populating the file with the current raw xml config from the cib. This --can be done using the following command. -+populating the file with the current raw XML config from the CIB. - --[source,C] - ---- - # pcs cluster cib drbd_cfg - ---- - --Now using the pcs -f option, make changes to the configuration saved --in the drbd_cfg file. These changes will not be seen by the cluster until --the drbd_cfg file is pushed into the live cluster's cib later on. -+Using the `pcs -f` option, make changes to the configuration saved -+in the +drbd_cfg+ file. These changes will not be seen by the cluster until -+the +drbd_cfg+ file is pushed into the live cluster's CIB later. - --//// --source,C doesn't do well with \'s --//// -+Here, we create a cluster resource for the DRBD device, and an additional _clone_ -+resource to allow the resource to run on both nodes at the same time. - - ---- --# pcs -f drbd_cfg resource create WebData ocf:linbit:drbd \ -+[root@pcmk-1 ~]# pcs -f drbd_cfg resource create WebData ocf:linbit:drbd \ - drbd_resource=wwwdata op monitor interval=60s --# pcs -f drbd_cfg resource master WebDataClone WebData \ -+[root@pcmk-1 ~]# pcs -f drbd_cfg resource master WebDataClone WebData \ - master-max=1 master-node-max=1 clone-max=2 clone-node-max=1 \ - notify=true ------ --[source,C] ------ --# pcs -f drbd_cfg resource show -- ClusterIP (ocf::heartbeat:IPaddr2) Started -- WebSite (ocf::heartbeat:apache) Started -+[root@pcmk-1 ~]# pcs -f drbd_cfg resource show -+ ClusterIP (ocf::heartbeat:IPaddr2): Started -+ WebSite (ocf::heartbeat:apache): Started - Master/Slave Set: WebDataClone [WebData] -- Stopped: [ WebData:0 WebData:1 ] -+ Stopped: [ pcmk-1 pcmk-2 ] - ---- - --After you are satisfied with all the changes, you can commit all --the changes at once by pushing the drbd_cfg file into the live --cib. -+After you are satisfied with all the changes, you can commit -+them all at once by pushing the drbd_cfg file into the live CIB. - --[source,C] - ---- --# pcs cluster cib-push drbd_cfg -+[root@pcmk-1 ~]# pcs cluster cib-push drbd_cfg - CIB updated -+---- - --# pcs status -+[NOTE] -+==== -+Early versions of `pcs` required `push cib` in place of `cib-push` above. -+==== - --Last updated: Fri Sep 14 12:19:49 2012 --Last change: Fri Sep 14 12:19:13 2012 via cibadmin on pcmk-1 -+Let's see what the cluster did with the new configuration: -+---- -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+Last updated: Wed Dec 17 16:39:43 2014 -+Last change: Wed Dec 17 16:39:30 2014 - Stack: corosync - Current DC: pcmk-2 (2) - partition with quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --4 Resources configured. -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+4 Resources configured -+ - - Online: [ pcmk-1 pcmk-2 ] - - Full list of resources: - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 -- WebSite (ocf::heartbeat:apache): Started pcmk-1 -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 -+ WebSite (ocf::heartbeat:apache): Started pcmk-1 - Master/Slave Set: WebDataClone [WebData] - Masters: [ pcmk-1 ] - Slaves: [ pcmk-2 ] ------ --endif::[] -- --ifdef::crmsh[] --One handy feature of the crm shell is that you can use it in --interactive mode to make several changes atomically. -- --First we launch the shell. The prompt will change to indicate you're --in interactive mode. -- --[source,C] ------ --# crm --crm(live) # ------ -- --Next we must create a working copy of the current configuration. This is --where all our changes will go. The cluster will not see any of them until --we say it's ok. Notice again how the prompt changes, this time to indicate --that we're no longer looking at the live cluster. -- --[source,C] ------ --cib crm(live) # cib new drbd --INFO: drbd shadow CIB created --crm(drbd) # ------ -- --Now we can create our DRBD clone and display the revised configuration. -- --[source,C] ------ --crm(drbd) # configure primitive WebData ocf:linbit:drbd params drbd_resource=wwwdata \ -- op monitor interval=60s --crm(drbd) # configure ms WebDataClone WebData meta master-max=1 master-node-max=1 \ -- clone-max=2 clone-node-max=1 notify=true --crm(drbd) # configure show --node $id="1702537408" pcmk-1 --node $id="1719314624" pcmk-2 --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.120" cidr_netmask="32" \ -- op monitor interval="30s" --primitive WebData ocf:linbit:drbd \ -- params drbd_resource="wwwdata" \ -- op monitor interval="60s" --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --ms WebDataClone WebData \ -- meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" --location prefer-pcmk-1 WebSite 50: pcmk-1 --colocation website-with-ip inf: WebSite ClusterIP --order apache-after-ip inf: ClusterIP WebSite --property $id="cib-bootstrap-options" \ -- dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ -- cluster-infrastructure="corosync" \ -- stonith-enabled="false" \ -- no-quorum-policy="ignore" \ -- last-lrm-refresh="1333446866" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" --op_defaults $id="op-options" \ -- timeout="240s" ------ -- --Once we're happy with the changes, we can tell the cluster to start using --them and use crm_mon to check everything is functioning. -- --[source,C] ------ --crm(drbd) # cib commit drbd --INFO: commited 'drbd' shadow CIB to the cluster --crm(drbd) # quit --bye --# crm_mon -1 --============ --Last updated: Tue Apr 3 13:50:01 2012 --Last change: Tue Apr 3 13:49:46 2012 via crm_shadow on pcmk-1 --Stack: corosync --Current DC: pcmk-1 (1702537408) - partition with quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --4 Resources configured. --============ - --Online: [ pcmk-1 pcmk-2 ] -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 -- WebSite (ocf::heartbeat:apache): Started pcmk-1 -- Master/Slave Set: WebDataClone [WebData] -- Masters: [ pcmk-1 ] -- Slaves: [ pcmk-2 ] -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled - ---- --endif::[] - --[NOTE] --======= -+We can see that *WebDataClone* (our DRBD device) is running as master (DRBD's -+primary role) on *pcmk-1* and slave (DRBD's secondary role) on *pcmk-2*. - --TODO: Include details on adding a second DRBD resource -+[IMPORTANT] -+==== -+The resource agent should load the DRBD module when needed if it's not already -+loaded. If that does not happen, configure your operating system to load the -+module at boot time. For Fedora 21, you would run this on both nodes: -+---- -+# echo drbd >/etc/modules-load.d/drbd.conf -+---- -+==== - --======= -+== Configure the Cluster for the Filesystem == - --Now that DRBD is functioning we can configure a Filesystem resource to --use it. In addition to the filesystem's definition, we also need to -+Now that we have a working DRBD device, we need to mount its filesystem. -+ -+In addition to defining the filesystem, we also need to - tell the cluster where it can be located (only on the DRBD Primary) - and when it is allowed to start (after the Primary was promoted). - --ifdef::pcs[] --We are going to take a shortcut when creating the resource this time though. --Instead of explicitly saying we want the 'ocf:heartbeat:Filesystem' script, we --are only going to ask for 'Filesystem'. We can do this because we know there is only --one resource script named 'Filesystem' available to pacemaker, and that pcs is smart --enough to fill in the 'ocf:heartbeat' portion for us correctly in the configuration. --If there were multiple 'Filesystem' scripts from different ocf providers, we would need --to specify the exact one we wanted to use. -+We are going to take a shortcut when creating the resource this time. -+Instead of explicitly saying we want the *ocf:heartbeat:Filesystem* script, we -+are only going to ask for *Filesystem*. We can do this because we know there is only -+one resource script named *Filesystem* available to pacemaker, and that pcs is smart -+enough to fill in the *ocf:heartbeat:* portion for us correctly in the configuration. -+If there were multiple *Filesystem* scripts from different OCF providers, we would need -+to specify the exact one we wanted. - --Once again we will queue up our changes to a file and then push the -+Once again, we will queue our changes to a file and then push the - new configuration to the cluster as the final step. - - ---- --# pcs cluster cib fs_cfg --# pcs -f fs_cfg resource create WebFS Filesystem \ -- device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" \ -+[root@pcmk-1 ~]# pcs cluster cib fs_cfg -+[root@pcmk-1 ~]# pcs -f fs_cfg resource create WebFS Filesystem \ -+ device="/dev/drbd1" directory="/var/www/html" \ - fstype="ext4" ------ --[source,C] ------ --# pcs -f fs_cfg constraint colocation add WebFS WebDataClone INFINITY with-rsc-role=Master --# pcs -f fs_cfg constraint order promote WebDataClone then start WebFS -+[root@pcmk-1 ~]# pcs -f fs_cfg constraint colocation add WebFS with WebDataClone INFINITY with-rsc-role=Master -+[root@pcmk-1 ~]# pcs -f fs_cfg constraint order promote WebDataClone then start WebFS - Adding WebDataClone WebFS (kind: Mandatory) (Options: first-action=promote then-action=start) - ---- - -@@ -492,287 +351,159 @@ We also need to tell the cluster that Apache needs to run on the same - machine as the filesystem and that it must be active before Apache can - start. - --[source,C] - ---- --# pcs -f fs_cfg constraint colocation add WebSite WebFS INFINITY --# pcs -f fs_cfg constraint order WebFS then WebSite -+[root@pcmk-1 ~]# pcs -f fs_cfg constraint colocation add WebSite with WebFS INFINITY -+[root@pcmk-1 ~]# pcs -f fs_cfg constraint order WebFS then WebSite -+Adding WebFS WebSite (kind: Mandatory) (Options: first-action=start then-action=start) - ---- - --Now review the updated configuration. -+Review the updated configuration. - --[source,C] - ---- --# pcs -f fs_cfg constraint -+[root@pcmk-1 ~]# pcs -f fs_cfg constraint - Location Constraints: - Ordering Constraints: -- start ClusterIP then start WebSite -- WebFS then WebSite -- promote WebDataClone then start WebFS -+ start ClusterIP then start WebSite (kind:Mandatory) -+ promote WebDataClone then start WebFS (kind:Mandatory) -+ start WebFS then start WebSite (kind:Mandatory) - Colocation Constraints: -- WebSite with ClusterIP -- WebFS with WebDataClone (with-rsc-role:Master) -- WebSite with WebFS -- --# pcs -f fs_cfg resource show -- ClusterIP (ocf::heartbeat:IPaddr2) Started -- WebSite (ocf::heartbeat:apache) Started -+ WebSite with ClusterIP (score:INFINITY) -+ WebFS with WebDataClone (score:INFINITY) (with-rsc-role:Master) -+ WebSite with WebFS (score:INFINITY) -+[root@pcmk-1 ~]# pcs -f fs_cfg resource show -+ ClusterIP (ocf::heartbeat:IPaddr2): Started -+ WebSite (ocf::heartbeat:apache): Started - Master/Slave Set: WebDataClone [WebData] - Masters: [ pcmk-1 ] - Slaves: [ pcmk-2 ] -- WebFS (ocf::heartbeat:Filesystem) Stopped ------ -- --endif::[] -- --ifdef::crmsh[] --Once again we'll use the shell's interactive mode -- --[source,C] ------ --# crm --crm(live) # cib new fs --INFO: fs shadow CIB created --crm(fs) # configure primitive WebFS ocf:heartbeat:Filesystem \ -- params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="ext4" --crm(fs) # configure colocation fs_on_drbd inf: WebFS WebDataClone:Master --crm(fs) # configure order WebFS-after-WebData inf: WebDataClone:promote WebFS:start -+ WebFS (ocf::heartbeat:Filesystem): Stopped - ---- - --We also need to tell the cluster that Apache needs to run on the same --machine as the filesystem and that it must be active before Apache can --start. -- --[source,C] ------ --crm(fs) # configure colocation WebSite-with-WebFS inf: WebSite WebFS --crm(fs) # configure order WebSite-after-WebFS inf: WebFS WebSite ------ -- --Time to review the updated configuration: -- --[source,C] ------ --crm(fs) # configure show --node $id="1702537408" pcmk-1 --node $id="1719314624" pcmk-2 --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.120" cidr_netmask="32" \ -- op monitor interval="30s" --primitive WebData ocf:linbit:drbd \ -- params drbd_resource="wwwdata" \ -- op monitor interval="60s" --primitive WebFS ocf:heartbeat:Filesystem \ -- params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="ext4" --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --ms WebDataClone WebData \ -- meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" --location prefer-pcmk-1 WebSite 50: pcmk-1 --colocation WebSite-with-WebFS inf: WebSite WebFS --colocation fs_on_drbd inf: WebFS WebDataClone:Master --colocation website-with-ip inf: WebSite ClusterIP --order WebFS-after-WebData inf: WebDataClone:promote WebFS:start --order WebSite-after-WebFS inf: WebFS WebSite --order apache-after-ip inf: ClusterIP WebSite --property $id="cib-bootstrap-options" \ -- dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ -- cluster-infrastructure="corosync" \ -- stonith-enabled="false" \ -- no-quorum-policy="ignore" \ -- last-lrm-refresh="1333446866" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" --op_defaults $id="op-options" \ -- timeout="240s" ------ --endif::[] -- --After reviewing the new configuration, we again upload it and watch the -+After reviewing the new configuration, upload it and watch the - cluster put it into effect. - --ifdef::pcs[] --[source,C] - ---- --# pcs cluster cib-push fs_cfg --CIB updated --# pcs status -- Last updated: Fri Aug 10 12:47:01 2012 -+[root@pcmk-1 ~]# pcs cluster cib-push fs_cfg -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+Last updated: Wed Dec 17 17:02:45 2014 -+Last change: Wed Dec 17 17:02:42 2014 -+Stack: corosync -+Current DC: pcmk-2 (2) - partition with quorum -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+5 Resources configured - -- Last change: Fri Aug 10 12:46:55 2012 via cibadmin on pcmk-1 -- Stack: corosync -- Current DC: pcmk-1 (1) - partition with quorum -- Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 -- 2 Nodes configured, unknown expected votes -- 5 Resources configured. - - Online: [ pcmk-1 pcmk-2 ] - - Full list of resources: - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 -- WebSite (ocf::heartbeat:apache): Started pcmk-1 -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 -+ WebSite (ocf::heartbeat:apache): Started pcmk-1 - Master/Slave Set: WebDataClone [WebData] - Masters: [ pcmk-1 ] - Slaves: [ pcmk-2 ] -- WebFS (ocf::heartbeat:Filesystem): Started pcmk-1 ------ --endif::[] -+ WebFS (ocf::heartbeat:Filesystem): Started pcmk-1 - --ifdef::crmsh[] --[source,C] ------ --crm(fs) # cib commit fs --INFO: commited 'fs' shadow CIB to the cluster --crm(fs) # quit --bye --# crm_mon -1 --============ --Last updated: Tue Apr 3 13:52:21 2012 --Last change: Tue Apr 3 13:52:06 2012 via crm_shadow on pcmk-1 --Stack: corosync --Current DC: pcmk-1 (1702537408) - partition with quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --5 Resources configured. --============ -- --Online: [ pcmk-1 pcmk-2 ] -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 -- WebSite (ocf::heartbeat:apache): Started pcmk-1 -- Master/Slave Set: WebDataClone [WebData] -- Masters: [ pcmk-1 ] -- Slaves: [ pcmk-2 ] -- WebFS (ocf::heartbeat:Filesystem): Started pcmk-1 -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled - ---- --endif::[] - --=== Testing Migration === -+== Test Cluster Failover == - --We could shut down the active node again, but another way to safely --simulate recovery is to put the node into what is called "standby --mode". Nodes in this state tell the cluster that they are not allowed --to run resources. Any resources found active there will be moved --elsewhere. This feature can be particularly useful when updating the --resources' packages. -+Previously, we used `pcs cluster stop pcmk-1` to stop all cluster -+services on *pcmk-1*, failing over the cluster resources, but there is another -+way to safely simulate node failure. - --Put the local node into standby mode and observe the cluster move all --the resources to the other node. Note also that the node's status will -+We can put the node into _standby mode_. Nodes in this state continue to -+run corosync and pacemaker but are not allowed to run resources. Any resources -+found active there will be moved elsewhere. This feature can be particularly -+useful when performing system administration tasks such as updating packages -+used by cluster resources. -+ -+Put the active node into standby mode, and observe the cluster move all -+the resources to the other node. The node's status will - change to indicate that it can no longer host resources. - --ifdef::pcs[] --[source,C] - ---- --# pcs cluster standby pcmk-1 --# pcs status -- --Last updated: Fri Sep 14 12:41:12 2012 --Last change: Fri Sep 14 12:41:08 2012 via crm_attribute on pcmk-1 -+[root@pcmk-1 ~]# pcs cluster standby pcmk-1 -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+Last updated: Wed Dec 17 17:14:05 2014 -+Last change: Wed Dec 17 17:14:02 2014 - Stack: corosync --Current DC: pcmk-1 (1) - partition with quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --5 Resources configured. -+Current DC: pcmk-2 (2) - partition with quorum -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+5 Resources configured -+ - - Node pcmk-1 (1): standby - Online: [ pcmk-2 ] - - Full list of resources: - --ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 --WebSite (ocf::heartbeat:apache): Started pcmk-2 -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -+ WebSite (ocf::heartbeat:apache): Started pcmk-2 - Master/Slave Set: WebDataClone [WebData] - Masters: [ pcmk-2 ] -- Stopped: [ WebData:1 ] --WebFS (ocf::heartbeat:Filesystem): Started pcmk-2 ------ --endif::[] -+ Stopped: [ pcmk-1 ] -+ WebFS (ocf::heartbeat:Filesystem): Started pcmk-2 - --ifdef::crmsh[] --[source,C] ------ --# crm node standby --# crm_mon -1 --============ --Last updated: Tue Apr 3 13:59:14 2012 --Last change: Tue Apr 3 13:52:36 2012 via crm_attribute on pcmk-1 --Stack: corosync --Current DC: pcmk-1 (1702537408) - partition with quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --5 Resources configured. --============ -- --Node pcmk-1 (1702537408): standby --Online: [ pcmk-2 ] -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - --ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 --WebSite (ocf::heartbeat:apache): Started pcmk-2 -- Master/Slave Set: WebDataClone [WebData] -- Masters: [ pcmk-2 ] -- Stopped: [ WebData:1 ] --WebFS (ocf::heartbeat:Filesystem): Started pcmk-2 -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled - ---- --endif::[] - - Once we've done everything we needed to on pcmk-1 (in this case nothing, - we just wanted to see the resources move), we can allow the node to be a - full cluster member again. - --ifdef::pcs[] --[source,C] - ---- --# pcs cluster unstandby pcmk-1 --# pcs status -- --Last updated: Fri Sep 14 12:43:02 2012 --Last change: Fri Sep 14 12:42:57 2012 via crm_attribute on pcmk-1 -+[root@pcmk-1 ~]# pcs cluster unstandby pcmk-1 -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+Last updated: Wed Dec 17 17:15:36 2014 -+Last change: Wed Dec 17 17:15:33 2014 - Stack: corosync --Current DC: pcmk-1 (1) - partition with quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --5 Resources configured. -+Current DC: pcmk-2 (2) - partition with quorum -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+5 Resources configured -+ - - Online: [ pcmk-1 pcmk-2 ] - - Full list of resources: - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -- WebSite (ocf::heartbeat:apache): Started pcmk-2 -+ ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -+ WebSite (ocf::heartbeat:apache): Started pcmk-2 - Master/Slave Set: WebDataClone [WebData] - Masters: [ pcmk-2 ] - Slaves: [ pcmk-1 ] -- WebFS (ocf::heartbeat:Filesystem): Started pcmk-2 ------ --endif::[] -- --ifdef::crmsh[] --[source,C] ------ --# crm node online --# crm_mon -1 --============ --Last updated: Tue Apr 3 14:00:06 2012 --Last change: Tue Apr 3 14:00:00 2012 via crm_attribute on pcmk-1 --Stack: corosync --Current DC: pcmk-1 (1702537408) - partition with quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --5 Resources configured. --============ -+ WebFS (ocf::heartbeat:Filesystem): Started pcmk-2 - --Online: [ pcmk-1 pcmk-2 ] -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - -- ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 -- WebSite (ocf::heartbeat:apache): Started pcmk-2 -- Master/Slave Set: WebDataClone [WebData] -- Masters: [ pcmk-2 ] -- Slaves: [ pcmk-1 ] -- WebFS (ocf::heartbeat:Filesystem): Started pcmk-2 -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled - ---- --endif::[] -- --Notice that our resource stickiness settings prevent the services from --migrating back to pcmk-1. - -+Notice that *pcmk-1* is back to the *Online* state, and that the cluster resources -+stay where they are due to our resource stickiness settings configured earlier. -diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Stonith.txt b/doc/Clusters_from_Scratch/en-US/Ch-Stonith.txt -index 9518fc2..0ad6c2e 100644 ---- a/doc/Clusters_from_Scratch/en-US/Ch-Stonith.txt -+++ b/doc/Clusters_from_Scratch/en-US/Ch-Stonith.txt -@@ -1,29 +1,27 @@ - = Configure STONITH = - --== What Is STONITH == -+== What is STONITH? == - --STONITH is an acronym for Shoot-The-Other-Node-In-The-Head and it --protects your data from being corrupted by rogue nodes or concurrent --access. -+STONITH (Shoot The Other Node In The Head aka. fencing) protects your data from -+being corrupted by rogue nodes or unintended concurrent access. - --Just because a node is unresponsive, this doesn't mean it isn't -+Just because a node is unresponsive doesn't mean it has stopped - accessing your data. The only way to be 100% sure that your data is --safe, is to use STONITH so we can be certain that the node is truly --offline, before allowing the data to be accessed from another node. -- -+safe, is to use STONITH to ensure that the node is truly -+offline before allowing the data to be accessed from another node. - - STONITH also has a role to play in the event that a clustered service - cannot be stopped. In this case, the cluster uses STONITH to force the - whole node offline, thereby making it safe to start the service - elsewhere. - --== What STONITH Device Should You Use == -+== Choose a STONITH Device == - --It is crucial that the STONITH device can allow the cluster to --differentiate between a node failure and a network one. -+It is crucial that your STONITH device can allow the cluster to -+differentiate between a node failure and a network failure. - - The biggest mistake people make in choosing a STONITH device is to --use remote power switch (such as many on-board IMPI controllers) that -+use a remote power switch (such as many on-board IPMI controllers) that - shares power with the node it controls. In such cases, the cluster - cannot be sure if the node is really offline, or active and suffering - from a network fault. -@@ -31,278 +29,112 @@ from a network fault. - Likewise, any device that relies on the machine being active (such as - SSH-based "devices" used during testing) are inappropriate. - --== Configuring STONITH == -- --ifdef::pcs[] --. Find the correct driver: +pcs stonith list+ -+== Configure the Cluster for STONITH == - --. Find the parameters associated with the device: +pcs stonith describe + -+. Configure the STONITH device itself to be able to fence your nodes and accept -+ fencing requests. - --. Create a local config to make changes to +pcs cluster cib stonith_cfg+ -+. Install the STONITH agent(s). To see what packages are available, run `yum -+ search fence-agents fence-virt`. Be sure to install the package(s) on all -+ cluster nodes. - --. Create the fencing resource using +pcs -f stonith_cfg stonith create -- [stonith device options]+ -+. Find the correct STONITH agent script: `pcs stonith list` - --. Set stonith-enable to true. +pcs -f stonith_cfg property set stonith-enabled=true+ --endif::[] -+. Find the parameters associated with the device: +pcs stonith describe pass:[agent_name]+ - --ifdef::crmsh[] --. Find the correct driver: +stonith_admin --list-installed+ -+. Create a local copy of the CIB: `pcs cluster cib stonith_cfg` - --. Since every device is different, the parameters needed to configure -- it will vary. To find out the parameters associated with the device, -- run: +stonith_admin --metadata --agent type+ -+. Create the fencing resource: +pcs -f stonith_cfg stonith create pass:[stonith_id -+ stonith_device_type [stonith_device_options]]+ - -- The output should be XML formatted text containing additional -- parameter descriptions. We will endevor to make the output more -- friendly in a later version. -- --. Enter the shell crm Create an editable copy of the existing -- configuration +cib new stonith+ Create a fencing resource containing a -- primitive resource with a class of stonith, a type of type and a -- parameter for each of the values returned in step 2: +configure -- primitive ...+ --endif::[] -+. Enable STONITH in the cluster: `pcs -f stonith_cfg property set stonith-enabled=true` - - . If the device does not know how to fence nodes based on their uname, -- you may also need to set the special +pcmk_host_map+ parameter. See -- +man stonithd+ for details. -+ you may also need to set the special *pcmk_host_map* parameter. See -+ `man stonithd` for details. - --. If the device does not support the list command, you may also need -- to set the special +pcmk_host_list+ and/or +pcmk_host_check+ -- parameters. See +man stonithd+ for details. -+. If the device does not support the *list* command, you may also need -+ to set the special *pcmk_host_list* and/or *pcmk_host_check* -+ parameters. See `man stonithd` for details. - - . If the device does not expect the victim to be specified with the -- port parameter, you may also need to set the special -- +pcmk_host_argument+ parameter. See +man stonithd+ for details. -- --ifdef::crmsh[] --. Upload it into the CIB from the shell: +cib commit stonith+ --endif::[] -+ *port* parameter, you may also need to set the special -+ *pcmk_host_argument* parameter. See `man stonithd` for details. - --ifdef::pcs[] --. Commit the new configuration. +pcs cluster cib-push stonith_cfg+ --endif::[] -+. Commit the new configuration: `pcs cluster cib-push stonith_cfg` - --. Once the stonith resource is running, you can test it by executing: -- +stonith_admin --reboot nodename+. Although you might want to stop the -- cluster on that machine first. -+. Once the STONITH resource is running, test it (you might want to stop -+ the cluster on that machine first): +stonith_admin --reboot pass:[nodename]+ - - == Example == - --Assuming we have an chassis containing four nodes and an IPMI device --active on 10.0.0.1, then we would chose the fence_ipmilan driver in step --2 and obtain the following list of parameters -+For this example, assume we have a chassis containing four nodes -+and an IPMI device active on 10.0.0.1. Following the steps above -+would go something like this: - --.Obtaining a list of STONITH Parameters -+Step 1: Configure the IP address, authentication credentials, etc. in the IPMI device itself. - --ifdef::pcs[] --[source,C] -+Step 2: Install the *fence-agents-ipmilan* package on both nodes. -+ -+Step 3: Choose the *fence_ipmilan* STONITH agent. -+ -+Step 4: Obtain the agent's possible parameters: - ---- --# pcs stonith describe fence_ipmilan -+[root@pcmk-1 ~]# pcs stonith describe fence_ipmilan - Stonith options for: fence_ipmilan -- auth: IPMI Lan Auth type (md5, password, or none) -- ipaddr: IPMI Lan IP to talk to -- passwd: Password (if required) to control power on IPMI device -- passwd_script: Script to retrieve password (if required) -- lanplus: Use Lanplus -- login: Username/Login (if required) to control power on IPMI device -- action: Operation to perform. Valid operations: on, off, reboot, status, list, diag, monitor or metadata -- timeout: Timeout (sec) for IPMI operation -+ ipport: TCP/UDP port to use for connection with device -+ inet6_only: Forces agent to use IPv6 addresses only -+ ipaddr (required): IP Address or Hostname -+ passwd_script: Script to retrieve password -+ method: Method to fence (onoff|cycle) -+ inet4_only: Forces agent to use IPv4 addresses only -+ passwd: Login password or passphrase -+ lanplus: Use Lanplus to improve security of connection -+ auth: IPMI Lan Auth type. - cipher: Ciphersuite to use (same as ipmitool -C parameter) -- method: Method to fence (onoff or cycle) -- power_wait: Wait X seconds after on/off operation -- delay: Wait X seconds before fencing is started - privlvl: Privilege level on IPMI device -+ action (required): Fencing Action -+ login: Login Name - verbose: Verbose mode -+ debug: Write debug information to given file -+ version: Display version information and exit -+ help: Display help and exit -+ power_wait: Wait X seconds after issuing ON/OFF -+ login_timeout: Wait X seconds for cmd prompt after login -+ power_timeout: Test X seconds for status change after ON/OFF -+ delay: Wait X seconds before fencing is started -+ ipmitool_path: Path to ipmitool binary -+ shell_timeout: Wait X seconds for cmd prompt after issuing command -+ retry_on: Count of attempts to retry power on -+ sudo: Use sudo (without password) when calling 3rd party sotfware. -+ stonith-timeout: How long to wait for the STONITH action (reboot, on, off) to complete per a stonith device. -+ priority: The priority of the stonith resource. Devices are tried in order of highest priority to lowest. -+ pcmk_host_map: A mapping of host names to ports numbers for devices that do not support host names. -+ pcmk_host_list: A list of machines controlled by this device (Optional unless pcmk_host_check=static-list). -+ pcmk_host_check: How to determine which machines are controlled by the device. - ---- --endif::[] -- --ifdef::crmsh[] --[source,C] ------ --# stonith_admin --metadata -a fence_ipmilan ------ --[source,XML] ------ -- -- -- --fence_ipmilan is an I/O Fencing agent which can be used with machines controlled by IPMI. This agent calls support software using ipmitool (http://ipmitool.sf.net/). -- --To use fence_ipmilan with HP iLO 3 you have to enable lanplus option (lanplus / -P) and increase wait after operation to 4 seconds (power_wait=4 / -T 4) -- -- -- -- -- IPMI Lan Auth type (md5, password, or none) -- -- -- -- -- IPMI Lan IP to talk to -- -- -- -- -- Password (if required) to control power on IPMI device -- -- -- -- -- Script to retrieve password (if required) -- -- -- -- -- Use Lanplus -- -- -- -- -- Username/Login (if required) to control power on IPMI device -- -- -- -- -- Operation to perform. Valid operations: on, off, reboot, status, list, diag, monitor or metadata -- -- -- -- -- Timeout (sec) for IPMI operation -- -- -- -- -- Ciphersuite to use (same as ipmitool -C parameter) -- -- -- -- -- Method to fence (onoff or cycle) -- -- -- -- -- Wait X seconds after on/off operation -- -- -- -- -- Wait X seconds before fencing is started -- -- -- -- -- Verbose mode -- -- -- -- -- -- -- -- -- -- -- -- -- ------ --endif::[] - --from which we would create a STONITH resource fragment that might look --like this -+Step 5: `pcs cluster cib stonith_cfg` - --.Sample STONITH Resource --ifdef::pcs[] -+Step 6: Here are example parameters for creating our STONITH resource: - ---- --# pcs cluster cib stonith_cfg --# pcs -f stonith_cfg stonith create impi-fencing fence_ipmilan \ -+# pcs -f stonith_cfg stonith create ipmi-fencing fence_ipmilan \ - pcmk_host_list="pcmk-1 pcmk-2" ipaddr=10.0.0.1 login=testuser \ - passwd=acd123 op monitor interval=60s ------ --[source,C] ------ - # pcs -f stonith_cfg stonith -- impi-fencing (stonith:fence_ipmilan) Stopped -+ ipmi-fencing (stonith:fence_ipmilan): Stopped - ---- --endif::[] -- --ifdef::crmsh[] --[source,C] ------ --# crm crm(live)# cib new stonith --INFO: stonith shadow CIB created --crm(stonith)# configure primitive impi-fencing stonith::fence_ipmilan \ -- params pcmk_host_list="pcmk-1 pcmk-2" ipaddr=10.0.0.1 login=testuser passwd=abc123 \ -- op monitor interval="60s" ------ --endif::[] -- --And finally, since we disabled it earlier, we need to re-enable STONITH. --At this point we should have the following configuration. - --ifdef::pcs[] --[source,C] -+Steps 7-10: Enable STONITH in the cluster: - ---- - # pcs -f stonith_cfg property set stonith-enabled=true - # pcs -f stonith_cfg property --dc-version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --cluster-infrastructure: corosync --no-quorum-policy: ignore --stonith-enabled: true -+Cluster Properties: -+ cluster-infrastructure: corosync -+ cluster-name: mycluster -+ dc-version: 1.1.12-a9c8177 -+ have-watchdog: false -+ stonith-enabled: true - ---- --endif::[] - --Now push the configuration into the cluster. -- --ifdef::pcs[] --[source,C] ------ --# pcs cluster cib-push stonith_cfg ------ --endif::[] -- --ifdef::crmsh[] --[source,C] ------ --crm(stonith)# configure property stonith-enabled="true" --crm(stonith)# configure shownode pcmk-1 --node pcmk-2 --primitive WebData ocf:linbit:drbd \ -- params drbd_resource="wwwdata" \ -- op monitor interval="60s" --primitive WebFS ocf:heartbeat:Filesystem \ -- params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" --primitive WebSite ocf:heartbeat:apache \ -- params configfile="/etc/httpd/conf/httpd.conf" \ -- op monitor interval="1min" --primitive ClusterIP ocf:heartbeat:IPaddr2 \ -- params ip="192.168.122.101" cidr_netmask="32" clusterip_hash="sourceip" \ -- op monitor interval="30s"primitive ipmi-fencing stonith::fence_ipmilan \ params pcmk_host_list="pcmk-1 pcmk-2" ipaddr=10.0.0.1 login=testuser passwd=abc123 \ op monitor interval="60s"ms WebDataClone WebData \ -- meta master-max="2" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" --clone WebFSClone WebFS --clone WebIP ClusterIP \ -- meta globally-unique="true" clone-max="2" clone-node-max="2" --clone WebSiteClone WebSite --colocation WebSite-with-WebFS inf: WebSiteClone WebFSClone --colocation fs_on_drbd inf: WebFSClone WebDataClone:Master --colocation website-with-ip inf: WebSiteClone WebIP --order WebFS-after-WebData inf: WebDataClone:promote WebFSClone:start --order WebSite-after-WebFS inf: WebFSClone WebSiteClone --order apache-after-ip inf: WebIP WebSiteClone --property $id="cib-bootstrap-options" \ -- dc-version="1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f" \ -- cluster-infrastructure="openais" \ -- expected-quorum-votes="2" \ -- stonith-enabled="true" \ -- no-quorum-policy="ignore" --rsc_defaults $id="rsc-options" \ -- resource-stickiness="100" --crm(stonith)# cib commit stonithINFO: commited 'stonith' shadow CIB to the cluster --crm(stonith)# quit --bye ------ --endif::[] -+Step 11: `pcs cluster cib-push stonith_cfg` -diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Tools.txt b/doc/Clusters_from_Scratch/en-US/Ch-Tools.txt -index 04de80c..f3bcd8c 100644 ---- a/doc/Clusters_from_Scratch/en-US/Ch-Tools.txt -+++ b/doc/Clusters_from_Scratch/en-US/Ch-Tools.txt -@@ -1,6 +1,6 @@ - = Pacemaker Tools = - --== Using Pacemaker Tools == -+== Simplify administration using a cluster shell == - - In the dark past, configuring Pacemaker required the administrator to - read and write XML. In true UNIX style, there were also a number of -@@ -12,16 +12,14 @@ command-line shells (and GUIs) that hide all the messy XML - scaffolding. - - These shells take all the individual aspects required for managing and --configuring a cluster, and packs them into one simple to use command -+configuring a cluster, and packs them into one simple-to-use command - line tool. - - They even allow you to queue up several changes at once and commit - them atomically. - - There are currently two command-line shells that people use, `pcs` and --`crmsh`. This edition of Clusters from Scratch is based on --+{cli_name}+. Start by taking some time to familiarize yourself with --what it can do. -+`crmsh`. This edition of Clusters from Scratch is based on `pcs`. - - [NOTE] - =========== -@@ -30,19 +28,21 @@ does differ, so make sure you read the version of this guide that - corresponds to the software installed on your system. - =========== - --ifdef::pcs[] -- - [IMPORTANT] - =========== - Since `pcs` has the ability to manage all aspects of the cluster (both - corosync and pacemaker), it requires a specific cluster stack to be in --use, (corosync 2.0 with votequorum + Pacemaker version >= 1.1.8). -+use: corosync 2.0 or later with votequorum plus Pacemaker 1.1.8 or later. - =========== - --[source,C] --# pcs -+== Explore pcs == -+ -+Start by taking some time to familiarize yourself with -+what `pcs` can do. - --..... -+---- -+[root@pcmk-1 ~]# pcs -+Usage: pcs [-f file] [-h] [commands]... - Control and configure pacemaker and corosync. - - Options: -@@ -59,18 +59,16 @@ Commands: - property Set pacemaker properties - status View cluster status - config Print full cluster configuration --..... -+---- - --As you can see, the different aspects of cluster management are broken --up into categories: resource, cluster, stonith, property, constraint, -+As you can see, the different aspects of cluster management are separated -+into categories: resource, cluster, stonith, property, constraint, - and status. To discover the functionality available in each of these --categories, one can issue the command 'pcs help'. Below -+categories, one can issue the command +pcs pass:[category] help+. Below - is an example of all the options available under the status category. - --[source,C] --# pcs status help -- --..... -+---- -+[root@pcmk-1 ~]# pcs status help - Usage: pcs status [commands]... - View current cluster and resource status - Commands: -@@ -100,64 +98,22 @@ Commands: - - xml - View xml version of status (output from crm_mon -r -1 -X) -+---- - --..... -- --Additionally, if you are interested in the Pacemaker version and --supported cluster stack(s) available with your current Pacemaker --installation, the pacemakerd --features option is available to you. -- -- --[source,C] --# pacemakerd --features -- -------------------- --sys::[pacemakerd --features] -------------------- -- --[NOTE] --====== --If the SNMP and/or email options are not listed, then Pacemaker was not --built to support them. This may be by the choice of your distribution or --the required libraries may not have been available. Please contact --whoever supplied you with the packages for more details. --====== -- --endif::[] -- --ifdef::crmsh[] --pass:[# crm --help] -- --The primary tool for monitoring the status of the cluster is crm_mon --(also available as crm status). It can be run in a variety of modes --and has a number of output options. To find out about any of the tools --that come with Pacemaker, simply invoke them with the --help option or --consult the included man pages. Both sets of output are created from --the tool, and so will always be in sync with each other and the tool --itself. -- --Additionally, the Pacemaker version and supported cluster stack(s) are --available via the --feature option to pacemakerd. -- --[source,C] --# pacemakerd --features -- -------------------- --sys::[pacemakerd --features] -------------------- -- --[source,C] --# crm_mon --help -+Additionally, if you are interested in the version and -+supported cluster stack(s) available with your Pacemaker -+installation, run: - -------------------- --sys::[crm_mon --help] -------------------- -+---- -+[root@pcmk-1 ~]# pacemakerd --features -+Pacemaker 1.1.12 (Build: a9c8177) -+ Supporting v3.0.9: generated-manpages agent-manpages ascii-docs publican-docs ncurses libqb-logging libqb-ipc upstart systemd nagios corosync-native atomic-attrd acls -+---- - - [NOTE] - ====== - If the SNMP and/or email options are not listed, then Pacemaker was not --built to support them. This may be by the choice of your distribution or -+built to support them. This may be by the choice of your distribution, or - the required libraries may not have been available. Please contact - whoever supplied you with the packages for more details. - ====== --endif::[] -diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Verification.txt b/doc/Clusters_from_Scratch/en-US/Ch-Verification.txt -index 530e37b..4961763 100644 ---- a/doc/Clusters_from_Scratch/en-US/Ch-Verification.txt -+++ b/doc/Clusters_from_Scratch/en-US/Ch-Verification.txt -@@ -1,27 +1,26 @@ --= Verify Cluster Installation = -+= Start and Verify Cluster = - --ifdef::pcs[] - == Start the Cluster == - - Now that corosync is configured, it is time to start the cluster. - The command below will start corosync and pacemaker on both nodes - in the cluster. If you are issuing the start command from a different --node than the one you ran the 'pcs cluster auth' command on earlier, you --must authenticate on current node you are logged into before you will -+node than the one you ran the `pcs cluster auth` command on earlier, you -+must authenticate on the current node you are logged into before you will - be allowed to start the cluster. - --[source,C] - ---- --# pcs cluster start --all -+[root@pcmk-1 ~]# pcs cluster start --all - pcmk-1: Starting Cluster... - pcmk-2: Starting Cluster... - ---- - --An alternative to using the 'pcs cluster startall' command --is to issue either of the below commands on each node in the --cluster by hand. -+[NOTE] -+====== -+An alternative to using the `pcs cluster start --all` command -+is to issue either of the below command sequences on each node in the -+cluster separately: - --[source,C] - ---- - # pcs cluster start - Starting Cluster... -@@ -29,65 +28,46 @@ Starting Cluster... - - or - --[source,C] - ---- - # systemctl start corosync.service - # systemctl start pacemaker.service - ---- -+====== - --endif::[] -+[IMPORTANT] -+==== -+In this example, we are not enabling the corosync and pacemaker services -+to start at boot. If a cluster node fails or is rebooted, you will need to run -++pcs cluster start pass:[nodename]+ (or `--all`) to start the cluster on it. -+While you could enable the services to start at boot, requiring a manual -+start of cluster services gives you the opportunity to do a post-mortem investigation -+of a node failure before returning it to the cluster. -+==== - - == Verify Corosync Installation == - --ifdef::crmsh[] --Start Corosync on the first node -+First, use `corosync-cfgtool` to check whether cluster communication is happy: - --[source,C] - ---- --# systemctl start corosync.service ------ --endif::[] -- --The first thing to check is if cluster communication is happy, for --that we use `corosync-cfgtool`. -- --ifdef::crmsh[] --[source,C] ------ --# corosync-cfgtool -s --Printing ring status. --Local node ID 1702537408 --RING ID 0 -- id = 192.168.122.101 -- status = ring 0 active with no faults ------ --endif::[] -- --ifdef::pcs[] --[source,C] ------ --# corosync-cfgtool -s -+[root@pcmk-1 ~]# corosync-cfgtool -s - Printing ring status. - Local node ID 1 - RING ID 0 - id = 192.168.122.101 - status = ring 0 active with no faults - ---- --endif::[] - - We can see here that everything appears normal with our fixed IP --address, not a 127.0.0.x loopback address, listed as the +id+ and +no --faults+ for the status. -+address (not a 127.0.0.x loopback address) listed as the *id*, and *no -+faults* for the status. - - If you see something different, you might want to start by checking - the node's network, firewall and selinux configurations. - --Next we check the membership and quorum APIs: -+Next, check the membership and quorum APIs: - --ifdef::pcs[] --[source,C] - ---- --# corosync-cmapctl | grep members -+[root@pcmk-1 ~]# corosync-cmapctl | grep members - runtime.totem.pg.mrp.srp.members.1.ip (str) = r(0) ip(192.168.122.101) - runtime.totem.pg.mrp.srp.members.1.join_count (u32) = 1 - runtime.totem.pg.mrp.srp.members.1.status (str) = joined -@@ -95,7 +75,7 @@ runtime.totem.pg.mrp.srp.members.2.ip (str) = r(0) ip(192.168.122.102) - runtime.totem.pg.mrp.srp.members.2.join_count (u32) = 1 - runtime.totem.pg.mrp.srp.members.2.status (str) = joined - --# pcs status corosync -+[root@pcmk-1 ~]# pcs status corosync - Membership information - -------------------------- - Nodeid Votes Name -@@ -104,81 +84,15 @@ Membership information - ---- - - You should see both nodes have joined the cluster. --endif::[] -- --ifdef::crmsh[] --[source,C] ------ --# corosync-cmapctl | grep members --runtime.totem.pg.mrp.srp.members.1702537408.ip (str) = r(0) ip(192.168.122.101) --runtime.totem.pg.mrp.srp.members.1702537408.join_count (u32) = 1 --runtime.totem.pg.mrp.srp.members.1702537408.status (str) = joined -- --# corosync-quorumtool -l --Membership information -- -------------------------- -- Nodeid Votes Name --1702537408 1 pcmk-1 ------ -- --The node see's itself in both locations which is a good sign. -- --If the node list is empty when you call `corosync-quorumtool`, then --you've not correctly quorum in 'corosync.conf'. -- --With everything looking healthy, we start Corosync on the second node --and run the same communications check. -- --[source,C] ------ --# ssh pcmk-2 -- systemctl start corosync.service --# ssh pcmk-2 -- corosync-cfgtool -s --Printing ring status. --Local node ID 1719314624 --RING ID 0 -- id = 192.168.122.102 -- status = ring 0 active with no faults ------ -- --Everything appears to look ok from +pcmk-2+, time to re-run the --membership and quorum checks to see if it shows up there too. -- --Again, if you see something different to the above, check for the --usual suspects: network, firewall and selinux. -- --[source,C] ------ --# corosync-cmapctl | grep members --runtime.totem.pg.mrp.srp.members.1702537408.ip (str) = r(0) ip(192.168.122.101) --runtime.totem.pg.mrp.srp.members.1702537408.join_count (u32) = 1 --runtime.totem.pg.mrp.srp.members.1702537408.status (str) = joined --runtime.totem.pg.mrp.srp.members.1719314624.ip (str) = r(0) ip(192.168.122.102) --runtime.totem.pg.mrp.srp.members.1719314624.join_count (u32) = 1 --runtime.totem.pg.mrp.srp.members.1719314624.status (str) = joined -- --# corosync-quorumtool -l -- --Membership information -- -------------------------- -- Nodeid Votes Name --1702537408 1 pcmk-1 --1719314624 1 pcmk-2 ------ --endif::[] -- --All good! - - == Verify Pacemaker Installation == - -- --ifdef::pcs[] --Now that we have confirmed that Corosync is functional we can check -+Now that we have confirmed that Corosync is functional, we can check - the rest of the stack. Pacemaker has already been started, so verify --the necessary processes are running. -+the necessary processes are running: - --[source,C] - ---- --# ps axf -+[root@pcmk-1 ~]# ps axf - PID TTY STAT TIME COMMAND - 2 ? S 0:00 [kthreadd] - ...lots of processes... -@@ -192,102 +106,49 @@ the necessary processes are running. - 28053 ? Ss 0:00 \_ /usr/libexec/pacemaker/crmd - ---- - --If that looks ok, check the pcs status output. -+If that looks OK, check the `pcs status` output: - --[source,C] - ---- --# pcs status --Last updated: Fri Sep 14 09:52:25 2012 --Last change: Fri Sep 14 09:51:55 2012 via crmd on pcmk-2 -+[root@pcmk-1 ~]# pcs status -+Cluster name: mycluster -+WARNING: no stonith devices and stonith-enabled is not false -+Last updated: Tue Dec 16 16:15:29 2014 -+Last change: Tue Dec 16 15:49:47 2014 - Stack: corosync - Current DC: pcmk-2 (2) - partition with quorum --Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 --2 Nodes configured, unknown expected votes --0 Resources configured. -+Version: 1.1.12-a9c8177 -+2 Nodes configured -+0 Resources configured -+ - - Online: [ pcmk-1 pcmk-2 ] - - Full list of resources: ------ -- --Next, check for any ERRORs during startup - there shouldn't be any. -- --[source,C] ------ --# grep -i error /var/log/messages ------ -- --Repeat these checks on the other node. The results should be the same. -- --endif::[] -- --ifdef::crmsh[] --Now that we have confirmed that Corosync is functional we can check --the rest of the stack. Start Pacemaker and check the necessary --processes have been started. -- --[source,C] ------ --# systemctl start pacemaker.service --# ps axf -- PID TTY STAT TIME COMMAND -- 2 ? S 0:00 [kthreadd] --...lots of processes... --28019 ? Ssl 0:03 /usr/sbin/corosync --28047 ? Ss 0:00 /usr/sbin/pacemakerd -f --28048 ? Ss 0:00 \_ /usr/libexec/pacemaker/cib --28049 ? Ss 0:00 \_ /usr/libexec/pacemaker/stonithd --28050 ? Ss 0:00 \_ /usr/lib64/heartbeat/lrmd --28051 ? Ss 0:00 \_ /usr/libexec/pacemaker/attrd --28052 ? Ss 0:00 \_ /usr/libexec/pacemaker/pengine --28053 ? Ss 0:00 \_ /usr/libexec/pacemaker/crmd ------ - --If that looks ok, check the logs and crm_mon. - --[source,C] ------ --# grep pacemakerd /var/log/messages | grep -e get_cluster_type -e read_config --Apr 3 09:19:32 pcmk-1 pacemakerd[28047]: info: get_cluster_type: Detected an active 'corosync' cluster --Apr 3 09:19:32 pcmk-1 pacemakerd[28047]: info: read_config: Reading configure for stack: corosync --# crm_mon -1 --============ --Last updated: Tue Apr 3 09:21:37 2012 --Last change: Tue Apr 3 09:19:54 2012 via crmd on pcmk-1 --Stack: corosync --Current DC: pcmk-1 (1702537408) - partition with quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --1 Nodes configured, unknown expected votes --0 Resources configured. --============ -+PCSD Status: -+ pcmk-1: Online -+ pcmk-2: Online - --Online: [ pcmk-1 ] -+Daemon Status: -+ corosync: active/disabled -+ pacemaker: active/disabled -+ pcsd: active/enabled - ---- - --Next, check for any ERRORs during startup - there shouldn't be any. -- --[source,C] -+Finally, ensure there are no startup errors (aside from messages relating -+to not having STONITH configured, which are OK at this point): - ---- --# grep -i error /var/log/messages -+[root@pcmk-1 ~]# journalctl | grep -i error - ---- - --Repeat on the other node and display the cluster's status. -- --[source,C] -+[NOTE] -+====== -+Other operating systems will report startup errors in other locations. -+For example, on Fedora 19 and earlier, the command would be: - ---- --# ssh pcmk-2 -- systemctl start pacemaker.service --# crm_mon -1 --============ --Last updated: Tue Apr 3 09:26:23 2012 --Last change: Tue Apr 3 09:26:21 2012 via crmd on pcmk-1 --Stack: corosync --Current DC: pcmk-1 (1702537408) - partition with quorum --Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff --2 Nodes configured, unknown expected votes --0 Resources configured. --============ -- --Online: [ pcmk-1 pcmk-2 ] -+[root@pcmk-1 ~]# grep -i error /var/log/messages - ---- -+====== - --endif::[] -+Repeat these checks on the other node. The results should be the same. -diff --git a/doc/Clusters_from_Scratch/en-US/Clusters_from_Scratch.ent b/doc/Clusters_from_Scratch/en-US/Clusters_from_Scratch.ent -index ecc8c13..5a675eb 100644 ---- a/doc/Clusters_from_Scratch/en-US/Clusters_from_Scratch.ent -+++ b/doc/Clusters_from_Scratch/en-US/Clusters_from_Scratch.ent -@@ -1,6 +1,6 @@ - - -- -+ - - -- -+ -diff --git a/doc/Clusters_from_Scratch/en-US/Clusters_from_Scratch.xml b/doc/Clusters_from_Scratch/en-US/Clusters_from_Scratch.xml -index a9b2936..4586c9e 100644 ---- a/doc/Clusters_from_Scratch/en-US/Clusters_from_Scratch.xml -+++ b/doc/Clusters_from_Scratch/en-US/Clusters_from_Scratch.xml -@@ -13,8 +13,8 @@ - - - -- - -+ - - - -diff --git a/doc/Clusters_from_Scratch/en-US/Preface.xml b/doc/Clusters_from_Scratch/en-US/Preface.xml -index edea678..678b160 100644 ---- a/doc/Clusters_from_Scratch/en-US/Preface.xml -+++ b/doc/Clusters_from_Scratch/en-US/Preface.xml -@@ -3,11 +3,11 @@ - - %BOOK_ENTITIES; - ]> -- -- Preface -- -- -- -- -+Preface -+ -+ -+ -+ -+ -+ - -- -diff --git a/doc/Clusters_from_Scratch/en-US/Revision_History.xml b/doc/Clusters_from_Scratch/en-US/Revision_History.xml -index 19dd319..03d367e 100644 ---- a/doc/Clusters_from_Scratch/en-US/Revision_History.xml -+++ b/doc/Clusters_from_Scratch/en-US/Revision_History.xml -@@ -4,6 +4,7 @@ - %BOOK_ENTITIES; - ]> - -+ - Revision History - - -@@ -49,6 +50,18 @@ - DavidVosseldvossel@redhat.com - Updated for pcs - -+ -+ 8-0 -+ Mon Jan 05 2015 -+ KenGaillotkgaillot@redhat.com -+ Updated for Fedora 21 -+ -+ -+ 8-1 -+ Thu Jan 08 2015 -+ KenGaillotkgaillot@redhat.com -+ Minor corrections, plus use include file for intro -+ - - - -diff --git a/doc/Clusters_from_Scratch/ro-RO/Ch-Apache.po b/doc/Clusters_from_Scratch/ro-RO/Ch-Apache.po -index 6032e87..228390d 100644 ---- a/doc/Clusters_from_Scratch/ro-RO/Ch-Apache.po -+++ b/doc/Clusters_from_Scratch/ro-RO/Ch-Apache.po -@@ -205,7 +205,7 @@ msgstr "Actualizarea Configurației" - #. Tag: para - #, fuzzy, no-c-format - msgid "At this point, Apache is ready to go, all that needs to be done is to add it to the cluster. Lets call the resource WebSite. We need to use an OCF script called apache in the heartbeat namespace Compare the key used here ocf:heartbeat:apache with the one we used earlier for the IP address: ocf:heartbeat:IPaddr2 , the only required parameter is the path to the main Apache configuration file and we’ll tell the cluster to check once a minute that apache is still running." --msgstr "La acest punct, Apache este gata de pornire, tot ce trebuie făcut este să îl adăugăm la cluster. Să denumim resursa WebSite. Avem nevoie să folosim un script OCF numit apache din namespace-ul Comparați cheia folosită aici, ocf:heartbeart:apache, cu cea folosită anterior pentru adresa IP: ocf:heartbeat:IPaddr2 heartbeat, singurul parametru necesar este calea către fișierul principal de configurare al Apache și vom spune clusterului să verifice o dată pe minut că apache încă funcționează." -+msgstr "La acest punct, Apache este gata de pornire, tot ce trebuie făcut este să îl adăugăm la cluster. Să denumim resursa WebSite. Avem nevoie să folosim un script OCF numit apache din namespace-ul Comparați cheia folosită aici, ocf:heartbeat:apache, cu cea folosită anterior pentru adresa IP: ocf:heartbeat:IPaddr2 heartbeat, singurul parametru necesar este calea către fișierul principal de configurare al Apache și vom spune clusterului să verifice o dată pe minut că apache încă funcționează." - - #. Tag: screen - #, no-c-format -diff --git a/doc/Clusters_from_Scratch/zh-CN/Ch-Active-Passive.po b/doc/Clusters_from_Scratch/zh-CN/Ch-Active-Passive.po -index f9cc723..daefc41 100644 ---- a/doc/Clusters_from_Scratch/zh-CN/Ch-Active-Passive.po -+++ b/doc/Clusters_from_Scratch/zh-CN/Ch-Active-Passive.po -@@ -379,7 +379,7 @@ msgstr "当有半数以上的节点在线时,这个集群就认为自己拥有 - #. Tag: literallayout - #, no-c-format - msgid "total_nodes < 2 * active_nodes" --msgstr "总节点数 < 2 * 活跃节点数" -+msgstr "" - - #. Tag: para - #, no-c-format -diff --git a/doc/Makefile.am b/doc/Makefile.am -index 8798365..eae7729 100644 ---- a/doc/Makefile.am -+++ b/doc/Makefile.am -@@ -25,10 +25,14 @@ ascii = crm_fencing.txt acls.txt - docbook = Pacemaker_Explained Clusters_from_Scratch Pacemaker_Remote - doc_DATA = README.hb2openais $(ascii) $(generated_docs) - -+# rsync destination for www targets -+RSYNC_DEST = root@www.clusterlabs.org:/var/www/html/doc/ -+ - publican_docs = - generated_docs = - generated_mans = - -+ASCIIDOC_CLI_TYPE := pcs - DOCBOOK_FORMATS := html-desktop - DOCBOOK_LANGS := en-US - DOTs = $(wildcard */en-US/images/*.dot) -@@ -70,24 +74,37 @@ EXTRA_DIST = $(docbook:%=%.xml) - $(AM_V_ASCII)$(ASCIIDOC) --unsafe --backend=xhtml11 $< - - -+SHARED_TXT=$(wildcard shared/en-US/*.txt) -+SHARED_XML=$(SHARED_TXT:%.txt=%.xml) -+ -+ -+CFS_SHARED_TXT=$(addprefix shared/en-US/,pacemaker-intro.txt) -+CFS_SHARED_XML=$(CFS_SHARED_TXT:%.txt=%.xml) - CFS_TXT=$(wildcard Clusters_from_Scratch/en-US/*.txt) - CFS_XML=$(CFS_TXT:%.txt=%.xml) - -+$(CFS_XML): $(CFS_SHARED_XML) -+ - # We have to hardcode the book name - # With '%' the test for 'newness' fails --Clusters_from_Scratch.build: $(PNGS) $(wildcard Clusters_from_Scratch/en-US/*.xml) $(CFS_XML) -+Clusters_from_Scratch.build: $(PNGS) $(wildcard Clusters_from_Scratch/en-US/*.xml) $(CFS_XML) $(CFS_SHARED_XML) - $(PCMK_V) @echo Building $(@:%.build=%) because of $? - rm -rf $(@:%.build=%)/publish/* - $(AM_V_PUB)cd $(@:%.build=%) && RPM_BUILD_DIR="" $(PUBLICAN) build --publish --langs=$(DOCBOOK_LANGS) --formats=$(DOCBOOK_FORMATS) $(PCMK_quiet) - rm -rf $(@:%.build=%)/tmp - touch $@ - -+ -+PE_SHARED_TXT=$(addprefix shared/en-US/,pacemaker-intro.txt) -+PE_SHARED_XML=$(PE_SHARED_TXT:%.txt=%.xml) - PE_TXT=$(wildcard Pacemaker_Explained/en-US/*.txt) - PE_XML=$(PE_TXT:%.txt=%.xml) - -+$(PE_XML): $(PE_SHARED_XML) -+ - # We have to hardcode the book name - # With '%' the test for 'newness' fails --Pacemaker_Explained.build: $(PNGS) $(wildcard Pacemaker_Explained/en-US/*.xml) $(PE_XML) -+Pacemaker_Explained.build: $(PNGS) $(wildcard Pacemaker_Explained/en-US/*.xml) $(PE_XML) $(PE_SHARED_XML) - $(PCMK_V) @echo Building $(@:%.build=%) because of $? - rm -rf $(@:%.build=%)/publish/* - $(AM_V_PUB)cd $(@:%.build=%) && RPM_BUILD_DIR="" $(PUBLICAN) build --publish --langs=$(DOCBOOK_LANGS) --formats=$(DOCBOOK_FORMATS) $(PCMK_quiet) -@@ -126,7 +143,6 @@ docbook_build = $(docbook:%=%.build) - - all-local: $(docbook_build) */publican.cfg - --#install-data-local: all-local - install-data-local: all-local - for book in $(docbook); do \ - filelist=`find $$book/publish/* -print`; \ -@@ -153,18 +169,11 @@ brand: $(BRAND_PNGS) $(wildcard publican-clusterlabs/en-US/*.xml) - pdf: - make DOCBOOK_FORMATS="pdf" ASCIIDOC_CLI_TYPE=$(ASCIIDOC_CLI_TYPE) all-local - --# Make sure www-(pcs|crmsh) happen in serial --www: -- make www-pcs -- make www-crmsh -- make $(generated_docs) $(ascii) -- rsync -rtz --progress $(generated_docs) $(ascii) $(asciiman) root@www.clusterlabs.org:/var/www/html/doc/ -+www: clean-local $(generated_docs) $(ascii) -+ make www-cli -+ rsync -rtz --progress $(generated_docs) $(ascii) $(asciiman) $(RSYNC_DEST) - --www-crmsh: -- make ASCIIDOC_CLI_TYPE=crmsh clean-local www-cli -- --www-pcs: -- make ASCIIDOC_CLI_TYPE=pcs clean-local www-cli -+www-pcs: www-cli - - www-cli: - for book in $(docbook); do \ -@@ -181,14 +190,12 @@ if BUILD_DOCBOOK - mv $$book/publish/$$lang/Pacemaker/$(PACKAGE_SERIES)-$(ASCIIDOC_CLI_TYPE)/epub/$$book/Pacemaker-1.1{-$(ASCIIDOC_CLI_TYPE),}-$$book-$$lang.epub; \ - mv $$book/publish/$$lang/Pacemaker/$(PACKAGE_SERIES)-$(ASCIIDOC_CLI_TYPE)/pdf/$$book/Pacemaker-1.1{-$(ASCIIDOC_CLI_TYPE),}-$$book-$$lang.pdf; \ - done; \ -- rsync -rtz --progress $$book/publish/* root@www.clusterlabs.org:/var/www/html/doc/; \ -+ rsync -rtz --progress $$book/publish/* $(RSYNC_DEST); \ - sed -i.sed 's@version:.*@version: $(PACKAGE_SERIES)@' $$book/publican.cfg; \ - done - endif - - clean-local: -- -rm -rf $(generated_docs) $(generated_mans) $(docbook_build) $(CFS_XML) $(PE_XML) $(PR_XML) -+ -rm -rf $(generated_docs) $(generated_mans) $(docbook_build) -+ -rm -rf $(SHARED_XML) $(CFS_XML) $(PE_XML) $(PR_XML) - for book in $(docbook); do rm -rf $$book/tmp $$book/publish; done -- --foo: -- rm -f $(CFS_XML) -diff --git a/doc/Pacemaker_Explained/en-US/Ap-FAQ.txt b/doc/Pacemaker_Explained/en-US/Ap-FAQ.txt -new file mode 100644 -index 0000000..954df04 ---- /dev/null -+++ b/doc/Pacemaker_Explained/en-US/Ap-FAQ.txt -@@ -0,0 +1,74 @@ -+[appendix] -+ -+[[ap-faq]] -+== FAQ == -+ -+ -+[qanda] -+Why is the Project Called Pacemaker?:: -+ indexterm:[Pacemaker] -+ First of all, the reason it's not called the CRM is because of the abundance -+ of terms footnote:[http://en.wikipedia.org/wiki/CRM] that are commonly -+ abbreviated to those three letters. The Pacemaker name came from Kham, -+ footnote:[http://khamsouk.souvanlasy.com/] a good friend of Pacemaker -+ developer Andrew Beekhof's, and was originally used by a Java GUI that Beekhof -+ was prototyping in early 2007. Alas, other commitments prevented the GUI from -+ progressing much and, when it came time to choose a name for this project, -+ Lars Marowsky-Bree suggested it was an even better fit for an independent CRM. -+ The idea stems from the analogy between the role of this software and that of -+ the little device that keeps the human heart pumping. Pacemaker monitors the -+ cluster and intervenes when necessary to ensure the smooth operation of the -+ services it provides. -+ There were a number of other names (and acronyms) tossed around, but suffice to -+ say "Pacemaker" was the best. -+ -+Why was the Pacemaker Project Created?:: -+ -+ The decision was made to spin-off the CRM into its own project after the 2.1.3 -+ Heartbeat release in order to: -+ -+ * support both the Corosync and Heartbeat cluster stacks equally -+ * decouple the release cycles of two projects at very different stages of their life-cycles -+ * foster clearer package boundaries, thus leading to better and more stable interfaces -+ -+What Messaging Layers are Supported?:: -+ indexterm:[Messaging Layers] -+ -+ * http://www.corosync.org/[Corosync] -+ * http://linux-ha.org/[Heartbeat] -+ -+Can I Choose Which Messaging Layer to Use at Run Time?:: -+ -+ Yes. The CRM will automatically detect which started it and behave accordingly. -+ -+Can I Have a Mixed Heartbeat-Corosync Cluster?:: -+ -+ No. -+ -+[[q-messaging-layer]] Which Messaging Layer Should I Choose?:: -+ indexterm:[Cluster,Choosing Between Heartbeat and Corosync] -+ indexterm:[Cluster Stack,Corosync] indexterm:[Corosync] -+ indexterm:[Cluster Stack,Heartbeat] indexterm:[Heartbeat] -+ You can choose from multiple messaging layers, including -+ heartbeat, corosync 1 (with or without CMAN), and corosync 2. -+ Corosync 2 is the current state of the art due to its -+ more advanced features and better support for pacemaker, -+ but often the best choice is to use whatever comes with -+ your Linux distribution, and follow the distribution's -+ setup instructions. -+ -+Where Can I Get Pre-built Packages?:: -+ -+ Most major Linux distributions have pacemaker packages in their standard -+ package repositories. See the http://clusterlabs.org/wiki/Install[Install wiki -+ page] for details. -+ -+What Versions of Pacemaker Are Supported?:: -+ -+ Some Linux distributions (such as Red Hat Enterprise Linux and SUSE Linux -+ Enterprise) offer technical support for their customers; contact them -+ for details of such support. -+ For help within the community (mailing lists, IRC, etc.) from Pacemaker developers -+ and users, refer to the http://clusterlabs.org/wiki/Releases[Releases wiki page] -+ for an up-to-date list of versions considered to be supported by the project. -+ When seeking assistance, please try to ensure you have one of these versions. -diff --git a/doc/Pacemaker_Explained/en-US/Ap-FAQ.xml b/doc/Pacemaker_Explained/en-US/Ap-FAQ.xml -deleted file mode 100644 -index 9e6d69f..0000000 ---- a/doc/Pacemaker_Explained/en-US/Ap-FAQ.xml -+++ /dev/null -@@ -1,95 +0,0 @@ -- -- FAQ -- -- -- History -- -- -- Why is the Project Called PacemakernamingPacemaker? -- -- -- First of all, the reason its not called the CRM is because of the abundance of terms that are commonly abbreviated to those three letters. -- -- The Pacemaker name came from Kham, a good friend of mine, and was originally used by a Java GUI that I was prototyping in early 2007. -- Alas other commitments have prevented the GUI from progressing much and, when it came time to choose a name for this project, Lars suggested it was an even better fit for an independent CRM. -- -- -- The idea stems from the analogy between the role of this software and that of the little device that keeps the human heart pumping. -- Pacemaker monitors the cluster and intervenes when necessary to ensure the smooth operation of the services it provides. -- -- There were a number of other names (and acronyms) tossed around, but suffice to say "Pacemaker" was the best -- -- -- -- -- Why was the Pacemaker Project Created? -- -- -- The decision was made to spin-off the CRM into its own project after the 2.1.3 Heartbeat release in order to -- -- support both the Corosync and Heartbeat cluster stacks equally -- decouple the release cycles of two projects at very different stages of their life-cycles -- foster the clearer package boundaries, thus leading to -- better and more stable interfaces -- -- -- -- -- -- Setup -- -- -- What Messaging Layers Messaging Layers are Supported? -- -- -- -- Corosync () -- Heartbeat () -- -- -- -- -- -- Can I Choose which Messaging Layer to use at Run Time? -- -- -- Yes. The CRM will automatically detect which started it and behave accordingly. -- -- -- -- -- Can I Have a Mixed Heartbeat-Corosync Cluster? -- -- -- No. -- -- -- -- -- Which Messaging Layer Should I Choose? -- -- -- This is discussed in . -- -- -- -- -- Where Can I Get Pre-built Packages? -- -- -- Official packages for most major .rpm and based distributions are available from the ClusterLabs Website. -- For Debian packages, building from source and details on using the above repositories, see our installation page. -- -- -- -- -- What Versions of Pacemaker Are Supported? -- -- -- Please refer to the Releases page for an up-to-date list of versions supported directly by the project. -- When seeking assistance, please try to ensure you have one of these versions. -- -- -- -- -- -diff --git a/doc/Pacemaker_Explained/en-US/Ap-Install.txt b/doc/Pacemaker_Explained/en-US/Ap-Install.txt -index 7eb587b..c34cb14 100644 ---- a/doc/Pacemaker_Explained/en-US/Ap-Install.txt -+++ b/doc/Pacemaker_Explained/en-US/Ap-Install.txt -@@ -1,43 +1,118 @@ - [appendix] - --[[ap-install]] --== Installation == -- --[WARNING] --The following text may no longer be accurate in some places. -- --=== Choosing a Cluster Stack === --indexterm:[Cluster,Choosing Between Heartbeat and Corosync] --indexterm:[Cluster Stack,Corosync] indexterm:[Corosync] --indexterm:[Cluster Stack,Heartbeat] indexterm:[Heartbeat] -- --Ultimately the choice of cluster stack is a personal decision that --must be made in the context of you or your company's needs and --strategic direction. Pacemaker currently functions equally well with --both stacks. -- --Here are some factors that may influence the decision: -- --* SUSE/Novell, Red Hat and Oracle are all putting their collective -- weight behind the Corosync cluster stack. --* Using Corosync gives your applications access to the following -- additional cluster services --** distributed locking service --** extended virtual synchronization service --** cluster closed process group service --* It is likely that Pacemaker, at some point in the future, will make -- use of some of these additional services not provided by Heartbeat -- -+== Installing == -+ -+=== Installing the Software === -+ -+Most major Linux distributions have pacemaker packages in their standard -+package repositories, or the software can be built from source code. -+See the http://clusterlabs.org/wiki/Install[Install wiki page] for details. -+ -+See <> -+for information about choosing a messaging layer. -+ - === Enabling Pacemaker === - --==== For Corosync ==== -+==== Enabling Pacemaker For Corosync 2._x_ ==== -+ -+High-level cluster management tools are available that can configure -+corosync for you. This document focuses on the lower-level details -+if you want to configure corosync yourself. -+ -+Corosync configuration is normally located in -++/etc/corosync/corosync.conf+. -+ -+.Corosync 2._x_ configuration file for two nodes *myhost1* and *myhost2* -+==== -+---- -+totem { -+version: 2 -+secauth: off -+cluster_name: mycluster -+transport: udpu -+} -+ -+nodelist { -+ node { -+ ring0_addr: myhost1 -+ nodeid: 1 -+ } -+ node { -+ ring0_addr: myhost2 -+ nodeid: 2 -+ } -+} -+ -+quorum { -+provider: corosync_votequorum -+two_node: 1 -+} -+ -+logging { -+to_syslog: yes -+} -+---- -+==== -+ -+.Corosync 2._x_ configuration file for three nodes *myhost1*, *myhost2* and *myhost3* -+==== -+---- -+totem { -+version: 2 -+secauth: off -+cluster_name: mycluster -+transport: udpu -+} -+ -+nodelist { -+ node { -+ ring0_addr: myhost1 -+ nodeid: 1 -+ } -+ node { -+ ring0_addr: myhost2 -+ nodeid: 2 -+ } -+ node { -+ ring0_addr: myhost3 -+ nodeid: 3 -+ } -+} -+ -+quorum { -+provider: corosync_votequorum - --The Corosync configuration is normally located in --'/etc/corosync/corosync.conf' and an example for a machine with an --address of +1.2.3.4+ in a cluster communicating on port 1234 (without --peer authentication and message encryption) is shown below. -+} -+ -+logging { -+to_syslog: yes -+} -+---- -+==== -+ -+In the above examples, the +totem+ section defines what protocol version and -+options (including encryption) to use, -+footnote:[ -+Please consult the Corosync website (http://www.corosync.org/) and -+documentation for details on enabling encryption and peer authentication for -+the cluster. -+] -+and gives the cluster a unique name (+mycluster+ in these examples). -+ -+The +node+ section lists the nodes in this cluser. (See <> -+for how this affects pacemaker.) -+ -+The +quorum+ section defines how the cluster uses quorum. -+The important thing is that two-node clusters must be handled specially, -+so +two_node: 1+ must be defined for two-node clusters (and only for two-node -+clusters). - --.An example Corosync configuration file -+The +logging+ section should be self-explanatory. -+ -+==== Enabling Pacemaker For Corosync 1._x_ ==== -+ -+.Corosync 1._x_ configuration file for a cluster with all nodes on the +192.0.2.0/24+ network -+==== - [source,XML] - ------- - totem { -@@ -46,7 +121,7 @@ peer authentication and message encryption) is shown below. - threads: 0 - interface { - ringnumber: 0 -- bindnetaddr: 1.2.3.4 -+ bindnetaddr: 192.0.2.0 - mcastaddr: 239.255.1.1 - mcastport: 1234 - } -@@ -59,37 +134,33 @@ peer authentication and message encryption) is shown below. - amf { - mode: disabled - } -- - ------- -+==== - --The logging should be mostly obvious and the amf section refers to the --Availability Management Framework and is not covered in this document. -+With corosync 1._x_, the +totem+ section contains the protocol version and -+options as with 2._x_. However, nodes are also listed here, -+in the +interface+ section. The +bindnetaddr+ option is usually the _network_ -+address, thus allowing the same configuration file to be used on all nodes. -+IPv4 or IPv6 addresses can be used with corosync. - --The interesting part of the configuration is the totem section. This --is where we define how the node can communicate with the rest of the --cluster and what protocol version and options (including encryption --footnote:[ --Please consult the Corosync website (http://www.corosync.org/) and documentation for details on enabling encryption and peer authentication for the cluster. --] --) it should use. Beginners are encouraged to use the values shown and --modify the interface section based on their network. -+The +amf+ section refers to the Availability Management Framework and -+is not covered in this document. - --It is also possible to configure Corosync for an IPv6 based --environment. Simply configure +bindnetaddr+ and +mcastaddr+ with their --IPv6 equivalents, eg. -+The above corosync configuration is enough for corosync to operate by itself, -+but corosync 1._x_ additionally needs to be told when it is being -+used in conjunction with Pacemaker. This can be accomplished -+in one of two ways: - --.Example options for an IPv6 environment --[source,Bash] --------- -- bindnetaddr: fec0::1:a800:4ff:fe00:20 -- mcastaddr: ff05::1 --------- -+* Via the CMAN software provided with Red Hat Enterprise Linux 6 and its derivatives -+* Via the pacemaker corosync plugin - --To tell Corosync to use the Pacemaker cluster manager, add the --following fragment to a functional Corosync configuration and restart --the cluster. -+To use CMAN, consult its documentation. - --.Configuration fragment for enabling Pacemaker under Corosync -+To use the pacemaker corosync plugin, add the following fragment to the -+corosync configuration and restart the cluster. -+ -+.Corosync 1._x_configuration fragment to enable Pacemaker plugin -+==== - [source,XML] - ------- - aisexec { -@@ -101,6 +172,7 @@ service { - ver: 0 - } - ------- -+==== - - The cluster needs to be run as root so that its child processes (the - +lrmd+ in particular) have sufficient privileges to perform the -@@ -110,10 +182,16 @@ an IP address or start apache is of little use. - The second directive is the one that actually instructs the cluster to - run Pacemaker. - --==== For Heartbeat ==== -+==== Enabling Pacemaker For Heartbeat ==== -+ -+See the heartbeat documentation for how to set up a +ha.cf+ configuration file. - --Add the following to a functional _ha.cf_ configuration file and restart Heartbeat: -+To enable the use of pacemaker with heartbeat, add the following to a -+functional +ha.cf+ configuration file and restart Heartbeat: - --.Configuration fragment for enabling Pacemaker under Heartbeat --[source,Bash] -+.Heartbeat configuration fragment to enable Pacemaker -+==== -+---- - crm respawn -+---- -+==== -diff --git a/doc/Pacemaker_Explained/en-US/Ap-LSB.txt b/doc/Pacemaker_Explained/en-US/Ap-LSB.txt -index 62d1d16..479ac55 100644 ---- a/doc/Pacemaker_Explained/en-US/Ap-LSB.txt -+++ b/doc/Pacemaker_Explained/en-US/Ap-LSB.txt -@@ -1,76 +1,81 @@ - [appendix] - - [[ap-lsb]] --== init-Script LSB Compliance == -+== Init Script LSB Compliance == - --The relevant part of --http://refspecs.freestandards.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html[LSB spec] -+The relevant part of the -+http://refspecs.linuxfoundation.org/lsb.shtml[LSB specifications] - includes a description of all the return codes listed here. - --Assuming +some_service+ is configured correctly and currently not --active, the following sequence will help you determine if it is LSB --compatible: -+Assuming `some_service` is configured correctly and currently -+inactive, the following sequence will help you determine if it is -+LSB-compatible: - - . Start (stopped): - + --[source,C] -+---- - # /etc/init.d/some_service start ; echo "result: $?" -+---- - + - .. Did the service start? -- .. Did the command print result: 0 (in addition to the regular output)? -+ .. Did the command print *result: 0* (in addition to its usual output)? - + - . Status (running): - + --[source,C] -+---- - # /etc/init.d/some_service status ; echo "result: $?" -+---- - + - .. Did the script accept the command? - .. Did the script indicate the service was running? -- .. Did the command print result: 0 (in addition to the regular output)? -+ .. Did the command print *result: 0* (in addition to its usual output)? - + - . Start (running): - + --[source,C] -+---- - # /etc/init.d/some_service start ; echo "result: $?" -+---- - + - .. Is the service still running? -- .. Did the command print result: 0 (in addition to the regular output)? -+ .. Did the command print *result: 0* (in addition to its usual output)? - + - . Stop (running): - + --[source,C] -+---- - # /etc/init.d/some_service stop ; echo "result: $?" -+---- - + - .. Was the service stopped? -- .. Did the command print result: 0 (in addition to the regular output)? -+ .. Did the command print *result: 0* (in addition to its usual output)? - + - . Status (stopped): - + --[source,C] -+---- - # /etc/init.d/some_service status ; echo "result: $?" -+---- - + - .. Did the script accept the command? - .. Did the script indicate the service was not running? -- .. Did the command print result: 3 (in addition to the regular output)? -+ .. Did the command print *result: 3* (in addition to its usual output)? - + - . Stop (stopped): - + --[source,C] -+---- - # /etc/init.d/some_service stop ; echo "result: $?" -+---- - + - .. Is the service still stopped? -- .. Did the command print result: 0 (in addition to the regular output)? -+ .. Did the command print *result: 0* (in addition to its usual output)? - + - . Status (failed): - + --This step is not readily testable and relies on manual inspection of the script. -+.. This step is not readily testable and relies on manual inspection of the script. - + - The script can use one of the error codes (other than 3) listed in the - LSB spec to indicate that it is active but failed. This tells the - cluster that before moving the resource to another node, it needs to - stop it on the existing one first. - -- - If the answer to any of the above questions is no, then the script is --not LSB compliant. Your options are then to either fix the script or -+not LSB-compliant. Your options are then to either fix the script or - write an OCF agent based on the existing script. -diff --git a/doc/Pacemaker_Explained/en-US/Ap-OCF.txt b/doc/Pacemaker_Explained/en-US/Ap-OCF.txt -index 4edccdd..25a9b72 100644 ---- a/doc/Pacemaker_Explained/en-US/Ap-OCF.txt -+++ b/doc/Pacemaker_Explained/en-US/Ap-OCF.txt -@@ -6,22 +6,24 @@ - === Location of Custom Scripts === - - indexterm:[OCF Resource Agents] --OCF Resource Agents are found in '/usr/lib/ocf/resource.d/+provider+'. -+OCF Resource Agents are found in +/usr/lib/ocf/resource.d/pass:[provider]+ - - When creating your own agents, you are encouraged to create a new --directory under _/usr/lib/ocf/resource.d/_ so that they are not --confused with (or overwritten by) the agents shipped with Heartbeat. -+directory under +/usr/lib/ocf/resource.d/+ so that they are not -+confused with (or overwritten by) the agents shipped by existing providers. - --So, for example, if you chose the provider name of bigCorp and wanted --a new resource named bigApp, you would create a script called --_/usr/lib/ocf/resource.d/bigCorp/bigApp_ and define a resource: -+So, for example, if you choose the provider name of bigCorp and want -+a new resource named bigApp, you would create a resource agent called -++/usr/lib/ocf/resource.d/bigCorp/bigApp+ and define a resource: - - [source,XML] -+---- - -+---- - - === Actions === - --All OCF Resource Agents are required to implement the following actions -+All OCF resource agents are required to implement the following actions. - - .Required Actions for OCF Agents - [width="95%",cols="3m,3,7",options="header",align="center"] -@@ -60,21 +62,21 @@ NOTE: The monitor script should test the state of the resource on the local mach - indexterm:[meta-data,OCF Action] - indexterm:[OCF,Action,meta-data] - --NOTE: This is *not* performed as root. -+NOTE: This is _not_ performed as root. - - |validate-all - |Verify the supplied parameters --|Exit with 0 if parameters are valid, 2 if not valid, 6 if resource is not configured. -+|Return 0 if parameters are valid, 2 if not valid, and 6 if resource is not configured. - indexterm:[validate-all,OCF Action] - indexterm:[OCF,Action,validate-all] - - |========================================================= - --Additional requirements (not part of the OCF specs) are placed on --agents that will be used for advanced concepts like -+Additional requirements (not part of the OCF specification) are placed on -+agents that will be used for advanced concepts such as - <> and <> resources. - --.Optional Actions for OCF Agents -+.Optional Actions for OCF Resource Agents - [width="95%",cols="2m,6,3",options="header",align="center"] - |========================================================= - -@@ -83,19 +85,19 @@ agents that will be used for advanced concepts like - |Instructions - - |promote --|Promote the local instance of a multi-state resource to the master/primary state. -+|Promote the local instance of a multi-state resource to the master (primary) state. - |Return 0 on success - indexterm:[promote,OCF Action] - indexterm:[OCF,Action,promote] - - |demote --|Demote the local instance of a multi-state resource to the slave/secondary state. -+|Demote the local instance of a multi-state resource to the slave (secondary) state. - |Return 0 on success - indexterm:[demote,OCF Action] - indexterm:[OCF,Action,demote] - - |notify --|Used by the cluster to send the agent pre and post notification -+|Used by the cluster to send the agent pre- and post-notification - events telling the resource what has happened and will happen. - |Must not fail. Must exit with 0 - indexterm:[notify,OCF Action] -@@ -103,19 +105,21 @@ agents that will be used for advanced concepts like - - |========================================================= - --One action specified in the OCF specs is not currently used by the cluster: -+One action specified in the OCF specs, +recover+, is not currently used by the -+cluster. It is intended to be a variant of the +start+ action that tries to -+recover a resource locally. - --* +recover+ - a variant of the +start+ action, this should try to -- recover a resource locally. -- --Remember to use indexterm:[ocf-tester]`ocf-tester` to verify that your --new agent complies with the OCF standard properly. -+[IMPORTANT] -+==== -+If you create a new OCF resource agent, use indexterm:[ocf-tester]`ocf-tester` -+to verify that the agent complies with the OCF standard properly. -+==== - - === How are OCF Return Codes Interpreted? === - - The first thing the cluster does is to check the return code against - the expected result. If the result does not match the expected value, --then the operation is considered to have failed and recovery action is -+then the operation is considered to have failed, and recovery action is - initiated. - - There are three types of failure recovery: -@@ -141,22 +145,24 @@ indexterm:[hard,OCF error] - indexterm:[OCF,error,hard] - - |fatal --|A non-transient error that will be common to all cluster nodes (eg. a bad configuration was specified) -+|A non-transient error that will be common to all cluster nodes (e.g. a bad configuration was specified) - |Stop the resource and prevent it from being started on any cluster node - indexterm:[fatal,OCF error] - indexterm:[OCF,error,fatal] - - |========================================================= - --Assuming an action is considered to have failed, the following table --outlines the different OCF return codes and the type of recovery the --cluster will initiate when it is received. -- - [[s-ocf-return-codes]] - === OCF Return Codes === - -+The following table outlines the different OCF return codes and the type of -+recovery the cluster will initiate when a failure code is received. -+Although counterintuitive, even actions that return 0 -+(aka. +OCF_SUCCESS+) can be considered to have failed, if 0 was not -+the expected return value. -+ - .OCF Return Codes and their Recovery Types --[width="95%",cols="2m,5^m,6<,1m",options="header",align="center"] -+[width="95%",cols="1m,4>). - * Recurring actions that return +OCF_ERR_UNIMPLEMENTED+ -- do not cause any type of recovery -+ do not cause any type of recovery. -diff --git a/doc/Pacemaker_Explained/en-US/Ap-Samples.txt b/doc/Pacemaker_Explained/en-US/Ap-Samples.txt -index 7f4f057..4494c18 100644 ---- a/doc/Pacemaker_Explained/en-US/Ap-Samples.txt -+++ b/doc/Pacemaker_Explained/en-US/Ap-Samples.txt -@@ -8,7 +8,7 @@ - ======= - [source,XML] - ------- -- -+ - - - -@@ -22,24 +22,19 @@ - - === Simple === - --.Simple Configuration - 2 nodes, some cluster options and a resource -+.A simple configuration with two nodes, some cluster options and a resource - ======= - [source,XML] - ------- -- -+ - - -- -- -+ -+ -+ -+ -+ - -- -- -- -- -- -- -- - - - -@@ -49,46 +44,50 @@ - - - -- -- -+ -+ - - - - - - -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ - - - - ------- - ======= - --In this example, we have one resource (an IP address) that we check -+In the above example, we have one resource (an IP address) that we check - every five minutes and will run on host +c001n01+ until either the - resource fails 10 times or the host shuts down. - -- - === Advanced Configuration === - --.Advanced configuration - groups and clones with stonith -+.An advanced configuration with groups, clones and STONITH - ======= - [source,XML] - ------- -- -+ - - -- -- -- -+ -+ -+ -+ -+ - -- -- -- -- -- -- -- - - - -@@ -99,8 +98,8 @@ resource fails 10 times or the host shuts down. - - - -- -- -+ -+ - - - -@@ -135,6 +134,17 @@ resource fails 10 times or the host shuts down. - - -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ - - - -diff --git a/doc/Pacemaker_Explained/en-US/Ap-Upgrade-Config.txt b/doc/Pacemaker_Explained/en-US/Ap-Upgrade-Config.txt -index db51b25..7f1eb06 100644 ---- a/doc/Pacemaker_Explained/en-US/Ap-Upgrade-Config.txt -+++ b/doc/Pacemaker_Explained/en-US/Ap-Upgrade-Config.txt -@@ -1,18 +1,14 @@ - [appendix] - --== Upgrading the Configuration from 0.6 == -+== Upgrading the Configuration == -+ -+This process was originally written for the upgrade from 0.6.'x' to 1.'y', -+but the concepts should apply for any upgrade involving a change in -+the XML schema version. - --=== Preparation === - indexterm:[Upgrading the Configuration] - indexterm:[Configuration,Upgrading] - --indexterm:[Download,DTD] --indexterm:[DTD,Download] -- --Download the latest --http://hg.clusterlabs.org/pacemaker/stable-1.0/file-raw/tip/xml/crm.dtd[DTD] --and ensure your configuration validates. -- - === Perform the upgrade === - - ==== Upgrade the software ==== -@@ -23,11 +19,11 @@ Refer to the appendix: <> - - As XML is not the friendliest of languages, it is common for cluster - administrators to have scripted some of their activities. In such --cases, it is likely that those scripts will not work with the new 1.0 -+cases, it is likely that those scripts will not work with the new XML - syntax. - - In order to support such environments, it is actually possible to --continue using the old 0.6 syntax. -+continue using the old XML syntax. - - The downside is, however, that not all the new features will be - available and there is a performance impact since the cluster must do -@@ -36,26 +32,23 @@ while using the old syntax is possible, it is not advisable to - continue using it indefinitely. - - Even if you wish to continue using the old syntax, it is advisable to --follow the upgrade procedure to ensure that the cluster is able to use --your existing configuration (since it will perform much the same task --internally). -+follow the upgrade procedure (except for the last step) to ensure that the -+cluster is able to use your existing configuration (since it will perform much -+the same task internally). - - . Create a shadow copy to work with - + --[source,C] - ----- - # crm_shadow --create upgrade06 - ----- - . Verify the configuration is valid indexterm:[Configuration,Verify]indexterm:[Verify,Configuration] - + --[source,C] - ----- - # crm_verify --live-check - ----- - . Fix any errors or warnings - . Perform the upgrade: - + --[source,C] - ----- - # cibadmin --upgrade - ----- -@@ -66,29 +59,28 @@ internally). - The most common reason is ID values being repeated or invalid. Pacemaker 1.0 is much stricter regarding this type of validation. - ] - + --If the result of the transformation is invalid, you may see a number of errors from the validation library. If these are not helpful, visit http://clusterlabs.org/wiki/Validation_FAQ and/or try the procedure described below under <> -+If the result of the transformation is invalid, you may see a number of errors -+from the validation library. If these are not helpful, visit the -+http://clusterlabs.org/wiki/Validation_FAQ[Validation FAQ wiki page] and/or try -+the procedure described below under <> - + - . Check the changes - + --[source,C] - ----- - # crm_shadow --diff - ----- - + - If at this point there is anything about the upgrade that you wish to fine-tune (for example, to change some of the automatic IDs) now is the time to do so. Since the shadow configuration is not in use by the cluster, it is safe to edit the file manually: - + --[source,C] - ----- - # crm_shadow --edit - ----- - + --This will open the configuration in your favorite editor (whichever is specified by the standard +$EDITOR+ environment variable) -+This will open the configuration in your favorite editor (whichever is -+specified by the standard *$EDITOR* environment variable) - + --. Preview how the cluster will react -+. Preview how the cluster will react: - + --Test what the cluster will do when you upload the new configuration --+ --[source,C] - ------ - # crm_simulate --live-check --save-dotfile upgrade06.dot -S - # graphviz upgrade06.dot -@@ -99,38 +91,38 @@ happy with any that are scheduled. If the output contains actions you - do not expect (possibly due to changes to the score calculations), you - may need to make further manual changes. See - <> for further details on how to interpret --the output of `crm_simulate` -+the output of `crm_simulate` and `graphviz`. - + - . Upload the changes - + --[source,C] - ----- - # crm_shadow --commit upgrade06 --force - ----- --If this step fails, something really strange has occurred. You should report a bug. -++ -+In the unlikely event this step fails, please report a bug. - - [[s-upgrade-config-manual]] - ==== Manually Upgrading the Configuration ==== - - indexterm:[Configuration,Upgrade manually] --It is also possible to perform the configuration upgrade steps manually. To do this -+It is also possible to perform the configuration upgrade steps manually: - --Locate the 'upgrade06.xsl' conversion script or download the latest --version from --https://github.com/ClusterLabs/pacemaker/tree/master/xml/upgrade06.xsl[Git] -+. Locate the +upgrade06.xsl+ conversion script provided with the source code -+ (the https://github.com/ClusterLabs/pacemaker/tree/master/xml/upgrade06.xsl[latest version] is available via -+ git). - - . Convert the XML blob: indexterm:[XML,Convert] - + --[source,C] - ----- - # xsltproc /path/to/upgrade06.xsl config06.xml > config10.xml - ----- - + --. Locate the 'pacemaker.rng' script. -+. Locate the +pacemaker.rng+ script. - . Check the XML validity: indexterm:[Validate Configuration]indexterm:[Configuration,Validate XML] - + --[source,C] -+---- - # xmllint --relaxng /path/to/pacemaker.rng config10.xml -+---- - - The advantage of this method is that it can be performed without the - cluster running and any validation errors should be more informative -diff --git a/doc/Pacemaker_Explained/en-US/Ap-Upgrade.txt b/doc/Pacemaker_Explained/en-US/Ap-Upgrade.txt -index dc14d71..66f5cc5 100644 ---- a/doc/Pacemaker_Explained/en-US/Ap-Upgrade.txt -+++ b/doc/Pacemaker_Explained/en-US/Ap-Upgrade.txt -@@ -1,18 +1,14 @@ - [appendix] - - [[ap-upgrade]] --== Upgrading Cluster Software -+== Upgrading Cluster Software == - --=== Version Compatibility === -+There will always be an upgrade path from any pacemaker 1._x_ -+release to any other 1._y_ release. - --When releasing newer versions we take care to make sure we are --backwards compatible with older versions. While you will always be --able to upgrade from version x to x+1, in order to continue to produce --high quality software it may occasionally be necessary to drop --compatibility with older versions. -- --There will always be an upgrade path from any series-2 release to any --other series-2 release. -+Consult the documentation for your messaging layer -+(Heartbeat or Corosync) to see whether upgrading them to a -+newer version is also supported. - - There are three approaches to upgrading your cluster software: - -@@ -21,10 +17,10 @@ There are three approaches to upgrading your cluster software: - * Disconnect and Reattach - - Each method has advantages and disadvantages, some of which are listed --in the table below, and you should chose the one most appropriate to -+in the table below, and you should choose the one most appropriate to - your needs. - --.Summary of Upgrade Methodologies -+.Upgrade Methods - [width="95%",cols="6*",options="header",align="center"] - |========================================================= - -@@ -36,11 +32,7 @@ your needs. - |Allows change of cluster stack type - indexterm:[Cluster,Switching between Stacks] - indexterm:[Changing Cluster Stack] --footnote:[ --For example, switching from Heartbeat to Corosync. Consult the --Heartbeat or Corosync documentation to see if upgrading them to a --newer version is also supported. --] -+footnote:[For example, switching from Heartbeat to Corosync.] - - |Shutdown - indexterm:[Upgrade,Shutdown] -@@ -73,53 +65,51 @@ indexterm:[Reattach Upgrade] - - === Complete Cluster Shutdown === - --In this scenario one shuts down all cluster nodes and resources and --upgrades all the nodes before restarting the cluster. -+In this scenario, one shuts down all cluster nodes and resources, -+then upgrades all the nodes before restarting the cluster. - --==== Procedure ==== - . On each node: --.. Shutdown the cluster stack (Heartbeat or Corosync) --.. Upgrade the Pacemaker software. -- This may also include upgrading the cluster stack and/or the -- underlying operating system. -+.. Shutdown the cluster software (pacemaker and the messaging layer). -+.. Upgrade the Pacemaker software. This may also include upgrading the -+ messaging layer and/or the underlying operating system. - .. Check the configuration manually or with the `crm_verify` tool if available. - . On each node: --.. Start the cluster stack. -- This can be either Corosync or Heartbeat and does not need to be -- the same as the previous cluster stack. -+.. Start the cluster software. -+ The messaging layer can be either Corosync or Heartbeat and does not need to -+ be the same one before the upgrade. - - === Rolling (node by node) === - --In this scenario each node is removed from the cluster, upgraded and then brought back online until all nodes are running the newest version. -- --[IMPORTANT] --=========== --This method is currently broken between Pacemaker 0.6.x and 1.0.x. -+In this scenario, each node is removed from the cluster, upgraded and then -+brought back online until all nodes are running the newest version. - --Measures have been put into place to ensure rolling upgrades always --work for versions after 1.0.0. Please try one of the other upgrade --strategies. Detach/Reattach is a particularly good option for most --people. --=========== -- --==== Procedure ==== -+Rolling upgrades should always be possible for pacemaker versions -+1.0.0 and later. - - On each node: --. Shutdown the cluster stack (Heartbeat or Corosync) -+ -+. Put the node into standby mode, and wait for any active resources -+ to be moved cleanly to another node. -+. Shutdown the cluster software (pacemaker and the messaging layer) on the node. - . Upgrade the Pacemaker software. This may also include upgrading the -- cluster stack and/or the underlying operating system. --.. On the first node, check the configuration manually or with the -- `crm_verify` tool if available. --.. Start the cluster stack. --+ --This must be the same type of cluster stack (Corosync or Heartbeat) --that the rest of the cluster is using. Upgrading Corosync/Heartbeat --may also be possible, please consult the documentation for those --projects to see if the two versions will be compatible. --+ --.. Repeat for each node in the cluster. -+ messaging layer and/or the underlying operating system. -+. If this is the first node to be upgraded, check the configuration manually -+ or with the `crm_verify` tool if available. -+. Start the messaging layer. -+ This must be the same messaging layer (Corosync or Heartbeat) -+ that the rest of the cluster is using. Upgrading the messaging layer -+ may also be possible; consult the documentation for those -+ projects to see whether the two versions will be compatible. - --==== Version Compatibility ==== -+[NOTE] -+==== -+Rolling upgrades were not always possible with older heartbeat and -+pacemaker versions. The table below shows which versions were -+compatible during rolling upgrades. Rolling upgrades that cross compatibility -+boundaries must be performed in multiple steps (for example, -+upgrading heartbeat 2.0.6 to heartbeat 2.1.3, and then upgrading again -+to pacemaker 0.6.6). Rolling upgrades from pacemaker 0._x_ to 1._y_ are not -+possible. - - .Version Compatibility Table - [width="95%",cols="2*",options="header",align="center"] -@@ -147,70 +137,57 @@ projects to see if the two versions will be compatible. - |None. Use an alternate upgrade strategy. - - |========================================================= -- --==== Crossing Compatibility Boundaries ==== -- --Rolling upgrades that cross compatibility boundaries must be preformed --in multiple steps. For example, to perform a rolling update from --Heartbeat 2.0.1 to Pacemaker 0.6.6 one must: -- --. Perform a rolling upgrade from Heartbeat 2.0.1 to Heartbeat 2.0.4 --. Perform a rolling upgrade from Heartbeat 2.0.4 to Heartbeat 2.1.3 --. Perform a rolling upgrade from Heartbeat 2.1.3 to Pacemaker 0.6.6 -+==== - - === Disconnect and Reattach === - --A variant of a complete cluster shutdown, but the resources are left --active and get re-detected when the cluster is restarted. -- --==== Procedure ==== -+The reattach method is a variant of a complete cluster shutdown, where the -+resources are left active and get re-detected when the cluster is restarted. - --. Tell the cluster to stop managing services. -+. Tell the cluster to stop managing services. This is required to allow the -+ services to remain active after the cluster shuts down. - + --This is required to allow the services to remain active after the --cluster shuts down. --+ --[source,C] -+---- - # crm_attribute -t crm_config -n is-managed-default -v false --+ -+---- -+ - . For any resource that has a value for +is-managed+, make sure it is --set to +false+ (so that the cluster will not stop it) -+set to +false+ so that the cluster will not stop it (replacing $rsc_id -+appropriately): - + --[source,C] -+---- - # crm_resource -t primitive -r $rsc_id -p is-managed -v false --+ -+---- -+ - . On each node: --.. Shutdown the cluster stack (Heartbeat or Corosync) --.. Upgrade the cluster stack program - This may also include upgrading --the underlying operating system. -+.. Shutdown the cluster software (pacemaker and the messaging layer). -+.. Upgrade the Pacemaker software. This may also include upgrading the -+ messaging layer and/or the underlying operating system. - . Check the configuration manually or with the `crm_verify` tool if available. - . On each node: --.. Start the cluster stack. --+ --This can be either Corosync or Heartbeat and does not need to be the --same as the previous cluster stack. --+ -+.. Start the cluster software. The messaging layer can be either Corosync or -+ Heartbeat and does not need to be the same one as before the upgrade. -+ - . Verify that the cluster re-detected all resources correctly. - . Allow the cluster to resume managing resources again: - + --[source,C] -+---- - # crm_attribute -t crm_config -n is-managed-default -v true --+ --. For any resource that has a value for +is-managed+ reset it to -+---- -+ -+. For any resource that has a value for +is-managed+, reset it to - +true+ (so the cluster can recover the service if it fails) if - desired: - + --[source,C] -+---- - # crm_resource -t primitive -r $rsc_id -p is-managed -v true -+---- - -- --==== Notes ==== -+[NOTE] -+The oldest version of the CRM to support this upgrade type was in Heartbeat 2.0.4. - - [IMPORTANT] - =========== - Always check your existing configuration is still compatible with the - version you are installing before starting the cluster. - =========== -- --[NOTE] --The oldest version of the CRM to support this upgrade type was in Heartbeat 2.0.4 -diff --git a/doc/Pacemaker_Explained/en-US/Book_Info.xml b/doc/Pacemaker_Explained/en-US/Book_Info.xml -index 93f1783..56b9d9b 100644 ---- a/doc/Pacemaker_Explained/en-US/Book_Info.xml -+++ b/doc/Pacemaker_Explained/en-US/Book_Info.xml -@@ -6,24 +6,19 @@ - An A-Z guide to Pacemaker's Configuration Options - Pacemaker - 1.1 -- 1 -+ -+ 5 - 0 - - - The purpose of this document is to definitively explain the concepts used to configure Pacemaker. -- To achieve this, it will focus exclusively on the XML syntax used to configure the CIB. -- -- -- For those that are allergic to XML, there exist several unified shells -- and GUIs for Pacemaker. However these tools will not be covered at all -- in this document -- I hope, however, that the concepts explained here make the functionality of these tools more easily understood. -- , precisely because they hide the XML. -- -- -- Additionally, this document is NOT a step-by-step how-to guide for configuring a specific clustering scenario. -- Although such guides exist, the purpose of this document is to provide an understanding of the building blocks that can be used to construct any type of Pacemaker cluster. -- Try the Clusters from Scratch document instead. -+ To achieve this, it will focus exclusively on the XML syntax used to configure Pacemaker's -+ Cluster Information Base (CIB). - - - -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Options.txt b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Options.txt -index ab9a089..d1cf176 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Options.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Options.txt -@@ -12,64 +12,74 @@ variables and runs the same commands as when working on a cluster - node. - - .Environment Variables Used to Connect to Remote Instances of the CIB --[width="95%",cols="1m,2<",options="header",align="center"] -+[width="95%",cols="1m,1,3<",options="header",align="center"] - |========================================================= - - |Environment Variable -+|Default - |Description - - |CIB_user -+|$USER - |The user to connect as. Needs to be part of the +hacluster+ group on -- the target host. Defaults to _$USER_. -+ the target host. - indexterm:[Environment Variable,CIB_user] - - |CIB_passwd -+| - |The user's password. Read from the command line if unset. - indexterm:[Environment Variable,CIB_passwd] - - |CIB_server --|The host to contact. Defaults to _localhost_. -+|localhost -+|The host to contact - indexterm:[Environment Variable,CIB_server] - - |CIB_port -+| - |The port on which to contact the server; required. - indexterm:[Environment Variable,CIB_port] - - |CIB_encrypted --|Encrypt network traffic; defaults to _true_. -+|TRUE -+|Whether to encrypt network traffic - indexterm:[Environment Variable,CIB_encrypted] - - |========================================================= - --So, if +c001n01+ is an active cluster node and is listening on +1234+ --for connections, and +someguy+ is a member of the +hacluster+ group, --then the following would prompt for +someguy+'s password and return -+So, if *c001n01* is an active cluster node and is listening on port 1234 -+for connections, and *someuser* is a member of the *hacluster* group, -+then the following would prompt for *someuser*'s password and return - the cluster's current configuration: - --[source,C] --# export CIB_port=1234; export CIB_server=c001n01; export CIB_user=someguy; -+---- -+# export CIB_port=1234; export CIB_server=c001n01; export CIB_user=someuser; - # cibadmin -Q -+---- - - For security reasons, the cluster does not listen for remote - connections by default. If you wish to allow remote access, you need - to set the +remote-tls-port+ (encrypted) or +remote-clear-port+ --(unencrypted) top-level options (ie., those kept in the cib tag, like -+(unencrypted) CIB properties (i.e., those kept in the +cib+ tag, like - +num_updates+ and +epoch+). - --.Extra top-level CIB options for remote access --[width="95%",cols="1m,2<",options="header",align="center"] -+.Extra top-level CIB properties for remote access -+[width="95%",cols="1m,1,3<",options="header",align="center"] - |========================================================= - - |Field -+|Default - |Description - - |remote-tls-port --|Listen for encrypted remote connections on this port. Default: _none_ -+|_none_ -+|Listen for encrypted remote connections on this port. - indexterm:[remote-tls-port,Remote Connection Option] - indexterm:[Remote Connection,Option,remote-tls-port] - - |remote-clear-port --|Listen for plaintext remote connections on this port. Default: _none_ -+|_none_ -+|Listen for plaintext remote connections on this port. - indexterm:[remote-clear-port,Remote Connection Option] - indexterm:[Remote Connection,Option,remote-clear-port] - -@@ -82,25 +92,25 @@ to set the +remote-tls-port+ (encrypted) or +remote-clear-port+ - By default, recurring actions are scheduled relative to when the - resource started. So if your resource was last started at 14:32 and - you have a backup set to be performed every 24 hours, then the backup --will always run at in the middle of the business day - hardly -+will always run at in the middle of the business day -- hardly - desirable. - --To specify a date/time that the operation should be relative to, set -+To specify a date and time that the operation should be relative to, set - the operation's +interval-origin+. The cluster uses this point to - calculate the correct +start-delay+ such that the operation will occur - at _origin + (interval * N)_. - --So, if the operation's interval is 24h, it's interval-origin is set to --+02:00+ and it is currently +14:32+, then the cluster would initiate -+So, if the operation's interval is 24h, its interval-origin is set to -+02:00 and it is currently 14:32, then the cluster would initiate - the operation with a start delay of 11 hours and 28 minutes. If the --resource is moved to another node before 2am, then the operation is of --course cancelled. -+resource is moved to another node before 2am, then the operation is -+cancelled. - --The value specified for interval and +interval-origin+ can be any -+The value specified for +interval+ and +interval-origin+ can be any - date/time conforming to the - http://en.wikipedia.org/wiki/ISO_8601[ISO8601 standard]. By way of - example, to specify an operation that would run on the first Monday of --2009 and every Monday after that you would add: -+2009 and every Monday after that, you would add: - - .Specifying a Base for Recurring Action Intervals - ===== -@@ -112,51 +122,58 @@ example, to specify an operation that would run on the first Monday of - indexterm:[Moving,Resources] - indexterm:[Resource,Moving] - --=== Manual Intervention === -+=== Moving Resources Manually === - - There are primarily two occasions when you would want to move a --resource from it's current location: when the whole node is under -+resource from its current location: when the whole node is under - maintenance, and when a single resource needs to be moved. - -+==== Standby Mode ==== -+ - Since everything eventually comes down to a score, you could create - constraints for every resource to prevent them from running on one --node. While the configuration can seem convoluted at times, not even -+node. While pacemaker configuration can seem convoluted at times, not even - we would require this of administrators. - --Instead one can set a special node attribute which tells the cluster -+Instead, one can set a special node attribute which tells the cluster - "don't let anything run here". There is even a helpful tool to help - query and set it, called `crm_standby`. To check the standby status --of the current machine, simply run: -+of the current machine, run: - --[source,C] --# crm_standby --get-value -+---- -+# crm_standby -G -+---- - --A value of +true+ indicates that the node is _NOT_ able to host any --resources, while a value of +false+ says that it _CAN_. -+A value of +on+ indicates that the node is _not_ able to host any -+resources, while a value of +off+ says that it _can_. - - You can also check the status of other nodes in the cluster by --specifying the `--node-uname` option: -+specifying the `--node` option: -+ -+---- -+# crm_standby -G --node sles-2 -+---- - --[source,C] --# crm_standby --get-value --node-uname sles-2 -+To change the current node's standby status, use `-v` instead of `-G`: - --To change the current node's standby status, use `--attr-value` --instead of `--get-value`. -+---- -+# crm_standby -v on -+---- - --[source,C] --# crm_standby --attr-value -+Again, you can change another host's value by supplying a hostname with `--node`. - --Again, you can change another host's value by supplying a host name with `--node-uname`. -+==== Moving One Resource ==== - --When only one resource is required to move, we do this by creating --location constraints. However, once again we provide a user friendly -+When only one resource is required to move, we could do this by creating -+location constraints. However, once again we provide a user-friendly - shortcut as part of the `crm_resource` command, which creates and --modifies the extra constraints for you. If +Email+ was running on -+modifies the extra constraints for you. If +Email+ were running on - +sles-1+ and you wanted it moved to a specific location, the command - would look something like: - --[source,C] -+---- - # crm_resource -M -r Email -H sles-2 -+---- - - Behind the scenes, the tool will create the following location constraint: - -@@ -166,31 +183,35 @@ Behind the scenes, the tool will create the following location constraint: - It is important to note that subsequent invocations of `crm_resource - -M` are not cumulative. So, if you ran these commands - --[source,C] -+---- - # crm_resource -M -r Email -H sles-2 - # crm_resource -M -r Email -H sles-3 -+---- - - then it is as if you had never performed the first command. - - To allow the resource to move back again, use: - --[source,C] -+---- - # crm_resource -U -r Email -+---- - - Note the use of the word _allow_. The resource can move back to its - original location but, depending on +resource-stickiness+, it might - stay where it is. To be absolutely certain that it moves back to - +sles-1+, move it there before issuing the call to `crm_resource -U`: - --[source,C] -+---- - # crm_resource -M -r Email -H sles-1 - # crm_resource -U -r Email -+---- - - Alternatively, if you only care that the resource should be moved from --its current location, try -+its current location, try: - --[source,C] --# crm_resource -M -r Email` -+---- -+# crm_resource -B -r Email -+---- - - Which will instead create a negative constraint, like - -@@ -206,8 +227,8 @@ where every other cluster node is no longer available! - In some cases, such as when +resource-stickiness+ is set to - +INFINITY+, it is possible that you will end up with the problem - described in <>. The tool can detect --some of these cases and deals with them by also creating both a --positive and negative constraint. Eg. -+some of these cases and deals with them by creating both -+positive and negative constraints. E.g. - - +Email+ prefers +sles-1+ with a score of +-INFINITY+ - -@@ -218,34 +239,40 @@ which has the same long-term consequences as discussed earlier. - [[s-failure-migration]] - === Moving Resources Due to Failure === - -+Normally, if a running resource fails, pacemaker will try to start -+it again on the same node. However if a resource fails repeatedly, -+it is possible that there is an underlying problem on that node, and you -+might desire trying a different node in such a case. - --New in 1.0 is the concept of a migration threshold. -+Pacemaker allows you to set your preference via the +migration-threshold+ -+resource option. - footnote:[ - The naming of this option was perhaps unfortunate as it is easily --confused with true migration, the process of moving a resource from -+confused with live migration, the process of moving a resource from - one node to another without stopping it. Xen virtual guests are the - most common example of resources that can be migrated in this manner. - ] - --Simply define +migration-threshold=N+ for a resource and it will --migrate to a new node after N failures. There is no threshold defined -+Simply define +migration-threshold=pass:[N]+ for a resource and it will -+migrate to a new node after 'N' failures. There is no threshold defined - by default. To determine the resource's current failure status and --limits, use `crm_mon --failcounts`. -+limits, run `crm_mon --failcounts`. - --By default, once the threshold has been reached, this node will no -+By default, once the threshold has been reached, the troublesome node will no - longer be allowed to run the failed resource until the administrator - manually resets the resource's failcount using `crm_failcount` (after --hopefully first fixing the failure's cause). However it is possible --to expire them by setting the resource's +failure-timeout+ option. -+hopefully first fixing the failure's cause). Alternatively, it is possible -+to expire them by setting the +failure-timeout+ option for the resource. - --So a setting of +migration-threshold=2+ and +failure-timeout=60s+ -+For example, a setting of +migration-threshold=2+ and +failure-timeout=60s+ - would cause the resource to move to a new node after 2 failures, and --allow it to move back (depending on the stickiness and constraint --scores) after one minute. -+allow it to move back (depending on stickiness and constraint scores) after one -+minute. - --There are two exceptions to the migration threshold concept; they --occur when a resource either fails to start or fails to stop. Start --failures cause the failcount to be set to +INFINITY+ and thus always -+There are two exceptions to the migration threshold concept: -+when a resource either fails to start or fails to stop. -+ -+Start failures cause the failcount to be set to +INFINITY+ and thus always - cause the resource to move immediately. - - Stop failures are slightly different and crucial. If a resource fails -@@ -256,39 +283,37 @@ to start the resource elsewhere, but will try to stop it again after - the failure timeout. - - [IMPORTANT] --Please read <> before enabling this option. -+Please read <> to understand how timeouts work -+before configuring a +failure-timeout+. - - === Moving Resources Due to Connectivity Changes === - --Setting up the cluster to move resources when external connectivity is --lost is a two-step process. -- --==== Tell Pacemaker to monitor connectivity ==== -+You can configure the cluster to move resources when external connectivity is -+lost in two steps. - -+==== Tell Pacemaker to Monitor Connectivity ==== - --To do this, you need to add a +ping+ resource to the cluster. The --+ping+ resource uses the system utility of the same name to a test if -+First, add an *ocf:pacemaker:ping* resource to the cluster. The -+*ping* resource uses the system utility of the same name to a test whether - list of machines (specified by DNS hostname or IPv4/IPv6 address) are --reachable and uses the results to maintain a node attribute normally --called +pingd+. -+reachable and uses the results to maintain a node attribute called +pingd+ -+by default. - footnote:[ --The attribute name is customizable; that allows multiple ping groups to be defined. -+The attribute name is customizable, in order to allow multiple ping groups to be defined. - ] - - [NOTE] --Older versions of Heartbeat required users to add ping nodes to _ha.cf_ - this is no longer required. -- --[IMPORTANT] - =========== --Older versions of Pacemaker used a custom binary called 'pingd' for --this functionality; this is now deprecated in favor of 'ping'. -+Older versions of Heartbeat required users to add ping nodes to +ha.cf+, but -+this is no longer required. - --If your version of Pacemaker does not contain the ping agent, you can --download the latest version from -+Older versions of Pacemaker used a different agent *ocf:pacemaker:pingd* which -+is now deprecated in favor of *ping*. If your version of Pacemaker does not -+contain the *ping* resource agent, download the latest version from - https://github.com/ClusterLabs/pacemaker/tree/master/extra/resources/ping - =========== - --Normally the resource will run on all cluster nodes, which means that -+Normally, the ping resource should run on all cluster nodes, which means that - you'll need to create a clone. A template for this can be found below - along with a description of the most interesting parameters. - -@@ -347,24 +372,21 @@ how to deal with the connectivity status that +ocf:pacemaker:ping+ is - recording. - =========== - --==== Tell Pacemaker how to interpret the connectivity data ==== -+==== Tell Pacemaker How to Interpret the Connectivity Data ==== - --[NOTE] -+[IMPORTANT] - ====== --Before reading the following, please make sure you have read and --understood <> above. -+Before attempting the following, make sure you understand -+<>. - ====== - --There are a number of ways to use the connectivity data provided by --Heartbeat. The most common setup is for people to have a single ping --node, to prevent the cluster from running a resource on any --unconnected node. -+There are a number of ways to use the connectivity data. - --//// --TODO: is the idea that only nodes that can reach eg. the router should have active resources? --//// -+The most common setup is for people to have a single ping -+target (e.g. the service network's default gateway), to prevent the cluster -+from running a resource on any unconnected node. - --.Don't run on unconnected nodes -+.Don't run a resource on unconnected nodes - ===== - [source,XML] - ------- -@@ -376,23 +398,29 @@ TODO: is the idea that only nodes that can reach eg. the router should have acti - ------- - ===== - --A more complex setup is to have a number of ping nodes configured. -+A more complex setup is to have a number of ping targets configured. - You can require the cluster to only run resources on nodes that can - connect to all (or a minimum subset) of them. - --.Run only on nodes connected to three or more ping nodes; this assumes +multiplier+ is set to 1000: -+.Run only on nodes connected to three or more ping targets. - ===== - [source,XML] - ------- -+ -+... -+ -+... -+ -+... - - -- -+ - - - ------- - ===== - --Instead you can tell the cluster only to _prefer_ nodes with the best -+Alternatively, you can tell the cluster only to _prefer_ nodes with the best - connectivity. Just be sure to set +multiplier+ to a value higher than - that of +resource-stickiness+ (and don't set either of them to - +INFINITY+). -@@ -411,11 +439,11 @@ that of +resource-stickiness+ (and don't set either of them to - - It is perhaps easier to think of this in terms of the simple - constraints that the cluster translates it into. For example, if --+sles-1+ is connected to all 5 ping nodes but +sles-2+ is only --connected to 2, then it would be as if you instead had the following -+*sles-1* is connected to all five ping nodes but *sles-2* is only -+connected to two, then it would be as if you instead had the following - constraints in your configuration: - --.How the cluster translates the pingd constraint -+.How the cluster translates the above location constraint - ===== - [source,XML] - ------- -@@ -447,24 +475,29 @@ three (again assuming that +multiplier+ is set to 1000). - ------- - ===== - --=== Resource Migration === -+=== Migrating Resources === -+ -+Normally, when the cluster needs to move a resource, it fully restarts -+the resource (i.e. stops the resource on the current node -+and starts it on the new node). - --Some resources, such as Xen virtual guests, are able to move to --another location without loss of state. We call this resource --migration; this is different from the normal practice of stopping the --resource on the first machine and starting it elsewhere. -+However, some types of resources, such as Xen virtual guests, are able to move to -+another location without loss of state (often referred to as live migration -+or hot migration). In pacemaker, this is called resource migration. -+Pacemaker can be configured to migrate a resource when moving it, -+rather than restarting it. - --Not all resources are able to migrate, see the Migration Checklist -+Not all resources are able to migrate; see the Migration Checklist - below, and those that can, won't do so in all situations. --Conceptually there are two requirements from which the other -+Conceptually, there are two requirements from which the other - prerequisites follow: - --* the resource must be active and healthy at the old location -+* The resource must be active and healthy at the old location; and - * everything required for the resource to run must be available on -- both the old and new locations -+ both the old and new locations. - --The cluster is able to accommodate both push and pull migration models --by requiring the resource agent to support two new actions: -+The cluster is able to accommodate both 'push' and 'pull' migration models -+by requiring the resource agent to support two special actions: - +migrate_to+ (performed on the current location) and +migrate_from+ - (performed on the destination). - -@@ -477,30 +510,34 @@ Conversely for pull, the +migrate_to+ action is practically empty and - +migrate_from+ does most of the work, extracting the relevant resource - state from the old location and activating it. - --There is no wrong or right way to implement migration for your --service, as long as it works. -- --==== Migration Checklist ==== -+There is no wrong or right way for a resource agent to implement migration, -+as long as it works. - -+.Migration Checklist - * The resource may not be a clone. - * The resource must use an OCF style agent. - * The resource must not be in a failed or degraded state. --* The resource must not, directly or indirectly, depend on any -- primitive or group resources. --* The resource must support two new actions: +migrate_to+ and -- +migrate_from+, and advertise them in its metadata. -+* The resource agent must support +migrate_to+ and -+ +migrate_from+ actions, and advertise them in its metadata. - * The resource must have the +allow-migrate+ meta-attribute set to - +true+ (which is not the default). - --//// --TODO: how can a KVM with DRBD migrate? --//// -+If an otherwise migratable resource depends on another resource -+via an ordering constraint, there are special situations in which it will be -+restarted rather than migrated. -+ -+For example, if the resource depends on a clone, and at the time the resource -+needs to be moved, the clone has instances that are stopping and instances -+that are starting, then the resource will be restarted. -+The Policy Engine is not yet able to model this -+situation correctly and so takes the safer (if less optimal) path. -+ -+In pacemaker 1.1.11 and earlier, a migratable resource will be restarted -+when moving if it directly or indirectly depends on 'any' primitive or group -+resources. - --If the resource depends on a clone, and at the time the resource needs --to be move, the clone has instances that are stopping and instances --that are starting, then the resource will be moved in the traditional --manner. The Policy Engine is not yet able to model this situation --correctly and so takes the safe (yet less optimal) path. -+Even in newer versions, if a migratable resource depends on a non-migratable -+resource, and both need to be moved, the migratable resource will be restarted. - - [[s-reusing-config-elements]] - == Reusing Rules, Options and Sets of Operations == -@@ -574,17 +611,17 @@ The same principle applies for +meta_attributes+ and - == Reloading Services After a Definition Change == - - The cluster automatically detects changes to the definition of --services it manages. However, the normal response is to stop the -+services it manages. The normal response is to stop the - service (using the old definition) and start it again (with the new - definition). This works well, but some services are smarter and can - be told to use a new set of options without restarting. - --To take advantage of this capability, your resource agent must: -+To take advantage of this capability, the resource agent must: - - . Accept the +reload+ operation and perform any required actions. -- _The steps required here depend completely on your application!_ -+ _The actions here depend completely on your application!_ - + --.The DRBD Agent's Control logic for Supporting the +reload+ Operation -+.The DRBD agent's logic for supporting +reload+ - ===== - [source,Bash] - ------- -@@ -657,13 +694,13 @@ Any parameter with the +unique+ set to 0 is eligible to be used in this way. - - Once these requirements are satisfied, the cluster will automatically - know to reload the resource (instead of restarting) when a non-unique --fields changes. -+field changes. - - [NOTE] - ====== --The metadata is re-read when the resource is started. This may mean --that the resource will be restarted the first time, even though you --changed a parameter with +unique=0+ -+Metadata will not be re-read unless the resource needs to be started. This may -+mean that the resource will be restarted the first time, even though you -+changed a parameter with +unique=0+. - ====== - - [NOTE] -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt -index 8262578..4060201 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt -@@ -8,21 +8,21 @@ indexterm:[Resources,Groups] - - One of the most common elements of a cluster is a set of resources - that need to be located together, start sequentially, and stop in the --reverse order. To simplify this configuration we support the concept -+reverse order. To simplify this configuration, we support the concept - of groups. - --.An example group -+.A group of two primitive resources - ====== - [source,XML] - ------- - - - -- -+ - - - -- -+ - ------- - ====== - -@@ -53,7 +53,7 @@ The group above is logically equivalent to writing: - - - -- -+ - - - -@@ -81,7 +81,7 @@ mount, an IP address, and an application that uses them. - |Description - - |id --|Your name for the group -+|A unique name for the group - indexterm:[id,Group Resource Property] - indexterm:[Resource,Group Property,id] - -@@ -89,27 +89,27 @@ mount, an IP address, and an application that uses them. - - === Group Options === - --Options inherited from <> resources: --+priority, target-role, is-managed+ -+Groups inherit the +priority+, +target-role+, and +is-managed+ properties -+from primitive resources. See <> for information about -+those properties. - - === Group Instance Attributes === - --Groups have no instance attributes, however any that are set here will --be inherited by the group's children. -+Groups have no instance attributes. However, any that are set for the group -+object will be inherited by the group's children. - - === Group Contents === - --Groups may only contain a collection of --<> cluster resources. To refer to --the child of a group resource, just use the child's id instead of the --group's. -+Groups may only contain a collection of cluster resources (see -+<>). To refer to a child of a group resource, just use -+the child's +id+ instead of the group's. - - === Group Constraints === - --Although it is possible to reference the group's children in --constraints, it is usually preferable to use the group's name instead. -+Although it is possible to reference a group's children in -+constraints, it is usually preferable to reference the group itself. - --.Example constraints involving groups -+.Some constraints involving groups - ====== - [source,XML] - ------- -@@ -136,32 +136,32 @@ current location with a score of 500. - indexterm:[Clone Resources] - indexterm:[Resources,Clones] - --Clones were initially conceived as a convenient way to start N --instances of an IP resource and have them distributed throughout the -+Clones were initially conceived as a convenient way to start multiple -+instances of an IP address resource and have them distributed throughout the - cluster for load balancing. They have turned out to quite useful for --a number of purposes including integrating with Red Hat's DLM, the --fencing subsystem, and OCFS2. -+a number of purposes including integrating with the Distributed Lock Manager -+(used by many cluster filesystems), the fencing subsystem, and OCFS2. - - You can clone any resource, provided the resource agent supports it. - - Three types of cloned resources exist: - - * Anonymous --* Globally Unique -+* Globally unique - * Stateful - --Anonymous clones are the simplest type. These resources behave -+'Anonymous' clones are the simplest. These behave - completely identically everywhere they are running. Because of this, --there can only be one copy of an anonymous clone active per machine. -+there can be only one copy of an anonymous clone active per machine. - --Globally unique clones are distinct entities. A copy of the clone -+'Globally unique' clones are distinct entities. A copy of the clone - running on one machine is not equivalent to another instance on --another node. Nor would any two copies on the same node be -+another node, nor would any two copies on the same node be - equivalent. - --Stateful clones are covered later in <>. -+'Stateful' clones are covered later in <>. - --.An example clone -+.A clone of an LSB resource - ====== - [source,XML] - ------- -@@ -184,7 +184,7 @@ Stateful clones are covered later in <>. - |Description - - |id --|Your name for the clone -+|A unique name for the clone - indexterm:[id,Clone Property] - indexterm:[Clone,Property,id] - -@@ -195,49 +195,54 @@ Stateful clones are covered later in <>. - Options inherited from <> resources: - +priority, target-role, is-managed+ - --.Clone specific configuration options --[width="95%",cols="3m,5<",options="header",align="center"] -+.Clone-specific configuration options -+[width="95%",cols="1m,1,3<",options="header",align="center"] - |========================================================= - - |Field -+|Default - |Description - - |clone-max --|How many copies of the resource to start. Defaults to the number of -- nodes in the cluster. -+|number of nodes in cluster -+|How many copies of the resource to start - indexterm:[clone-max,Clone Option] - indexterm:[Clone,Option,clone-max] - - |clone-node-max --|How many copies of the resource can be started on a single node; -- default _1_. -+|1 -+|How many copies of the resource can be started on a single node - indexterm:[clone-node-max,Clone Option] - indexterm:[Clone,Option,clone-node-max] - - |notify -+|true - |When stopping or starting a copy of the clone, tell all the other -- copies beforehand and when the action was successful. Allowed values: -- _false_, +true+ -+ copies beforehand and again when the action was successful. Allowed values: -+ +false+, +true+ - indexterm:[notify,Clone Option] - indexterm:[Clone,Option,notify] - - |globally-unique -+|false - |Does each copy of the clone perform a different function? Allowed -- values: _false_, +true+ -+ values: +false+, +true+ - indexterm:[globally-unique,Clone Option] - indexterm:[Clone,Option,globally-unique] - - |ordered -+|false - |Should the copies be started in series (instead of in -- parallel). Allowed values: _false_, +true+ -+ parallel)? Allowed values: +false+, +true+ - indexterm:[ordered,Clone Option] - indexterm:[Clone,Option,ordered] - - |interleave --|Changes the behavior of ordering constraints (between clones/masters) -- so that instances can start/stop as soon as their peer instance has -- (rather than waiting for every instance of the other clone -- has). Allowed values: _false_, +true+ -+|false -+|If this clone depends on another clone via an ordering constraint, -+is it allowed to start after the local instance of the other clone -+starts, rather than wait for all instances of the other clone to start? -+Allowed values: +false+, +true+ - indexterm:[interleave,Clone Option] - indexterm:[Clone,Option,interleave] - -@@ -250,7 +255,7 @@ will be inherited by the clone's children. - - === Clone Contents === - --Clones must contain exactly one group or one regular resource. -+Clones must contain exactly one primitive or group resource. - - [WARNING] - You should never reference the name of a clone's child. -@@ -261,26 +266,10 @@ If you think you need to do this, you probably need to re-evaluate your design. - In most cases, a clone will have a single copy on each active cluster - node. If this is not the case, you can indicate which nodes the - cluster should preferentially assign copies to with resource location --constraints. These constraints are written no differently to those --for regular resources except that the clone's id is used. -- --Ordering constraints behave slightly differently for clones. In the --example below, +apache-stats+ will wait until all copies of the clone --that need to be started have done so before being started itself. --Only if _no_ copies can be started +apache-stats+ will be prevented --from being active. Additionally, the clone will wait for --+apache-stats+ to be stopped before stopping the clone. -+constraints. These constraints are written no differently from those -+for primitive resources except that the clone's +id+ is used. - --Colocation of a regular (or group) resource with a clone means that --the resource can run on any machine with an active copy of the clone. --The cluster will choose a copy based on where the clone is running and --the resource's own location preferences. -- --Colocation between clones is also possible. In such cases, the set of --allowed locations for the clone is limited to nodes on which the clone --is (or will be) active. Allocation is then performed as normally. -- --.Example constraints involving clones -+.Some constraints involving clones - ====== - [source,XML] - ------- -@@ -292,6 +281,24 @@ is (or will be) active. Allocation is then performed as normally. - ------- - ====== - -+Ordering constraints behave slightly differently for clones. In the -+example above, +apache-stats+ will wait until all copies of +apache-clone+ -+that need to be started have done so before being started itself. -+Only if _no_ copies can be started will +apache-stats+ be prevented -+from being active. Additionally, the clone will wait for -++apache-stats+ to be stopped before stopping itself. -+ -+Colocation of a primitive or group resource with a clone means that -+the resource can run on any machine with an active copy of the clone. -+The cluster will choose a copy based on where the clone is running and -+the resource's own location preferences. -+ -+Colocation between clones is also possible. If one clone +A+ is colocated -+with another clone +B+, the set of allowed locations for +A+ is limited to -+nodes on which +B+ is (or will be) active. Placement is then performed -+normally. -+ -+[[s-clone-stickiness]] - === Clone Stickiness === - - indexterm:[resource-stickiness,Clones] -@@ -302,6 +309,16 @@ will use a value of 1. Being a small value, it causes minimal - disturbance to the score calculations of other resources but is enough - to prevent Pacemaker from needlessly moving copies around the cluster. - -+[NOTE] -+==== -+For globally unique clones, this may result in multiple instances of the -+clone staying on a single node, even after another eligible node becomes -+active (for example, after being put into standby mode then made active again). -+If you do not want this behavior, specify a +resource-stickiness+ of 0 -+for the clone temporarily and let the cluster adjust, then set it back -+to 1 if you want the default behavior to apply again. -+==== -+ - === Clone Resource Agent Requirements === - - Any resource can be used as an anonymous clone, as it requires no -@@ -310,27 +327,27 @@ do so depends on your resource and its resource agent. - - Globally unique clones do require some additional support in the - resource agent. In particular, it must only respond with --+${OCF_SUCCESS}+ if the node has that exact instance active. All -++$\{OCF_SUCCESS}+ if the node has that exact instance active. All - other probes for instances of the clone should result in --+${OCF_NOT_RUNNING}+. Unless of course they are failed, in which case --they should return one of the other OCF error codes. -++$\{OCF_NOT_RUNNING}+ (or one of the other OCF error codes if -+they are failed). - --Copies of a clone are identified by appending a colon and a numerical --offset, eg. +apache:2+. -+Individual instances of a clone are identified by appending a colon and a -+numerical offset, e.g. +apache:2+. - - Resource agents can find out how many copies there are by examining - the +OCF_RESKEY_CRM_meta_clone_max+ environment variable and which - copy it is by examining +OCF_RESKEY_CRM_meta_clone+. - --You should not make any assumptions (based on --+OCF_RESKEY_CRM_meta_clone+) about which copies are active. In -+The resource agent must not make any assumptions (based on -++OCF_RESKEY_CRM_meta_clone+) about which numerical instances are active. In - particular, the list of active copies will not always be an unbroken - sequence, nor always start at 0. - - ==== Clone Notifications ==== - - Supporting notifications requires the +notify+ action to be --implemented. Once supported, the notify action will be passed a -+implemented. If supported, the notify action will be passed a - number of extra variables which, when combined with additional - context, can be used to calculate the current state of the cluster and - what is about to happen to it. -@@ -397,13 +414,13 @@ what is about to happen to it. - The variables come in pairs, such as - +OCF_RESKEY_CRM_meta_notify_start_resource+ and - +OCF_RESKEY_CRM_meta_notify_start_uname+ and should be treated as an --array of whitespace separated elements. -+array of whitespace-separated elements. - - Thus in order to indicate that +clone:0+ will be started on +sles-1+, - +clone:2+ will be started on +sles-3+, and +clone:3+ will be started - on +sles-2+, the cluster would set - --.Example notification variables -+.Notification variables - ====== - [source,Bash] - ------- -@@ -453,12 +470,13 @@ OCF_RESKEY_CRM_meta_notify_start_uname="sles-1 sles-3 sles-2" - indexterm:[Multi-state Resources] - indexterm:[Resources,Multi-state] - --Multi-state resources are a specialization of Clone resources; please --ensure you understand the section on clones before continuing! They --allow the instances to be in one of two operating modes; these are --called +Master+ and +Slave+, but can mean whatever you wish them to --mean. The only limitation is that when an instance is started, it --must come up in the +Slave+ state. -+Multi-state resources are a specialization of clone resources; please -+ensure you understand <> before continuing! -+ -+Multi-state resources allow the instances to be in one of two operating modes -+(called 'roles'). The roles are called 'master' and 'slave', but can mean -+whatever you wish them to mean. The only limitation is that when an instance is -+started, it must come up in the slave role. - - === Multi-state Properties === - -@@ -485,22 +503,24 @@ Options inherited from <> resources: - +clone-max+, +clone-node-max+, +notify+, +globally-unique+, +ordered+, - +interleave+ - --.Multi-state specific resource configuration options --[width="95%",cols="3m,5<",options="header",align="center"] -+.Multi-state-specific resource configuration options -+[width="95%",cols="1m,1,3<",options="header",align="center"] - |========================================================= - - |Field -+|Default - |Description - - |master-max --|How many copies of the resource can be promoted to +master+ status; -- default 1. -+|1 -+|How many copies of the resource can be promoted to the +master+ role - indexterm:[master-max,Multi-State Option] - indexterm:[Multi-State,Option,master-max] - - |master-node-max --|How many copies of the resource can be promoted to +master+ status on -- a single node; default 1. -+|1 -+|How many copies of the resource can be promoted to the +master+ role on -+ a single node - indexterm:[master-node-max,Multi-State Option] - indexterm:[Multi-State,Option,master-node-max] - -@@ -509,11 +529,11 @@ Options inherited from <> resources: - === Multi-state Instance Attributes === - - Multi-state resources have no instance attributes; however, any that --are set here will be inherited by master's children. -+are set here will be inherited by a master's children. - - === Multi-state Contents === - --Masters must contain exactly one group or one regular resource. -+Masters must contain exactly one primitive or group resource. - - [WARNING] - You should never reference the name of a master's child. -@@ -521,21 +541,12 @@ If you think you need to do this, you probably need to re-evaluate your design. - - === Monitoring Multi-State Resources === - --The normal type of monitor actions are not sufficient to monitor a --multi-state resource in the +Master+ state. To detect failures of the --+Master+ instance, you need to define an additional monitor action --with +role="Master"+. -+The usual monitor actions are insufficient to monitor a multi-state resource, -+because pacemaker needs to verify not only that the resource is active, but -+also that its actual role matches its intended one. - --[IMPORTANT] --=========== --It is crucial that _every_ monitor operation has a different interval! -- --This is because Pacemaker currently differentiates between operations --only by resource and interval; so if eg. a master/slave resource has --the same monitor interval for both roles, Pacemaker would ignore the --role when checking the status - which would cause unexpected return --codes, and therefore unnecessary complications. --=========== -+Define two monitoring actions: the usual one will cover the slave role, -+and an additional one with +role="master"+ will cover the master role. - - .Monitoring both states of a multi-state resource - ====== -@@ -552,14 +563,23 @@ codes, and therefore unnecessary complications. - ------- - ====== - -+[IMPORTANT] -+=========== -+It is crucial that _every_ monitor operation has a different interval! -+Pacemaker currently differentiates between operations -+only by resource and interval; so if (for example) a master/slave resource had -+the same monitor interval for both roles, Pacemaker would ignore the -+role when checking the status -- which would cause unexpected return -+codes, and therefore unnecessary complications. -+=========== - - === Multi-state Constraints === - --In most cases, a multi-state resources will have a single copy on each -+In most cases, multi-state resources will have a single copy on each - active cluster node. If this is not the case, you can indicate which - nodes the cluster should preferentially assign copies to with resource --location constraints. These constraints are written no differently to --those for regular resources except that the master's id is used. -+location constraints. These constraints are written no differently from -+those for primitive resources except that the master's +id+ is used. - - When considering multi-state resources in constraints, for most - purposes it is sufficient to treat them as clones. The exception is -@@ -568,52 +588,50 @@ constraints) and +first-action+ and/or +then-action+ fields (for - ordering constraints) are used. - - .Additional constraint options relevant to multi-state resources --[width="95%",cols="3m,5<",options="header",align="center"] -+[width="95%",cols="1m,1,3<",options="header",align="center"] - |========================================================= - - |Field -+|Default - |Description - - |rsc-role -+|started - |An additional attribute of colocation constraints that specifies the -- role that +rsc+ must be in. Allowed values: _Started_, +Master+, -- +Slave+. -+ role that +rsc+ must be in. Allowed values: +started+, +master+, -+ +slave+. - indexterm:[rsc-role,Ordering Constraints] - indexterm:[Constraints,Ordering,rsc-role] - - |with-rsc-role -+|started - |An additional attribute of colocation constraints that specifies the -- role that +with-rsc+ must be in. Allowed values: _Started_, -- +Master+, +Slave+. -+ role that +with-rsc+ must be in. Allowed values: +started+, -+ +master+, +slave+. - indexterm:[with-rsc-role,Ordering Constraints] - indexterm:[Constraints,Ordering,with-rsc-role] - - |first-action -+|start - |An additional attribute of ordering constraints that specifies the - action that the +first+ resource must complete before executing the -- specified action for the +then+ resource. Allowed values: _start_, -+ specified action for the +then+ resource. Allowed values: +start+, - +stop+, +promote+, +demote+. - indexterm:[first-action,Ordering Constraints] - indexterm:[Constraints,Ordering,first-action] - - |then-action -+|value of +first-action+ - |An additional attribute of ordering constraints that specifies the - action that the +then+ resource can only execute after the - +first-action+ on the +first+ resource has completed. Allowed -- values: +start+, +stop+, +promote+, +demote+. Defaults to the value -- (specified or implied) of +first-action+. -+ values: +start+, +stop+, +promote+, +demote+. - indexterm:[then-action,Ordering Constraints] - indexterm:[Constraints,Ordering,then-action] - - |========================================================= - --In the example below, +myApp+ will wait until one of the database --copies has been started and promoted to master before being started --itself. Only if no copies can be promoted will +apache-stats+ be --prevented from being active. Additionally, the database will wait for --+myApp+ to be stopped before it is demoted. -- --.Example constraints involving multi-state resources -+.Constraints involving multi-state resources - ====== - [source,XML] - ------- -@@ -630,54 +648,60 @@ prevented from being active. Additionally, the database will wait for - ------- - ====== - --Colocation of a regular (or group) resource with a multi-state -+In the example above, +myApp+ will wait until one of the database -+copies has been started and promoted to master before being started -+itself on the same node. Only if no copies can be promoted will +myApp+ be -+prevented from being active. Additionally, the cluster will wait for -++myApp+ to be stopped before demoting the database. -+ -+Colocation of a primitive or group resource with a multi-state - resource means that it can run on any machine with an active copy of --the multi-state resource that is in the specified state (+Master+ or --+Slave+). In the example, the cluster will choose a location based on --where database is running as a +Master+, and if there are multiple --+Master+ instances it will also factor in +myApp+'s own location -+the multi-state resource that has the specified role (+master+ or -++slave+). In the example above, the cluster will choose a location based on -+where database is running as a +master+, and if there are multiple -++master+ instances it will also factor in +myApp+'s own location - preferences when deciding which location to choose. - - Colocation with regular clones and other multi-state resources is also - possible. In such cases, the set of allowed locations for the +rsc+ - clone is (after role filtering) limited to nodes on which the - +with-rsc+ multi-state resource is (or will be) in the specified role. --Allocation is then performed as-per-normal. -+Placement is then performed as normal. - --==== Using Multi-state Resources in Colocation/Ordering Sets ==== -+==== Using Multi-state Resources in Colocation Sets ==== - - .Additional colocation set options relevant to multi-state resources --[width="95%",cols="3m,5<",options="header",align="center"] -+[width="95%",cols="1m,1,6<",options="header",align="center"] - |========================================================= - - |Field -+|Default - |Description - - |role --|An additional attribute of colocation constraint sets that specifies the -- role that *all members of the set* must be in. Allowed values: _Started_, +Master+, -- +Slave+. -+|started -+|The role that 'all members' of the set must be in. Allowed values: +started+, +master+, -+ +slave+. - indexterm:[role,Ordering Constraints] - indexterm:[Constraints,Ordering,role] - - |========================================================= - - In the following example +B+'s master must be located on the same node as +A+'s master. --Additionally resources +C+ and +D+ must be located on the same node as +B+'s master. -+Additionally resources +C+ and +D+ must be located on the same node as +A+'s -+and +B+'s masters. - --.Colocate C and C with A and B's master instances -+.Colocate C and D with A's and B's master instances - ====== - [source,XML] - ------- - - -- -+ - - - -- -- -- -+ - - - -@@ -686,68 +710,68 @@ Additionally resources +C+ and +D+ must be located on the same node as +B+'s mas - ------- - ====== - -+==== Using Multi-state Resources in Ordering Sets ==== -+ - .Additional ordered set options relevant to multi-state resources --[width="95%",cols="3m,5<",options="header",align="center"] -+[width="95%",cols="1m,1,3<",options="header",align="center"] - |========================================================= - - |Field -+|Default - |Description - - |action --|An additional attribute of ordering constraint set that specifies the -- action that applies to *all members of the set*. Allowed -- values: +start+, +stop+, +promote+, +demote+. Defaults to the value -- (specified or implied) of +first-action+. -+|value of +first-action+ -+|An additional attribute of ordering constraint sets that specifies the -+ action that applies to 'all members' of the set. Allowed -+ values: +start+, +stop+, +promote+, +demote+. - indexterm:[action,Ordering Constraints] - indexterm:[Constraints,Ordering,action] - - |========================================================= - --In the following example +B+ cannot be promoted until +A+'s has been promoted. --Additionally resources +C+ and +D+ must wait until +A+ and +B+ have been promoted before they can start. -- --.Start C and C after first promoting A and B -+.Start C and D after first promoting A and B - ====== - [source,XML] - ------- - -- -- -+ -+ - - - -- -+ - - - -- -+ - - ------- - ====== - -+In the above example, +B+ cannot be promoted to a master role until +A+ has -+been promoted. Additionally, resources +C+ and +D+ must wait until +A+ and +B+ -+have been promoted before they can start. -+ - - === Multi-state Stickiness === - - indexterm:[resource-stickiness,Multi-State] --To achieve a stable allocation pattern, multi-state resources are --slightly sticky by default. If no value for +resource-stickiness+ is --provided, the multi-state resource will use a value of 1. Being a --small value, it causes minimal disturbance to the score calculations --of other resources but is enough to prevent Pacemaker from needlessly --moving copies around the cluster. -+As with regular clones, multi-state resources are -+slightly sticky by default. See <> for details. - - === Which Resource Instance is Promoted === - --During the start operation, most Resource Agent scripts should call -+During the start operation, most resource agents should call - the `crm_master` utility. This tool automatically detects both the - resource and host and should be used to set a preference for being - promoted. Based on this, +master-max+, and +master-node-max+, the - instance(s) with the highest preference will be promoted. - --The other alternative is to create a location constraint that -+An alternative is to create a location constraint that - indicates which nodes are most preferred as masters. - --.Manually specifying which node should be promoted -+.Explicitly preferring node1 to be promoted to master - ====== - [source,XML] - ------- -@@ -759,27 +783,29 @@ indicates which nodes are most preferred as masters. - ------- - ====== - --=== Multi-state Resource Agent Requirements === -+=== Requirements for Multi-state Resource Agents === - - Since multi-state resources are an extension of cloned resources, all --the requirements of Clones are also requirements of multi-state --resources. Additionally, multi-state resources require two extra --actions: +demote+ and +promote+; these actions are responsible for -+the requirements for resource agents that support clones are also requirements -+for resource agents that support multi-state resources. -+ -+Additionally, multi-state resources require two extra -+actions, +demote+ and +promote+, which are responsible for - changing the state of the resource. Like +start+ and +stop+, they --should return +OCF_SUCCESS+ if they completed successfully or a -+should return +$\{OCF_SUCCESS}+ if they completed successfully or a - relevant error code if they did not. - - The states can mean whatever you wish, but when the resource is --started, it must come up in the mode called +Slave+. From there the --cluster will then decide which instances to promote to +Master+. -+started, it must come up in the mode called +slave+. From there the -+cluster will decide which instances to promote to +master+. - --In addition to the Clone requirements for monitor actions, agents must -+In addition to the clone requirements for monitor actions, agents must - also _accurately_ report which state they are in. The cluster relies - on the agent to report its status (including role) accurately and does - not indicate to the agent what role it currently believes it to be in. - - .Role implications of OCF return codes --[width="95%",cols="5,3<",options="header",align="center"] -+[width="95%",cols="1,1<",options="header",align="center"] - |========================================================= - - |Monitor Return Code -@@ -806,15 +832,15 @@ not indicate to the agent what role it currently believes it to be in. - - |========================================================= - --=== Multi-state Notifications === -+==== Multi-state Notifications ==== - - Like clones, supporting notifications requires the +notify+ action to --be implemented. Once supported the notify action will be passed a -+be implemented. If supported, the notify action will be passed a - number of extra variables which, when combined with additional - context, can be used to calculate the current state of the cluster and - what is about to happen to it. - --.Environment variables supplied with Master notify actions footnote:[Emphasized variables are specific to +Master+ resources and all behave in the same manner as described for Clone resources.] -+.Environment variables supplied with multi-state notify actions footnote:[Emphasized variables are specific to +Master+ resources, and all behave in the same manner as described for Clone resources.] - [width="95%",cols="5,3<",options="header",align="center"] - |========================================================= - -@@ -913,7 +939,7 @@ what is about to happen to it. - - |========================================================= - --=== Multi-state - Proper Interpretation of Notification Environment Variables === -+==== Proper Interpretation of Multi-state Notification Environment Variables ==== - - - .Pre-notification (demote): -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Basics.txt b/doc/Pacemaker_Explained/en-US/Ch-Basics.txt -index 6f73955..5134e69 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Basics.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Basics.txt -@@ -2,56 +2,67 @@ - - == Configuration Layout == - --The cluster is written using XML notation and divided into two main --sections: configuration and status. -- --The status section contains the history of each resource on each node --and based on this data, the cluster can construct the complete current --state of the cluster. The authoritative source for the status section --is the local resource manager (lrmd) process on each cluster node and --the cluster will occasionally repopulate the entire section. For this --reason it is never written to disk and administrators are advised --against modifying it in any way. -- --The configuration section contains the more traditional information --like cluster options, lists of resources and indications of where they --should be placed. The configuration section is the primary focus of --this document. -- --The configuration section itself is divided into four parts: -- -- * Configuration options (called +crm_config+) -- * Nodes -- * Resources -- * Resource relationships (called +constraints+) -+The cluster is defined by the Cluster Information Base (CIB), -+which uses XML notation. The simplest CIB, an empty one, looks like this: - - .An empty configuration - ====== - [source,XML] - ------- -- -- -- -- -- -- -- -- -- -+ -+ -+ -+ -+ -+ -+ -+ -+ - ------- - ====== - -+The empty configuration above contains the major sections that make up a CIB: -+ -+* +cib+: The entire CIB is enclosed with a +cib+ tag. Certain fundamental settings -+ are defined as attributes of this tag. -+ -+ ** +configuration+: This section -- the primary focus of this document -- -+ contains traditional configuration information such as what resources the -+ cluster serves and the relationships among them. -+ -+ *** +crm_config+: cluster-wide configuration options -+ *** +nodes+: the machines that host the cluster -+ *** +resources+: the services run by the cluster -+ *** +constraints+: indications of how resources should be placed -+ -+ ** +status+: This section contains the history of each resource on each node. -+ Based on this data, the cluster can construct the complete current -+ state of the cluster. The authoritative source for this section -+ is the local resource manager (lrmd process) on each cluster node, and -+ the cluster will occasionally repopulate the entire section. For this -+ reason, it is never written to disk, and administrators are advised -+ against modifying it in any way. -+ -+In this document, configuration settings will be described as 'properties' or 'options' -+based on how they are defined in the CIB: -+ -+* Properties are XML attributes of an XML element. -+* Options are name-value pairs expressed as +nvpair+ child elements of an XML element. -+ -+Normally you will use command-line tools that abstract the XML, so the -+distinction will be unimportant; both properties and options are -+cluster settings you can tweak. -+ - == The Current State of the Cluster == - - Before one starts to configure a cluster, it is worth explaining how - to view the finished product. For this purpose we have created the --`crm_mon` utility that will display the -+`crm_mon` utility, which will display the - current state of an active cluster. It can show the cluster status by - node or by resource and can be used in either single-shot or - dynamically-updating mode. There are also modes for displaying a list - of the operations performed (grouped by node and resource) as well as - information about failures. -- - - Using this tool, you can examine the state of the cluster for - irregularities and see how it responds when you cause or simulate -@@ -113,7 +124,7 @@ Details on all the available options can be obtained using the - ====== - - The DC (Designated Controller) node is where all the decisions are --made and if the current DC fails a new one is elected from the -+made, and if the current DC fails a new one is elected from the - remaining cluster nodes. The choice of DC is of no significance to an - administrator beyond the fact that its logs will generally be more - interesting. -@@ -122,27 +133,30 @@ interesting. - - There are three basic rules for updating the cluster configuration: - -- * Rule 1 - Never edit the cib.xml file manually. Ever. I'm not making this up. -+ * Rule 1 - Never edit the +cib.xml+ file manually. Ever. I'm not making this up. - * Rule 2 - Read Rule 1 again. - * Rule 3 - The cluster will notice if you ignored rules 1 & 2 and refuse to use the configuration. - --Now that it is clear how NOT to update the configuration, we can begin --to explain how you should. -+Now that it is clear how 'not' to update the configuration, we can begin -+to explain how you 'should'. -+ -+=== Editing the CIB Using XML === - - The most powerful tool for modifying the configuration is the --+cibadmin+ command which talks to a running cluster. With +cibadmin+, --the user can query, add, remove, update or replace any part of the --configuration; all changes take effect immediately, so there is no --need to perform a reload-like operation. -- -++cibadmin+ command. With +cibadmin+, you can query, add, remove, update -+or replace any part of the configuration. All changes take effect immediately, -+so there is no need to perform a reload-like operation. - --The simplest way of using cibadmin is to use it to save the current -+The simplest way of using `cibadmin` is to use it to save the current - configuration to a temporary file, edit that file with your favorite --text or XML editor and then upload the revised configuration. -+text or XML editor, and then upload the revised configuration. footnote:[This -+process might appear to risk overwriting changes that happen after the initial -+cibadmin call, but pacemaker will reject any update that is "too old". If the -+CIB is updated in some other fashion after the initial cibadmin, the second -+cibadmin will be rejected because the version number will be too low.] - - .Safely using an editor to modify the cluster configuration - ====== --[source,C] - -------- - # cibadmin --query > tmp.xml - # vi tmp.xml -@@ -152,16 +166,16 @@ text or XML editor and then upload the revised configuration. - - Some of the better XML editors can make use of a Relax NG schema to - help make sure any changes you make are valid. The schema describing --the configuration can normally be found in --'/usr/lib/heartbeat/pacemaker.rng' on most systems. -- -+the configuration can be found in +pacemaker.rng+, which may be -+deployed in a location such as +/usr/share/pacemaker+ or -++/usr/lib/heartbeat+ depending on your operating system and how you -+installed the software. - --If you only wanted to modify the resources section, you could instead --do -+If you want to modify just one section of the configuration, you can -+query and replace just that section to avoid modifying any others. - --.Safely using an editor to modify a subsection of the cluster configuration -+.Safely using an editor to modify only the resources section - ====== --[source,C] - -------- - # cibadmin --query --obj_type resources > tmp.xml - # vi tmp.xml -@@ -169,18 +183,15 @@ do - -------- - ====== - --to avoid modifying any other part of the configuration. -+=== Quickly Deleting Part of the Configuration === - --== Quickly Deleting Part of the Configuration == -- --Identify the object you wish to delete. Eg. run -+Identify the object you wish to delete by XML tag and id. For example, -+you might search the CIB for all STONITH-related configuration: - --.Searching for STONITH related configuration items -+.Searching for STONITH-related configuration items - ====== --[source,C] -+---- - # cibadmin -Q | grep stonith --[source,XML] ---------- - - - -@@ -191,34 +202,48 @@ Identify the object you wish to delete. Eg. run - - - ---------- -+---- - ====== - --Next identify the resource's tag name and id (in this case we'll --choose +primitive+ and +child_DoFencing+). Then simply execute: -+If you wanted to delete the +primitive+ tag with id +child_DoFencing+, -+you would run: - --[source,C] -+---- - # cibadmin --delete --crm_xml '' -+---- - --== Updating the Configuration Without Using XML == -+=== Updating the Configuration Without Using XML === - --Some common tasks can also be performed with one of the higher level --tools that avoid the need to read or edit XML. -+Most tasks can be performed with one of the other command-line -+tools provided with pacemaker, avoiding the need to read or edit XML. - --To enable stonith for example, one could run: -+To enable STONITH for example, one could run: - --[source,C] --# crm_attribute --attr-name stonith-enabled --attr-value true -+---- -+# crm_attribute --name stonith-enabled --update 1 -+---- - --Or, to see if +somenode+ is allowed to run resources, there is: -+Or, to check whether *somenode* is allowed to run resources, there is: - --[source,C] --# crm_standby --get-value --node-uname somenode -+---- -+# crm_standby --get-value --node somenode -+---- - --Or, to find the current location of +my-test-rsc+, one can use: -+Or, to find the current location of *my-test-rsc*, one can use: - --[source,C] -+---- - # crm_resource --locate --resource my-test-rsc -+---- -+ -+Examples of using these tools for specific cases will be given throughout this -+document where appropriate. -+ -+[NOTE] -+==== -+Old versions of pacemaker (1.0.3 and earlier) had different -+command-line tool syntax. If you are using an older version, -+check your installed manual pages for the proper syntax to use. -+==== - - [[s-config-sandboxes]] - == Making Configuration Changes in a Sandbox == -@@ -229,46 +254,46 @@ have created `crm_shadow` which creates a - "shadow" copy of the configuration and arranges for all the command - line tools to use it. - --To begin, simply invoke `crm_shadow` and give --it the name of a configuration to create footnote:[Shadow copies are --identified with a name, making it possible to have more than one.] ; --be sure to follow the simple on-screen instructions. -+To begin, simply invoke `crm_shadow --create` with -+the name of a configuration to create footnote:[Shadow copies are -+identified with a name, making it possible to have more than one.], -+and follow the simple on-screen instructions. - --WARNING: Read the above carefully, failure to do so could result in you --destroying the cluster's active configuration! -+[WARNING] -+==== -+Read this section and the on-screen instructions carefully; failure to do so could -+result in destroying the cluster's active configuration! -+==== - - - .Creating and displaying the active sandbox - ====== --[source,Bash] ---------- -- # crm_shadow --create test -- Setting up shadow instance -- Type Ctrl-D to exit the crm_shadow shell -- shadow[test]: -- shadow[test] # crm_shadow --which -- test ---------- -+---- -+# crm_shadow --create test -+Setting up shadow instance -+Type Ctrl-D to exit the crm_shadow shell -+shadow[test]: -+shadow[test] # crm_shadow --which -+test -+---- - ====== - - From this point on, all cluster commands will automatically use the - shadow copy instead of talking to the cluster's active configuration. --Once you have finished experimenting, you can either commit the --changes, or discard them as shown below. Again, be sure to follow the --on-screen instructions carefully. -+Once you have finished experimenting, you can either make the -+changes active via the `--commit` option, or discard them using the `--delete` -+option. Again, be sure to follow the on-screen instructions carefully! - -- - For a full list of `crm_shadow` options and --commands, invoke it with the --help option. -+commands, invoke it with the `--help` option. - --.Using a sandbox to make multiple changes atomically -+.Using a sandbox to make multiple changes atomically, discard them and verify the real configuration is untouched - ====== --[source,Bash] ---------- -+---- - shadow[test] # crm_failcount -G -r rsc_c001n01 - name=fail-count-rsc_c001n01 value=0 -- shadow[test] # crm_standby -v on -n c001n02 -- shadow[test] # crm_standby -G -n c001n02 -+ shadow[test] # crm_standby -v on -N c001n02 -+ shadow[test] # crm_standby -G -N c001n02 - name=c001n02 scope=nodes value=on - shadow[test] # cibadmin --erase --force - shadow[test] # cibadmin --query -@@ -286,7 +311,7 @@ commands, invoke it with the --help option. - Now type Ctrl-D to exit the crm_shadow shell - shadow[test] # exit - # crm_shadow --which -- No shadow instance provided -+ No active shadow configuration defined - # cibadmin -Q - -@@ -295,38 +320,41 @@ commands, invoke it with the --help option. - - - ---------- -+---- - ====== - --Making changes in a sandbox and verifying the real configuration is untouched -- - [[s-config-testing-changes]] - == Testing Your Configuration Changes == - - We saw previously how to make a series of changes to a "shadow" copy - of the configuration. Before loading the changes back into the --cluster (eg. `crm_shadow --commit mytest --force`), it is often --advisable to simulate the effect of the changes with +crm_simulate+, --eg. -+cluster (e.g. `crm_shadow --commit mytest --force`), it is often -+advisable to simulate the effect of the changes with +crm_simulate+. -+For example: - --[source,C] -+---- - # crm_simulate --live-check -VVVVV --save-graph tmp.graph --save-dotfile tmp.dot -+---- - -- --The tool uses the same library as the live cluster to show what it --would have done given the supplied input. It's output, in addition to -+This tool uses the same library as the live cluster to show what it -+would have done given the supplied input. Its output, in addition to - a significant amount of logging, is stored in two files +tmp.graph+ --and +tmp.dot+, both are representations of the same thing -- the -+and +tmp.dot+. Both files are representations of the same thing: the - cluster's response to your changes. - --In the graph file is stored the complete transition, containing a list --of all the actions, their parameters and their pre-requisites. --Because the transition graph is not terribly easy to read, the tool --also generates a Graphviz dot-file representing the same information. -+The graph file stores the complete transition from the existing cluster state -+to your desired new state, containing a list of all the actions, their -+parameters and their pre-requisites. Because the transition graph is not -+terribly easy to read, the tool also generates a Graphviz -+footnote:[Graph visualization software. See http://www.graphviz.org/ for details.] -+dot-file representing the same information. - --== Interpreting the Graphviz output == -+For information on the options supported by `crm_simulate`, use -+its `--help` option. -+ -+.Interpreting the Graphviz output - * Arrows indicate ordering dependencies -- * Dashed-arrows indicate dependencies that are not present in the transition graph -+ * Dashed arrows indicate dependencies that are not present in the transition graph - * Actions with a dashed border of any color do not form part of the transition graph - * Actions with a green border form part of the transition graph - * Actions with a red border are ones the cluster would like to execute but cannot run -@@ -341,24 +369,21 @@ also generates a Graphviz dot-file representing the same information. - - image::images/Policy-Engine-small.png["An example transition graph as represented by Graphviz",width="16cm",height="6cm",align="center"] - --In the above example, it appears that a new node, +pcmk-2+, has come --online and that the cluster is checking to make sure +rsc1+, +rsc2+ --and +rsc3+ are not already running there (Indicated by the --+*_monitor_0+ entries). Once it did that, and assuming the resources --were not active there, it would have liked to stop +rsc1+ and +rsc2+ --on +pcmk-1+ and move them to +pcmk-2+. However, there appears to be -+In the above example, it appears that a new node, *pcmk-2*, has come -+online and that the cluster is checking to make sure *rsc1*, *rsc2* -+and *rsc3* are not already running there (Indicated by the -+*rscN_monitor_0* entries). Once it did that, and assuming the resources -+were not active there, it would have liked to stop *rsc1* and *rsc2* -+on *pcmk-1* and move them to *pcmk-2*. However, there appears to be - some problem and the cluster cannot or is not permitted to perform the - stop actions which implies it also cannot perform the start actions. --For some reason the cluster does not want to start +rsc3+ anywhere. -- --For information on the options supported by `crm_simulate`, use --the `--help` option. -+For some reason the cluster does not want to start *rsc3* anywhere. - - === Complex Cluster Transition === - - image::images/Policy-Engine-big.png["Another, slightly more complex, transition graph that you're not expected to be able to read",width="16cm",height="20cm",align="center"] - --== Do I Need to Update the Configuration on all Cluster Nodes? == -+== Do I Need to Update the Configuration on All Cluster Nodes? == - - No. Any changes are immediately synchronized to the other active - members of the cluster. -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Constraints.txt b/doc/Pacemaker_Explained/en-US/Ch-Constraints.txt -index 8498ce0..cd722ff 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Constraints.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Constraints.txt -@@ -1,4 +1,4 @@ --= Resource Constraints = -+= Resource Constraints = - - indexterm:[Resource,Constraints] - -@@ -9,19 +9,19 @@ Practically everything from moving a resource to deciding which - resource to stop in a degraded cluster is achieved by manipulating - scores in some way. - --Scores are calculated on a per-resource basis and any node with a -+Scores are calculated on a per-resource basis, and any node with a - negative score for a resource can't run that resource. After - calculating the scores for a resource, the cluster then chooses the - node with the highest one. - - === Infinity Math === - --+INFINITY+ is currently defined as 1,000,000 and addition/subtraction --with it follows these three basic rules: -+Pacemaker implements +INFINITY+ internally as a score of 1,000,000. -+Addition/subtraction with it follows these three basic rules: - - * Any value + +INFINITY+ = +INFINITY+ --* Any value - +INFINITY+ = -+INFINITY+ --* +INFINITY+ - +INFINITY+ = -+INFINITY+ -+* Any value - +INFINITY+ = +-INFINITY+ -+* +INFINITY+ - +INFINITY+ = +-INFINITY+ - - == Deciding Which Nodes a Resource Can Run On == - -@@ -30,43 +30,76 @@ indexterm:[Resource,Constraints,Location] - There are two alternative strategies for specifying which nodes a - resources can run on. One way is to say that by default they can run - anywhere and then create location constraints for nodes that are not --allowed. The other option is to have nodes "opt-in"... to start with -+allowed. The other option is to have nodes "opt-in" -- to start with - nothing able to run anywhere and selectively enable allowed nodes. -- --=== Options === - --.Options for Simple Location Constraints --[width="95%",cols="2m,5<",options="header",align="center"] -+Whether you should choose opt-in or opt-out depends on your -+personal preference and the make-up of your cluster. If most of your -+resources can run on most of the nodes, then an opt-out arrangement is -+likely to result in a simpler configuration. On the other-hand, if -+most resources can only run on a small subset of nodes, an opt-in -+configuration might be simpler. -+ -+=== Location Properties === -+ -+.Properties for Simple Location Constraints -+[width="95%",cols="2m,1,5>). -+==== -+ -+=== Ordering Properties === -+ - .Properties of an Ordering Constraint --[width="95%",cols="2m,5<",options="header",align="center"] -+[width="95%",cols="1m,1,4 -- -- -+ -+ - - ------- - ====== - --Some additional information on ordering constraints can be found in --the document http://clusterlabs.org/doc/Ordering_Explained.pdf[Ordering Explained]. -+Because the above example lets +symmetrical+ default to TRUE, -++Webserver+ must be stopped before +Database+ can be stopped, -+and +Webserver+ should be stopped before +IP+ -+if they both need to be stopped. - - [[s-resource-colocation]] - == Placing Resources Relative to other Resources == -@@ -255,10 +289,11 @@ indexterm:[Resource,Location Relative to other Resources] - When the location of one resource depends on the location of another - one, we call this colocation. - --There is an important side-effect of creating a colocation constraint --between two resources: it affects the order in which resources are --assigned to a node. If you think about it, it's somewhat obvious. --You can't place A relative to B unless you know where B is. -+Colocation has an important side-effect: it affects the order in which -+resources are assigned to a node. -+footnote:['Not' the order in which they are started. For that, see -+<>.] -+Think about it: You can't place A relative to B unless you know where B is. - footnote:[ - While the human brain is sophisticated enough to read the constraint - in any order and choose the correct one depending on the situation, -@@ -266,19 +301,18 @@ the cluster is not quite so smart. Yet. - ] - - So when you are creating colocation constraints, it is important to --consider whether you should colocate A with B or B with A. -+consider whether you should colocate A with B, or B with A. - --Another thing to keep in mind is that, assuming A is collocated with --B, the cluster will also take into account A's preferences when -+Another thing to keep in mind is that, assuming A is colocated with -+B, the cluster will take into account A's preferences when - deciding which node to choose for B. - --For a detailed look at exactly how this occurs, see the --http://www.clusterlabs.org/mediawiki/images/6/61/Colocation_Explained.pdf[Colocation --Explained] document. -+For a detailed look at exactly how this occurs, see -+http://clusterlabs.org/doc/Colocation_Explained.pdf[Colocation Explained]. - --=== Options === -+=== Colocation Properties === - --.Properties of a Collocation Constraint -+.Properties of a Colocation Constraint - [width="95%",cols="2m,5<",options="header",align="center"] - |========================================================= - -@@ -291,22 +325,20 @@ indexterm:[id,Colocation Constraints] - indexterm:[Constraints,Colocation,id] - - |rsc --|The colocation source. If the constraint cannot be satisfied, the -- cluster may decide not to allow the resource to run at all. -+|The name of a resource that should be located relative to +with-rsc+. - indexterm:[rsc,Colocation Constraints] - indexterm:[Constraints,Colocation,rsc] - - |with-rsc --|The colocation target. The cluster will decide where to put this -- resource first and then decide where to put the resource in the +rsc+ -- field. -+|The name of the resource used as the colocation target. The cluster will -+decide where to put this resource first and then decide where to put +rsc+. - indexterm:[with-rsc,Colocation Constraints] - indexterm:[Constraints,Colocation,with-rsc] - - |score --|Positive values indicate the resource should run on the same -- node. Negative values indicate the resources should not run on the -- same node. Values of \+/- +INFINITY+ change "should" to "must". -+|Positive values indicate the resources should run on the same -+ node. Negative values indicate the resources should run on -+ different nodes. Values of \+/- +INFINITY+ change "should" to "must". - indexterm:[score,Colocation Constraints] - indexterm:[Constraints,Colocation,score] - -@@ -314,52 +346,61 @@ indexterm:[Constraints,Colocation,rsc] - - === Mandatory Placement === - --Mandatory placement occurs any time the constraint's score is -+Mandatory placement occurs when the constraint's score is - ++INFINITY+ or +-INFINITY+. In such cases, if the constraint can't be - satisfied, then the +rsc+ resource is not permitted to run. For - +score=INFINITY+, this includes cases where the +with-rsc+ resource is - not active. - --If you need +resource1+ to always run on the same machine as --+resource2+, you would add the following constraint: -+If you need resource +A+ to always run on the same machine as -+resource +B+, you would add the following constraint: - --.An example colocation constraint -+.Mandatory colocation constraint for two resources -+==== - [source,XML] -- -+ -+==== -+ -+Remember, because +INFINITY+ was used, if +B+ can't run on any -+of the cluster nodes (for whatever reason) then +A+ will not -+be allowed to run. Whether +A+ is running or not has no effect on +B+. - --Remember, because +INFINITY+ was used, if +resource2+ can't run on any --of the cluster nodes (for whatever reason) then +resource1+ will not --be allowed to run. -+Alternatively, you may want the opposite -- that +A+ 'cannot' -+run on the same machine as +B+. In this case, use -++score="-INFINITY"+. - --Alternatively, you may want the opposite... that +resource1+ cannot --run on the same machine as +resource2+. In this case use --+score="-INFINITY"+ -- --.An example anti-colocation constraint -+.Mandatory anti-colocation constraint for two resources -+==== - [source,XML] -- -+ -+==== - --Again, by specifying +-INFINTY+, the constraint is binding. So if the --only place left to run is where +resource2+ already is, then --+resource1+ may not run anywhere. -+Again, by specifying +-INFINITY+, the constraint is binding. So if the -+only place left to run is where +B+ already is, then -++A+ may not run anywhere. -+ -+As with +INFINITY+, +B+ can run even if +A+ is stopped. -+However, in this case +A+ also can run if +B+ is stopped, because it still -+meets the constraint of +A+ and +B+ not running on the same node. - - === Advisory Placement === - - If mandatory placement is about "must" and "must not", then advisory - placement is the "I'd prefer if" alternative. For constraints with - scores greater than +-INFINITY+ and less than +INFINITY+, the cluster --will try and accommodate your wishes but may ignore them if the -+will try to accommodate your wishes but may ignore them if the - alternative is to stop some of the cluster resources. -- - --Like in life, where if enough people prefer something it effectively -+As in life, where if enough people prefer something it effectively - becomes mandatory, advisory colocation constraints can combine with - other elements of the configuration to behave as if they were - mandatory. - --.An example advisory-only colocation constraint -+.Advisory colocation constraint for two resources -+==== - [source,XML] -- -+ -+==== - - [[s-resource-sets-ordering]] - == Ordering Sets of Resources == -@@ -381,7 +422,7 @@ ordered resources, such as: - - .Visual representation of the four resources' start order for the above constraints - image::images/resource-set.png["Ordered set",width="16cm",height="2.5cm",align="center"] -- -+ - === Ordered Set === - - To simplify this situation, there is an alternate format for ordering -@@ -412,10 +453,10 @@ In some tools +create set A B+ is *NOT* equivalent to +create A then B+. - - While the set-based format is not less verbose, it is significantly - easier to get right and maintain. It can also be expanded to allow --ordered sets of (un)ordered resources. In the example below, +rscA+ --and +rscB+ can both start in parallel, as can +rscC+ and +rscD+, --however +rscC+ and +rscD+ can only start once _both_ +rscA+ _and_ -- +rscB+ are active. -+ordered sets of (un)ordered resources. In the example below, +A+ -+and +B+ can both start in parallel, as can +C+ and +D+, -+however +C+ and +D+ can only start once _both_ +A+ _and_ -+ +B+ are active. - - .Ordered sets of unordered resources - ====== -@@ -435,7 +476,7 @@ however +rscC+ and +rscD+ can only start once _both_ +rscA+ _and_ - - ------- - ====== -- -+ - .Visual representation of the start order for two ordered sets of unordered resources - image::images/two-sets.png["Two ordered sets",width="13cm",height="7.5cm",align="center"] - -@@ -474,21 +515,21 @@ image::images/three-sets.png["Three ordered sets",width="16cm",height="7.5cm",al - - The unordered set logic discussed so far has all been "AND" logic. - To illustrate this take the 3 resource set figure in the previous section. --Those sets can be expressed, +(A and B) then (C) then (D) then (E and F)+ -+Those sets can be expressed, +(A and B) then \(C) then (D) then (E and F)+. - --Say for example we want change the first set, (A and B), to use "OR" logic --so the sets look like this, +(A or B) then (C) then (D) then (E and F)+. -+Say for example we want to change the first set, +(A and B)+, to use "OR" logic -+so the sets look like this: +(A or B) then \(C) then (D) then (E and F)+. - This functionality can be achieved through the use of the +require-all+ --option. By default this option is 'require-all=true' which is why the --"AND" logic is used by default. Changing +require-all=false+ means only one -+option. This option defaults to TRUE which is why the -+"AND" logic is used by default. Setting +require-all=false+ means only one - resource in the set needs to be started before continuing on to the next set. - --Note that the 'require-all=false' option only makes sense to use in conjunction --with unordered sets, 'sequential=false'. Think of it like this, 'sequential=false' -+Note that the +require-all=false+ option only makes sense to use in conjunction -+with unordered sets, +sequential=false+. Think of it like this, +sequential=false+ - modifies the set to be an unordered set that uses "AND" logic by default, by adding --'require-all=false' the unordered set's "AND" logic is flipped to "OR" logic. -++require-all=false+ the unordered set's "AND" logic is flipped to "OR" logic. - --.Resource Set "OR" logic. Three ordered sets, where the first set is internally unordered with "OR" logic. -+.Resource Set "OR" logic: Three ordered sets, where the first set is internally unordered with "OR" logic - ====== - [source,XML] - ------- -@@ -513,41 +554,45 @@ modifies the set to be an unordered set that uses "AND" logic by default, by add - - - [[s-resource-sets-colocation]] --== Collocating Sets of Resources == -+== Colocating Sets of Resources == - - Another common situation is for an administrator to create a set of --collocated resources. Previously this was possible either by defining --a resource group (See <>) which could not always --accurately express the design; or by defining each relationship as an --individual constraint, causing a constraint explosion as the number of --resources and combinations grew. -+colocated resources. -+ -+One way to do this would be to define a resource group (see -+<>), but that cannot always accurately express the desired -+state. -+ -+Another way would be to define each relationship as an individual constraint, -+but that causes a constraint explosion as the number of resources and -+combinations grow. An example of this approach: - --.A chain of collocated resources -+.Chain of colocated resources - ====== - [source,XML] - ------- - -- -- -- -+ -+ -+ - - ------- - ====== - - To make things easier, we allow an alternate form of colocation --constraints using +resource_sets+. Just like the expanded version, a --resource that can't be active also prevents any resource that must be --collocated with it from being active. For example, if +B+ was not --able to run, then both +C+ (and by inference +D+) must also remain --stopped. -+constraints using +resource_set+. As with the chained version, a -+resource that can't be active prevents any resource that must be -+colocated with it from being active. For example, if +C+ is not -+able to run, then both +B+ and by inference +A+ must also remain -+stopped. Here is an example +resource_set+: - --.The equivalent colocation chain expressed using +resource_sets+ -+.Equivalent colocation chain expressed using +resource_set+ - ====== - [source,XML] - ------- - - -- -+ - - - -@@ -561,38 +606,27 @@ stopped. - [WARNING] - ========= - Always pay attention to how your tools expose this functionality. --In some tools +create set A B+ is *NOT* equivalent to +create A with B+. -+In some tools +create set A B+ is 'not' equivalent to +create A with B+. - ========= - --.A group resource with the equivalent colocation rules --[source,XML] --------- -- -- -- -- -- -- --------- -- --This notation can also be used in this context to tell the cluster -+This notation can also be used to tell the cluster - that a set of resources must all be located with a common peer, but --have no dependencies on each other. In this scenario, unlike the --previous, +B would+ be allowed to remain active even if +A or+ +C+ (or -+have no dependencies on each other. In this scenario, unlike the -+previous, +B+ 'would' be allowed to remain active even if +A+ or +C+ (or - both) were inactive. - --.Using colocation sets to specify a common peer. -+.Using colocation sets to specify a common peer - ====== - [source,XML] - ------- - - -- -+ - - - - -- -+ - - - -@@ -600,30 +634,30 @@ both) were inactive. - ------- - ====== - --Of course there is no limit to the number and size of the sets used. --The only thing that matters is that in order for any member of set N --to be active, all the members of set N+1 must also be active (and --naturally on the same node); and if a set has +sequential="true"+, --then in order for member M to be active, member M+1 must also be --active. You can even specify the role in which the members of a set --must be in using the set's role attribute. -- --.A colocation chain where the members of the middle set have no inter-dependencies and the last has master status. -+There is no inherent limit to the number and size of the sets used. -+The only thing that matters is that in order for any member of one set -+in the constraint to be active, all members of sets listed after it must also -+be active (and naturally on the same node); and if a set has +sequential="true"+, -+then in order for one member of that set to be active, all members listed after it -+must also be active. You can even specify the role in which the members of a set -+must be in using the set's +role+ attribute. -+ -+.A colocation chain where the members of the middle set have no interdependencies and the last has master status. - ====== - [source,XML] - ------- - - -- -+ - - - -- -+ - - - - -- -+ - - - -@@ -631,6 +665,6 @@ must be in using the set's role attribute. - - ------- - ====== -- -+ - .Visual representation of a colocation chain where the members of the middle set have no inter-dependencies - image::images/three-sets-complex.png["Colocation chain",width="16cm",height="9cm",align="center"] -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Intro.txt b/doc/Pacemaker_Explained/en-US/Ch-Intro.txt -index fd05c81..e610651 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Intro.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Intro.txt -@@ -15,150 +15,9 @@ footnote:[I hope, however, that the concepts explained here make the functionali - Additionally, this document is NOT a step-by-step how-to guide for - configuring a specific clustering scenario. - --Although such guides exist, the purpose of this document is to provide --an understanding of the building blocks that can be used to construct --any type of Pacemaker cluster. -+Although such guides exist, -+footnote:[For example, see the http://www.clusterlabs.org/doc/[Clusters from Scratch] guide.] -+the purpose of this document is to provide an understanding of the building -+blocks that can be used to construct any type of Pacemaker cluster. - --== What Is Pacemaker? == -- --Pacemaker is a cluster resource manager. -- --It achieves maximum availability for your cluster services --(aka. resources) by detecting and recovering from node and --resource-level failures by making use of the messaging and membership --capabilities provided by your preferred cluster infrastructure (either --http://www.corosync.org/[Corosync] or --http://linux-ha.org/wiki/Heartbeat[Heartbeat]). -- --Pacemaker's key features include: -- -- * Detection and recovery of node and service-level failures -- * Storage agnostic, no requirement for shared storage -- * Resource agnostic, anything that can be scripted can be clustered -- * Supports STONITH for ensuring data integrity -- * Supports large and small clusters -- * Supports both quorate and resource driven clusters -- * Supports practically any redundancy configuration -- * Automatically replicated configuration that can be updated from any node -- * Ability to specify cluster-wide service ordering, colocation and anti-colocation -- * Support for advanced service types -- ** Clones: for services which need to be active on multiple nodes -- ** Multi-state: for services with multiple modes (eg. master/slave, primary/secondary) -- * Unified, scriptable, cluster management tools. -- --== Pacemaker Architecture == -- --At the highest level, the cluster is made up of three pieces: -- -- * Non-cluster aware components. These pieces -- include the resources themselves, scripts that start, stop and -- monitor them, and also a local daemon that masks the differences -- between the different standards these scripts implement. -- -- * Resource management. Pacemaker provides the brain that processes -- and reacts to events regarding the cluster. These events include -- nodes joining or leaving the cluster; resource events caused by -- failures, maintenance, scheduled activities; and other -- administrative actions. Pacemaker will compute the ideal state of -- the cluster and plot a path to achieve it after any of these -- events. This may include moving resources, stopping nodes and even -- forcing them offline with remote power switches. -- -- * Low level infrastructure. Projects like Corosync, CMAN and -- Heartbeat provide reliable messaging, membership and quorum -- information about the cluster. -- --When combined with Corosync, Pacemaker also supports popular open --source cluster filesystems. --footnote:[ --Even though Pacemaker also supports Heartbeat, the filesystems need to --use the stack for messaging and membership and Corosync seems to be --what they're standardizing on. -- --Technically it would be possible for them to support Heartbeat as --well, however there seems little interest in this. --] -- --Due to past standardization within the cluster filesystem community, --they make use of a common distributed lock manager which makes use of --Corosync for its messaging and membership capabilities (which nodes --are up/down) and Pacemaker for fencing services. -- --.The Pacemaker Stack --image::images/pcmk-stack.png["The Pacemaker stack",width="10cm",height="7.5cm",align="center"] -- --=== Internal Components === -- --Pacemaker itself is composed of five key components: -- -- * CIB (aka. Cluster Information Base) -- * CRMd (aka. Cluster Resource Management daemon) -- * LRMd (aka. Local Resource Management daemon) -- * PEngine (aka. PE or Policy Engine) -- * STONITHd -- --.Internal Components --image::images/pcmk-internals.png["Subsystems of a Pacemaker cluster",align="center",scaledwidth="65%"] -- --The CIB uses XML to represent both the cluster's configuration and --current state of all resources in the cluster. The contents of the CIB --are automatically kept in sync across the entire cluster and are used --by the PEngine to compute the ideal state of the cluster and how it --should be achieved. -- --This list of instructions is then fed to the DC (Designated --Controller). Pacemaker centralizes all cluster decision making by --electing one of the CRMd instances to act as a master. Should the --elected CRMd process, or the node it is on, fail... a new one is --quickly established. -- --The DC carries out the PEngine's instructions in the required order by --passing them to either the LRMd (Local Resource Management daemon) or --CRMd peers on other nodes via the cluster messaging infrastructure --(which in turn passes them on to their LRMd process). -- --The peer nodes all report the results of their operations back to the --DC and, based on the expected and actual results, will either execute --any actions that needed to wait for the previous one to complete, or --abort processing and ask the PEngine to recalculate the ideal cluster --state based on the unexpected results. -- --In some cases, it may be necessary to power off nodes in order to --protect shared data or complete resource recovery. For this Pacemaker --comes with STONITHd. -- --STONITH is an acronym for Shoot-The-Other-Node-In-The-Head and is --usually implemented with a remote power switch. -- --In Pacemaker, STONITH devices are modeled as resources (and configured --in the CIB) to enable them to be easily monitored for failure, however --STONITHd takes care of understanding the STONITH topology such that --its clients simply request a node be fenced and it does the rest. -- --== Types of Pacemaker Clusters == -- --Pacemaker makes no assumptions about your environment, this allows it --to support practically any --http://en.wikipedia.org/wiki/High-availability_cluster#Node_configurations[redundancy --configuration] including Active/Active, Active/Passive, N+1, N+M, --N-to-1 and N-to-N. -- --.Active/Passive Redundancy --image::images/pcmk-active-passive.png["Active/Passive Redundancy",width="10cm",height="7.5cm",align="center"] -- --Two-node Active/Passive clusters using Pacemaker and DRBD are a --cost-effective solution for many High Availability situations. -- --.Shared Failover --image::images/pcmk-shared-failover.png["Shared Failover",width="10cm",height="7.5cm",align="center"] -- --By supporting many nodes, Pacemaker can dramatically reduce hardware --costs by allowing several active/passive clusters to be combined and --share a common backup node -- --.N to N Redundancy --image::images/pcmk-active-active.png["N to N Redundancy",width="10cm",height="7.5cm",align="center"] -- --When shared storage is available, every node can potentially be used --for failover. Pacemaker can even run multiple copies of services to --spread out the workload. -+include::../../shared/en-US/pacemaker-intro.txt[] -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Multi-site-Clusters.txt b/doc/Pacemaker_Explained/en-US/Ch-Multi-site-Clusters.txt -index efd2f7a..5ca6cd0 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Multi-site-Clusters.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Multi-site-Clusters.txt -@@ -1,68 +1,61 @@ - = Multi-Site Clusters and Tickets = - --[[Multisite]] --== Abstract == - Apart from local clusters, Pacemaker also supports multi-site clusters. --That means you can have multiple, geographically dispersed sites with a --local cluster each. Failover between these clusters can be coordinated --by a higher level entity, the so-called `CTR (Cluster Ticket Registry)`. -- -+That means you can have multiple, geographically dispersed sites, each with a -+local cluster. Failover between these clusters can be coordinated -+manually by the administrator, or automatically by a higher-level entity called -+a 'Cluster Ticket Registry (CTR)'. - - == Challenges for Multi-Site Clusters == - - Typically, multi-site environments are too far apart to support --synchronous communication between the sites and synchronous data --replication. That leads to the following challenges: -+synchronous communication and data replication between the sites. -+That leads to significant challenges: - --- How to make sure that a cluster site is up and running? -+- How do we make sure that a cluster site is up and running? - --- How to make sure that resources are only started once? -+- How do we make sure that resources are only started once? - --- How to make sure that quorum can be reached between the different --sites and a split brain scenario can be avoided? -+- How do we make sure that quorum can be reached between the different -+sites and a split-brain scenario avoided? - --- How to manage failover between the sites? -+- How do we manage failover between sites? - --- How to deal with high latency in case of resources that need to be -+- How do we deal with high latency in case of resources that need to be - stopped? - - In the following sections, learn how to meet these challenges. - -- - == Conceptual Overview == - - Multi-site clusters can be considered as “overlay” clusters where - each cluster site corresponds to a cluster node in a traditional cluster. --The overlay cluster can be managed by a `CTR (Cluster Ticket Registry)` --mechanism. It guarantees that the cluster resources will be highly --available across different cluster sites. This is achieved by using --so-called `tickets` that are treated as failover domain between cluster -+The overlay cluster can be managed by a CTR in order to -+guarantee that any cluster resource will be active -+on no more than one cluster site. This is achieved by using -+'tickets' that are treated as failover domain between cluster - sites, in case a site should be down. - --The following list explains the individual components and mechanisms -+The following sections explain the individual components and mechanisms - that were introduced for multi-site clusters in more detail. - -+=== Ticket === - --=== Components and Concepts === -- --==== Ticket ==== -- --"Tickets" are, essentially, cluster-wide attributes. A ticket grants the -+Tickets are, essentially, cluster-wide attributes. A ticket grants the - right to run certain resources on a specific cluster site. Resources can --be bound to a certain ticket by `rsc_ticket` dependencies. Only if the --ticket is available at a site, the respective resources are started. -+be bound to a certain ticket by +rsc_ticket+ constraints. Only if the -+ticket is available at a site can the respective resources be started there. - Vice versa, if the ticket is revoked, the resources depending on that --ticket need to be stopped. -- --The ticket thus is similar to a 'site quorum'; i.e., the permission to --manage/own resources associated with that site. -+ticket must be stopped. - --(One can also think of the current `have-quorum` flag as a special, cluster-wide --ticket that is granted in case of node majority.) -+The ticket thus is similar to a 'site quorum', i.e. the permission to -+manage/own resources associated with that site. (One can also think of the -+current +have-quorum+ flag as a special, cluster-wide ticket that is granted in -+case of node majority.) - --These tickets can be granted/revoked either manually by administrators --(which could be the default for the classic enterprise clusters), or via --an automated `CTR` mechanism described further below. -+Tickets can be granted and revoked either manually by administrators -+(which could be the default for classic enterprise clusters), or via -+the automated CTR mechanism described below. - - A ticket can only be owned by one site at a time. Initially, none - of the sites has a ticket. Each ticket must be granted once by the cluster -@@ -70,58 +63,60 @@ administrator. - - The presence or absence of tickets for a site is stored in the CIB as a - cluster status. With regards to a certain ticket, there are only two states --for a site: `true` (the site has the ticket) or `false` (the site does -+for a site: +true+ (the site has the ticket) or +false+ (the site does - not have the ticket). The absence of a certain ticket (during the initial --state of the multi-site cluster) is also reflected by the value `false`. -+state of the multi-site cluster) is the same as the value +false+. - -+=== Dead Man Dependency === - --==== Dead Man Dependency ==== -- --A site can only activate the resources safely if it can be sure that the -+A site can only activate resources safely if it can be sure that the - other site has deactivated them. However after a ticket is revoked, it can - take a long time until all resources depending on that ticket are stopped - "cleanly", especially in case of cascaded resources. To cut that process --short, the concept of a `Dead Man Dependency` was introduced: -+short, the concept of a 'Dead Man Dependency' was introduced. - --- If the ticket is revoked from a site, the nodes that are hosting --dependent resources are fenced. This considerably speeds up the recovery --process of the cluster and makes sure that resources can be migrated more --quickly. -+If a dead man dependency is in force, if a ticket is revoked from a site, the -+nodes that are hosting dependent resources are fenced. This considerably speeds -+up the recovery process of the cluster and makes sure that resources can be -+migrated more quickly. - --This can be configured by specifying a `loss-policy="fence"` in --`rsc_ticket` constraints. -+This can be configured by specifying a +loss-policy="fence"+ in -++rsc_ticket+ constraints. - -+=== Cluster Ticket Registry === - --==== CTR (Cluster Ticket Registry) ==== -+A CTR is a coordinated group of network daemons that automatically handles -+granting, revoking, and timing out tickets (instead of the administrator -+revoking the ticket somewhere, waiting for everything to stop, and then -+granting it on the desired site). - --This is for those scenarios where the tickets management is supposed to --be automatic (instead of the administrator revoking the ticket somewhere, --waiting for everything to stop, and then granting it on the desired site). -+Pacemaker does not implement its own CTR, but interoperates with external -+software designed for that purpose (similar to how resource and fencing agents -+are not directly part of pacemaker). - --A `CTR` is a network daemon that handles granting, --revoking, and timing out "tickets". The participating clusters would run --the daemons that would connect to each other, exchange information on --their connectivity details, and vote on which site gets which ticket(s). -+Participating clusters run the CTR daemons, which connect to each other, exchange -+information about their connectivity, and vote on which sites gets which -+tickets. - --A ticket would only be granted to a site once they can be sure that it --has been relinquished by the previous owner, which would need to be --implemented via a timer in most scenarios. If a site loses connection --to its peers, its tickets time out and recovery occurs. After the --connection timeout plus the recovery timeout has passed, the other sites --are allowed to re-acquire the ticket and start the resources again. -+A ticket is granted to a site only once the CTR is sure that the ticket -+has been relinquished by the previous owner, implemented via a timer in most -+scenarios. If a site loses connection to its peers, its tickets time out and -+recovery occurs. After the connection timeout plus the recovery timeout has -+passed, the other sites are allowed to re-acquire the ticket and start the -+resources again. - - This can also be thought of as a "quorum server", except that it is not - a single quorum ticket, but several. - -+=== Configuration Replication === - --==== Configuration Replication ==== -- --As usual, the CIB is synchronized within each cluster, but it is not synchronized -+As usual, the CIB is synchronized within each cluster, but it is 'not' synchronized - across cluster sites of a multi-site cluster. You have to configure the resources - that will be highly available across the multi-site cluster for every site - accordingly. - - -+[[s-ticket-constraints]] - == Configuring Ticket Dependencies == - - The `rsc_ticket` constraint lets you specify the resources depending on a certain -@@ -130,67 +125,72 @@ what should happen to the respective resources if the ticket is revoked. - - The attribute `loss-policy` can have the following values: - --fence:: Fence the nodes that are running the relevant resources. -- --stop:: Stop the relevant resources. -+* +fence:+ Fence the nodes that are running the relevant resources. - --freeze:: Do nothing to the relevant resources. -+* +stop:+ Stop the relevant resources. - --demote:: Demote relevant resources that are running in master mode to slave mode. -+* +freeze:+ Do nothing to the relevant resources. - -+* +demote:+ Demote relevant resources that are running in master mode to slave mode. - --An example to configure a `rsc_ticket` constraint: - -+.Constraint that fences node if +ticketA+ is revoked -+==== - [source,XML] - ------- - - ------- -+==== - --This creates a constraint with the ID `rsc1-req-ticketA`. It defines that the --resource `rsc1` depends on `ticketA` and that the node running the resource should --be fenced in case `ticketA` is revoked. -+The example above creates a constraint with the ID +rsc1-req-ticketA+. It -+defines that the resource +rsc1+ depends on +ticketA+ and that the node running -+the resource should be fenced if +ticketA+ is revoked. - --If resource `rsc1` was a multi-state resource that can run in master or --slave mode, you may want to configure that only `rsc1's` master mode --depends on `ticketA`. With the following configuration, `rsc1` will be --demoted to slave mode if `ticketA` is revoked: -+If resource +rsc1+ were a multi-state resource (i.e. it could run in master or -+slave mode), you might want to configure that only master mode -+depends on +ticketA+. With the following configuration, +rsc1+ will be -+demoted to slave mode if +ticketA+ is revoked: - -+.Constraint that demotes +rsc1+ if +ticketA+ is revoked -+==== - [source,XML] - ------- - - ------- -+==== - --You can create more `rsc_ticket` constraints to let multiple resources --depend on the same ticket. -- --`rsc_ticket` also supports resource sets. So one can easily list all the --resources in one `rsc_ticket` constraint. For example: -+You can create multiple `rsc_ticket` constraints to let multiple resources -+depend on the same ticket. However, `rsc_ticket` also supports resource sets, -+so one can easily list all the resources in one `rsc_ticket` constraint instead. - -+.Ticket constraint for multiple resources -+==== - [source,XML] - ------- -- -- -- -- -- -- -- -- -- -- -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ - ------- -+==== - --In the example, there are two resource sets for listing the resources with --different `roles` in one `rsc_ticket` constraint. There's no dependency --between the two resource sets. And there's no dependency among the -+In the example above, there are two resource sets, so we can list resources -+with different roles in a single +rsc_ticket+ constraint. There's no dependency -+between the two resource sets, and there's no dependency among the - resources within a resource set. Each of the resources just depends on --`ticketA`. -++ticketA+. - --Referencing resource templates in `rsc_ticket` constraints, and even -+Referencing resource templates in +rsc_ticket+ constraints, and even - referencing them within resource sets, is also supported. - - If you want other resources to depend on further tickets, create as many --constraints as necessary with `rsc_ticket`. -+constraints as necessary with +rsc_ticket+. - - - == Managing Multi-Site Clusters == -@@ -198,43 +198,59 @@ constraints as necessary with `rsc_ticket`. - === Granting and Revoking Tickets Manually === - - You can grant tickets to sites or revoke them from sites manually. --Though if you want to re-distribute a ticket, you should wait for --the dependent resources to cleanly stop at the previous site before you --grant the ticket to another desired site. -+If you want to re-distribute a ticket, you should wait for -+the dependent resources to stop cleanly at the previous site before you -+grant the ticket to the new site. - - Use the `crm_ticket` command line tool to grant and revoke tickets. - - To grant a ticket to this site: --[source,C] - ------- - # crm_ticket --ticket ticketA --grant - ------- - - To revoke a ticket from this site: --[source,C] - ------- - # crm_ticket --ticket ticketA --revoke - ------- - - [IMPORTANT] - ==== --If you are managing tickets manually. Use the `crm_ticket` command with --great care as they cannot help verify if the same ticket is already -+If you are managing tickets manually, use the `crm_ticket` command with -+great care, because it cannot check whether the same ticket is already - granted elsewhere. -- - ==== - - - === Granting and Revoking Tickets via a Cluster Ticket Registry === - --==== Booth ==== --Booth is an implementation of `Cluster Ticket Registry` or so-called --`Cluster Ticket Manager`. -+We will use https://github.com/ClusterLabs/booth[Booth] here as an example of -+software that can be used with pacemaker as a Cluster Ticket Registry. Booth -+implements the -+http://en.wikipedia.org/wiki/Raft_%28computer_science%29[Raft] -+algorithm to guarantee the distributed consensus among different -+cluster sites, and manages the ticket distribution (and thus the failover -+process between sites). - --Booth is the instance managing the ticket distribution and thus, --the failover process between the sites of a multi-site cluster. Each of --the participating clusters and arbitrators runs a service, the boothd. --It connects to the booth daemons running at the other sites and -+Each of the participating clusters and 'arbitrators' runs the Booth daemon -+`boothd`. -+ -+An 'arbitrator' is the multi-site equivalent of a quorum-only node in a local -+cluster. If you have a setup with an even number of sites, -+you need an additional instance to reach consensus about decisions such -+as failover of resources across sites. In this case, add one or more -+arbitrators running at additional sites. Arbitrators are single machines -+that run a booth instance in a special mode. An arbitrator is especially -+important for a two-site scenario, otherwise there is no way for one site -+to distinguish between a network failure between it and the other site, and -+a failure of the other site. -+ -+The most common multi-site scenario is probably a multi-site cluster with two -+sites and a single arbitrator on a third site. However, technically, there are -+no limitations with regards to the number of sites and the number of -+arbitrators involved. -+ -+`Boothd` at each site connects to its peers running at the other sites and - exchanges connectivity details. Once a ticket is granted to a site, the - booth mechanism will manage the ticket automatically: If the site which - holds the ticket is out of service, the booth daemons will vote which -@@ -249,113 +265,68 @@ resources before will be treated according to the `loss-policy` you set - within the `rsc_ticket` constraint. - - Before the booth can manage a certain ticket within the multi-site cluster, --you initially need to grant it to a site manually via `booth client` command. --After you have initially granted a ticket to a site, the booth mechanism -+you initially need to grant it to a site manually via the `booth` command-line -+tool. After you have initially granted a ticket to a site, `boothd` - will take over and manage the ticket automatically. - - [IMPORTANT] - ==== --The `booth client` command line tool can be used to grant, list, or --revoke tickets. The `booth client` commands work on any machine where --the booth daemon is running. -- --If you are managing tickets via `Booth`, only use `booth client` for manual --intervention instead of `crm_ticket`. That can make sure the same ticket -+The `booth` command-line tool can be used to grant, list, or -+revoke tickets and can be run on any machine where `boothd` is running. -+If you are managing tickets via Booth, use only `booth` for manual -+intervention, not `crm_ticket`. That ensures the same ticket - will only be owned by one cluster site at a time. - ==== - --Booth includes an implementation of --http://en.wikipedia.org/wiki/Paxos_algorithm['Paxos'] and 'Paxos Lease' --algorithm, which guarantees the distributed consensus among different --cluster sites. -- --[NOTE] --==== --`Arbitrator` -- --Each site runs one booth instance that is responsible for communicating --with the other sites. If you have a setup with an even number of sites, --you need an additional instance to reach consensus about decisions such --as failover of resources across sites. In this case, add one or more --arbitrators running at additional sites. Arbitrators are single machines --that run a booth instance in a special mode. As all booth instances --communicate with each other, arbitrators help to make more reliable --decisions about granting or revoking tickets. -- --An arbitrator is especially important for a two-site scenario: For example, --if site `A` can no longer communicate with site `B`, there are two possible --causes for that: -- --- `A` network failure between `A` and `B`. -- --- Site `B` is down. -- --However, if site `C` (the arbitrator) can still communicate with site `B`, --site `B` must still be up and running. -- --==== -- --===== Requirements ===== -- --- All clusters that will be part of the multi-site cluster must be based on Pacemaker. -- --- Booth must be installed on all cluster nodes and on all arbitrators that will --be part of the multi-site cluster. -+==== Booth Requirements ==== - --The most common scenario is probably a multi-site cluster with two sites and a --single arbitrator on a third site. However, technically, there are no limitations --with regards to the number of sites and the number of arbitrators involved. -+* All clusters that will be part of the multi-site cluster must be based on -+ Pacemaker. - --Nodes belonging to the same cluster site should be synchronized via NTP. However, --time synchronization is not required between the individual cluster sites. -+* Booth must be installed on all cluster nodes and on all arbitrators that will -+ be part of the multi-site cluster. - -+* Nodes belonging to the same cluster site should be synchronized via NTP. However, -+ time synchronization is not required between the individual cluster sites. - - === General Management of Tickets === - - Display the information of tickets: --[source,C] - ------- - # crm_ticket --info - ------- - - Or you can monitor them with: --[source,C] - ------- - # crm_mon --tickets - ------- - --Display the rsc_ticket constraints that apply to a ticket: --[source,C] -+Display the +rsc_ticket+ constraints that apply to a ticket: - ------- - # crm_ticket --ticket ticketA --constraints - ------- - --When you want to do maintenance or manual switch-over of a ticket, the --ticket could be revoked from the site for any reason, which would --trigger the loss-policies. If `loss-policy="fence"`, the dependent --resources could not be gracefully stopped/demoted, and even, other --unrelated resources could be impacted. -+When you want to do maintenance or manual switch-over of a ticket, -+revoking the ticket would trigger the loss policies. If -++loss-policy="fence"+, the dependent resources could not be gracefully -+stopped/demoted, and other unrelated resources could even be affected. - --The proper way is making the ticket `standby` first with: --[source,C] -+The proper way is making the ticket 'standby' first with: - ------- - # crm_ticket --ticket ticketA --standby - ------- - - Then the dependent resources will be stopped or demoted gracefully without --triggering the loss-policies. -+triggering the loss policies. - - If you have finished the maintenance and want to activate the ticket again, - you can run: --[source,C] - ------- - # crm_ticket --ticket ticketA --activate - ------- - - == For more information == - --`Multi-site Clusters` --http://doc.opensuse.org/products/draft/SLE-HA/SLE-ha-guide_sd_draft/cha.ha.geo.html -+* https://www.suse.com/documentation/sle-ha-geo-12/art_ha_geo_quick/data/art_ha_geo_quick.html[SUSE's Geo Clustering quick start] - --`Booth` --https://github.com/ClusterLabs/booth -+* https://github.com/ClusterLabs/booth[Booth] -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Nodes.txt b/doc/Pacemaker_Explained/en-US/Ch-Nodes.txt -index 16bf13d..c7f1005 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Nodes.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Nodes.txt -@@ -27,28 +27,31 @@ to read an existing UUID or define a value before the cluster starts. - - Traditionally, Pacemaker required nodes to be referred to by the value - returned by `uname -n`. This can be problematic for services that --require the `uname -n` to be a specific value (ie. for a licence -+require the `uname -n` to be a specific value (e.g. for a licence - file). - --Since version 2.0.0 of Pacemaker, this requirement has been relaxed --for clusters using Corosync 2.0 or later. The name Pacemaker uses is: -+This requirement has been relaxed for clusters using Corosync 2.0 or later. -+The name Pacemaker uses is: - --. The value stored in 'corosync.conf' under +ring0_addr+ in the +nodelist+, if it does not contain an IP address; otherwise --. The value stored in 'corosync.conf' under +name+ in the +nodelist+; otherwise -+. The value stored in +corosync.conf+ under *ring0_addr* in the *nodelist*, if it does not contain an IP address; otherwise -+. The value stored in +corosync.conf+ under *name* in the *nodelist*; otherwise - . The value of `uname -n` - - Pacemaker provides the `crm_node -n` command which displays the name - used by a running cluster. - --If a Corosync nodelist is used, `crm_node --name-for-id $number` is also -+If a Corosync *nodelist* is used, `crm_node --name-for-id` pass:[number] is also - available to display the name used by the node with the corosync --+nodeid+ of '$number', for example: `crm_node --name-for-id 2`. -+*nodeid* of pass:[number], for example: `crm_node --name-for-id 2`. - - [[s-node-attributes]] --== Describing a Cluster Node == -+== Node Attributes == - - indexterm:[Node,attribute] --Beyond the basic definition of a node the administrator can also -+'Node attributes' are a special type of option (name-value pair) that -+applies to a node object. -+ -+Beyond the basic definition of a node, the administrator can - describe the node's attributes, such as how much RAM, disk, what OS or - kernel version it has, perhaps even its physical location. This - information can then be used by the cluster when deciding where to -@@ -60,47 +63,51 @@ when the cluster is running, using `crm_attribute`. - - Below is what the node's definition would look like if the admin ran the command: - --.The result of using crm_attribute to specify which kernel pcmk-1 is running -+.Result of using crm_attribute to specify which kernel pcmk-1 is running - ====== --[source,C] - ------- --# crm_attribute --type nodes --node-uname pcmk-1 --attr-name kernel --attr-value `uname -r` -+# crm_attribute --type nodes --node pcmk-1 --name kernel --update $(uname -r) - ------- - [source,XML] - ------- - - -- -+ - - - ------- - ====== --A simpler way to determine the current value of an attribute is to use `crm_attribute` command again: -+Rather than having to read the XML, a simpler way to determine the current -+value of an attribute is to use `crm_attribute` again: - --[source,C] --# crm_attribute --type nodes --node-uname pcmk-1 --attr-name kernel --get-value -+---- -+# crm_attribute --type nodes --node pcmk-1 --name kernel --query -+scope=nodes name=kernel value=3.10.0-123.13.2.el7.x86_64 -+---- - - By specifying `--type nodes` the admin tells the cluster that this - attribute is persistent. There are also transient attributes which - are kept in the status section which are "forgotten" whenever the node - rejoins the cluster. The cluster uses this area to store a record of --how many times a resource has failed on that node but administrators -+how many times a resource has failed on that node, but administrators - can also read and write to this section by specifying `--type status`. - --== Corosync == -+== Managing Nodes in a Corosync-Based Cluster == - - === Adding a New Corosync Node === - - indexterm:[Corosync,Add Cluster Node] - indexterm:[Add Cluster Node,Corosync] - --Adding a new node is as simple as installing Corosync and Pacemaker, --and copying '/etc/corosync/corosync.conf' and '/etc/corosync/authkey' (if --it exists) from an existing node. You may need to modify the --+mcastaddr+ option to match the new node's IP address. -+To add a new node: - --If a log message containing "Invalid digest" appears from Corosync, --the keys are not consistent between the machines. -+. Install Corosync and Pacemaker on the new host. -+. Copy +/etc/corosync/corosync.conf+ and +/etc/corosync/authkey+ (if it exists) -+ from an existing node. You may need to modify the *mcastaddr* option to match -+ the new node's IP address. -+. Start the cluster software on the new host. If a log message containing -+ "Invalid digest" appears from Corosync, the keys are not consistent between -+ the machines. - - === Removing a Corosync Node === - -@@ -108,26 +115,24 @@ indexterm:[Corosync,Remove Cluster Node] - indexterm:[Remove Cluster Node,Corosync] - - Because the messaging and membership layers are the authoritative --source for cluster nodes, deleting them from the CIB is not a reliable --solution. First one must arrange for corosync to forget about the --node (_pcmk-1_ in the example below). -- --On the host to be removed: -- --. Stop the cluster: `/etc/init.d/corosync stop` -- --Next, from one of the remaining active cluster nodes: -- --. Tell Pacemaker to forget about the removed host: -+source for cluster nodes, deleting them from the CIB is not a complete -+solution. First, one must arrange for corosync to forget about the -+node (*pcmk-1* in the example below). -+ -+. Stop the cluster on the host to be removed. How to do this will vary with -+ your operating system and installed versions of cluster software, for example, -+ `pcs cluster stop` if you are using pcs for cluster management, or -+ `service corosync stop` on a host using corosync 1.x with the pacemaker plugin. -+. From one of the remaining active cluster nodes, tell Pacemaker to forget -+ about the removed host, which will also delete the node from the CIB: - + --[source,C] -+---- - # crm_node -R pcmk-1 --+ --This includes deleting the node from the CIB -+---- - - [NOTE] - ====== --This proceedure only works for versions after 1.1.8 -+This procedure only works for pacemaker 1.1.8 and later. - ====== - - === Replacing a Corosync Node === -@@ -135,18 +140,14 @@ This proceedure only works for versions after 1.1.8 - indexterm:[Corosync,Replace Cluster Node] - indexterm:[Replace Cluster Node,Corosync] - --The five-step guide to replacing an existing cluster node: -- --. Make sure the old node is completely stopped --. Give the new machine the same hostname and IP address as the old one --. Install the cluster software :-) --. Copy '/etc/corosync/corosync.conf' and '/etc/corosync/authkey' (if it exists) to the new node --. Start the new cluster node -+To replace an existing cluster node: - --If a log message containing "Invalid digest" appears from Corosync, --the keys are not consistent between the machines. -+. Make sure the old node is completely stopped. -+. Give the new machine the same hostname and IP address as the old one. -+. Follow the procedure above for adding a node. - --== CMAN == -+//// -+== Managing Nodes in a CMAN-based Cluster == - - === Adding a New CMAN Node === - -@@ -157,20 +158,25 @@ indexterm:[Add Cluster Node,CMAN] - - indexterm:[CMAN,Remove Cluster Node] - indexterm:[Remove Cluster Node,CMAN] -+//// - --== Heartbeat == -+== Managing Nodes in a Heartbeat-based Cluster == - - === Adding a New Heartbeat Node === - - indexterm:[Heartbeat,Add Cluster Node] - indexterm:[Add Cluster Node,Heartbeat] - --Provided you specified +autojoin any+ in 'ha.cf', adding a new node is --as simple as installing heartbeat and copying 'ha.cf' and 'authkeys' --from an existing node. -+To add a new node: - --If you don't want to use +autojoin+, then after setting up 'ha.cf' and --'authkeys', you must use `hb_addnode` before starting the new node. -+. Install heartbeat and pacemaker on the new host. -+. Copy +ha.cf+ and +authkeys+ from an existing node. -+. If you do not use *autojoin any* in +ha.cf+, run: -++ -+---- -+hb_addnode $(uname -n) -+---- -+. Start the cluster software on the new node. - - === Removing a Heartbeat Node === - -@@ -178,43 +184,42 @@ indexterm:[Heartbeat,Remove Cluster Node] - indexterm:[Remove Cluster Node,Heartbeat] - - Because the messaging and membership layers are the authoritative --source for cluster nodes, deleting them from the CIB is not a reliable --solution. -- --First one must arrange for Heartbeat to forget about the node (pcmk-1 -+source for cluster nodes, deleting them from the CIB is not a complete -+solution. First, one must arrange for Heartbeat to forget about the node (pcmk-1 - in the example below). - --On the host to be removed: -- --. Stop the cluster: `/etc/init.d/corosync stop` -- --Next, from one of the remaining active cluster nodes: -- --. Tell Heartbeat the node should be removed -- --[source,C] --# hb_delnode pcmk-1 -- -+. On the host to be removed, stop the cluster: -++ -+---- -+service heartbeat stop -+---- -+. From one of the remaining active cluster nodes, tell Heartbeat the node -+should be removed: -++ -+---- -+hb_delnode pcmk-1 -+---- - . Tell Pacemaker to forget about the removed host: -- --[source,C] --# crm_node -R pcmk-1 -++ -+---- -+crm_node -R pcmk-1 -+---- - - [NOTE] - ====== --This proceedure only works for versions after 1.1.8 -+This procedure only works for pacemaker versions after 1.1.8. - ====== - - === Replacing a Heartbeat Node === - - indexterm:[Heartbeat,Replace Cluster Node] - indexterm:[Replace Cluster Node,Heartbeat] --The seven-step guide to replacing an existing cluster node: -- --. Make sure the old node is completely stopped --. Give the new machine the same hostname as the old one --. Go to an active cluster node and look up the UUID for the old node in '/var/lib/heartbeat/hostcache' --. Install the cluster software --. Copy 'ha.cf' and 'authkeys' to the new node --. On the new node, populate it's UUID using `crm_uuid -w` and the UUID from step 2 --. Start the new cluster node -+To replace an existing cluster node: -+ -+. Make sure the old node is completely stopped. -+. Give the new machine the same hostname as the old one. -+. Go to an active cluster node and look up the UUID for the old node in +/var/lib/heartbeat/hostcache+. -+. Install the cluster software. -+. Copy +ha.cf+ and +authkeys+ to the new node. -+. On the new node, populate its UUID using `crm_uuid -w` and the UUID obtained earlier. -+. Start the new cluster node. -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Notifications.txt b/doc/Pacemaker_Explained/en-US/Ch-Notifications.txt -index 13b835d..134ab0c 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Notifications.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Notifications.txt -@@ -1,27 +1,27 @@ - = Receiving Notification for Cluster Events = - - //// --We prefer [[ch-notifications]], but older versions of asciidoc dont deal well -+We prefer [[ch-notifications]], but older versions of asciidoc don't deal well - with that construct for chapter headings - //// - anchor:ch-notifications[Chapter 7, Receiving Notification for Cluster Events] - indexterm:[Resource,Notification] - --A Pacemaker cluster is an event driven system. In this context, an event is a --resource failure or configuration change (not exhaustive). -+A Pacemaker cluster is an event-driven system. In this context, an 'event' -+might be a resource failure or a configuration change, among others. - --The +ocf:pacemaker:ClusterMon+ resource can monitor the cluster status and --triggers alerts on each cluster event. This resource runs +crm_mon+ in the --background at regular intervals (configurable) and uses +crm_mon+ capabilities --to send emails (SMTP), SNMP traps or to execute an external program via the --+extra_options+ parameter. -+The *ocf:pacemaker:ClusterMon* resource can monitor the cluster status and -+trigger alerts on each cluster event. This resource runs `crm_mon` in the -+background at regular (configurable) intervals and uses `crm_mon` capabilities -+to trigger emails (SMTP), SNMP traps or external programs (via the -++extra_options+ parameter). - - [NOTE] - ===== - Depending on your system settings and compilation settings, SNMP or email --alerts might be unavailable. Check +crm_mon --help+ output to see if these -+alerts might be unavailable. Check the output of `crm_mon --help` to see whether these - options are available to you. In any case, executing an external agent will --always be available, and you can have this agent to send emails, SNMP traps, -+always be available, and you can use this agent to send emails, SNMP traps - or whatever action you develop. - ===== - -@@ -29,8 +29,12 @@ or whatever action you develop. - == Configuring SNMP Notifications == - indexterm:[Resource,Notification,SNMP] - --Requires an IP to send SNMP traps to, and a SNMP community. --Pacemaker MIB is found in _/usr/share/snmp/mibs/PCMK-MIB.txt_ -+Requires an IP to send SNMP traps to, and an SNMP community string. -+The Pacemaker MIB is provided with the source, and is typically -+installed in +/usr/share/snmp/mibs/PCMK-MIB.txt+. -+ -+This example uses +snmphost.example.com+ as the SNMP IP and -++public+ as the community string: - - .Configuring ClusterMon to send SNMP traps - ===== -@@ -52,7 +56,9 @@ Pacemaker MIB is found in _/usr/share/snmp/mibs/PCMK-MIB.txt_ - == Configuring Email Notifications == - indexterm:[Resource,Notification,SMTP,Email] - --Requires a user to send mail alerts to. "Mail-From", SMTP relay and Subject prefix can also be configured. -+Requires the recipient e-mail address. You can also optionally configure -+the sender e-mail address, the hostname of the SMTP relay, and a prefix string -+for the subject line. - - .Configuring ClusterMon to send email alerts - ===== -@@ -74,8 +80,8 @@ Requires a user to send mail alerts to. "Mail-From", SMTP relay and Subject pref - == Configuring Notifications via External-Agent == - - Requires a program (external-agent) to run when resource operations take --place, and an external-recipient (IP address, Email address, URI). When --triggered, the external-agent is fed with dynamically filled environnement -+place, and an external-recipient (IP address, email address, URI). When -+triggered, the external-agent is fed with dynamically filled environment - variables describing precisely the cluster event that occurred. By making - smart usage of these variables in your external-agent code, you can trigger - any action. -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Options.txt b/doc/Pacemaker_Explained/en-US/Ch-Options.txt -index cf1478f..f8bca12 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Options.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Options.txt -@@ -1,133 +1,98 @@ --= Cluster Options = -+= Cluster-Wide Configuration = - --== Special Options == -+== CIB Properties == - --The reason for these fields to be placed at the top level instead of --with the rest of cluster options is simply a matter of parsing. These --options are used by the configuration database which is, by design, --mostly ignorant of the content it holds. So the decision was made to --place them in an easy to find location. -+Certain settings are defined by CIB properties (that is, attributes of the -++cib+ tag) rather than with the rest of the cluster configuration in the -++configuration+ section. - --== Configuration Version == -+The reason is simply a matter of parsing. These options are used by the -+configuration database which is, by design, mostly ignorant of the content it -+holds. So the decision was made to place them in an easy-to-find location. - --indexterm:[Configuration Version,Cluster] --indexterm:[Cluster,Option,Configuration Version] -- --When a node joins the cluster, the cluster will perform a check to see --who has the best configuration based on the fields below. It then --asks the node with the highest (+admin_epoch+, +epoch+, +num_updates+) --tuple to replace the configuration on all the nodes - which makes --setting them, and setting them correctly, very important. -- --.Configuration Version Properties -+.CIB Properties - [width="95%",cols="2m,5<",options="header",align="center"] - |========================================================= - |Field |Description - - | admin_epoch | -+indexterm:[Configuration Version,Cluster] -+indexterm:[Cluster,Option,Configuration Version] - indexterm:[admin_epoch,Cluster Option] - indexterm:[Cluster,Option,admin_epoch] --Never modified by the cluster. Use this to make the configurations on --any inactive nodes obsolete. -- --_Never set this value to zero_, in such cases the cluster cannot tell --the difference between your configuration and the "empty" one used --when nothing is found on disk. -+When a node joins the cluster, the cluster performs a check to see -+which node has the best configuration. It asks the node with the highest -+(+admin_epoch+, +epoch+, +num_updates+) tuple to replace the configuration on -+all the nodes -- which makes setting them, and setting them correctly, very -+important. +admin_epoch+ is never modified by the cluster; you can use this -+to make the configurations on any inactive nodes obsolete. _Never set this -+value to zero_. In such cases, the cluster cannot tell the difference between -+your configuration and the "empty" one used when nothing is found on disk. - - | epoch | - indexterm:[epoch,Cluster Option] - indexterm:[Cluster,Option,epoch] --Incremented every time the configuration is updated (usually by the admin) -+The cluster increments this every time the configuration is updated (usually by -+the administrator). - - | num_updates | - indexterm:[num_updates,Cluster Option] - indexterm:[Cluster,Option,num_updates] --Incremented every time the configuration or status is updated (usually by the cluster) -- --|========================================================= -- --== Other Fields == --.Properties Controlling Validation --[width="95%",cols="2m,5<",options="header",align="center"] --|========================================================= --|Field |Description -+The cluster increments this every time the configuration or status is updated -+(usually by the cluster) and resets it to 0 when epoch changes. - - | validate-with | - indexterm:[validate-with,Cluster Option] - indexterm:[Cluster,Option,validate-with] --Determines the type of validation being done on the configuration. If --set to "none", the cluster will not verify that updates conform to the -+Determines the type of XML validation that will be done on the configuration. -+If set to +none+, the cluster will not verify that updates conform to the - DTD (nor reject ones that don't). This option can be useful when --operating a mixed version cluster during an upgrade. -- --|========================================================= -- --== Fields Maintained by the Cluster == -- --.Properties Maintained by the Cluster --[width="95%",cols="2m,5<",options="header",align="center"] --|========================================================= --|Field |Description -+operating a mixed-version cluster during an upgrade. - - |cib-last-written | - indexterm:[cib-last-written,Cluster Property] - indexterm:[Cluster,Property,cib-last-written] --Indicates when the configuration was last written to disk. Informational purposes only. -- --|dc-uuid | --indexterm:[dc-uuid,Cluster Property] --indexterm:[Cluster,Property,dc-uuid] --Indicates which cluster node is the current leader. Used by the --cluster when placing resources and determining the order of some --events. -+Indicates when the configuration was last written to disk. Maintained by the -+cluster; for informational purposes only. - - |have-quorum | - indexterm:[have-quorum,Cluster Property] - indexterm:[Cluster,Property,have-quorum] - Indicates if the cluster has quorum. If false, this may mean that the --cluster cannot start resources or fence other nodes. See --+no-quorum-policy+ below. -- --| dc-version | --indexterm:[dc-version,Cluster Peroperty] --indexterm:[Cluster,Peroperty,dc-version] --Version of Pacemaker on the cluster's DC. -- --Often includes the hash which identifies the exact Git changeset it --was built from. Used for diagnostic purposes. -- --| cluster-infrastructure | --indexterm:[cluster-infrastructure,Cluster Peroperty] --indexterm:[Cluster,Peroperty,cluster-infrastructure] --The messaging stack on which Pacemaker is currently running. --Used for informational and diagnostic purposes. -+cluster cannot start resources or fence other nodes (see -++no-quorum-policy+ below). Maintained by the cluster. - --| expected-quorum-votes | --indexterm:[expected-quorum-votes,Cluster Peroperty] --indexterm:[Cluster,Peroperty,expected-quorum-votes] --The number of nodes expected to be in the cluster -- --Used to calculate quorum in Corosync 1.x (not CMAN) based clusters. -+|dc-uuid | -+indexterm:[dc-uuid,Cluster Property] -+indexterm:[Cluster,Property,dc-uuid] -+Indicates which cluster node is the current leader. Used by the -+cluster when placing resources and determining the order of some -+events. Maintained by the cluster. - - |========================================================= - --Note that although these fields can be written to by the admin, in -+=== Working with CIB Properties === -+ -+Although these fields can be written to by the user, in - most cases the cluster will overwrite any values specified by the --admin with the "correct" ones. To change the +admin_epoch+, for --example, one would use: -+user with the "correct" ones. - --[source,C] -+To change the ones that can be specified by the user, -+for example +admin_epoch+, one should use: -+---- - # cibadmin --modify --crm_xml '' -+---- - --A complete set of fields will look something like this: -+A complete set of CIB properties will look something like this: - --.An example of the fields set for a cib object -+.Attributes set for a cib object - ====== - [source,XML] - ------- -- -+ - ------- - ====== - -@@ -136,82 +101,78 @@ A complete set of fields will look something like this: - Cluster options, as you might expect, control how the cluster behaves - when confronted with certain situations. - --They are grouped into sets and, in advanced configurations, there may --be more than one. --footnote:[This will be described later in the section on --<> where we will show how to have the cluster use --different sets of options during working hours (when downtime is --usually to be avoided at all costs) than it does during the weekends --(when resources can be moved to the their preferred hosts without --bothering end users)] --For now we will describe the simple case where each option is present at most once. -- --== Available Cluster Options == -+They are grouped into sets within the +crm_config+ section, and, in advanced -+configurations, there may be more than one set. (This will be described later -+in the section on <> where we will show how to have the cluster use -+different sets of options during working hours than during weekends.) For now, -+we will describe the simple case where each option is present at most once. -+ -+You can obtain an up-to-date list of cluster options, including -+their default values, by running the `man pengine` and `man crmd` commands. -+ - .Cluster Options --[width="95%",cols="5m,2,11<",options="header",align="center"] -+[width="95%",cols="5m,2,11> instead -+Should a failure to start be treated as fatal for a resource? -+If FALSE, the cluster will instead use the resource's -++failcount+ and value for +migration-threshold+ (see <>). - --| is-managed-default | TRUE | --indexterm:[is-managed-default,Cluster Option] --indexterm:[Cluster,Option,is-managed-default] --+Deprecated:+ See <> instead -+| enable-startup-probes | TRUE | -+indexterm:[enable-startup-probes,Cluster Option] -+indexterm:[Cluster,Option,enable-startup-probes] -+Should the cluster check for active resources during startup? - - | maintenance-mode | FALSE | - indexterm:[maintenance-mode,Cluster Option] - indexterm:[Cluster,Option,maintenance-mode] --Should the cluster monitor resources and start/stop them as required -+Should the cluster refrain from monitoring, starting and stopping resources? -+ -+| stonith-enabled | TRUE | -+indexterm:[stonith-enabled,Cluster Option] -+indexterm:[Cluster,Option,stonith-enabled] -+Should failed nodes and nodes with resources that can't be stopped be -+shot? If you value your data, set up a STONITH device and enable this. -+ -+If true, or unset, the cluster will refuse to start resources unless -+one or more STONITH resources have been configured. -+ -+| stonith-action | reboot | -+indexterm:[stonith-action,Cluster Option] -+indexterm:[Cluster,Option,stonith-action] -+Action to send to STONITH device. Allowed values are +reboot+ and +off+. -+The value +poweroff+ is also allowed, but is only used for -+legacy devices. - - | stonith-timeout | 60s | - indexterm:[stonith-timeout,Cluster Option] - indexterm:[Cluster,Option,stonith-timeout] --How long to wait for the STONITH action to complete -+How long to wait for STONITH actions (reboot, on, off) to complete - --| default-action-timeout | 20s | --indexterm:[default-action-timeout,Cluster Option] --indexterm:[Cluster,Option,default-action-timeout] --+Deprecated:+ See <> instead -+| cluster-delay | 60s | -+indexterm:[cluster-delay,Cluster Option] -+indexterm:[Cluster,Option,cluster-delay] -+Estimated maximum round-trip delay over the network (excluding action -+execution). If the TE requires an action to be executed on another node, -+it will consider the action failed if it does not get a response -+from the other node in this time (after considering the action's -+own timeout). The "correct" value will depend on the speed and load of your -+network and cluster nodes. - - | dc-deadtime | 20s | - indexterm:[dc-deadtime,Cluster Option] -@@ -277,111 +240,159 @@ The "correct" value will depend on the speed/load of your network and the type o - | cluster-recheck-interval | 15min | - indexterm:[cluster-recheck-interval,Cluster Option] - indexterm:[Cluster,Option,cluster-recheck-interval] --Polling interval for time based changes to options, resource parameters and constraints. -+Polling interval for time-based changes to options, resource parameters and constraints. -+ -+The Cluster is primarily event-driven, but your configuration can have -+elements that take effect based on the time of day. To ensure these changes -+take effect, we can optionally poll the cluster's status for changes. A value -+of 0 disables polling. Positive values are an interval (in seconds unless other -+SI units are specified, e.g. 5min). -+ -+| pe-error-series-max | -1 | -+indexterm:[pe-error-series-max,Cluster Option] -+indexterm:[Cluster,Option,pe-error-series-max] -+The number of PE inputs resulting in ERRORs to save. Used when reporting problems. -+A value of -1 means unlimited (report all). -+ -+| pe-warn-series-max | -1 | -+indexterm:[pe-warn-series-max,Cluster Option] -+indexterm:[Cluster,Option,pe-warn-series-max] -+The number of PE inputs resulting in WARNINGs to save. Used when reporting problems. -+A value of -1 means unlimited (report all). -+ -+| pe-input-series-max | -1 | -+indexterm:[pe-input-series-max,Cluster Option] -+indexterm:[Cluster,Option,pe-input-series-max] -+The number of "normal" PE inputs to save. Used when reporting problems. -+A value of -1 means unlimited (report all). - --The Cluster is primarily event driven, however the configuration can have elements that change based on time. To ensure these changes take effect, we can optionally poll the cluster's status for changes. -+| remove-after-stop | FALSE | -+indexterm:[remove-after-stop,Cluster Option] -+indexterm:[Cluster,Option,remove-after-stop] -+_Advanced Use Only:_ Should the cluster remove resources from the LRM after -+they are stopped? Values other than the default are, at best, poorly tested and -+potentially dangerous. - --Allowed values: Zero disables polling. Positive values are an interval in seconds (unless other SI units are specified. eg. 5min) -+| startup-fencing | TRUE | -+indexterm:[startup-fencing,Cluster Option] -+indexterm:[Cluster,Option,startup-fencing] -+_Advanced Use Only:_ Should the cluster shoot unseen nodes? -+Not using the default is very unsafe! - - | election-timeout | 2min | - indexterm:[election-timeout,Cluster Option] - indexterm:[Cluster,Option,election-timeout] --+Advanced Use Only+ -- --If need to adjust this value, it probably indicates the presence of a bug. -+_Advanced Use Only:_ If you need to adjust this value, it probably indicates -+the presence of a bug. - - | shutdown-escalation | 20min | - indexterm:[shutdown-escalation,Cluster Option] - indexterm:[Cluster,Option,shutdown-escalation] --+Advanced Use Only+ -- --If need to adjust this value, it probably indicates the presence of a bug. -+_Advanced Use Only:_ If you need to adjust this value, it probably indicates -+the presence of a bug. - - | crmd-integration-timeout | 3min | - indexterm:[crmd-integration-timeout,Cluster Option] - indexterm:[Cluster,Option,crmd-integration-timeout] --+Advanced Use Only+ -- --If need to adjust this value, it probably indicates the presence of a bug. -+_Advanced Use Only:_ If you need to adjust this value, it probably indicates -+the presence of a bug. - - | crmd-finalization-timeout | 30min | - indexterm:[crmd-finalization-timeout,Cluster Option] - indexterm:[Cluster,Option,crmd-finalization-timeout] --+Advanced Use Only+ -- --If need to adjust this value, it probably indicates the presence of a bug. -+_Advanced Use Only:_ If you need to adjust this value, it probably indicates -+the presence of a bug. - --| crmd-transition-delay | | -+| crmd-transition-delay | 0s | - indexterm:[crmd-transition-delay,Cluster Option] - indexterm:[Cluster,Option,crmd-transition-delay] --+Advanced Use Only+ Enabling this option will slow down cluster recovery under all conditions. -+_Advanced Use Only:_ Delay cluster recovery for the configured interval to -+allow for additional/related events to occur. Useful if your configuration is -+sensitive to the order in which ping updates arrive. -+Enabling this option will slow down cluster recovery under -+all conditions. - --Delay cluster recovery for the configured interval to allow for additional/related events to occur. Useful if your configuration is sensitive to the order in which ping updates arrive. -+|default-resource-stickiness | 0 | -+indexterm:[default-resource-stickiness,Cluster Option] -+indexterm:[Cluster,Option,default-resource-stickiness] -+_Deprecated:_ See <> instead - --|========================================================= -+| is-managed-default | TRUE | -+indexterm:[is-managed-default,Cluster Option] -+indexterm:[Cluster,Option,is-managed-default] -+_Deprecated:_ See <> instead - --You can always obtain an up-to-date list of cluster options, including --their default values, by running the `man pengine` and `man crmd` commands. -+| default-action-timeout | 20s | -+indexterm:[default-action-timeout,Cluster Option] -+indexterm:[Cluster,Option,default-action-timeout] -+_Deprecated:_ See <> instead -+ -+|========================================================= - --== Querying and Setting Cluster Options == -+=== Querying and Setting Cluster Options === - - indexterm:[Querying,Cluster Option] - indexterm:[Setting,Cluster Option] - indexterm:[Cluster,Querying Options] - indexterm:[Cluster,Setting Options] - --Cluster options can be queried and modified using the --`crm_attribute` tool. To get the current --value of +cluster-delay+, simply use: -+Cluster options can be queried and modified using the `crm_attribute` tool. To -+get the current value of +cluster-delay+, you can run: - --[source,C] --# crm_attribute --attr-name cluster-delay --get-value -+---- -+# crm_attribute --query --name cluster-delay -+---- - - which is more simply written as - --[source,C] --# crm_attribute --get-value -n cluster-delay -+---- -+# crm_attribute -G -n cluster-delay -+---- - - If a value is found, you'll see a result like this: - --[source,C] --# crm_attribute --get-value -n cluster-delay -- name=cluster-delay value=60s -+---- -+# crm_attribute -G -n cluster-delay -+scope=crm_config name=cluster-delay value=60s -+---- - --However, if no value is found, the tool will display an error: -+If no value is found, the tool will display an error: - --[source,C] --# crm_attribute --get-value -n clusta-deway` --name=clusta-deway value=(null) --Error performing operation: The object/attribute does not exist -+---- -+# crm_attribute -G -n clusta-deway -+scope=crm_config name=clusta-deway value=(null) -+Error performing operation: No such device or address -+---- - --To use a different value, eg. +30+, simply run: -+To use a different value (for example, 30 seconds), simply run: - --[source,C] --# crm_attribute --attr-name cluster-delay --attr-value 30s -+---- -+# crm_attribute --name cluster-delay --update 30s -+---- - --To go back to the cluster's default value you can delete the value, for example with this command: -+To go back to the cluster's default value, you can delete the value, for example: - --[source,C] --# crm_attribute --attr-name cluster-delay --delete-attr -+---- -+# crm_attribute --name cluster-delay --delete -+Deleted crm_config option: id=cib-bootstrap-options-cluster-delay name=cluster-delay -+---- - --== When Options are Listed More Than Once == -+=== When Options are Listed More Than Once === - - If you ever see something like the following, it means that the option you're modifying is present more than once. - - .Deleting an option that is listed twice - ======= --[source,C] - ------ --# crm_attribute --attr-name batch-limit --delete-attr -+# crm_attribute --name batch-limit --delete - - Multiple attributes match name=batch-limit in crm_config: - Value: 50 (set=cib-bootstrap-options, id=cib-bootstrap-options-batch-limit) - Value: 100 (set=custom, id=custom-batch-limit) --Please choose from one of the matches above and supply the 'id' with --attr-id -+Please choose from one of the matches above and supply the 'id' with --id - ------- - ======= - --In such cases follow the on-screen instructions to perform the -+In such cases, follow the on-screen instructions to perform the - requested action. To determine which value is currently being used by --the cluster, please refer to <>. -+the cluster, refer to <>. -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Resource-Templates.txt b/doc/Pacemaker_Explained/en-US/Ch-Resource-Templates.txt -index 5c34ae7..06cf32e 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Resource-Templates.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Resource-Templates.txt -@@ -1,16 +1,13 @@ - = Resource Templates = - --== Abstract == -- - If you want to create lots of resources with similar configurations, defining a - resource template simplifies the task. Once defined, it can be referenced in - primitives or in certain types of constraints. -- - - == Configuring Resources with Templates == - --The primitives referencing the template will inherit all meta --attributes, instance attributes, utilization attributes and operations defined -+The primitives referencing the template will inherit all meta-attributes, -+instance attributes, utilization attributes and operations defined - in the template. And you can define specific attributes and operations for any - of the primitives. If any of these are defined in both the template and the - primitive, the values defined in the primitive will take precedence over the -@@ -21,8 +18,10 @@ If any changes are needed, they can be done to the template definition and - will take effect globally in all resource definitions referencing that - template. - --Resource templates have a similar syntax like primitives. For example: -+Resource templates have a syntax similar to that of primitives. - -+.Resource template for a migratable Xen virtual machine -+==== - [source,XML] - ---- - - ---- -+==== - --Once you defined the new resource template, you can use it in primitives: -+Once you define a resource template, you can use it in primitives by specifying the -++template+ property. - -+.Xen primitive resource using a resource template -+==== - [source,XML] - ---- - -@@ -50,10 +53,13 @@ Once you defined the new resource template, you can use it in primitives: - - - ---- -+==== - --The new primitive `vm1` is going to inherit everything from the `vm-template`. For --example, the equivalent of the above two would be: -+In the example above, the new primitive +vm1+ will inherit everything from +vm-template+. For -+example, the equivalent of the above two examples would be: - -+.Equivalent Xen primitive resource not using a resource template -+==== - [source,XML] - ---- - -@@ -73,14 +79,13 @@ example, the equivalent of the above two would be: - - - ---- -+==== - - If you want to overwrite some attributes or operations, add them to the - particular primitive's definition. - --For instance, the following new primitive `vm2` has special --attribute values. Its `monitor` operation has a longer `timeout` and `interval`, and --the primitive has an additional `stop` operation. -- -+.Xen resource overriding template values -+==== - [source,XML] - ---- - -@@ -100,24 +105,31 @@ the primitive has an additional `stop` operation. - - - ---- -+==== - --The following command shows the resulting definition of a resource: -+In the example above, the new primitive +vm2+ has special -+attribute values. Its +monitor+ operation has a longer +timeout+ and +interval+, and -+the primitive has an additional +stop+ operation. -+ -+To see the resulting definition of a resource, run: - --[source,C] -+---- - # crm_resource --query-xml --resource vm2 -+---- - --The following command shows its raw definition in cib: -+To see the raw definition of a resource in the CIB, run: - --[source,C] -+---- - # crm_resource --query-xml-raw --resource vm2 -+---- - - == Referencing Templates in Constraints == - - A resource template can be referenced in the following types of constraints: - --- `order` constraints --- `colocation` constraints, --- `rsc_ticket` constraints (for multi-site clusters). -+- +order+ constraints (see <>) -+- +colocation+ constraints (see <>) -+- +rsc_ticket+ constraints (for multi-site clusters as described in <>) - - Resource templates referenced in constraints stand for all primitives which are - derived from that template. This means, the constraint applies to all primitive -@@ -125,12 +137,12 @@ resources referencing the resource template. Referencing resource templates in - constraints is an alternative to resource sets and can simplify the cluster - configuration considerably. - --For example: -+For example, given the example templates earlier in this chapter: - - [source,XML] - - --is the equivalent of the following constraint configuration: -+would colocate all VMs with +base-rsc+ and is the equivalent of the following constraint configuration: - - [source,XML] - ---- -@@ -148,9 +160,11 @@ is the equivalent of the following constraint configuration: - [NOTE] - ====== - In a colocation constraint, only one template may be referenced from either --`rsc` or `with-rsc`, and the other reference must be a regular resource. -+`rsc` or `with-rsc`; the other reference must be a regular resource. - ====== - -+=== Referencing Resource Templates in Sequential Resource Sets === -+ - Resource templates can also be referenced in resource sets. - - For example: -@@ -180,6 +194,8 @@ is the equivalent of the following constraint configuration: - - ---- - -+=== Referencing Resource Templates in Parallel Resource Sets === -+ - If the resources referencing the template can run in parallel: - - [source,XML] -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Resources.txt b/doc/Pacemaker_Explained/en-US/Ch-Resources.txt -index 552c1b9..5d5fa33 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Resources.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Resources.txt -@@ -1,49 +1,42 @@ - = Cluster Resources = - --== What is a Cluster Resource == -+== What is a Cluster Resource? == - - indexterm:[Resource] - --The role of a resource agent is to abstract the service it provides --and present a consistent view to the cluster, which allows the cluster --to be agnostic about the resources it manages. -+A resource is a service made highly available by a cluster. -+The simplest type of resource, a 'primitive' resource, is described -+in this chapter. More complex forms, such as groups and clones, -+are described in later chapters. - -+Every primitive resource has a 'resource agent'. A resource agent is an -+external program that abstracts the service it provides and present a -+consistent view to the cluster. -+ -+This allows the cluster to be agnostic about the resources it manages. - The cluster doesn't need to understand how the resource works because - it relies on the resource agent to do the right thing when given a --+start+, +stop+ or +monitor+ command. -- --For this reason it is crucial that resource agents are well tested. -+`start`, `stop` or `monitor` command. For this reason, it is crucial that -+resource agents are well-tested. - --Typically resource agents come in the form of shell scripts, however -+Typically, resource agents come in the form of shell scripts. However, - they can be written using any technology (such as C, Python or Perl) - that the author is comfortable with. - - [[s-resource-supported]] --== Supported Resource Classes == -+== Resource Classes == - - indexterm:[Resource,class] - --There are six classes of agents supported by Pacemaker: -+Pacemaker supports several classes of agents: - - * OCF - * LSB - * Upstart - * Systemd --* Fencing - * Service --* Nagios -- --indexterm:[Resource,Heartbeat] --indexterm:[Heartbeat,Resources] -- --Version 1 of Heartbeat came with its own style of resource agents and --it is highly likely that many people have written their own agents --based on its conventions. footnote:[ See --http://wiki.linux-ha.org/HeartbeatResourceAgent for more information ] -- --Although deprecated with the release of Heartbeat v2, they were --supported by Pacemaker up until the release of 1.1.8 to enable --administrators to continue to use these agents. -+* Fencing -+* Nagios Plugins - - === Open Cluster Framework === - -@@ -52,24 +45,22 @@ indexterm:[OCF,Resources] - indexterm:[Open Cluster Framework,Resources] - - The OCF standard --footnote:[ --http://www.opencf.org/cgi-bin/viewcvs.cgi/specs/ra/resource-agent-api.txt?rev=HEAD - at least as it relates to resource agents. --] footnote:[ --The Pacemaker implementation has been somewhat extended from the OCF --Specs, but none of those changes are incompatible with the original --OCF specification. --] -+footnote:[See -+http://www.opencf.org/cgi-bin/viewcvs.cgi/specs/ra/resource-agent-api.txt?rev=HEAD -+ -- at least as it relates to resource agents. The Pacemaker implementation has -+been somewhat extended from the OCF specs, but none of those changes are -+incompatible with the original OCF specification.] - is basically an extension of the Linux Standard Base conventions for - init scripts to: - - * support parameters, --* make them self describing and --* extensible -+* make them self-describing, and -+* make them extensible - - OCF specs have strict definitions of the exit codes that actions must return. - footnote:[ --Included with the cluster is the ocf-tester script, which can be --useful in this regard. -+The resource-agents source code includes the `ocf-tester` script, which -+can be useful in this regard. - ] - - The cluster follows these specifications exactly, and giving the wrong -@@ -78,15 +69,14 @@ find puzzling and annoying. In particular, the cluster needs to - distinguish a completely stopped resource from one which is in some - erroneous and indeterminate state. - --Parameters are passed to the script as environment variables, with the -+Parameters are passed to the resource agent as environment variables, with the - special prefix +OCF_RESKEY_+. So, a parameter which the user thinks --of as ip it will be passed to the script as +OCF_RESKEY_ip+. The --number and purpose of the parameters is completely arbitrary, however --your script should advertise any that it supports using the --+meta-data+ command. -+of as +ip+ will be passed to the resource agent as +OCF_RESKEY_ip+. The -+number and purpose of the parameters is left to the resource agent; however, -+the resource agent should use the `meta-data` command to advertise any that it -+supports. - -- --The OCF class is the most preferred one as it is an industry standard, -+The OCF class is the most preferred as it is an industry standard, - highly flexible (allowing parameters to be passed to agents in a - non-positional manner) and self-describing. - -@@ -99,72 +89,84 @@ indexterm:[Resource,LSB] - indexterm:[LSB,Resources] - indexterm:[Linux Standard Base,Resources] - --LSB resource agents are those found in '/etc/init.d'. -+LSB resource agents are those found in +/etc/init.d+. - --Generally they are provided by the OS/distribution and, in order to be used with the cluster, they must conform to the LSB Spec. -+Generally, they are provided by the OS distribution and, in order to be used -+with the cluster, they must conform to the LSB Spec. - footnote:[ - See - http://refspecs.linux-foundation.org/LSB_3.0.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html --for the LSB Spec (as it relates to init scripts). -+for the LSB Spec as it relates to init scripts. - ] - -+[WARNING] -+==== - Many distributions claim LSB compliance but ship with broken init --scripts. For details on how to check if your init script is --LSB-compatible, see <>. The most common problems are: -+scripts. For details on how to check whether your init script is -+LSB-compatible, see <>. Common problematic violations of -+the LSB standard include: - - * Not implementing the status operation at all --* Not observing the correct exit status codes for start/stop/status actions --* Starting a started resource returns an error (this violates the LSB spec) --* Stopping a stopped resource returns an error (this violates the LSB spec) -+* Not observing the correct exit status codes for `start/stop/status` actions -+* Starting a started resource returns an error -+* Stopping a stopped resource returns an error -+==== -+ -+[IMPORTANT] -+==== -+Remember to make sure the computer is _not_ configured to start any -+services at boot time -- that should be controlled by the cluster. -+==== - - === Systemd === - indexterm:[Resource,Systemd] - indexterm:[Systemd,Resources] - - Some newer distributions have replaced the old --http://en.wikipedia.org/wiki/Init#SysV-style[SYS-V] style of --initialization daemons (and scripts) with an alternative called -+http://en.wikipedia.org/wiki/Init#SysV-style["SysV"] style of -+initialization daemons and scripts with an alternative called - http://www.freedesktop.org/wiki/Software/systemd[Systemd]. - - Pacemaker is able to manage these services _if they are present_. - --Instead of +init scripts+, systemd has +unit files+. Generally the --services (or unit files) are provided by the OS/distribution but there --are some instructions for converting from init scripts at: --http://0pointer.de/blog/projects/systemd-for-admins-3.html -+Instead of init scripts, systemd has 'unit files'. Generally, the -+services (unit files) are provided by the OS distribution, but there -+are online guides for converting from init scripts. -+footnote:[For example, -+http://0pointer.de/blog/projects/systemd-for-admins-3.html] - --[NOTE] --====== --Remember to make sure the computer is +not+ configured to start any --services at boot time that should be controlled by the cluster. --====== -+[IMPORTANT] -+==== -+Remember to make sure the computer is _not_ configured to start any -+services at boot time -- that should be controlled by the cluster. -+==== - - === Upstart === - indexterm:[Resource,Upstart] - indexterm:[Upstart,Resources] - - Some newer distributions have replaced the old --http://en.wikipedia.org/wiki/Init#SysV-style[SYS-V] style of -+http://en.wikipedia.org/wiki/Init#SysV-style["SysV"] style of - initialization daemons (and scripts) with an alternative called --http://upstart.ubuntu.com[Upstart]. -+http://upstart.ubuntu.com/[Upstart]. - - Pacemaker is able to manage these services _if they are present_. - --Instead of +init scripts+, upstart has +jobs+. Generally the --services (or jobs) are provided by the OS/distribution. -+Instead of init scripts, upstart has 'jobs'. Generally, the -+services (jobs) are provided by the OS distribution. - --[NOTE] --====== --Remember to make sure the computer is +not+ configured to start any --services at boot time that should be controlled by the cluster. --====== -+[IMPORTANT] -+==== -+Remember to make sure the computer is _not_ configured to start any -+services at boot time -- that should be controlled by the cluster. -+==== - - === System Services === - indexterm:[Resource,System Services] - indexterm:[System Service,Resources] - --Since there are now many "common" types of system services (+systemd+, --+upstart+, and +lsb+), Pacemaker supports a special alias which -+Since there are various types of system services (+systemd+, -++upstart+, and +lsb+), Pacemaker supports a special +service+ alias which - intelligently figures out which one applies to a given cluster node. - - This is particularly useful when the cluster contains a mix of -@@ -172,7 +174,7 @@ This is particularly useful when the cluster contains a mix of - - In order, Pacemaker will try to find the named service as: - --. an LSB (SYS-V) init script -+. an LSB init script - . a Systemd unit file - . an Upstart job - -@@ -180,32 +182,35 @@ In order, Pacemaker will try to find the named service as: - indexterm:[Resource,STONITH] - indexterm:[STONITH,Resources] - --There is also an additional class, STONITH, which is used exclusively --for fencing related resources. This is discussed later in --<>. -+The STONITH class is used exclusively for fencing-related resources. This is -+discussed later in <>. - - === Nagios Plugins === - indexterm:[Resource,Nagios Plugins] - indexterm:[Nagios Plugins,Resources] - --Nagios plugins allow us to monitor services on the remote hosts. --http://nagiosplugins.org[Nagios Plugins]. -+Nagios Plugins -+footnote:[The project has two independent forks, hosted at -+https://www.nagios-plugins.org/ and https://www.monitoring-plugins.org/. Output -+from both projects' plugins is similar, so plugins from either project can be -+used with pacemaker.] -+allow us to monitor services on remote hosts. - - Pacemaker is able to do remote monitoring with the plugins _if they are - present_. - --An use case is to configure them as resources belonging to a resource --container, which usually is a VM, and the container will be restarted --if any of them has failed. While they can also be configured as ordinary --resources to be just used for monitoring hosts or services via network. -+A common use case is to configure them as resources belonging to a resource -+container (usually a virtual machine), and the container will be restarted -+if any of them has failed. Another use is to configure them as ordinary -+resources to be used for monitoring hosts or services via the network. - --The supported parameters are same as the long options of a nagios plugin. -+The supported parameters are same as the long options of the plugin. - - [[primitive-resource]] - == Resource Properties == - --These values tell the cluster which script to use for the resource, --where to find that script and what standards it conforms to. -+These values tell the cluster which resource agent to use for the resource, -+where to find that resource agent and what standards it conforms to. - - .Properties of a Primitive Resource - [width="95%",cols="1m,6<",options="header",align="center"] -@@ -221,33 +226,35 @@ where to find that script and what standards it conforms to. - - |class - --|The standard the script conforms to. Allowed values: +ocf+, -- +service+, +upstart+, +systemd+, +lsb+, +stonith+ -+|The standard the resource agent conforms to. Allowed values: -++lsb+, +nagios+, +ocf+, +service+, +stonith+, +systemd+, +upstart+ - indexterm:[class,Resource] - indexterm:[Resource,Property,class] - - |type --|The name of the Resource Agent you wish to use. Eg. _IPaddr_ or _Filesystem_ -+|The name of the Resource Agent you wish to use. E.g. +IPaddr+ or +Filesystem+ - indexterm:[type,Resource] - indexterm:[Resource,Property,type] - - |provider - |The OCF spec allows multiple vendors to supply the same -- ResourceAgent. To use the OCF resource agents supplied with -- Heartbeat, you should specify +heartbeat+ here. -+ resource agent. To use the OCF resource agents supplied by -+ the Heartbeat project, you would specify +heartbeat+ here. - indexterm:[provider,Resource] - indexterm:[Resource,Property,provider] - - |========================================================= - --Resource definitions can be queried with the `crm_resource` tool. For example -+The XML definition of a resource can be queried with the `crm_resource` tool. -+For example: - --[source,C] -+---- - # crm_resource --resource Email --query-xml -+---- - - might produce: - --.An example system resource -+.A system resource definition - ===== - [source,XML] - -@@ -255,17 +262,21 @@ might produce: - - [NOTE] - ===== --One of the main drawbacks to system services (such as LSB, Systemd and -+One of the main drawbacks to system services (LSB, systemd or - Upstart) resources is that they do not allow any parameters! - ===== - --.An example OCF resource -+//// -+See https://tools.ietf.org/html/rfc5737 for choice of example IP address -+//// -+ -+.An OCF resource definition - ===== - [source,XML] - ------- - -- -- -+ -+ - - - ------- -@@ -274,12 +285,18 @@ Upstart) resources is that they do not allow any parameters! - [[s-resource-options]] - == Resource Options == - --Options are used by the cluster to decide how your resource should -+Resources have two types of options: 'meta-attributes' and 'instance attributes'. -+Meta-attributes apply to any type of resource, while instance attributes -+are specific to each resource agent. -+ -+=== Resource Meta-Attributes === -+ -+Meta-attributes are used by the cluster to decide how a resource should - behave and can be easily set using the `--meta` option of the - `crm_resource` command. - --.Options for a Primitive Resource --[width="95%",cols="1m,1,4<",options="header",align="center"] -+.Meta-attributes of a Primitive Resource -+[width="95%",cols="2m,2,5> resources, they will not promoted to - master) -- --* 'Master' - Allow the resource to be started and, if appropriate, promoted -+* +master:+ Allow the resource to be started and, if appropriate, promoted - indexterm:[target-role,Resource Option] - indexterm:[Resource,Option,target-role] - - |is-managed --|+TRUE+ -+|TRUE - |Is the cluster allowed to start and stop the resource? Allowed - values: +true+, +false+ - indexterm:[is-managed,Resource Option] - indexterm:[Resource,Option,is-managed] - - |resource-stickiness --|Calculated --|How much does the resource prefer to stay where it is? Defaults to -- the value of +resource-stickiness+ in the +rsc_defaults+ section -+|value of +resource-stickiness+ in the +rsc_defaults+ section -+|How much does the resource prefer to stay where it is? - indexterm:[resource-stickiness,Resource Option] - indexterm:[Resource,Option,resource-stickiness] - - |requires --|Calculated --|Under what conditions can the resource be started. ('Since 1.1.8') -- -- Defaults to +fencing+ unless +stonith-enabled+ is 'false' or +class+ -- is 'stonith' - under those conditions the default is +quorum+. -- Possible values: -- -- * 'nothing' - can always be started -- -- * 'quorum' - The cluster can only start this resource if a majority of -- the configured nodes are active -- -- * 'fencing' - The cluster can only start this resource if a majority -- of the configured nodes are active _and_ any failed or unknown nodes -- have been powered off. -- -- * 'unfencing' - The cluster can only start this resource if a majority -- of the configured nodes are active _and_ any failed or unknown nodes -- have been powered off _and_ only on nodes that have been 'unfenced' -- indexterm: Option[requires,Resource] -- indexterm:[Resource,Option,requires] -+|fencing (unless +stonith-enabled+ is +false+ or +class+ is -++stonith+, in which case it defaults to quorum) -+|Conditions under which the resource can be started ('Since 1.1.8') -+Allowed values: -+ -+* +nothing:+ can always be started -+* +quorum:+ The cluster can only start this resource if a majority of -+ the configured nodes are active -+* +fencing:+ The cluster can only start this resource if a majority -+ of the configured nodes are active _and_ any failed or unknown nodes -+ have been powered off -+* +unfencing:+ The cluster can only start this resource if a majority -+ of the configured nodes are active _and_ any failed or unknown nodes -+ have been powered off _and_ only on nodes that have been 'unfenced' -+ -+indexterm:[requires,Resource Option] -+indexterm:[Resource,Option,requires] - - |migration-threshold --|+INFINITY+ (disabled) -+|INFINITY - |How many failures may occur for this resource on a node, before this -- node is marked ineligible to host this resource. -+ node is marked ineligible to host this resource. A value of INFINITY -+ indicates that this feature is disabled. - indexterm:[migration-threshold,Resource Option] - indexterm:[Resource,Option,migration-threshold] - - |failure-timeout --|+0+ (disabled) -+|0 - |How many seconds to wait before acting as if the failure had not - occurred, and potentially allowing the resource back to the node on -- which it failed. -+ which it failed. A value of 0 indicates that this feature is disabled. - indexterm:[failure-timeout,Resource Option] - indexterm:[Resource,Option,failure-timeout] - - |multiple-active --|+stop_start+ -+|stop_start - |What should the cluster do if it ever finds the resource active on -- more than one node. Allowed values: -- --* 'block' - mark the resource as unmanaged -+ more than one node? Allowed values: - --* 'stop_only' - stop all active instances and leave them that way -- --* 'stop_start' - stop all active instances and start the resource in -+* +block:+ mark the resource as unmanaged -+* +stop_only:+ stop all active instances and leave them that way -+* +stop_start:+ stop all active instances and start the resource in - one location only - - indexterm:[multiple-active,Resource Option] - indexterm:[Resource,Option,multiple-active] - - |remote-node --|++ (disabled) --|The name of the remote-node this resource defines. This both enables the resource as a remote-node and defines the unique name used to identify the remote-node. If no other parameters are set, this value will also be assumed as the hostname to connect to at port 3121. +WARNING+ This value cannot overlap with any resource or node IDs. -+| -+|The name of the remote-node this resource defines. This both enables the -+resource as a remote-node and defines the unique name used to identify the -+remote-node. If no other parameters are set, this value will also be assumed as -+the hostname to connect to at the port specified by +remote-port+. +WARNING:+ -+This value cannot overlap with any resource or node IDs. If not specified, -+this feature is disabled. - - |remote-port --|+3121+ --|Configure a custom port to use for the guest connection to pacemaker_remote. -+|3121 -+|Port to use for the guest connection to pacemaker_remote - - |remote-addr --|+remote-node+ value used as hostname --|The ip address or hostname to connect to if remote-node's name is not the hostname of the guest. -+|value of +remote-node+ -+|The IP address or hostname to connect to if remote-node's name is not the -+hostname of the guest. - - |+remote-connect-timeout+ --|+60s+ -+|60s - |How long before a pending guest connection will time out. - - |========================================================= - --If you performed the following commands on the previous LSB Email resource -+[NOTE] -+==== -+Support for remote nodes was added in pacemaker 1.1.10. If you are using an -+earlier version, options related to remote nodes will not be available. -+==== -+ -+As an example of setting resource options, if you performed the following -+commands on an LSB Email resource: - --[source,C] - ------- --# crm_resource --meta --resource Email --set-parameter priority --property-value 100 --# crm_resource --meta --resource Email --set-parameter multiple-active --property-value block -+# crm_resource --meta --resource Email --set-parameter priority --parameter-value 100 -+# crm_resource -m -r Email -p multiple-active -v block - ------- - --the resulting resource definition would be -+the resulting resource definition might be: - - .An LSB resource with cluster options - ===== - [source,XML] - ------- - -- -- -- -- -+ -+ -+ -+ - - ------- - ===== - - [[s-resource-defaults]] --== Setting Global Defaults for Resource Options == -+=== Setting Global Defaults for Resource Meta-Attributes === - --To set a default value for a resource option, simply add it to the --+rsc_defaults+ section with `crm_attribute`. Thus, -+To set a default value for a resource option, add it to the -++rsc_defaults+ section with `crm_attribute`. For example, - --[source,C] --# crm_attribute --type rsc_defaults --attr-name is-managed --attr-value false -+---- -+# crm_attribute --type rsc_defaults --name is-managed --update false -+---- - - would prevent the cluster from starting or stopping any of the - resources in the configuration (unless of course the individual --resources were specifically enabled and had +is-managed+ set to -+resources were specifically enabled by having their +is-managed+ set to - +true+). - --== Instance Attributes == -+=== Resource Instance Attributes === - --The scripts of some resource classes (LSB not being one of them) can --be given parameters which determine how they behave and which instance -+The resource agents of some resource classes (lsb, systemd and upstart 'not' among them) -+can be given parameters which determine how they behave and which instance - of a service they control. - - If your resource agent supports parameters, you can add them with the --`crm_resource` command. For instance -+`crm_resource` command. For example, - --[source,C] --# crm_resource --resource Public-IP --set-parameter ip --property-value 1.2.3.4 -+---- -+# crm_resource --resource Public-IP --set-parameter ip --parameter-value 192.0.2.2 -+---- - - would create an entry in the resource like this: - -@@ -449,70 +472,86 @@ would create an entry in the resource like this: - ------- - - -- -+ - - - ------- - ===== - - For an OCF resource, the result would be an environment variable --called +OCF_RESKEY_ip+ with a value of +1.2.3.4+. -+called +OCF_RESKEY_ip+ with a value of +192.0.2.2+. - --The list of instance attributes supported by an OCF script can be --found by calling the resource script with the `meta-data` command. -+The list of instance attributes supported by an OCF resource agent can be -+found by calling the resource agent with the `meta-data` command. - The output contains an XML description of all the supported - attributes, their purpose and default values. - - .Displaying the metadata for the Dummy resource agent template - ===== --[source,C] --------- -+---- - # export OCF_ROOT=/usr/lib/ocf - # $OCF_ROOT/resource.d/pacemaker/Dummy meta-data --------- -+---- - [source,XML] - ------- - -- -- -- 1.0 -- -- -- This is a Dummy Resource Agent. It does absolutely nothing except -- keep track of whether its running or not. -- Its purpose in life is for testing and to serve as a template for RA writers. -- -- Dummy resource agent -- -- -- -- -- Location to store the resource state in. -- -- State file -- -- -- -- -- -- Dummy attribute that can be changed to cause a reload -- -- Dummy attribute that can be changed to cause a reload -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -+ -+ -+1.0 -+ -+ -+This is a Dummy Resource Agent. It does absolutely nothing except -+keep track of whether its running or not. -+Its purpose in life is for testing and to serve as a template for RA writers. -+ -+NB: Please pay attention to the timeouts specified in the actions -+section below. They should be meaningful for the kind of resource -+the agent manages. They should be the minimum advised timeouts, -+but they shouldn't/cannot cover _all_ possible resource -+instances. So, try to be neither overly generous nor too stingy, -+but moderate. The minimum timeouts should never be below 10 seconds. -+ -+Example stateless resource agent -+ -+ -+ -+ -+Location to store the resource state in. -+ -+State file -+ -+ -+ -+ -+ -+Fake attribute that can be changed to cause a reload -+ -+Fake attribute that can be changed to cause a reload -+ -+ -+ -+ -+ -+Number of seconds to sleep during operations. This can be used to test how -+the cluster reacts to operation timeouts. -+ -+Operation sleep duration in seconds. -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ - ------- - ===== - -@@ -520,9 +559,10 @@ attributes, their purpose and default values. - - indexterm:[Resource,Action] - --=== Monitoring Resources for Failure === -+Operations are actions the cluster can perform on a resource, -+such as start, stop and monitor. - --By default, the cluster will not ensure your resources are still -+As an example, by default the cluster will not ensure your resources are still - healthy. To instruct the cluster to do this, you need to add a - +monitor+ operation to the resource's definition. - -@@ -535,88 +575,98 @@ healthy. To instruct the cluster to do this, you need to add a - - - -- -+ - - - ------- - ===== - - .Properties of an Operation --[width="95%",cols="1m,6<",options="header",align="center"] -+[width="95%",cols="2m,3,6 - - -- -+ - - - ------- - ===== - --==== Multiple Monitor Operations ==== -+=== Multiple Monitor Operations === - - Provided no two operations (for a single resource) have the same name --and interval you can have as many monitor operations as you like. In --this way you can do a superficial health check every minute and -+and interval, you can have as many monitor operations as you like. In -+this way, you can do a superficial health check every minute and - progressively more intense ones at higher intervals. - - To tell the resource agent what kind of check to perform, you need to - provide each monitor with a different value for a common parameter. - The OCF standard creates a special parameter called +OCF_CHECK_LEVEL+ --for this purpose and dictates that it is _"made available to the --resource agent without the normal +OCF_RESKEY+ prefix"_. -+for this purpose and dictates that it is "made available to the -+resource agent without the normal +OCF_RESKEY+ prefix". - - Whatever name you choose, you can specify it by adding an --+instance_attributes+ block to the op tag. Note that it is up to each -++instance_attributes+ block to the +op+ tag. It is up to each - resource agent to look for the parameter and decide how to use it. - --.An OCF resource with two recurring health checks, performing different levels of checks - specified via +OCF_CHECK_LEVEL+. -+.An OCF resource with two recurring health checks, performing different levels of checks specified via +OCF_CHECK_LEVEL+. - ===== - [source,XML] - ------- -@@ -670,13 +720,13 @@ resource agent to look for the parameter and decide how to use it. - - - -- -+ - - - ------- - ===== - --==== Disabling a Monitor Operation ==== -+=== Disabling a Monitor Operation === - - The easiest way to stop a recurring monitor is to just delete it. - However, there can be times when you only want to disable it -@@ -692,15 +742,19 @@ operation's definition. - - - -- -+ - - - ------- - ===== - --This can be achieved from the command-line by executing -+This can be achieved from the command line by executing: - --[source,C] --# cibadmin -M -X '' -+---- -+# cibadmin --modify --xml-text '' -+---- - - Once you've done whatever you needed to do, you can then re-enable it with -+---- -+# cibadmin --modify --xml-text '' -+---- -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Rules.txt b/doc/Pacemaker_Explained/en-US/Ch-Rules.txt -index 4f80983..c65c268 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Rules.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Rules.txt -@@ -1,7 +1,7 @@ - = Rules = - - //// --We prefer [[ch-rules]], but older versions of asciidoc dont deal well -+We prefer [[ch-rules]], but older versions of asciidoc don't deal well - with that construct for chapter headings - //// - -@@ -24,29 +24,35 @@ on the rule's +boolean-op+ field to determine if the rule ultimately - evaluates to +true+ or +false+. What happens next depends on the - context in which the rule is being used. - -+== Rule Properties == -+ - .Properties of a Rule --[width="95%",cols="2m,5<",options="header",align="center"] -+[width="95%",cols="2m,1,5<",options="header",align="center"] - |========================================================= - - |Field -+|Default - |Description - - |role --|Limits the rule to apply only when the resource is in that -- role. Allowed values: _Started_, +Slave,+ and +Master+. NOTE: A rule -- with +role="Master"+ can not determine the initial location of a -- clone instance. It will only affect which of the active instances -+|+started+ -+|Limits the rule to apply only when the resource is in the specified -+ role. Allowed values are +started+, +slave+, and +master+. A rule -+ with +role="master"+ cannot determine the initial location of a -+ clone instance and will only affect which of the active instances - will be promoted. - indexterm:[role,Constraint Rule] - indexterm:[Constraint,Rule,role] - - |score -+| - |The score to apply if the rule evaluates to +true+. Limited to use in - rules that are part of location constraints. - indexterm:[score,Constraint Rule] - indexterm:[Constraint,Rule,score] - - |score-attribute -+| - |The node attribute to look up and use as a score if the rule - evaluates to +true+. Limited to use in rules that are part of - location constraints. -@@ -54,8 +60,9 @@ context in which the rule is being used. - indexterm:[Constraint,Rule,score-attribute] - - |boolean-op -+|+and+ - |How to combine the result of multiple expression objects. Allowed -- values: _and_ and +or+. -+ values are +and+ and +or+. - indexterm:[boolean-op,Constraint Rule] - indexterm:[Constraint,Rule,boolean-op] - -@@ -71,75 +78,75 @@ added by the administrator, each node has a built-in node attribute - called +#uname+ that can also be used. - - .Properties of an Expression --[width="95%",cols="2m,5<",options="header",align="center"] -+[width="95%",cols="1m,1,5 -- - - ------- -@@ -455,9 +460,9 @@ would have its preference increased by +5678+. - - == Using Rules to Control Resource Options == - --Often some cluster nodes will be different from their peers; sometimes --these differences (the location of a binary or the names of network --interfaces) require resources to be configured differently depending -+Often some cluster nodes will be different from their peers. Sometimes, -+these differences -- e.g. the location of a binary or the names of network -+interfaces -- require resources to be configured differently depending - on the machine they're hosted on. - - By defining multiple +instance_attributes+ objects for the resource -@@ -497,12 +502,21 @@ port 9999 for all other nodes. - - The order in which +instance_attributes+ objects are evaluated is - determined by their score (highest to lowest). If not supplied, score --defaults to zero and objects with an equal score are processed in --listed order. If the +instance_attributes+ object does not have a --+rule+ or has a +rule+ that evaluates to +true+, then for any --parameter the resource does not yet have a value for, the resource --will use the parameter values defined by the +instance_attributes+ --object. -+defaults to zero, and objects with an equal score are processed in -+listed order. If the +instance_attributes+ object has no rule -+or a +rule+ that evaluates to +true+, then for any parameter the resource does -+not yet have a value for, the resource will use the parameter values defined by -+the +instance_attributes+. -+ -+For example, given the configuration above, if the resource is placed on node1: -+ -+. +special-node1+ has the highest score (3) and so is evaluated first; -+ its rule evaluates to +true+, so +interface+ is set to +eth1+. -+. +special-node2+ is evaluated next with score 2, but its rule evaluates to +false+, -+ so it is ignored. -+. +defaults+ is evaluated last with score 1, and has no rule, so its values -+ are examined; +interface+ is already defined, so the value here is not used, -+ but +port+ is not yet defined, so +port+ is set to +9999+. - - == Using Rules to Control Cluster Options == - indexterm:[Rule,Controlling Cluster Options] -@@ -512,9 +526,9 @@ Controlling cluster options is achieved in much the same manner as - specifying different resource options on different nodes. - - The difference is that because they are cluster options, one cannot --(or should not, because they won't work) use attribute based -+(or should not, because they won't work) use attribute-based - expressions. The following example illustrates how to set a different --+resource-stickiness+ value during and outside of work hours. This -++resource-stickiness+ value during and outside work hours. This - allows resources to automatically move back to their most preferred - hosts, but at a time that (in theory) does not interfere with business - activities. -@@ -540,22 +554,20 @@ activities. - ===== - - [[s-rules-recheck]] --== Ensuring Time Based Rules Take Effect == -+== Ensuring Time-Based Rules Take Effect == - --A Pacemaker cluster is an event driven system. As such, it won't --recalculate the best place for resources to run in unless something -+A Pacemaker cluster is an event-driven system. As such, it won't -+recalculate the best place for resources to run unless something - (like a resource failure or configuration change) happens. This can - mean that a location constraint that only allows resource X to run - between 9am and 5pm is not enforced. - --If you rely on time based rules, it is essential that you set the --+cluster-recheck-interval+ option. This tells the cluster to --periodically recalculate the ideal state of the cluster. For example, --if you set +cluster-recheck-interval=5m+, then sometime between 9:00 --and 9:05 the cluster would notice that it needs to start resource X, --and between 17:00 and 17:05 it would realize that X needed to be --stopped. -- --Note that the timing of the actual start and stop actions depends on --what else needs to be performed first --. -+If you rely on time-based rules, the +cluster-recheck-interval+ cluster option -+(which defaults to 15 minutes) is essential. This tells the cluster to -+periodically recalculate the ideal state of the cluster. -+ -+For example, if you set +cluster-recheck-interval="5m"+, then sometime between -+09:00 and 09:05 the cluster would notice that it needs to start resource X, -+and between 17:00 and 17:05 it would realize that X needed to be stopped. -+The timing of the actual start and stop actions depends on what other actions -+the cluster may need to perform first. -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Status.txt b/doc/Pacemaker_Explained/en-US/Ch-Status.txt -index 0d19e2f..8083a34 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Status.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Status.txt -@@ -1,4 +1,4 @@ --= Status - Here be dragons = -+= Status -- Here be dragons = - - Most users never need to understand the contents of the status section - and can be happy with the output from `crm_mon`. -@@ -12,9 +12,9 @@ indexterm:[Node,Status] - indexterm:[Status of a Node] - - In addition to the cluster's configuration, the CIB holds an --up-to-date representation of each cluster node in the status section. -+up-to-date representation of each cluster node in the +status+ section. - --.A bare-bones status entry for a healthy node called +cl-virt-1+ -+.A bare-bones status entry for a healthy node *cl-virt-1* - ====== - [source,XML] - ----- -@@ -25,34 +25,32 @@ up-to-date representation of each cluster node in the status section. - ----- - ====== - --Users are highly recommended _not to modify_ any part of a node's -+Users are highly recommended _not_ to modify any part of a node's - state _directly_. The cluster will periodically regenerate the entire --section from authoritative sources. So any changes should be done --with the tools for those subsystems. -+section from authoritative sources, so any changes should be done -+with the tools appropriate to those sources. - - .Authoritative Sources for State Information --[width="95%",cols="5m,5<",options="header",align="center"] -+[width="95%",cols="1m,1<",options="header",align="center"] - |========================================================= - --|Dataset |Authoritative Source -+| CIB Object | Authoritative Source - --|node_state fields |crmd -+|node_state|crmd - --|transient_attributes tag |attrd -+|transient_attributes|attrd - --|lrm tag |lrmd -+|lrm|lrmd - - |========================================================= - - The fields used in the +node_state+ objects are named as they are - largely for historical reasons and are rooted in Pacemaker's origins --as the Heartbeat resource manager. -- --They have remained unchanged to preserve compatibility with older --versions. -+as the Heartbeat resource manager. They have remained unchanged to preserve -+compatibility with older versions. - - .Node Status Fields --[width="95%",cols="2m,5<",options="header",align="center"] -+[width="95%",cols="1m,4<",options="header",align="center"] - |========================================================= - - |Field |Description -@@ -61,9 +59,8 @@ versions. - | id | - indexterm:[id,Node Status] - indexterm:[Node,Status,id] --Unique identifier for the node. Corosync based clusters use the uname --of the machine, Heartbeat clusters use a human-readable (but annoying) --UUID. -+Unique identifier for the node. Corosync-based clusters use a numeric -+counter, while Heartbeat clusters use a (barely) human-readable UUID. - - | uname | - indexterm:[uname,Node Status] -@@ -73,24 +70,23 @@ The node's machine name (output from `uname -n`). - | ha | - indexterm:[ha,Node Status] - indexterm:[Node,Status,ha] --Flag specifying whether the cluster software is active on the --node. Allowed values: +active+, +dead+. -+Is the cluster software active on this node? Allowed values: +active+, +dead+. - - | in_ccm | - indexterm:[in_ccm,Node Status] - indexterm:[Node,Status,in_ccm] --Flag for cluster membership; allowed values: +true+, +false+. -+Is the node a member of the cluster? Allowed values: +true+, +false+. - - | crmd | - indexterm:[crmd,Node Status] - indexterm:[Node,Status,crmd] --Flag: is the crmd process active on the node? One of +online+, +offline+. -+Is the crmd process active on the node? Allowed values: +online+, +offline+. - - | join | - indexterm:[join,Node Status] - indexterm:[Node,Status,join] --Flag saying whether the node participates in hosting --resources. Possible values: +down+, +pending+, +member+, +banned+. -+Does the node participate in hosting resources? Allowed values: +down+, -++pending+, +member+, +banned+. - - | expected | - indexterm:[expected,Node Status] -@@ -100,47 +96,46 @@ Expected value for +join+. - | crm-debug-origin | - indexterm:[crm-debug-origin,Node Status] - indexterm:[Node,Status,crm-debug-origin] --Diagnostic indicator: the origin of the most recent change(s). -+The origin of the most recent change(s). For diagnostic purposes. - - |========================================================= - --The cluster uses these fields to determine if, at the node level, the -+The cluster uses these fields to determine whether, at the node level, the - node is healthy or is in a failed state and needs to be fenced. - - == Transient Node Attributes == - - Like regular <>, the name/value --pairs listed here also help to describe the node. However they are --forgotten by the cluster when the node goes offline. This can be --useful, for instance, when you want a node to be in standby mode (not --able to run resources) until the next reboot. -+pairs listed in the +transient_attributes+ section help to describe the -+node. However they are forgotten by the cluster when the node goes offline. -+This can be useful, for instance, when you want a node to be in standby mode -+(not able to run resources) just until the next reboot. - - In addition to any values the administrator sets, the cluster will - also store information about failed resources here. - --.Example set of transient node attributes for node "cl-virt-1" -+.A set of transient node attributes for node *cl-virt-1* - ====== - [source,XML] - ----- -- -- -- -- -- -- -- -- -+ -+ -+ -+ -+ -+ -+ -+ - ----- - ====== - - In the above example, we can see that the +pingd:0+ resource has --failed once, at +Mon Apr 6 11:22:22 2009+. -+failed once, at 09:22:22 UTC 6 April 2009. - footnote:[ --You can use the standard +date+ command to print a human readable of --any seconds-since-epoch value: -- # `date -d @number` -+You can use the standard `date` command to print a human-readable version of -+any seconds-since-epoch value, for example `date -d @1239009742`. - ] --We also see that the node is connected to three "pingd" peers and that -+We also see that the node is connected to three *pingd* peers and that - all known resources have been checked for on this machine (+probe_complete+). - - == Operation History == -@@ -149,17 +144,17 @@ indexterm:[Operation History] - A node's resource history is held in the +lrm_resources+ tag (a child - of the +lrm+ tag). The information stored here includes enough - information for the cluster to stop the resource safely if it is --removed from the +configuration+ section. Specifically the resource's -+removed from the +configuration+ section. Specifically, the resource's - +id+, +class+, +type+ and +provider+ are stored. - --.A record of the apcstonith resource -+.A record of the +apcstonith+ resource - ====== - [source,XML] - - ====== - - Additionally, we store the last job for every combination of --+resource, action+ and +interval+. The concatenation of the values in -++resource+, +action+ and +interval+. The concatenation of the values in - this tuple are used to create the id of the +lrm_rsc_op+ object. - - .Contents of an +lrm_rsc_op+ job -@@ -214,27 +209,30 @@ details on what the values here mean and how they are interpreted. - indexterm:[last-run,Action Status] - indexterm:[Action,Status,last-run] - --Diagnostic indicator. Machine local date/time, in seconds since epoch, --at which the job was executed. -+Machine-local date/time, in seconds since epoch, -+at which the job was executed. For diagnostic purposes. - - | last-rc-change | - indexterm:[last-rc-change,Action Status] - indexterm:[Action,Status,last-rc-change] - --Diagnostic indicator. Machine local date/time, in seconds since epoch, -+Machine-local date/time, in seconds since epoch, - at which the job first returned the current value of +rc-code+. -+For diagnostic purposes. - - | exec-time | - indexterm:[exec-time,Action Status] - indexterm:[Action,Status,exec-time] - --Diagnostic indicator. Time, in milliseconds, that the job was running for. -+Time, in milliseconds, that the job was running for. -+For diagnostic purposes. - - | queue-time | - indexterm:[queue-time,Action Status] - indexterm:[Action,Status,queue-time] - --Diagnostic indicator. Time, in seconds, that the job was queued for in the LRMd. -+Time, in seconds, that the job was queued for in the LRMd. -+For diagnostic purposes. - - | crm_feature_set | - indexterm:[crm_feature_set,Action Status] -@@ -274,13 +272,14 @@ necessary. - indexterm:[crm-debug-origin,Action Status] - indexterm:[Action,Status,crm-debug-origin] - --Diagnostic indicator. The origin of the current values. -+The origin of the current values. -+For diagnostic purposes. - - |========================================================= - --=== Simple Example === -+=== Simple Operation History Example === - --.A monitor operation (determines current state of the apcstonith resource) -+.A monitor operation (determines current state of the +apcstonith+ resource) - ====== - [source,XML] - ----- -@@ -299,26 +298,24 @@ Diagnostic indicator. The origin of the current values. - In the above example, the job is a non-recurring monitor operation - often referred to as a "probe" for the +apcstonith+ resource. - --The cluster schedules probes for every configured resource on when a --new node starts, in order to determine the resource's current state -+The cluster schedules probes for every configured resource on a node when -+the node first starts, in order to determine the resource's current state - before it takes any further action. - - From the +transition-key+, we can see that this was the 22nd action of - the 2nd graph produced by this instance of the crmd - (2668bbeb-06d5-40f9-936d-24cb7f87006a). - --The third field of the +transition-key+ contains a 7, this indicates --that the job expects to find the resource inactive. -- --By looking at the +rc-code+ property, we see that this was the case. -- -+The third field of the +transition-key+ contains a 7, which indicates -+that the job expects to find the resource inactive. By looking at the +rc-code+ -+property, we see that this was the case. - --As that is the only job recorded for this node we can conclude that -+As that is the only job recorded for this node, we can conclude that - the cluster started the resource elsewhere. - --=== Complex Resource History Example === -+=== Complex Operation History Example === - --.Resource history of a pingd clone with multiple jobs -+.Resource history of a +pingd+ clone with multiple jobs - ====== - [source,XML] - ----- -@@ -364,7 +361,7 @@ Once sorted, the above example can be summarized as: - - The cluster processes each job record to build up a picture of the - resource's state. After the first and second entries, it is --considered stopped and after the third it considered active. -+considered stopped, and after the third it considered active. - - Based on the last operation, we can tell that the resource is - currently active. -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt b/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt -index fae0fe5..a3c02cb 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt -@@ -1,16 +1,16 @@ --= Configure STONITH = -+= STONITH = - - //// --We prefer [[ch-stonith]], but older versions of asciidoc dont deal well -+We prefer [[ch-stonith]], but older versions of asciidoc don't deal well - with that construct for chapter headings - //// - anchor:ch-stonith[Chapter 13, STONITH] - indexterm:[STONITH, Configuration] - --== What Is STONITH == -+== What Is STONITH? == - --STONITH is an acronym for Shoot-The-Other-Node-In-The-Head and it --protects your data from being corrupted by rogue nodes or concurrent -+STONITH (an acronym for "Shoot The Other Node In The Head"), also called -+'fencing', protects your data from being corrupted by rogue nodes or concurrent - access. - - Just because a node is unresponsive, this doesn't mean it isn't -@@ -18,19 +18,18 @@ accessing your data. The only way to be 100% sure that your data is - safe, is to use STONITH so we can be certain that the node is truly - offline, before allowing the data to be accessed from another node. - -- - STONITH also has a role to play in the event that a clustered service - cannot be stopped. In this case, the cluster uses STONITH to force the - whole node offline, thereby making it safe to start the service - elsewhere. - --== What STONITH Device Should You Use == -+== What STONITH Device Should You Use? == - - It is crucial that the STONITH device can allow the cluster to - differentiate between a node failure and a network one. - - The biggest mistake people make in choosing a STONITH device is to --use remote power switch (such as many on-board IMPI controllers) that -+use a remote power switch (such as many on-board IPMI controllers) that - shares power with the node it controls. In such cases, the cluster - cannot be sure if the node is really offline, or active and suffering - from a network fault. -@@ -38,46 +37,47 @@ from a network fault. - Likewise, any device that relies on the machine being active (such as - SSH-based "devices" used during testing) are inappropriate. - --== Differences of STONITH Resources == -+== Special Treatment of STONITH Resources == -+ -+STONITH resources are somewhat special in Pacemaker. - --Stonith resources are somewhat special in Pacemaker. -+STONITH may be initiated by pacemaker or by other parts of the cluster -+(such as resources like DRBD or DLM). To accommodate this, pacemaker -+does not require the STONITH resource to be in the 'started' state -+in order to be used, thus allowing reliable use of STONITH devices in such a -+case. - --In previous versions, only "running" resources could be used by --Pacemaker for fencing. This requirement has been relaxed to allow --other parts of the cluster (such as resources like DRBD) to reliably --initiate fencing. footnote:[Fencing a node while Pacemaker was moving --stonith resources around would otherwise fail] -+[NOTE] -+==== -+In pacemaker versions 1.1.9 and earlier, this feature either did not exist or -+did not work well. Only "running" STONITH resources could be used by Pacemaker -+for fencing, and if another component tried to fence a node while Pacemaker was -+moving STONITH resources, the fencing could fail. -+==== - --Now all nodes have access to their definitions and instantiate them --on-the-fly when needed, however preference is given to 'verified' --instances which are the ones the cluster has explicitly started. -+All nodes have access to STONITH devices' definitions and instantiate them -+on-the-fly when needed, but preference is given to 'verified' instances, which -+are the ones that are 'started' according to the cluster's knowledge. - - In the case of a cluster split, the partition with a verified instance --will have a slight advantage as stonith-ng in the other partition will --have to hear from all its current peers before choosing a node to -+will have a slight advantage, because the STONITH daemon in the other partition -+will have to hear from all its current peers before choosing a node to - perform the fencing. - --[NOTE] --=========== --To disable a fencing device/resource, 'target-role' can be set as you would for a normal resource. --=========== -+Fencing resources do work the same as regular resources in some respects: - --[NOTE] --=========== --To prevent a specific node from using a fencing device, location constraints will work as expected. --=========== -+* +target-role+ can be used to enable or disable the resource -+* Location constraints can be used to prevent a specific node from using the resource - - [IMPORTANT] - =========== -- --Currently there is a limitation that fencing resources may only have a --one set of meta-attributes and one set of instance-attributes. This -+Currently there is a limitation that fencing resources may only have -+one set of meta-attributes and one set of instance attributes. This - can be revisited if it becomes a significant limitation for people. -- - =========== - --.Properties of Fencing Devices --[width="95%",cols="1m,1m,1m,5<",options="header",align="center"] -+.Properties of Fencing Resources -+[width="95%",cols="5m,2,3,10+ -+. Find the required parameters associated with the device -+ (replacing $AGENT_NAME with the name obtained from the previous step): -++ -+---- -+# stonith_admin --metadata --agent $AGENT_NAME -+---- - - . Create a file called +stonith.xml+ containing a primitive resource -- with a class of 'stonith', a type of and a parameter -- for each of the values returned in step 2. -+ with a class of +stonith+, a type equal to the agent name obtained earlier, -+ and a parameter for each of the values returned in the previous step. - - . If the device does not know how to fence nodes based on their uname, - you may also need to set the special +pcmk_host_map+ parameter. See -- +man stonithd+ for details. -+ `man stonithd` for details. - --. If the device does not support the list command, you may also need -+. If the device does not support the `list` command, you may also need - to set the special +pcmk_host_list+ and/or +pcmk_host_check+ -- parameters. See +man stonithd+ for details. -+ parameters. See `man stonithd` for details. - - . If the device does not expect the victim to be specified with the -- port parameter, you may also need to set the special -- +pcmk_host_argument+ parameter. See +man stonithd+ for details. -+ `port` parameter, you may also need to set the special -+ +pcmk_host_argument+ parameter. See `man stonithd` for details. - --. Upload it into the CIB using cibadmin: +cibadmin -C -o resources --xml-file stonith.xml+ -+. Upload it into the CIB using cibadmin: -++ -+---- -+# cibadmin -C -o resources --xml-file stonith.xml -+---- - --. Set stonith-enabled to true. +crm_attribute -t crm_config -n stonith-enabled -v true+ -+. Set +stonith-enabled+ to true: -++ -+---- -+# crm_attribute -t crm_config -n stonith-enabled -v true -+---- - --. Once the stonith resource is running, you can test it by executing: -- +stonith_admin --reboot nodename+. Although you might want to stop the -- cluster on that machine first. -+. Once the stonith resource is running, you can test it by executing the -+ following (although you might want to stop the cluster on that machine -+ first): -++ -+---- -+# stonith_admin --reboot nodename -+---- - --=== Example === -+=== Example STONITH Configuration === - --Assuming we have an chassis containing four nodes and an IPMI device --active on 10.0.0.1, then we would chose the fence_ipmilan driver in step --2 and obtain the following list of parameters -+Assume we have an chassis containing four nodes and an IPMI device -+active on 192.0.2.1. We would choose the `fence_ipmilan` driver, -+and obtain the following list of parameters: - - .Obtaining a list of STONITH Parameters -- --[source,C] -+==== - ---- - # stonith_admin --metadata -a fence_ipmilan - ---- - - [source,XML] - ---- -- - -- --fence_ipmilan is an I/O Fencing agent which can be used with machines controlled by IPMI. This agent calls support software using ipmitool (http://ipmitool.sf.net/). -- --To use fence_ipmilan with HP iLO 3 you have to enable lanplus option (lanplus / -P) and increase wait after operation to 4 seconds (power_wait=4 / -T 4) -- -- -- -- -- IPMI Lan Auth type (md5, password, or none) -- -- -- -- -- IPMI Lan IP to talk to -- -- -- -- -- Password (if required) to control power on IPMI device -- -- -- -- -- Script to retrieve password (if required) -- -- -- -- -- Use Lanplus -- -- -- -- -- Username/Login (if required) to control power on IPMI device -- -- -- -- -- Operation to perform. Valid operations: on, off, reboot, status, list, diag, monitor or metadata -- -- -- -- -- Timeout (sec) for IPMI operation -- -- -- -- -- Ciphersuite to use (same as ipmitool -C parameter) -- -- -- -- -- Method to fence (onoff or cycle) -- -- -- -- -- Wait X seconds after on/off operation -- -- -- -- -- Wait X seconds before fencing is started -- -- -- -- -- Verbose mode -- -- -- -- -- -- -- -- -- -- -- -- -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ - - ---- -+==== - --from which we would create a STONITH resource fragment that might look -+Based on that, we would create a STONITH resource fragment that might look - like this: - --.Sample STONITH Resource -+.An IPMI-based STONITH Resource -+==== - [source,XML] - ---- - - - - -- -+ - - - -@@ -413,40 +501,39 @@ like this: - - - ---- -+==== - --And finally, since we disabled it earlier, we need to re-enable STONITH. -- --[source,Bash] -+Finally, we need to enable STONITH: - ---- - # crm_attribute -t crm_config -n stonith-enabled -v true - ---- - --== Advanced Fencing Configurations == -+== Advanced STONITH Configurations == - - Some people consider that having one fencing device is a single point - of failure footnote:[Not true, since a node or resource must fail --before fencing even has a chance to], others prefer removing the node -+before fencing even has a chance to]; others prefer removing the node - from the storage and network instead of turning it off. - - Whatever the reason, Pacemaker supports fencing nodes with multiple --devices through a feature called fencing topologies. -+devices through a feature called 'fencing topologies'. - --Simply create the individual devices as you normally would and then --define one or more fencing levels in the fencing-topology section in -+Simply create the individual devices as you normally would, then -+define one or more +fencing-level+ entries in the +fencing-topology+ section of - the configuration. - --* Each level is attempted in +ascending index+ order --* If a device fails, +processing terminates+ for the current level. -- No further devices in that level are exercised and the next level is attempted instead. --* If the operation succeeds for all the listed devices in a level, the level is deemed to have passed --* The operation is finished +when a level has passed+ (success), or all levels have been attempted (failed) --* If the operation failed, the next step is determined by the Policy Engine and/or crmd. -+* Each fencing level is attempted in order of ascending +index+. -+* If a device fails, processing terminates for the current level. -+ No further devices in that level are exercised, and the next level is attempted instead. -+* If the operation succeeds for all the listed devices in a level, the level is deemed to have passed. -+* The operation is finished when a level has passed (success), or all levels have been attempted (failed). -+* If the operation failed, the next step is determined by the Policy Engine and/or `crmd`. - - Some possible uses of topologies include: - --* try poison-pill and fail back to power --* try disk and network, and fall back to power if either fails --* initiate a kdump and then poweroff the node -+* Try poison-pill and fail back to power -+* Try disk and network, and fall back to power if either fails -+* Initiate a kdump and then poweroff the node - - .Properties of Fencing Levels - [width="95%",cols="1m,6<",options="header",align="center"] -@@ -456,7 +543,7 @@ Some possible uses of topologies include: - |Description - - |id --|Your name for the level -+|A unique name for the level - indexterm:[id,fencing-level] - indexterm:[Fencing,fencing-level,id] - -@@ -467,18 +554,19 @@ Some possible uses of topologies include: - - |index - |The order in which to attempt the levels. -- Levels are attempted in +ascending index+ order +until one succeeds+. -+ Levels are attempted in ascending order 'until one succeeds'. - indexterm:[index,fencing-level] - indexterm:[Fencing,fencing-level,index] - - |devices --|A comma separated list of devices for which the -+|A comma-separated list of devices that must all be tried for this level - indexterm:[devices,fencing-level] - indexterm:[Fencing,fencing-level,devices] - - |========================================================= - --=== Example use of Fencing Topologies === -+.Fencing topology with different devices for different nodes -+==== - [source,XML] - ---- - -@@ -498,21 +586,24 @@ Some possible uses of topologies include: - - - ---- -+==== - --=== Example use of advanced Fencing Topologies: dual layer and dual devices === -+=== Example Dual-Layer, Dual-Device Fencing Topologies === - --The following example illustrates an advanced use of +fencing_topology+ in a cluster with the following properties: -+The following example illustrates an advanced use of +fencing-topology+ in a cluster with the following properties: - - * 3 nodes (2 active prod-mysql nodes, 1 prod_mysql-rep in standby for quorum purposes) --* the active nodes have an IPMI-controlled power board reached at 10.10.10.1 and 10.10.10.2 --* the active nodes also have two independant PSUs (Power Supplu Units) connected to two independant PDUs (Power Distribution Unit) reached at 10.20.1.1 (port 10 and port 11) and 10.20.2.1 (port 10 and port 11) --* the first fencing method uses the +fence_ipmi+ agent --* the second fencing method uses the +fence_apc_snmp+ agent targetting 2 fencing devices (one per PSU, either port 10 or 11) -+* the active nodes have an IPMI-controlled power board reached at 192.0.2.1 and 192.0.2.2 -+* the active nodes also have two independent PSUs (Power Supply Units) -+ connected to two independent PDUs (Power Distribution Units) reached at -+ 198.51.100.1 (port 10 and port 11) and 203.0.113.1 (port 10 and port 11) -+* the first fencing method uses the `fence_ipmi` agent -+* the second fencing method uses the `fence_apc_snmp` agent targetting 2 fencing devices (one per PSU, either port 10 or 11) - * fencing is only implemented for the active nodes and has location constraints - * fencing topology is set to try IPMI fencing first then default to a "sure-kill" dual PDU fencing - --In a normal failure scenario, STONITH will first select +fence_ipmi+ to try and kill the faulty node. --Using a +fencing_topology+, if that first method fails, STONITH will then move on to selecting +fence_apc_snmp+ twice: -+In a normal failure scenario, STONITH will first select +fence_ipmi+ to try to kill the faulty node. -+Using a fencing topology, if that first method fails, STONITH will then move on to selecting +fence_apc_snmp+ twice: - - * once for the first PDU - * again for the second PDU -@@ -524,33 +615,34 @@ The fence action is considered successful only if both PDUs report the required - Each cluster node has it own dedicated IPMI channel that can be called for fencing using the following primitives: - [source,XML] - ---- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ - ---- - - .Second fencing method: dual PDU devices - --Each cluster node also has two distinct power channels controlled by two distinct PDUs. That means a total of 4 fencing devices configured as follows: -+Each cluster node also has two distinct power channels controlled by two -+distinct PDUs. That means a total of 4 fencing devices configured as follows: - - - Node 1, PDU 1, PSU 1 @ port 10 - - Node 1, PDU 2, PSU 2 @ port 10 -@@ -560,62 +652,62 @@ Each cluster node also has two distinct power channels controlled by two distinc - The matching fencing agents are configured as follows: - [source,XML] - ---- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ - ---- - - .Location Constraints - --To prevent STONITH from running a fencing agent on the very same node it is supposed to fence, constraints are placed on all the fencing primitives: -+To prevent STONITH from trying to run a fencing agent on the same node it is -+supposed to fence, constraints are placed on all the fencing primitives: - [source,XML] - ---- -- -- -- -- -- -- -- -- -+ -+ -+ -+ -+ -+ -+ -+ - ---- - - .Fencing topology -@@ -624,14 +716,14 @@ Now that all the fencing resources are defined, it's time to create the right to - We want to first fence using IPMI and if that does not work, fence both PDUs to effectively and surely kill the node. - [source,XML] - ---- -- -- -- -- -- -- -+ -+ -+ -+ -+ -+ - ---- --Please note, in +fencing_topology+, the lower index value determines the priority of the first fencing method. -+Please note, in +fencing-topology+, the lowest +index+ value determines the priority of the first fencing method. - - .Final configuration - -@@ -660,7 +752,7 @@ Put together, the configuration looks like this: - - - -- -+ - - - -@@ -671,7 +763,7 @@ Put together, the configuration looks like this: - - - -- -+ - - - -@@ -682,7 +774,7 @@ Put together, the configuration looks like this: - - - -- -+ - - - -@@ -692,7 +784,7 @@ Put together, the configuration looks like this: - - - -- -+ - - - -@@ -702,7 +794,7 @@ Put together, the configuration looks like this: - - - -- -+ - - - -@@ -712,7 +804,7 @@ Put together, the configuration looks like this: - - - -- -+ - - - -@@ -739,3 +831,29 @@ Put together, the configuration looks like this: - - - ---- -+ -+== Remapping Reboots == -+ -+When the cluster needs to reboot a node, whether because +stonith-action+ is +reboot+ or because -+a reboot was manually requested (such as by `stonith_admin --reboot`), it will remap that to -+other commands in two cases: -+ -+. If the chosen fencing device does not support the +reboot+ command, the cluster -+ will ask it to perform +off+ instead. -+ -+. If a fencing topology level with multiple devices must be executed, the cluster -+ will ask all the devices to perform +off+, then ask the devices to perform +on+. -+ -+To understand the second case, consider the example of a node with redundant -+power supplies connected to intelligent power switches. Rebooting one switch -+and then the other would have no effect on the node. Turning both switches off, -+and then on, actually reboots the node. -+ -+In such a case, the fencing operation will be treated as successful as long as -+the +off+ commands succeed, because then it is safe for the cluster to recover -+any resources that were on the node. Timeouts and errors in the +on+ phase will -+be logged but ignored. -+ -+When a reboot operation is remapped, any action-specific timeout for the -+remapped action will be used (for example, +pcmk_off_timeout+ will be used when -+executing the +off+ command, not +pcmk_reboot_timeout+). -diff --git a/doc/Pacemaker_Explained/en-US/Ch-Utilization.txt b/doc/Pacemaker_Explained/en-US/Ch-Utilization.txt -index 6eeb063..afba0a9 100644 ---- a/doc/Pacemaker_Explained/en-US/Ch-Utilization.txt -+++ b/doc/Pacemaker_Explained/en-US/Ch-Utilization.txt -@@ -1,39 +1,38 @@ - = Utilization and Placement Strategy = - --== Background == -- - Pacemaker decides where to place a resource according to the resource - allocation scores on every node. The resource will be allocated to the --node where the resource has the highest score. If the resource allocation --scores on all the nodes are equal, by the `default` placement strategy, --Pacemaker will choose a node with the least number of allocated resources --for balancing the load. If the number of resources on each node is equal, --the first eligible node listed in cib will be chosen to run the resource. -+node where the resource has the highest score. - --Though resources are different. They may consume different amounts of the --capacities of the nodes. Actually, we cannot ideally balance the load just --according to the number of resources allocated to a node. Besides, if --resources are placed such that their combined requirements exceed the --provided capacity, they may fail to start completely or run with degraded --performance. -+If the resource allocation scores on all the nodes are equal, by the default -+placement strategy, Pacemaker will choose a node with the least number of -+allocated resources for balancing the load. If the number of resources on each -+node is equal, the first eligible node listed in the CIB will be chosen to run -+the resource. - --To take these into account, Pacemaker allows you to specify the following --configurations: -+Often, in real-world situations, different resources use significantly -+different proportions of a node's capacities (memory, I/O, etc.). -+We cannot balance the load ideally just according to the number of resources -+allocated to a node. Besides, if resources are placed such that their combined -+requirements exceed the provided capacity, they may fail to start completely or -+run with degraded performance. - --. The `capacity` a certain `node provides`. --. The `capacity` a certain `resource requires`. --. An overall `strategy` for placement of resources. -+To take these factors into account, Pacemaker allows you to configure: - -+. The capacity a certain node provides. -+. The capacity a certain resource requires. -+. An overall strategy for placement of resources. - - == Utilization attributes == - --To configure the capacity a node provides and the resource's requirements, --use `utilization` attributes. You can name the `utilization` attributes --according to your preferences and define as many `name/value` pairs as your --configuration needs. However, the attribute's values must be `integers`. -- --First, specify the capacities the nodes provide: -+To configure the capacity that a node provides or a resource requires, -+you can use 'utilization attributes' in +node+ and +resource+ objects. -+You can name utilization attributes according to your preferences and define as -+many name/value pairs as your configuration needs. However, the attributes' -+values must be integers. - -+.Specifying CPU and RAM capacities of two nodes -+==== - [source,XML] - ---- - -@@ -49,9 +48,10 @@ First, specify the capacities the nodes provide: - - - ---- -+==== - --Then, specify the capacities the resources require: -- -+.Specifying CPU and RAM consumed by several resources -+==== - [source,XML] - ---- - -@@ -73,115 +73,118 @@ Then, specify the capacities the resources require: - - - ---- -+==== - - A node is considered eligible for a resource if it has sufficient free - capacity to satisfy the resource's requirements. The nature of the required --or provided capacities is completely irrelevant for Pacemaker, it just makes -+or provided capacities is completely irrelevant to Pacemaker -- it just makes - sure that all capacity requirements of a resource are satisfied before placing - a resource to a node. - -- - == Placement Strategy == - - After you have configured the capacities your nodes provide and the --capacities your resources require, you need to set the `placement-strategy` -+capacities your resources require, you need to set the +placement-strategy+ - in the global cluster options, otherwise the capacity configurations have --`no effect`. -+'no effect'. - --Four values are available for the `placement-strategy`: -+Four values are available for the +placement-strategy+: - --`default`:: -++default+:: - --Utilization values are not taken into account at all, per default. -+Utilization values are not taken into account at all. - Resources are allocated according to allocation scores. If scores are equal, - resources are evenly distributed across nodes. - --`utilization`:: -++utilization+:: - --Utilization values are taken into account when deciding whether a node --is considered eligible if it has sufficient free capacity to satisfy the --resource's requirements. However, load-balancing is still done based on the -+Utilization values are taken into account 'only' when deciding whether a node -+is considered eligible (i.e. whether it has sufficient free capacity to satisfy -+the resource's requirements). Load-balancing is still done based on the - number of resources allocated to a node. - --`balanced`:: -++balanced+:: - - Utilization values are taken into account when deciding whether a node --is eligible to serve a resource; an attempt is made to spread the resources --evenly, optimizing resource performance. -+is eligible to serve a resource 'and' when load-balancing, so an attempt is -+made to spread the resources in a way that optimizes resource performance. - --`minimal`:: -++minimal+:: - --Utilization values are taken into account when deciding whether a node --is eligible to serve a resource; an attempt is made to concentrate the --resources on as few nodes as possible, thereby enabling possible power savings --on the remaining nodes. -+Utilization values are taken into account 'only' when deciding whether a node -+is eligible to serve a resource. For load-balancing, an attempt is made to -+concentrate the resources on as few nodes as possible, thereby enabling -+possible power savings on the remaining nodes. - - --Set `placement-strategy` with `crm_attribute`: --[source,C] --# crm_attribute --attr-name placement-strategy --attr-value balanced -+Set +placement-strategy+ with `crm_attribute`: -+---- -+# crm_attribute --name placement-strategy --update balanced -+---- - - Now Pacemaker will ensure the load from your resources will be distributed --evenly throughout the cluster - without the need for convoluted sets of -+evenly throughout the cluster, without the need for convoluted sets of - colocation constraints. - -- - == Allocation Details == - --=== Which node is preferred to be chosen to get consumed first on allocating resources? === -+=== Which node is preferred to get consumed first when allocating resources? === - --- The node that is most healthy (which has the highest node weight) gets --consumed first. -+- The node with the highest node weight gets consumed first. Node weight -+ is a score maintained by the cluster to represent node health. - --- If their weights are equal: -- * If `placement-strategy="default|utilization"`, -+- If multiple nodes have the same node weight: -+ * If +placement-strategy+ is +default+ or +utilization+, - the node that has the least number of allocated resources gets consumed first. - ** If their numbers of allocated resources are equal, -- the first eligible node listed in cib gets consumed first. -+ the first eligible node listed in the CIB gets consumed first. - -- * If `placement-strategy="balanced"`, -- the node that has more free capacity gets consumed first. -+ * If +placement-strategy+ is +balanced+, -+ the node that has the most free capacity gets consumed first. - ** If the free capacities of the nodes are equal, - the node that has the least number of allocated resources gets consumed first. - *** If their numbers of allocated resources are equal, -- the first eligible node listed in cib gets consumed first. -+ the first eligible node listed in the CIB gets consumed first. - -- * If `placement-strategy="minimal"`, -- the first eligible node listed in cib gets consumed first. -+ * If +placement-strategy+ is +minimal+, -+ the first eligible node listed in the CIB gets consumed first. - -+=== Which node has more free capacity? === - --==== Which node has more free capacity? ==== -+If only one type of utilization attribute has been defined, free capacity -+is a simple numeric comparison. - --This will be quite clear if we only define one type of `capacity`. While if we --define multiple types of `capacity`, for example: -+If multiple types of utilization attributes have been defined, then -+the node that is numerically highest in the the most attribute types -+has the most free capacity. For example: - --- If `nodeA` has more free `cpus`, `nodeB` has more free `memory`, -- their free capacities are equal. -+- If +nodeA+ has more free +cpus+, and +nodeB+ has more free +memory+, -+ then their free capacities are equal. - --- If `nodeA` has more free `cpus`, while `nodeB` has more free `memory` and `storage`, -- `nodeB` has more free capacity. -+- If +nodeA+ has more free +cpus+, while +nodeB+ has more free +memory+ and +storage+, -+ then +nodeB+ has more free capacity. - -+=== Which resource is preferred to be assigned first? === - --=== Which resource is preferred to be chosen to get assigned first? === -+- The resource that has the highest +priority+ (see <>) gets allocated first. - --- The resource that has the highest priority gets allocated first. -+- If their priorities are equal, check whether they are already running. The -+ resource that has the highest score on the node where it's running gets allocated -+ first, to prevent resource shuffling. - --- If their priorities are equal, check if they are already running. The --resource that has the highest score on the node where it's running gets allocated --first (to prevent resource shuffling). -- --- If the scores above are equal or they are not running, the resource has -+- If the scores above are equal or the resources are not running, the resource has - the highest score on the preferred node gets allocated first. - --- If the scores above are equal, the first runnable resource listed in cib gets allocated first. -+- If the scores above are equal, the first runnable resource listed in the CIB -+ gets allocated first. - - --== Limitations == -+== Limitations and Workarounds == - --This type of problem Pacemaker is dealing with here is known as the -+The type of problem Pacemaker is dealing with here is known as the - http://en.wikipedia.org/wiki/Knapsack_problem[knapsack problem] and falls into - the http://en.wikipedia.org/wiki/NP-complete[NP-complete] category of computer --science problems - which is fancy way of saying "it takes a really long time -+science problems -- a fancy way of saying "it takes a really long time - to solve". - - Clearly in a HA cluster, it's not acceptable to spend minutes, let alone hours -@@ -193,29 +196,32 @@ service. This means it arrives at a solution much faster than traditional - linear programming algorithms, but by doing so at the price of leaving some - services stopped. - --In the contrived example above: -+In the contrived example at the start of this chapter: - --- `rsc-small` would be allocated to `node1` --- `rsc-medium` would be allocated to `node2` --- `rsc-large` would remain inactive -+- +rsc-small+ would be allocated to +node1+ -+- +rsc-medium+ would be allocated to +node2+ -+- +rsc-large+ would remain inactive - - Which is not ideal. - -+There are various approaches to dealing with the limitations of -+pacemaker's placement strategy: - --== Strategies for Dealing with the Limitations == -+Ensure you have sufficient physical capacity.:: - --- Ensure you have sufficient physical capacity. --It might sounds obvious, but if the physical capacity of your nodes is (close to) -+It might sound obvious, but if the physical capacity of your nodes is (close to) - maxed out by the cluster under normal conditions, then failover isn't going to --go well. Even without the Utilization feature, you'll start hitting timeouts and --getting secondary failures'. -+go well. Even without the utilization feature, you'll start hitting timeouts and -+getting secondary failures. -+ -+Build some buffer into the capabilities advertised by the nodes.:: -+ -+Advertise slightly more resources than we physically have, on the (usually valid) -+assumption that a resource will not use 100% of the configured amount of -+CPU, memory and so forth 'all' the time. This practice is sometimes called 'overcommit'. - --- Build some buffer into the capabilities advertised by the nodes. --Advertise slightly more resources than we physically have on the (usually valid) --assumption that a resource will not use 100% of the configured number of --cpu/memory/etc `all` the time. This practice is also known as 'over commit'. -+Specify resource priorities.:: - --- Specify resource priorities. - If the cluster is going to sacrifice services, it should be the ones you care --(comparatively) about the least. Ensure that resource priorities are properly set -+about (comparatively) the least. Ensure that resource priorities are properly set - so that your most important resources are scheduled first. -diff --git a/doc/Pacemaker_Explained/en-US/NOTES b/doc/Pacemaker_Explained/en-US/NOTES -index ae5069b..d6c6f2f 100644 ---- a/doc/Pacemaker_Explained/en-US/NOTES -+++ b/doc/Pacemaker_Explained/en-US/NOTES -@@ -1,69 +1,16 @@ -+why sometimes and sometimes
? examples have title at top, figures have title at bottom - --That's a "+", not a hyphen: -- --Key combinations can be distinguished from keycaps by the hyphen connecting each part of a key --combination. For example: --Press Enter to execute the command. --Press Ctrl+Alt+F2 to switch to the first virtual terminal. Press Ctrl+Alt+F1 to --return to your X-Windows session. -- -- -- -- --doesn't apply here: -- --If source code is discussed, class names, methods, functions, v -- including application names; dialog box text; labeled buttons; check-box and radio button labels; menu titles and sub-menu titles. -- -- --1.2 terminal output has page-break -- -- --2.3 editing via VI --isn't that racy? Are concurrent changes detected? -- -- --why sometimes and sometimes
? example 2.2 has title at top, different to the figures -- -- --2.8 header slightly too long, line broken -- -+Example 2.8 (and others) XML line too long, line broken - - some are in , some in ... I'd like the latter more, or perhaps in a . - Indentation makes whitespace at start of lines ... remove? - --Chapter 3 --========= --- table 3.3 "Properties maintained by the Cluster" incomplete (crm-feature-set, ...) -- -- --4.4.2 structure different from 4.4.1 ... numbered lists -- -- --5.3 Notes have next content overlaid -- -- --ex 5.6: 1.0 ???? -- -- --perhaps use 10.20.30.40 instead of the 1.2.3.4 example IP address? -- -- --6.5 images/resource-set.png missing, "images/two-sets.png" too; images/three-sets; "images/three-sets-complex.png" -- -+tables 3.1 and 3.2 incomplete (crm-feature-set, ...) - - Ch 7 missing? - -- - Remove Ex9.9? - -- --collocate or colocate? Eg. in C.1: --Multi-dimensional colocation and ordering constraints. See Section 6.5, “Ordering Sets of --Resources” and Section 6.6, “Collocating Sets of Resources” -- -- - Ap-Debug.xml not used? - -- - alias for primary? -diff --git a/doc/Pacemaker_Explained/en-US/Pacemaker_Explained.ent b/doc/Pacemaker_Explained/en-US/Pacemaker_Explained.ent -index b9137bc..74004c8 100644 ---- a/doc/Pacemaker_Explained/en-US/Pacemaker_Explained.ent -+++ b/doc/Pacemaker_Explained/en-US/Pacemaker_Explained.ent -@@ -1,4 +1,4 @@ - - -- -+ - -diff --git a/doc/Pacemaker_Explained/en-US/Pacemaker_Explained.xml b/doc/Pacemaker_Explained/en-US/Pacemaker_Explained.xml -index aa1eab4..fe054f3 100644 ---- a/doc/Pacemaker_Explained/en-US/Pacemaker_Explained.xml -+++ b/doc/Pacemaker_Explained/en-US/Pacemaker_Explained.xml -@@ -32,10 +32,10 @@ - - Further Reading - -- Project Website -- Project Documentation -+ Project Website: -+ Project Documentation: - -- A comprehensive guide to cluster commands has been written by Novell -+ SUSE High Availibility Guide: - - Heartbeat configuration: - Corosync Configuration: -diff --git a/doc/Pacemaker_Explained/en-US/Preface.xml b/doc/Pacemaker_Explained/en-US/Preface.xml -index eadd41d..e63c4be 100644 ---- a/doc/Pacemaker_Explained/en-US/Preface.xml -+++ b/doc/Pacemaker_Explained/en-US/Preface.xml -@@ -1,11 +1,11 @@ - -- - Preface -- -+ - - -- -+ - - - -diff --git a/doc/Pacemaker_Explained/en-US/Revision_History.xml b/doc/Pacemaker_Explained/en-US/Revision_History.xml -index 0afc90b..eecd34b 100644 ---- a/doc/Pacemaker_Explained/en-US/Revision_History.xml -+++ b/doc/Pacemaker_Explained/en-US/Revision_History.xml -@@ -2,6 +2,7 @@ - - -+ - Revision History - - -@@ -42,6 +43,18 @@ - - - -+ -+ 5-0 -+ Mon Feb 23 2015 -+ KenGaillotkgaillot@redhat.com -+ -+ -+ -+ Update for clarity, stylistic consistency and current command-line syntax -+ -+ -+ -+ - - - -diff --git a/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt b/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt -index 9bf6dc6..d0fd14b 100644 ---- a/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt -+++ b/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt -@@ -42,7 +42,7 @@ Now install and start the pacemaker_remote daemon on the baremetal remote-node. - - [source,C] - ---- --# yum install -y paceamaker-remote resource-agents pcs -+# yum install -y pacemaker-remote resource-agents pcs - # systemctl enable pacemaker_remote.service - # systemctl start pacemaker_remote.service - ---- -diff --git a/doc/Pacemaker_Remote/en-US/Ch-Intro.txt b/doc/Pacemaker_Remote/en-US/Ch-Intro.txt -index d8699b3..777bb97 100644 ---- a/doc/Pacemaker_Remote/en-US/Ch-Intro.txt -+++ b/doc/Pacemaker_Remote/en-US/Ch-Intro.txt -@@ -48,7 +48,7 @@ In the past, users desiring this deployment had to make a decision. They would e - - With the pacemaker_remote service we have a new option. - --* The baremetal cluster-nodes run the cluster stack (paceamaker+corosync). -+* The baremetal cluster-nodes run the cluster stack (pacemaker+corosync). - * The virtual remote-nodes run the pacemaker_remote service (nearly zero configuration required on the virtual machine side) - * The cluster stack on the cluster-nodes launch the virtual machines and immediately connect to the pacemaker_remote service, allowing the virtual machines to integrate into the cluster just as if they were a real cluster-node. - -@@ -66,7 +66,7 @@ With this deployment you would have 64 webservers and databases running on 64 vi - - +"I want my traditional High Availability cluster to scale beyond the limits imposed by the corosync messaging layer."+ - --Ultimately the primary advantage of baremetal remote-nodes over traditional nodes running the Corosync+Pacemaker stack is scalability. There are likely some other use cases related to geographically distributed HA clusters that baremetal remote-nodes may serve a purpose in, but those use cases not well understood at this point. The only limitations baremetal remote-nodes have that cluster-nodes do not is the ability to take place in cluster quorum, and the ability to execute fencing agents via stonith. That is not to say however that fencing of a baremetal node works any differently than that of a normal cluster-node. The Pacemaker policy engine understands how to fence baremetal remote-nodes. As long as a fencing device exists, the cluster is capable of ensuring baremetal nodes are fenced in the exact same way as normal cluster-nodes are fenced. -+Ultimately the primary advantage of baremetal remote-nodes over traditional nodes running the Corosync+Pacemaker stack is scalability. There are likely some other use cases related to geographically distributed HA clusters that baremetal remote-nodes may serve a purpose in, but those use cases are not well understood at this point. The only limitations baremetal remote-nodes have that cluster-nodes do not is the ability to take place in cluster quorum, and the ability to execute fencing agents via stonith. That is not to say however that fencing of a baremetal node works any differently than that of a normal cluster-node. The Pacemaker policy engine understands how to fence baremetal remote-nodes. As long as a fencing device exists, the cluster is capable of ensuring baremetal nodes are fenced in the exact same way as normal cluster-nodes are fenced. - - == Linux Container Use Case == - -diff --git a/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt b/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt -index adf3422..7b150aa 100644 ---- a/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt -+++ b/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt -@@ -192,7 +192,7 @@ Now on the +GUEST+ install pacemaker-remote package and enable the daemon to run - - [source,C] - ---- --# yum install -y pacemaker paceamaker-remote resource-agents -+# yum install -y pacemaker pacemaker-remote resource-agents - # systemctl enable pacemaker_remote.service - ---- - -diff --git a/doc/asciidoc.reference b/doc/asciidoc.reference -index a9a171b..9323864 100644 ---- a/doc/asciidoc.reference -+++ b/doc/asciidoc.reference -@@ -1,31 +1,49 @@ -+= Single-chapter part of the documentation = -+ -+== Go-to reference chapter for how we use AsciiDoc on this project == -+ -+[NOTE] -+====== -+This is *not* an attempt for fully self-hosted AsciiDoc document, -+consider it a plaintext full of AsciiDoc samples (it's up to the reader -+to recognize the borderline) at documentation writers' disposal -+to somewhat standardize the style{empty}footnote:[ -+ style of both source notation and final visual appearance -+]. -+ - See also: - http://powerman.name/doc/asciidoc -+====== - --Commands: `some-tool --with option` --Files: '/tmp/file.name' --Italic: _some text_ -+Emphasis: _some test_ - Mono: +some text+ --Bold: *some text* --Super: ^some text^ --Sub: ~some text~ -+Strong: *some text* -+Super: ^some text^ -+Sub: ~some text~ - Quotes: - ``double quoted'' - `single quoted' - --Tool: command -+Command: `some-tool --with option` -+Newly introduced term: -+ 'some text' (another form of emphasis as of this edit) -+ -+File: mono - Literal: mono -+Tool: command -+Option: mono -+Replaceable: emphasis mono - Varname: mono --Option: italic --Emphasis: italic bold --Replaceable: italic mono -+Term encountered on system (e.g., menu choice, hostname): -+ strong - - --.Title for Eaxmple -+.Title for Example - ===== - Some text - ===== - --.Title for Eaxmple with XML Listing -+.Title for Example with XML Listing - ===== - [source,XML] - ----- -@@ -49,4 +67,4 @@ Section anchors: - - References to section anchors: - --<> or <> -\ No newline at end of file -+<> or <> -diff --git a/doc/coding_guidelines.txt b/doc/coding_guidelines.txt -index 079d7c7..a95134e 100644 ---- a/doc/coding_guidelines.txt -+++ b/doc/coding_guidelines.txt -@@ -89,8 +89,7 @@ code that will be a part of the Pacemaker project. - is indented by one level. Opening brace is on the same line as switch. - - ``` -- switch (expression) -- { -+ switch (expression) { - case 0: - command0; - break; -diff --git a/doc/shared/en-US/pacemaker-intro.txt b/doc/shared/en-US/pacemaker-intro.txt -new file mode 100644 -index 0000000..6b898c9 ---- /dev/null -+++ b/doc/shared/en-US/pacemaker-intro.txt -@@ -0,0 +1,169 @@ -+ -+== What Is 'Pacemaker'? == -+ -+Pacemaker is a 'cluster resource manager', that is, a logic responsible -+for a life-cycle of deployed software -- indirectly perhaps even whole -+systems or their interconnections -- under its control within a set of -+computers (a.k.a. 'cluster nodes', 'nodes' for short) and driven by -+prescribed rules. -+ -+It achieves maximum availability for your cluster services -+(a.k.a. 'resources') by detecting and recovering from node- and -+resource-level failures by making use of the messaging and membership -+capabilities provided by your preferred cluster infrastructure (either -+http://www.corosync.org/[Corosync] or -+http://linux-ha.org/wiki/Heartbeat[Heartbeat]), and possibly by -+utilizing other parts of the overall cluster stack. -+ -+.High Availability Clusters -+[NOTE] -+For *the goal of minimal downtime* a term 'high availability' was coined -+and together with its acronym, 'HA', is well-established in the sector. -+To differentiate this sort of clusters from high performance computing -+('HPC') ones, should a context require it (apparently, not the case in -+this document), using 'HA cluster' is an option. -+ -+Pacemaker's key features include: -+ -+ * Detection and recovery of node and service-level failures -+ * Storage agnostic, no requirement for shared storage -+ * Resource agnostic, anything that can be scripted can be clustered -+ * Supports 'fencing' (also referred to as the 'STONITH' acronym, -+ <> later on) for ensuring data integrity -+ * Supports large and small clusters -+ * Supports both quorate and resource-driven clusters -+ * Supports practically any redundancy configuration -+ * Automatically replicated configuration that can be updated -+ from any node -+ * Ability to specify cluster-wide service ordering, -+ colocation and anti-colocation -+ * Support for advanced service types -+ ** Clones: for services which need to be active on multiple nodes -+ ** Multi-state: for services with multiple modes -+ (e.g. master/slave, primary/secondary) -+ * Unified, scriptable cluster management tools -+ -+== Pacemaker Architecture == -+ -+At the highest level, the cluster is made up of three pieces: -+ -+ * *Non-cluster-aware components*. These pieces -+ include the resources themselves; scripts that start, stop and -+ monitor them; and a local daemon that masks the differences -+ between the different standards these scripts implement. -+ Even though interactions of these resources when run as multiple -+ instances can resemble a distributed system, they still lack -+ the proper HA mechanisms and/or autonomous cluster-wide governance -+ as subsumed in the following item. -+ -+ * *Resource management*. Pacemaker provides the brain that processes -+ and reacts to events regarding the cluster. These events include -+ nodes joining or leaving the cluster; resource events caused by -+ failures, maintenance and scheduled activities; and other -+ administrative actions. Pacemaker will compute the ideal state of -+ the cluster and plot a path to achieve it after any of these -+ events. This may include moving resources, stopping nodes and even -+ forcing them offline with remote power switches. -+ -+ * *Low-level infrastructure*. Projects like 'Corosync', 'CMAN' and -+ 'Heartbeat' provide reliable messaging, membership and quorum -+ information about the cluster. -+ -+When combined with Corosync, Pacemaker also supports popular open -+source cluster filesystems.{empty}footnote:[ -+ Even though Pacemaker also supports Heartbeat, the filesystems need to -+ use the stack for messaging and membership, and Corosync seems to be -+ what they're standardizing on. Technically, it would be possible for -+ them to support Heartbeat as well, but there seems little interest -+ in this. -+] -+ -+Due to past standardization within the cluster filesystem community, -+cluster filesystems make use of a common 'distributed lock manager', -+which makes use of Corosync for its messaging and membership -+capabilities (which nodes are up/down) and Pacemaker for fencing -+services. -+ -+.The Pacemaker Stack -+image::images/pcmk-stack.png["The Pacemaker stack",width="10cm",height="7.5cm",align="center"] -+ -+=== Internal Components === -+ -+Pacemaker itself is composed of five key components: -+ -+ * 'Cluster Information Base' ('CIB') -+ * 'Cluster Resource Management daemon' ('CRMd') -+ * 'Local Resource Management daemon' ('LRMd') -+ * 'Policy Engine' ('PEngine' or 'PE') -+ * Fencing daemon ('STONITHd') -+ -+.Internal Components -+image::images/pcmk-internals.png["Subsystems of a Pacemaker cluster",align="center",scaledwidth="65%"] -+ -+The CIB uses XML to represent both the cluster's configuration and -+current state of all resources in the cluster. The contents of the CIB -+are automatically kept in sync across the entire cluster and are used by -+the PEngine to compute the ideal state of the cluster and how it should -+be achieved. -+ -+This list of instructions is then fed to the 'Designated Controller' -+('DC'). Pacemaker centralizes all cluster decision making by electing -+one of the CRMd instances to act as a master. Should the elected CRMd -+process (or the node it is on) fail, a new one is quickly established. -+ -+The DC carries out the PEngine's instructions in the required order by -+passing them to either the Local Resource Management daemon (LRMd) or -+CRMd peers on other nodes via the cluster messaging infrastructure -+(which in turn passes them on to their LRMd process). -+ -+The peer nodes all report the results of their operations back to the DC -+and, based on the expected and actual results, will either execute any -+actions that needed to wait for the previous one to complete, or abort -+processing and ask the PEngine to recalculate the ideal cluster state -+based on the unexpected results. -+ -+In some cases, it may be necessary to power off nodes in order to -+protect shared data or complete resource recovery. For this, Pacemaker -+comes with STONITHd. -+ -+[[s-intro-stonith]] -+.STONITH -+[NOTE] -+*STONITH* is an acronym for 'Shoot-The-Other-Node-In-The-Head', -+a recommended practice that misbehaving node is best to be promptly -+'fenced' (shut off, cut from shared resources or otherwise immobilized), -+and is usually implemented with a remote power switch. -+ -+In Pacemaker, STONITH devices are modeled as resources (and configured -+in the CIB) to enable them to be easily monitored for failure, however -+STONITHd takes care of understanding the STONITH topology such that its -+clients simply request a node be fenced, and it does the rest. -+ -+== Types of Pacemaker Clusters == -+ -+Pacemaker makes no assumptions about your environment. This allows it -+to support practically any -+http://en.wikipedia.org/wiki/High-availability_cluster#Node_configurations[redundancy -+configuration] including 'Active/Active', 'Active/Passive', 'N+1', -+'N+M', 'N-to-1' and 'N-to-N'. -+ -+.Active/Passive Redundancy -+image::images/pcmk-active-passive.png["Active/Passive Redundancy",width="10cm",height="7.5cm",align="center"] -+ -+Two-node Active/Passive clusters using Pacemaker and 'DRBD' are -+a cost-effective solution for many High Availability situations. -+ -+.Shared Failover -+image::images/pcmk-shared-failover.png["Shared Failover",width="10cm",height="7.5cm",align="center"] -+ -+By supporting many nodes, Pacemaker can dramatically reduce hardware -+costs by allowing several active/passive clusters to be combined and -+share a common backup node. -+ -+.N to N Redundancy -+image::images/pcmk-active-active.png["N to N Redundancy",width="10cm",height="7.5cm",align="center"] -+ -+When shared storage is available, every node can potentially be used for -+failover. Pacemaker can even run multiple copies of services to spread -+out the workload. -+ -diff --git a/extra/Makefile.am b/extra/Makefile.am -index daaf137..40e6cad 100644 ---- a/extra/Makefile.am -+++ b/extra/Makefile.am -@@ -18,7 +18,7 @@ - - MAINTAINERCLEANFILES = Makefile.in - --SUBDIRS = resources rgmanager logrotate -+SUBDIRS = resources logrotate - - mibdir = $(datadir)/snmp/mibs - mib_DATA = PCMK-MIB.txt -diff --git a/extra/buildbot.helper b/extra/buildbot.helper -index 2f2d7d8..459f175 100755 ---- a/extra/buildbot.helper -+++ b/extra/buildbot.helper -@@ -5,7 +5,7 @@ self=`basename $0` - - if [ x$1 = xinstall ]; then - # Basic test phase -- mock --configdir=$PWD --root=mock --resultdir=./mock -v --install ./mock/*.rpm nano sudo valgrind lcov psmisc -+ mock --configdir=$PWD --root=mock --resultdir=./mock -v --install nano lcov psmisc sudo valgrind ./mock/*.rpm - elif [ x$1 = xdownloads ]; then - # Extra test phase - mock --configdir=$PWD --root=mock --resultdir=./mock -v --install ./downloads/*.rpm nano sudo valgrind lcov -diff --git a/extra/resources/Dummy b/extra/resources/Dummy -index 2410c4d..8a38ef5 100644 ---- a/extra/resources/Dummy -+++ b/extra/resources/Dummy -@@ -66,6 +66,14 @@ Location to store the resource state in. - - - -+ -+ -+Fake password field -+ -+Password -+ -+ -+ - - - Fake attribute that can be changed to cause a reload -@@ -129,6 +137,7 @@ dummy_stop() { - if [ $? = $OCF_SUCCESS ]; then - rm ${OCF_RESKEY_state} - fi -+ rm -f ${VERIFY_SERIALIZED_FILE} - return $OCF_SUCCESS - } - -@@ -137,10 +146,21 @@ dummy_monitor() { - # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING). - # That is THREE states, not just yes/no. - -- sleep ${OCF_RESKEY_op_sleep} -+ if [ "$OCF_RESKEY_op_sleep" -ne "0" ]; then -+ if [ -f ${VERIFY_SERIALIZED_FILE} ]; then -+ # two monitor ops have occurred at the same time. -+ # this is to verify a condition in the lrmd regression tests. -+ ocf_log err "$VERIFY_SERIALIZED_FILE exists already" -+ return $OCF_ERR_GENERIC -+ fi -+ -+ touch ${VERIFY_SERIALIZED_FILE} -+ sleep ${OCF_RESKEY_op_sleep} -+ rm ${VERIFY_SERIALIZED_FILE} -+ fi - - if [ -f ${OCF_RESKEY_state} ]; then -- return $OCF_SUCCESS -+ return $OCF_SUCCESS - fi - if false ; then - return $OCF_ERR_GENERIC -@@ -176,6 +196,7 @@ if [ "x$OCF_RESKEY_state" = "x" ]; then - OCF_RESKEY_state="${HA_VARRUN}/Dummy-${OCF_RESOURCE_INSTANCE}.state" - fi - fi -+VERIFY_SERIALIZED_FILE="${OCF_RESKEY_state}.serialized" - - case $__OCF_ACTION in - meta-data) meta_data -diff --git a/extra/resources/HealthSMART b/extra/resources/HealthSMART -index 45bb0f1..3747bfa 100644 ---- a/extra/resources/HealthSMART -+++ b/extra/resources/HealthSMART -@@ -63,7 +63,7 @@ Location to store the resource state in. - - - -- -+ - - The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda". - -@@ -71,7 +71,7 @@ The drive(s) to check as a SPACE separated list. Enter the full path to the devi - - - -- -+ - - The device type(s) to assume for the drive(s) being tested as a SPACE separated list. - -@@ -79,7 +79,7 @@ The device type(s) to assume for the drive(s) being tested as a SPACE separated - - - -- -+ - - Lower limit of the temperature in deg C of the drive(s). Below this limit the status will be red. - -@@ -87,7 +87,7 @@ Lower limit of the temperature in deg C of the drive(s). Below this limit the st - - - -- -+ - - Upper limit of the temperature if deg C of the drives(s). If the drive reports - a temperature higher than this value the status of #health-smart will be red. -@@ -96,7 +96,7 @@ a temperature higher than this value the status of #health-smart will be red. - - - -- -+ - - Number of deg C below/above the upper/lower temp limits at which point the status of #health-smart will change to yellow. - -@@ -170,7 +170,7 @@ init_smart() { - upper_yellow_limit=$((${upper_red_limit}-${yellow_threshold})) - - #Set disk defaults -- if [ -z ${OCF_RESKEY_drives} ] ; then -+ if [ -z "${OCF_RESKEY_drives}" ] ; then - DRIVES="/dev/sda" - else - DRIVES=${OCF_RESKEY_drives} -diff --git a/extra/resources/Makefile.am b/extra/resources/Makefile.am -index cc162e5..a090a16 100644 ---- a/extra/resources/Makefile.am -+++ b/extra/resources/Makefile.am -@@ -21,6 +21,9 @@ include $(top_srcdir)/Makefile.common - - EXTRA_DIST = $(ocf_SCRIPTS) - -+ -+isolationtechdir = @OCF_RA_DIR@/.isolation -+ - ocfdir = @OCF_RA_DIR@/pacemaker - - ocf_SCRIPTS = ClusterMon \ -@@ -36,6 +39,8 @@ ocf_SCRIPTS = ClusterMon \ - SystemHealth \ - remote - -+isolationtech_SCRIPTS = docker-wrapper -+ - if BUILD_XML_HELP - - man7_MANS = $(ocf_SCRIPTS:%=ocf_pacemaker_%.7) -diff --git a/extra/resources/docker-wrapper b/extra/resources/docker-wrapper -new file mode 100755 -index 0000000..4b0b87b ---- /dev/null -+++ b/extra/resources/docker-wrapper -@@ -0,0 +1,536 @@ -+#!/bin/bash -+# -+# Copyright (c) 2015 David Vossel -+# All Rights Reserved. -+# -+# This program is free software; you can redistribute it and/or modify -+# it under the terms of version 2 of the GNU General Public License as -+# published by the Free Software Foundation. -+# -+# This program is distributed in the hope that it would be useful, but -+# WITHOUT ANY WARRANTY; without even the implied warranty of -+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -+# -+# Further, this software is distributed without any warranty that it is -+# free of the rightful claim of any third person regarding infringement -+# or the like. Any license provided herein, whether implied or -+# otherwise, applies only to this software file. Patent licenses, if -+# any, provided herein do not apply to combinations of this program with -+# other software, or any other product whatsoever. -+# -+# You should have received a copy of the GNU General Public License -+# along with this program; if not, write the Free Software Foundation, -+# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. -+# -+ -+####################################################################### -+# Initialization: -+ -+: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} -+. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs -+ -+####################################################################### -+ -+CONF_PREFIX="pcmk_docker" -+ -+meta_data() { -+ cat < -+ -+ -+1.0 -+ -+ -+Docker technology wrapper for pacemaker remote. -+ -+docker wrapper -+ -+ -+ -+ -+Docker image to run resources within -+ -+docker image -+ -+ -+ -+ -+ -+Give resources within container access to cluster resources -+such as the CIB and the ability to manage cluster attributes. -+ -+NOTE: Do not confuse this with the docker run command's -+'--priviledged' option which gives a container permission -+to access system devices. To toggle the docker run option, -+ set --priviledged=true as part of the ${CONF_PREFIS}_run_opts -+arguments. The ${CONF_PREFIX}_privileged option only pertains -+to whether or not the container has access to the cluster's -+CIB or not. Some multistate resources need to be able to write -+values to the cib, which would require enabling ${CONF_PREFIX}_privileged -+ -+is privileged -+ -+ -+ -+ -+ -+Add options to be appended to the 'docker run' command which is used -+when creating the container during the start action. This option allows -+users to do things such as setting a custom entry point and injecting -+environment variables into the newly created container. Note the '-d' -+option is supplied regardless of this value to force containers to run -+in the background. -+ -+NOTE: Do not explicitly specify the --name argument in the run_opts. This -+agent will set --name using the resource's instance name -+ -+ -+run options -+ -+ -+ -+ -+ -+Allow the container to be reused after stopping the container. By default -+containers are removed after stop. With the reuse option containers -+will persist after the container stops. -+ -+reuse container -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+END -+} -+ -+####################################################################### -+ -+ -+CLIENT="/usr/libexec/pacemaker/lrmd_internal_ctl" -+DOCKER_AGENT="/usr/lib/ocf/resource.d/heartbeat/docker" -+KEY_VAL_STR="" -+PROVIDER=$OCF_RESKEY_CRM_meta_provider -+CLASS=$OCF_RESKEY_CRM_meta_class -+TYPE=$OCF_RESKEY_CRM_meta_type -+ -+CONTAINER=$OCF_RESKEY_CRM_meta_isolation_instance -+if [ -z "$CONTAINER" ]; then -+ CONTAINER=$OCF_RESOURCE_INSTANCE -+fi -+ -+RSC_STATE_DIR="${HA_RSCTMP}/docker-wrapper/${CONTAINER}-data/" -+RSC_STATE_FILE="$RSC_STATE_DIR/$OCF_RESOURCE_INSTANCE.state" -+CONNECTION_FAILURE=0 -+HOST_LOG_DIR="${HA_RSCTMP}/docker-wrapper/${CONTAINER}-logs" -+HOST_LOG_FILE="$HOST_LOG_DIR/pacemaker.log" -+GUEST_LOG_DIR="/var/log/pcmk" -+GUEST_LOG_FILE="$GUEST_LOG_DIR/pacemaker.log" -+ -+pcmk_docker_wrapper_usage() { -+ cat < $RSC_STATE_FILE -+ fi -+} -+ -+clear_state_file() -+{ -+ if [ -f "$RSC_STATE_FILE" ]; then -+ rm -f $RSC_STATE_FILE -+ fi -+} -+ -+clear_state_dir() -+{ -+ [ -d "$RSC_STATE_DIR" ] || return 0 -+ -+ rm -rf $RSC_STATE_DIR -+} -+ -+num_active_resources() -+{ -+ local count -+ -+ [ -d "$RSC_STATE_DIR" ] || return 0 -+ -+ count="$(ls $RSC_STATE_DIR | wc -w)" -+ if [ $? -ne 0 ] || [ -z "$count" ]; then -+ return 0 -+ fi -+ return $count -+} -+ -+random_port() -+{ -+ local port=$(python -c 'import socket; s=socket.socket(); s.bind(("localhost", 0)); print(s.getsockname()[1]); s.close()') -+ if [ $? -eq 0 ] && [ -n "$port" ]; then -+ echo "$port" -+ fi -+} -+ -+get_active_port() -+{ -+ PORT="$(docker port $CONTAINER 3121 | awk -F: '{ print $2 }')" -+} -+ -+# separate docker args from ocf resource args. -+separate_args() -+{ -+ local env key value -+ -+ # write out arguments to key value string for ocf agent -+ while read -r line; -+ do -+ key="$(echo $line | awk -F= '{print $1}' | sed 's/^OCF_RESKEY_//g')" -+ val="$(echo $line | awk -F= '{print $2}')" -+ KEY_VAL_STR="$KEY_VAL_STR -k '$key' -v '$val'" -+ done < <(printenv | grep "^OCF.*" | grep -v "^OCF_RESKEY_${CONF_PREFIX}_.*") -+ -+ # sanitize args for DOCKER agent's consumption -+ while read -r line; -+ do -+ env="$(echo $line | awk -F= '{print $1}')" -+ val="$(echo $line | awk -F= '{print $2}')" -+ key="$(echo "$env" | sed "s/^OCF_RESKEY_${CONF_PREFIX}/OCF_RESKEY/g")" -+ export $key="$val" -+ done < <(printenv | grep "^OCF_RESKEY_${CONF_PREFIX}_.*") -+ -+ if ocf_is_true $OCF_RESKEY_privileged ; then -+ export OCF_RESKEY_run_cmd="/usr/sbin/pacemaker_remoted" -+ # on start set random port to run_opts -+ # write port to state file... or potentially get from ps? maybe docker info or inspect as well? -+ -+ else -+ export OCF_RESKEY_run_cmd="/usr/libexec/pacemaker/lrmd" -+ fi -+ export OCF_RESKEY_name="$CONTAINER" -+} -+ -+monitor_container() -+{ -+ local rc -+ -+ $DOCKER_AGENT monitor -+ rc=$? -+ if [ $rc -ne $OCF_SUCCESS ]; then -+ clear_state_dir -+ return $rc -+ fi -+ -+ poke_remote -+ rc=$? -+ if [ $rc -ne $OCF_SUCCESS ]; then -+ # container is up without an active daemon. this is bad -+ ocf_log err "Container, $CONTAINER, is active without a responsive pacemaker_remote instance" -+ CONNECTION_FAILURE=1 -+ return $OCF_ERR_GENERIC -+ fi -+ CONNECTION_FAILURE=0 -+ -+ return $rc -+} -+ -+pcmk_docker_wrapper_monitor() { -+ local rc -+ -+ monitor_container -+ rc=$? -+ if [ $rc -ne $OCF_SUCCESS ]; then -+ return $rc -+ fi -+ -+ client_action "monitor" -+ rc=$? -+ if [ $rc -eq $OCF_SUCCESS ] || [ $rc -eq $OCF_RUNNING_MASTER ]; then -+ write_state_file -+ else -+ clear_state_file -+ fi -+ -+ return $rc -+} -+ -+pcmk_docker_wrapper_generic_action() -+{ -+ local rc -+ -+ monitor_container -+ rc=$? -+ if [ $? -ne $OCF_SUCCESS ]; then -+ return $rc -+ fi -+ -+ client_action "$1" -+} -+ -+client_action() -+{ -+ local action=$1 -+ local agent_type="-T $TYPE -C $CLASS" -+ local rc=0 -+ -+ if [ -n "$PROVIDER" ]; then -+ agent_type="$agent_type -P $PROVIDER" -+ fi -+ -+ if ocf_is_true $OCF_RESKEY_privileged ; then -+ if [ -z "$PORT" ]; then -+ get_active_port -+ fi -+ export PCMK_logfile=$HOST_LOG_FILE -+ ocf_log info "$CLIENT -c 'exec' -S '127.0.0.1' -p '$PORT' -a '$action' -r '$OCF_RESOURCE_INSTANCE' -n '$CONTAINER' '$agent_type' $KEY_VAL_STR " -+ eval $CLIENT -c 'exec' -S '127.0.0.1' -p '$PORT' -a '$action' -r '$OCF_RESOURCE_INSTANCE' -n '$CONTAINER' '$agent_type' $KEY_VAL_STR -+ else -+ export PCMK_logfile=$GUEST_LOG_FILE -+ ocf_log info "$CLIENT -c \"exec\" -a $action -r \"$OCF_RESOURCE_INSTANCE\" $agent_type $KEY_VAL_STR" -+ echo "$CLIENT -c \"exec\" -a $action -r \"$OCF_RESOURCE_INSTANCE\" $agent_type $KEY_VAL_STR " | nsenter --target $(docker inspect --format {{.State.Pid}} ${CONTAINER}) --mount --uts --ipc --net --pid -+ fi -+ rc=$? -+ -+ ocf_log debug "Client action $action with result $rc" -+ return $rc -+} -+ -+poke_remote() -+{ -+ # verifies daemon in container is active -+ if ocf_is_true $OCF_RESKEY_privileged ; then -+ get_active_port -+ ocf_log info "Attempting to contect $CONTAINER on port $PORT" -+ $CLIENT -c "poke" -S "127.0.0.1" -p $PORT -n $CONTAINER -+ fi -+ # no op for non privileged containers since we handed the -+ # client monitor action as the monitor_cmd for the docker agent -+} -+ -+start_container() -+{ -+ local rc -+ -+ monitor_container -+ rc=$? -+ if [ $rc -eq $OCF_SUCCESS ]; then -+ return $rc -+ fi -+ -+ mkdir -p $HOST_LOG_DIR -+ export OCF_RESKEY_run_opts="-e PCMK_logfile=$GUEST_LOG_FILE $OCF_RESKEY_run_opts" -+ export OCF_RESKEY_run_opts="-v $HOST_LOG_DIR:$GUEST_LOG_DIR $OCF_RESKEY_run_opts" -+ if ocf_is_true $OCF_RESKEY_privileged ; then -+ if ! [ -f "/etc/pacemaker/authkey" ]; then -+ # generate an authkey if it doesn't exist. -+ mkdir -p /etc/pacemaker/ -+ dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1 > /dev/null 2>&1 -+ chmod 600 /etc/pacemaker/authkey -+ fi -+ -+ PORT=$(random_port) -+ if [ -z "$PORT" ]; then -+ ocf_exit_reason "Unable to assign random port for pacemaker remote" -+ return $OCF_ERR_GENERIC -+ fi -+ export OCF_RESKEY_run_opts="-p 127.0.0.1:${PORT}:3121 $OCF_RESKEY_run_opts" -+ export OCF_RESKEY_run_opts="-v /etc/pacemaker/authkey:/etc/pacemaker/authkey $OCF_RESKEY_run_opts" -+ ocf_log debug "using privileged mode: run_opts=$OCF_RESKEY_run_opts" -+ else -+ export OCF_RESKEY_monitor_cmd="$CLIENT -c poke" -+ fi -+ -+ $DOCKER_AGENT start -+ rc=$? -+ if [ $rc -ne $OCF_SUCCESS ]; then -+ -+ docker ps > /dev/null 2>&1 -+ if [ $? -ne 0 ]; then -+ ocf_exit_reason "docker daemon is inactive." -+ fi -+ return $rc -+ fi -+ -+ monitor_container -+} -+ -+pcmk_docker_wrapper_start() { -+ local rc -+ -+ start_container -+ rc=$? -+ if [ $rc -ne $OCF_SUCCESS ]; then -+ return $rc -+ fi -+ -+ client_action "start" -+ rc=$? -+ if [ $? -ne "$OCF_SUCCESS" ]; then -+ ocf_exit_reason "Failed to start agent within container" -+ return $rc -+ fi -+ -+ pcmk_docker_wrapper_monitor -+ rc=$? -+ if [ $rc -eq $OCF_SUCCESS ]; then -+ ocf_log notice "$OCF_RESOURCE_INSTANCE started successfully. Container's logfile can be found at $HOST_LOG_FILE" -+ fi -+ -+ return $rc -+} -+ -+stop_container() -+{ -+ local rc -+ local count -+ -+ num_active_resources -+ count=$? -+ if [ $count -ne 0 ]; then -+ ocf_log err "Failed to stop agent within container. Killing container $CONTAINER with $count active resources" -+ fi -+ -+ $DOCKER_AGENT "stop" -+ rc=$? -+ if [ $rc -ne $OCF_SUCCESS ]; then -+ ocf_exit_reason "Docker container failed to stop" -+ return $rc -+ fi -+ clear_state_dir -+ return $rc -+} -+ -+stop_resource() -+{ -+ local rc -+ -+ client_action "stop" -+ rc=$? -+ if [ $? -ne "$OCF_SUCCESS" ]; then -+ export OCF_RESKEY_force_stop="true" -+ kill_now=1 -+ else -+ clear_state_file -+ fi -+} -+ -+pcmk_docker_wrapper_stop() { -+ local rc -+ local kill_now=0 -+ local all_stopped=0 -+ -+ pcmk_docker_wrapper_monitor -+ rc=$? -+ if [ $rc -eq $OCF_NOT_RUNNING ]; then -+ rc=$OCF_SUCCESS -+ num_active_resources -+ if [ $? -eq 0 ]; then -+ # stop container if no more resources are running -+ ocf_log info "Gracefully stopping container $CONTAINER because no resources are left running." -+ stop_container -+ rc=$? -+ fi -+ return $rc -+ fi -+ -+ # if we can't talk to the remote daemon but the container is -+ # active, we have to force kill the container. -+ if [ $CONNECTION_FAILURE -eq 1 ]; then -+ export OCF_RESKEY_force_kill="true" -+ stop_container -+ return $? -+ fi -+ -+ -+ # If we've gotten this far, the container is up, and we -+ # need to gracefully stop a resource within the container. -+ client_action "stop" -+ rc=$? -+ if [ $? -ne "$OCF_SUCCESS" ]; then -+ export OCF_RESKEY_force_stop="true" -+ # force kill the container if we fail to stop a resource. -+ stop_container -+ rc=$? -+ else -+ clear_state_file -+ num_active_resources -+ if [ $? -eq 0 ]; then -+ # stop container if no more resources are running -+ ocf_log info "Gracefully stopping container $CONTAINER because last resource has stopped" -+ stop_container -+ rc=$? -+ fi -+ fi -+ -+ return $rc -+} -+ -+pcmk_docker_wrapper_validate() { -+ check_binary docker -+ -+ if [ -z "$CLASS" ] || [ -z "$TYPE" ]; then -+ ocf_exit_reason "Update pacemaker to a version that supports container wrappers." -+ return $OCF_ERR_CONFIGURED -+ fi -+ -+ if ! [ -f "$DOCKER_AGENT" ]; then -+ ocf_exit_reason "Requires $DOCKER_AGENT to be installed. update the resource-agents package" -+ return $OCF_ERR_INSTALLED -+ fi -+ $DOCKER_AGENT validate-all -+ return $? -+} -+ -+case $__OCF_ACTION in -+meta-data) meta_data -+ exit $OCF_SUCCESS -+ ;; -+usage|help) pcmk_docker_wrapper_usage -+ exit $OCF_SUCCESS -+ ;; -+esac -+ -+separate_args -+pcmk_docker_wrapper_validate -+rc=$? -+if [ $rc -ne 0 ]; then -+ case $__OCF_ACTION in -+ stop) exit $OCF_SUCCESS;; -+ monitor) exit $OCF_NOT_RUNNING;; -+ *) exit $rc;; -+ esac -+fi -+ -+case $__OCF_ACTION in -+ start) pcmk_docker_wrapper_start;; -+ stop) pcmk_docker_wrapper_stop;; -+ monitor|status) pcmk_docker_wrapper_monitor;; -+ reload|promote|demote|notify) -+ pcmk_docker_wrapper_generic_action $__OCF_ACTION;; -+ validate-all) pcmk_docker_wrapper_validate;; -+ *) pcmk_docker_wrapper_usage -+ exit $OCF_ERR_UNIMPLEMENTED -+ ;; -+esac -+rc=$? -+ocf_log debug "Docker-wrapper ${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" -+exit $rc -+ -diff --git a/extra/resources/ping b/extra/resources/ping -index b9a69b8..ca9db75 100755 ---- a/extra/resources/ping -+++ b/extra/resources/ping -@@ -43,8 +43,7 @@ meta_data() { - 1.0 - - --Every time the monitor action is run, this resource agent records (in the CIB) the current number of ping nodes the host can connect to. --It is essentially the same as pingd except that it uses the system ping tool to obtain the results. -+Every time the monitor action is run, this resource agent records (in the CIB) the current number of nodes the host can connect to using the system fping (preferred) or ping tool. - - node connectivity - -@@ -77,7 +76,7 @@ The name of the attributes to set. This is the name to be used in the constrain - The number by which to multiply the number of connected ping nodes by - - Value multiplier -- -+ - - - -@@ -93,7 +92,7 @@ The list of ping nodes to count. - Number of ping attempts, per host, before declaring it dead - - no. of ping attempts -- -+ - - - -@@ -121,6 +120,15 @@ Default never fails. - - - -+ -+ -+Use fping rather than ping, if found. If set to 0, fping -+will not be used even if present. -+ -+Use fping if available -+ -+ -+ - - - Enables to use default attrd_updater verbose logging on every call. -@@ -154,7 +162,7 @@ ping_conditional_log() { - - ping_usage() { - cat <$f_out 2>$f_err; rc=$? - active=`grep alive $f_out|wc -l` - -@@ -274,7 +282,7 @@ ping_check() { - - ping_update() { - -- if have_binary fping; then -+ if ocf_is_true "$OCF_RESKEY_use_fping" && have_binary fping; then - fping_check - active=$? - else -@@ -306,6 +314,7 @@ ping_update() { - : ${OCF_RESKEY_multiplier:="1"} - : ${OCF_RESKEY_debug:="false"} - : ${OCF_RESKEY_failure_score:="0"} -+: ${OCF_RESKEY_use_fping:="1"} - - : ${OCF_RESKEY_CRM_meta_timeout:="20000"} - : ${OCF_RESKEY_CRM_meta_globally_unique:="true"} -diff --git a/extra/resources/remote b/extra/resources/remote -index 9e0482b..c481863 100644 ---- a/extra/resources/remote -+++ b/extra/resources/remote -@@ -44,7 +44,8 @@ meta_data() { - - - -- 0.1 -+0.1 -+remote resource agent - - - -@@ -53,20 +54,33 @@ meta_data() { - Server location - - -- -+ - - tcp port to connect to. - - tcp port -- -+ -+ -+ -+ -+ Time in seconds to wait before attempting to reconnect to a remote node -+ after an active connection to the remote node has been severed. This wait -+ is recurring. If reconnect fails after the wait period, a new reconnect -+ attempt will be made after observing the wait time. When this option is -+ in use, pacemaker will keep attempting to reach out and connect to the -+ remote node indefinitely after each wait interval. -+ -+ reconnect interval -+ - - - -- -- -- -- -- -+ -+ -+ -+ -+ -+ - - - -@@ -96,8 +110,9 @@ start) remote_unsupported;; - stop) remote_unsupported;; - monitor) remote_unsupported;; - migrate_to) remote_unsupported;; --migrate_from) remote_unsupported;; --validate-all) remote_unsupported;; -+migrate_from) remote_unsupported;; -+reload) remote_unsupported;; -+validate-all) remote_unsupported;; - usage|help) remote_usage - exit $OCF_SUCCESS - ;; -diff --git a/extra/rgmanager/Makefile.am b/extra/rgmanager/Makefile.am -deleted file mode 100644 -index 7fa7299..0000000 ---- a/extra/rgmanager/Makefile.am -+++ /dev/null -@@ -1,39 +0,0 @@ --# --# Copyright (C) 2004-2009 Andrew Beekhof --# --# This program is free software; you can redistribute it and/or --# modify it under the terms of the GNU General Public License --# as published by the Free Software Foundation; either version 2 --# of the License, or (at your option) any later version. --# --# This program is distributed in the hope that it will be useful, --# but WITHOUT ANY WARRANTY; without even the implied warranty of --# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the --# GNU General Public License for more details. --# --# You should have received a copy of the GNU General Public License --# along with this program; if not, write to the Free Software --# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. --# -- --MAINTAINERCLEANFILES = Makefile.in -- --if BUILD_CMAN -- --# Migration tools --bin_PROGRAMS = ccs_flatten disable_rgmanager --bin_SCRIPTS = ccs2cib --ccs_flatten_SOURCES = reslist.c resrules.c restree.c flatten.c xmlconf.c --disable_rgmanager_SOURCES = disable_rgmanager.c -- --dist_pkgdata_DATA = cluconf2cib.xsl -- --ccs2cib: ccs2cib.in -- echo "#!/bin/sh" > $@ -- echo >> $@ -- echo XSL_FILE=\"$(pkgdatadir)/cluconf2cib.xsl\" >> $@ -- echo SAXON_JAR=\"$(datadir)/java/saxon.jar\" >> $@ -- echo >> $@ -- cat $^ >> $@ -- chmod +x $@ --endif -diff --git a/extra/rgmanager/README b/extra/rgmanager/README -index b32a990..24e10e7 100644 ---- a/extra/rgmanager/README -+++ b/extra/rgmanager/README -@@ -1,21 +1,30 @@ --Linux-cluster cluster.conf to Pacemaker CIB translation --utility. -+# Legacy: Linux-cluster cluster.conf to Pacemaker CIB translation utility - --Development is on Fedora 14+ -+This directory used contain several parts related to the procedure of -+cluster stacks/configuration migration, in particular and as the directory -+name suggests: from (CMAN+RGManager)-based stack of HA components to -+the (Corosync+Pacemaker)-based one. - --2 phases: -- Step 1: Flatten tree -- Step 2: Convert -+This initial effort laid here was used as a foundation for a more -+sophisticated and maintained tool, [clufter](https://github.com/jnpkrn/clufter), -+which has made local tool-set obsolete since then. - --Requires: -- resource-agents -- libxml2 -- java -- saxon >= 9.1 -- java-1.6.0-openjdk -+In case any dependencies on previously offered bits arose as time was -+passing by, a mapping to the clufter-provided alternatives is provided: - --TODO: -- * Fencing -- * Resource-defaults [ skipping ] -- ... -- * Profit -+* `ccs2cib.in` resulting in `ccs2cib` conversion launcher -+ - use `clufter ccs2pcs` (perhaps with some switches) and wrap it with -+ some usage-specific script if suitable -+* `ccs_flatten.c` and the rest of C-files resulting in `ccs_flatten` binary -+ - you can find the same modulo few modifications in `ccs-flatten` -+ subdirectory within the released tarballs or repo itself -+ and the resulting binary is a crucial part of clufter installation -+* `cluconf2cib.xsl` -+ - deprecated in favor of multiple, sequentially chained XSL stylesheets -+ decomposed into tree-like structure (refer to `filters/cluster` -+ subdirectory but beware, it is accompanied with transformations for -+ various other purposes) -+* `disable_rgmanager.c` resulting in `disable_rgmanager` utility -+ - use `clufter ccs-disable-rg` (since `clufter-0.12.0`) -+* `tests` subdirectory -+ - configs have been put under `tests/cluster.conf/orig` in clufter repo -diff --git a/extra/rgmanager/README.markdown b/extra/rgmanager/README.markdown -new file mode 120000 -index 0000000..100b938 ---- /dev/null -+++ b/extra/rgmanager/README.markdown -@@ -0,0 +1 @@ -+README -\ No newline at end of file -diff --git a/extra/rgmanager/ccs2cib.in b/extra/rgmanager/ccs2cib.in -deleted file mode 100644 -index e49ce2f..0000000 ---- a/extra/rgmanager/ccs2cib.in -+++ /dev/null -@@ -1,169 +0,0 @@ --# Above is autogenerated; DEBUG here is run-time --DEBUG=0 -- --export XSL_FILE SAXON_JAR DEBUG -- --die() --{ -- echo "$*" -- exit 1 --} -- --_validate() --{ -- if [ $DEBUG -eq 1 ]; then -- echo "debug: adding . to $PATH" -- export PATH="$PATH:." -- fi -- -- which ccs_flatten &> /dev/null || die "Can't find ccs_flatten in path!" -- which java &> /dev/null || die "Can not find java in path!" -- -- if [ -z "$XSL_FILE" ]; then -- if [ $DEBUG -eq 1 ]; then -- XSL_FILE=./cluconf2cib.xsl -- echo "debug: using $XSL_FILE" -- else -- die "Please specify path to XSLT script using -X ." -- fi -- fi -- -- if [ -z "$SAXON_JAR" ]; then -- if [ $DEBUG -eq 1 ]; then -- SAXON_JAR=/usr/share/java/saxon.jar -- echo "debug: using $SAXON_JAR" -- else -- die "Please specify path to saxon.jar using -J ." -- fi -- fi -- -- [ -d /usr/share/cluster ] || die "/usr/share/cluster does not exist." -- [ -f /usr/share/cluster/service.sh ] || die "Missing rgmanager resource agents?" -- -- [ -f "$XSL_FILE" ] || die "$XSL_FILE does not exist!" -- [ -f "$SAXON_JAR" ] || die "$SAXON_JAR does not exist!" -- -- [ -f "$1" ] || die "Input file $1 not found" -- if [ -f "$2" ]; then -- [ $3 -ne 0 ] || die "Output file $2 exists; please remove or use -f" -- fi -- -- return 0 --} -- -- --help() --{ --cat < Specify path to XSLT script -- -J Specify path to Saxon jar file -- -h This message --EOT --} -- -- --# main --declare conf_in cib_out xsl_file saxon_jar conf_out opt do_update tmp --declare force_update no_verify -- --# defaults --conf_in="/etc/cluster/cluster.conf" --cib_out="cib-converted.xml" --conf_out="" --do_update=0 --no_verify=0 --force_update=0 --tmp=$(mktemp /tmp/ccs2cib.tmp.XXXXXX) -- --while getopts i:o:X:J:Rr:dnhf opt; do -- case $opt in -- d) -- DEBUG=1 -- ;; -- i) -- conf_in="$OPTARG" -- ;; -- o) -- cib_out="$OPTARG" -- ;; -- R) -- do_update=1 -- ;; -- r) -- do_update=1 -- conf_out="$OPTARG" -- ;; -- n) -- no_verify=1 -- ;; -- f) -- force_update=1 -- ;; -- X) -- XSL_FILE="$OPTARG" -- ;; -- J) -- SAXON_JAR="$OPTARG" -- ;; -- h) -- help $0 -- exit 0 -- ;; -- *) -- echo "Error parsing $opt" -- help $0 -- exit 1 -- ;; -- esac --done -- --[ -z "$conf_out" ] && conf_out="$conf_in" -- --_validate "$conf_in" "$cib_out" $force_update -- --echo " * Converting configuration" --if ! ccs_flatten "$conf_in" > $tmp; then -- rm -f $tmp -- die "Flattening of configuration file failed." --fi -- --if ! java -jar $SAXON_JAR -xsl:$XSL_FILE $tmp > $cib_out; then -- rm -f $tmp -- die "Conversion failed." --fi -- --echo " * Calling crm_verify to validate the configuration." -- --if [ $no_verify -eq 0 ]; then -- crm_verify --xml-file $cib_out -V || die "Validation failed." --fi -- --if [ $do_update -ne 0 ]; then -- echo " * Disabling rgmanager in $conf_out" -- rm -f $tmp -- disable_rgmanager "$conf_in" > "$tmp" || die "Failed to disable rgmanager" -- mv "$tmp" "$conf_out" -- if [ "$conf_out" = "/etc/cluster/cluster.conf" ]; then -- if clustat -Q &> /dev/null; then -- echo " * Calling cman_tool to update cluster.conf" -- cman_tool version -r -- else -- echo " * You will need to manually copy $conf_out to the other cluster" -- echo " nodes using scp, ccs_sync, or some other utility." -- fi -- fi --fi -- --echo " * Proposed cib stored in $cib_out" -- --exit 0 -diff --git a/extra/rgmanager/cluconf2cib.xsl b/extra/rgmanager/cluconf2cib.xsl -deleted file mode 100644 -index 885bc51..0000000 ---- a/extra/rgmanager/cluconf2cib.xsl -+++ /dev/null -@@ -1,242 +0,0 @@ -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 500000 -- -- -- -- -- 1000000 -- -- -- -- -- -- -- -- INFINITY -- -INFINITY -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -diff --git a/extra/rgmanager/disable_rgmanager.c b/extra/rgmanager/disable_rgmanager.c -deleted file mode 100644 -index 05c2695..0000000 ---- a/extra/rgmanager/disable_rgmanager.c -+++ /dev/null -@@ -1,121 +0,0 @@ --/* -- Copyright Red Hat, Inc. 2004-2006 -- -- This program is free software; you can redistribute it and/or modify it -- under the terms of the GNU General Public License as published by the -- Free Software Foundation; either version 2, or (at your option) any -- later version. -- -- This program is distributed in the hope that it will be useful, but -- WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program; see the file COPYING. If not, write to the -- Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -- MA 02110-1301 USA --*/ --#include --#include --#include --#include --#include --#include --#include -- --#define shift() {++argv; --argc;} -- --static int --disable_rgmanager(xmlDocPtr doc) --{ -- char buf[32]; -- xmlNodePtr n, o; -- char *a; -- int x; -- -- if (!doc) -- return 0; -- n = xmlDocGetRootElement(doc); -- if (!n) -- return 0; -- if (strcmp((char *)n->name, "cluster")) { -- fprintf(stderr, "Expected cluster tag, got %s\n", (char *)n->name); -- return 0; -- } -- -- a = (char *)xmlGetProp(n, (xmlChar *) "config_version"); -- if (!a) { -- fprintf(stderr, "No config_version found on cluster tag\n"); -- return 0; -- } -- -- x = atoi(a); -- if (x == 0) { -- fprintf(stderr, "config_version was invalid\n"); -- return 0; -- } -- -- ++x; -- snprintf(buf, sizeof(buf), "%d", x); -- if (xmlSetProp(n, (xmlChar *) "config_version", (xmlChar *) buf) == NULL) { -- fprintf(stderr, "Failed to update config_version\n"); -- return 0; -- } -- -- for (o = n->xmlChildrenNode; o; o = o->next) { -- if (o->type != XML_ELEMENT_NODE) -- continue; -- if (!strcmp((char *)o->name, "rm")) -- break; -- } -- -- if (!o) -- return 0; -- -- if (xmlSetProp(o, (xmlChar *) "disabled", (xmlChar *) "1") == NULL) { -- fprintf(stderr, "Failed to disable rgmanager\n"); -- return 0; -- } -- -- return 1; --} -- --static void --usage(const char *arg0, int ret) --{ -- fprintf(stderr, "usage: %s [output.conf]\n", arg0); -- exit(ret); --} -- --int --main(int argc, char **argv) --{ -- char *arg0 = basename(argv[0]); -- int ret = 0; -- xmlDocPtr doc = NULL; -- -- if (argc < 2) { -- usage(arg0, 1); -- } -- -- if (!strcasecmp(argv[1], "-h") || !strcasecmp(argv[1], "-?")) { -- usage(arg0, 0); -- } -- -- xmlInitParser(); -- xmlIndentTreeOutput = 1; -- xmlKeepBlanksDefault(0); -- -- shift(); -- doc = xmlParseFile(argv[0]); -- -- if (disable_rgmanager(doc)) { -- xmlDocFormatDump(stdout, doc, 1); -- } -- -- if (doc) -- xmlFreeDoc(doc); -- xmlCleanupParser(); -- return ret; --} -diff --git a/extra/rgmanager/flatten.c b/extra/rgmanager/flatten.c -deleted file mode 100644 -index d500bd4..0000000 ---- a/extra/rgmanager/flatten.c -+++ /dev/null -@@ -1,240 +0,0 @@ --/* -- Copyright Red Hat, Inc. 2004-2006 -- -- This program is free software; you can redistribute it and/or modify it -- under the terms of the GNU General Public License as published by the -- Free Software Foundation; either version 2, or (at your option) any -- later version. -- -- This program is distributed in the hope that it will be useful, but -- WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program; see the file COPYING. If not, write to the -- Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -- MA 02110-1301 USA --*/ --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include -- --#define shift() {++argv; --argc;} -- --const char *agentpath = RESOURCE_ROOTDIR; -- --static xmlNode * --get_rm_node(xmlDocPtr doc) --{ -- xmlNodePtr n, o; -- -- if (!doc) -- return NULL; -- n = xmlDocGetRootElement(doc); -- if (!n) -- return NULL; -- if (strcmp((char *)n->name, "cluster")) { -- fprintf(stderr, "Expected cluster tag, got %s\n", (char *)n->name); -- return NULL; -- } -- -- for (o = n->xmlChildrenNode; o; o = o->next) { -- if (o->type != XML_ELEMENT_NODE) -- continue; -- if (!strcmp((char *)o->name, "rm")) -- return o; -- } -- -- return NULL; --} -- --static void --remove_resources_block(xmlNodePtr rm) --{ -- xmlNodePtr o, r = NULL; -- -- for (o = rm->xmlChildrenNode; o; o = o->next) { -- if (o->type != XML_ELEMENT_NODE) -- continue; -- if (!strcmp((char *)o->name, "resources")) { -- r = o; -- break; -- } -- } -- -- if (!r) -- return; -- -- xmlUnlinkNode(r); -- xmlFreeNode(r); --} -- --static int --replace_resource(xmlNodePtr rm, char *restype, char *primattr, char *ident, xmlNodePtr n) --{ -- xmlNodePtr o, r = NULL; -- char *p; -- -- for (o = rm->xmlChildrenNode; o; o = o->next) { -- if (o->type != XML_ELEMENT_NODE) -- continue; -- if (!strcmp((char *)o->name, restype)) { -- p = (char *)xmlGetProp(o, (xmlChar *) primattr); -- if (!strcmp(p, ident)) { -- r = o; -- break; -- } -- } -- } -- -- if (!r) -- return -1; -- -- xmlUnlinkNode(r); -- xmlFreeNode(r); -- -- xmlAddChild(rm, n); -- -- return 0; --} -- --static int --flatten(int argc, char **argv, xmlDocPtr * doc) --{ -- xmlDocPtr d = NULL; -- xmlNode *n = NULL, *rm = NULL, *new_rb = NULL; -- resource_rule_t *rulelist = NULL; -- resource_t *reslist = NULL, *curres; -- resource_node_t *tree = NULL, *rn; -- FILE *f = stdout; -- int ret = 0; -- -- conf_setconfig(argv[0]); -- if (conf_open() < 0) { -- xmlFree(new_rb); -- goto out; -- } -- -- while (argc >= 2) { -- shift(); -- if (!strcmp(argv[0], "-r")) { -- if (!new_rb) -- new_rb = xmlNewNode(NULL, (xmlChar *) "rm"); -- } else { -- if (f == stdout) -- f = fopen(argv[0], "w+"); -- } -- } -- -- d = conf_get_doc(); -- rm = get_rm_node(d); -- -- load_resource_rules(agentpath, &rulelist); -- if (!rulelist) { -- fprintf(stderr, "No resource rules available\n"); -- goto out; -- } -- load_resources(&reslist, &rulelist); -- build_resource_tree(&tree, &rulelist, &reslist); -- if (!tree) { -- fprintf(stderr, "No resource trees defined; nothing to do\n"); -- goto out; -- } --#ifdef DEBUG -- fprintf(stderr, "Resources %p tree %p\n", reslist, tree); --#endif -- -- shift(); -- -- list_do(&tree, rn) { -- n = NULL; -- -- curres = rn->rn_resource; -- --#ifdef DEBUG -- fprintf(stderr, "Flatten %s:%s ... \n", curres->r_rule->rr_type, -- curres->r_attrs[0].ra_value); --#endif -- if (res_flatten(&n, new_rb, &tree, curres)) { -- fprintf(stderr, "FAIL 1\n"); -- ret = -1; -- goto out; -- } -- -- if (replace_resource(rm, curres->r_rule->rr_type, -- curres->r_attrs[0].ra_name, curres->r_attrs[0].ra_value, n) != 0) { -- fprintf(stderr, "FAIL 2\n"); -- ret = -1; -- goto out; -- } -- -- } -- while (!list_done(&tree, rn)) ; -- -- remove_resources_block(rm); -- if (new_rb) { -- xmlAddChild(rm, new_rb); -- } -- -- xmlDocFormatDump(f, d, 1); -- if (f != stdout) -- fclose(f); -- -- out: -- if (ret < 0) { -- xmlFreeDoc(d); -- } else { -- *doc = d; -- } -- conf_close(); -- destroy_resource_tree(&tree); -- destroy_resources(&reslist); -- destroy_resource_rules(&rulelist); -- -- return ret; --} -- --static void --usage(const char *arg0, int ret) --{ -- fprintf(stderr, "usage: %s [output.conf] [-r]\n", arg0); -- exit(ret); --} -- --int --main(int argc, char **argv) --{ -- char *arg0 = basename(argv[0]); -- int ret = 0; -- xmlDocPtr doc = NULL; -- -- if (argc < 2) { -- usage(arg0, 1); -- } -- -- if (!strcasecmp(argv[1], "-h") || !strcasecmp(argv[1], "-?")) { -- usage(arg0, 0); -- } -- -- xmlInitParser(); -- xmlIndentTreeOutput = 1; -- xmlKeepBlanksDefault(0); -- -- shift(); -- ret = flatten(argc, argv, &doc); -- -- //if (doc) -- //xmlFreeDoc(doc); -- xmlCleanupParser(); -- return ret; --} -diff --git a/extra/rgmanager/list.h b/extra/rgmanager/list.h -deleted file mode 100644 -index 6cd5511..0000000 ---- a/extra/rgmanager/list.h -+++ /dev/null -@@ -1,91 +0,0 @@ --#ifndef _LIST_H --# define _LIST_H -- --/** -- Simple list handlig macros. -- Needs rewrite or inclusion of /usr/include/linux/list.h as a replacement. -- */ -- --/* Must be first if structure is going to use it. */ --struct list_entry { -- struct list_entry *le_next, *le_prev; --}; -- --# define list_head() struct list_entry _list_head -- --# define le(p) (&((*p)._list_head)) -- --# define list_insert(list, newnode) \ --do { \ -- if (!(*list)) { \ -- le(newnode)->le_next = \ -- le(newnode)->le_prev = le(newnode); \ -- *list = (void *)le(newnode); \ -- } else { \ -- le(*list)->le_prev->le_next = le(newnode); \ -- le(newnode)->le_next = le(*list); \ -- le(newnode)->le_prev = le(*list)->le_prev; \ -- le(*list)->le_prev = le(newnode); \ -- } \ --} while (0) -- --# define list_prepend(list, newnode) \ --do { \ -- list_insert(list, newnode); \ -- *list = newnode; \ --} while (0) -- --# define list_remove(list, oldnode) \ --do { \ -- if (le(oldnode) == le(*list)) { \ -- *list = (void *)le(*list)->le_next; \ -- } \ -- if (le(oldnode) == le(*list)) { \ -- le(oldnode)->le_next = NULL; \ -- le(oldnode)->le_prev = NULL; \ -- *list = NULL; \ -- } else { \ -- le(oldnode)->le_next->le_prev = le(oldnode)->le_prev; \ -- le(oldnode)->le_prev->le_next = le(oldnode)->le_next; \ -- le(oldnode)->le_prev = NULL; \ -- le(oldnode)->le_next = NULL; \ -- } \ --} while (0) -- --/* -- list_do(list, node) { -- stuff; -- } while (!list_done(list, node)); -- */ --# define list_do(list, curr) \ -- if (*list && (curr = *list)) do -- --# define list_done(list, curr) \ -- (curr && (((curr = (void *)le(curr)->le_next)) && (curr == *list))) -- --/* -- * list_for(list, tmp, counter) { -- * stuff; -- * } -- * -- * counter = # of items in list when done. -- * * sets cnt to 0 before even checking list; -- * * checks for valid list -- * * traverses list, incrementing counter. If we get to the for loop, -- * there must be at least one item in the list -- */ --# define list_for(list, curr, cnt) \ -- if (!(cnt=0) && (list != NULL) && (*list != NULL)) \ -- for (curr = *list; \ -- (cnt == 0) || (curr != *list); \ -- curr = (void*)le(curr)->le_next, \ -- cnt++) -- --# define list_for_rev(list, curr, cnt) \ -- if (!(cnt=0) && list && *list) \ -- for (curr = (void *)(le(*list)->le_prev); \ -- (cnt == 0) || ((void *)curr != le(*list)->le_prev); \ -- curr = (void*)(le(curr)->le_prev), \ -- cnt++) -- --#endif -diff --git a/extra/rgmanager/resgroup.h b/extra/rgmanager/resgroup.h -deleted file mode 100644 -index dbaecb4..0000000 ---- a/extra/rgmanager/resgroup.h -+++ /dev/null -@@ -1,122 +0,0 @@ --#ifndef __RESGROUP_H --# define __RESGROUP_H -- --# include --# include --# include --# include --# include --# include --# include --# include -- --/* Requests */ --# define RG_SUCCESS 0 --# define RG_FAIL 1 --# define RG_START 2 --# define RG_STOP 3 --# define RG_STATUS 4 --# define RG_DISABLE 5 --# define RG_STOP_RECOVER 6 --# define RG_START_RECOVER 7 --# define RG_RESTART 8 --# define RG_EXITING 9 --# define RG_INIT 10 --# define RG_ENABLE 11 --# define RG_STATUS_NODE 12 --# define RG_RELOCATE 13 --# define RG_CONDSTOP 14 --# define RG_CONDSTART 15 --# define RG_START_REMOTE 16 /* Part of a relocate */ --# define RG_STOP_USER 17 /* User-stop request */ --# define RG_STOP_EXITING 18 -- /* Exiting. */ --# define RG_LOCK 19 --# define RG_UNLOCK 20 --# define RG_QUERY_LOCK 21 --# define RG_MIGRATE 22 --# define RG_FREEZE 23 --# define RG_UNFREEZE 24 --# define RG_STATUS_INQUIRY 25 --# define RG_CONVALESCE 26 --# define RG_NONE 999 -- --/* Resource group states (for now) */ --# define RG_STATE_BASE 110 --# define RG_STATE_STOPPED 110 /** Resource group is stopped */ --# define RG_STATE_STARTING 111 /** Resource is starting */ --# define RG_STATE_STARTED 112 /** Resource is started */ --# define RG_STATE_STOPPING 113 /** Resource is stopping */ --# define RG_STATE_FAILED 114 -- /** Resource has failed */ --# define RG_STATE_UNINITIALIZED 115 -- /** Thread not running yet */ --# define RG_STATE_CHECK 116 -- /** Checking status */ --# define RG_STATE_ERROR 117 -- /** Recoverable error */ --# define RG_STATE_RECOVER 118 /** Pending recovery */ --# define RG_STATE_DISABLED 119 /** Resource not allowd to run */ --# define RG_STATE_MIGRATE 120 /** Resource migrating */ -- --# define DEFAULT_CHECK_INTERVAL 10 -- --/* Resource group flags (for now) */ --# define RG_FLAG_FROZEN (1<<0) -- /** Resource frozen */ --# define RG_FLAG_PARTIAL (1<<1) -- /** One or more non-critical -- resources offline */ -- --/* Return codes */ --# define RG_EEXCL -16 /* Service not runnable due to -- the fact that it is tagged -- exclusive and there are no -- empty nodes. */ --# define RG_EDOMAIN -15 /* Service not runnable given the -- set of nodes and its failover -- domain */ --# define RG_ESCRIPT -14 /* S/Lang script failed */ --# define RG_EFENCE -13 /* Fencing operation pending */ --# define RG_ENODE -12 /* Node is dead/nonexistent */ --# define RG_EFROZEN -11 /* Forward compat. with -HEAD */ --# define RG_ERUN -10 -- /* Service is already running */ --# define RG_EQUORUM -9 /* Operation requires quorum */ --# define RG_EINVAL -8 /* Invalid operation for resource */ --# define RG_EDEPEND -7 /* Operation violates dependency */ --# define RG_EAGAIN -6 /* Try again */ --# define RG_EDEADLCK -5 /* Aborted - would deadlock */ --# define RG_ENOSERVICE -4 /* Service does not exist */ --# define RG_EFORWARD -3 /* Service not mastered locally */ --# define RG_EABORT -2 /* Abort; service unrecoverable */ --# define RG_EFAIL -1 /* Generic failure */ --# define RG_ESUCCESS 0 --# define RG_YES 1 --# define RG_NO 2 -- --const char *rg_strerror(int val); -- --/* -- * Fail-over domain states -- */ --# define FOD_ILLEGAL 0 --# define FOD_GOOD 1 --# define FOD_BETTER 2 --# define FOD_BEST 3 -- --/* -- Fail-over domain flags -- */ --# define FOD_ORDERED (1<<0) --# define FOD_RESTRICTED (1<<1) --# define FOD_NOFAILBACK (1<<2) -- --/* -- Status tree flags -- */ --# define SFL_FAILURE (1<<0) --# define SFL_RECOVERABLE (1<<1) --# define SFL_PARTIAL (1<<2) -- --#endif -diff --git a/extra/rgmanager/reslist.c b/extra/rgmanager/reslist.c -deleted file mode 100644 -index c0b0699..0000000 ---- a/extra/rgmanager/reslist.c -+++ /dev/null -@@ -1,518 +0,0 @@ --/* -- Copyright Red Hat, Inc. 2004 -- -- This program is free software; you can redistribute it and/or modify it -- under the terms of the GNU General Public License as published by the -- Free Software Foundation; either version 2, or (at your option) any -- later version. -- -- This program is distributed in the hope that it will be useful, but -- WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program; see the file COPYING. If not, write to the -- Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -- MA 02110-1301 USA --*/ --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include -- --void --res_build_name(char *buf, size_t buflen, resource_t * res) --{ -- snprintf(buf, buflen, "%s:%s", res->r_rule->rr_type, res->r_attrs[0].ra_value); --} -- --/** -- Find and determine an attribute's value. -- -- @param res Resource node to look examine -- @param attrname Attribute to retrieve. -- @return value of attribute or NULL if not found -- */ --char * --res_attr_value(resource_t * res, const char *attrname) --{ -- resource_attr_t *ra; -- int x; -- -- for (x = 0; res->r_attrs && res->r_attrs[x].ra_name; x++) { -- if (strcmp(attrname, res->r_attrs[x].ra_name)) -- continue; -- -- ra = &res->r_attrs[x]; -- -- if (ra->ra_flags & RA_INHERIT) -- /* Can't check inherited resources */ -- return NULL; -- -- return ra->ra_value; -- } -- -- return NULL; --} -- --/** -- Find and determine an attribute's value. Takes into account inherited -- attribute flag, and append attribute flag, which isn't implemented yet. -- -- @param node Resource tree node to look examine -- @param attrname Attribute to retrieve. -- @param ptype Resource type to look for (if inheritance) -- @return value of attribute or NULL if not found -- */ --static char * --_attr_value(resource_node_t * node, const char *attrname, const char *ptype) --{ -- resource_t *res; -- resource_attr_t *ra; -- char *c, p_type[32]; -- ssize_t len; -- int x; -- -- if (!node) -- return NULL; -- -- res = node->rn_resource; -- -- /* Go up the tree if it's not the right parent type */ -- if (ptype && strcmp(res->r_rule->rr_type, ptype)) -- return _attr_value(node->rn_parent, attrname, ptype); -- -- for (x = 0; res->r_attrs && res->r_attrs[x].ra_name; x++) { -- if (strcmp(attrname, res->r_attrs[x].ra_name)) -- continue; -- -- ra = &res->r_attrs[x]; -- -- if (!(ra->ra_flags & RA_INHERIT)) -- return ra->ra_value; -- /* -- Handle resource_type%field to be more precise, so we -- don't have to worry about this being a child -- of an unexpected type. E.g. lots of things have the -- "name" attribute. -- */ -- c = strchr(ra->ra_value, '%'); -- if (!c) { -- /* Someone doesn't care or uses older -- semantics on inheritance */ -- return _attr_value(node->rn_parent, ra->ra_value, NULL); -- } -- -- len = (c - ra->ra_value); -- memset(p_type, 0, sizeof(p_type)); -- memcpy(p_type, ra->ra_value, len); -- -- /* Skip the "%" and recurse */ -- return _attr_value(node->rn_parent, ++c, p_type); -- } -- -- return NULL; --} -- --char * --attr_value(resource_node_t * node, const char *attrname) --{ -- return _attr_value(node, attrname, NULL); --} -- --char * --primary_attr_value(resource_t * res) --{ -- int x; -- resource_attr_t *ra; -- -- for (x = 0; res->r_attrs && res->r_attrs[x].ra_name; x++) { -- ra = &res->r_attrs[x]; -- -- if (!(ra->ra_flags & RA_PRIMARY)) -- continue; -- -- return ra->ra_value; -- } -- -- return NULL; --} -- --/** -- Find a resource given its reference. A reference is the value of the -- primary attribute. -- -- @param reslist List of resources to traverse. -- @param type Type of resource to look for. -- @param ref Reference -- @return Resource matching type/ref or NULL if none. -- */ --resource_t * --find_resource_by_ref(resource_t ** reslist, char *type, char *ref) --{ -- resource_t *curr; -- int x; -- -- list_do(reslist, curr) { -- if (strcmp(curr->r_rule->rr_type, type)) -- continue; -- -- /* -- This should be one operation - the primary attr -- is generally at the head of the array. -- */ -- for (x = 0; curr->r_attrs && curr->r_attrs[x].ra_name; x++) { -- if (!(curr->r_attrs[x].ra_flags & RA_PRIMARY)) -- continue; -- if (strcmp(ref, curr->r_attrs[x].ra_value)) -- continue; -- -- return curr; -- } -- } -- while (!list_done(reslist, curr)) ; -- -- return NULL; --} -- --/** -- Store a resource in the resource list if it's legal to do so. -- Otherwise, don't store it. -- Note: This function needs to be rewritten; it's way too long and way -- too indented. -- -- @param reslist Resource list to store the new resource. -- @param newres Resource to store -- @return 0 on succes; nonzero on failure. -- */ --int --store_resource(resource_t ** reslist, resource_t * newres) --{ -- resource_t *curr; -- int x, y; -- -- if (!*reslist) { -- /* first resource */ -- list_insert(reslist, newres); -- return 0; -- } -- -- list_do(reslist, curr) { -- -- if (strcmp(curr->r_rule->rr_type, newres->r_rule->rr_type)) -- continue; -- -- for (x = 0; newres->r_attrs && newres->r_attrs[x].ra_name; x++) { -- /* -- Look for conflicting primary/unique keys -- */ -- if (!(newres->r_attrs[x].ra_flags & (RA_PRIMARY | RA_UNIQUE))) -- continue; -- -- for (y = 0; curr->r_attrs[y].ra_name; y++) { -- if (curr->r_attrs[y].ra_flags & RA_INHERIT) -- continue; -- -- if (strcmp(curr->r_attrs[y].ra_name, newres->r_attrs[x].ra_name)) -- continue; -- if (!strcmp(curr->r_attrs[y].ra_value, newres->r_attrs[x].ra_value)) { -- /* -- Unique/primary is not unique -- */ -- fprintf(stderr, -- "%s attribute collision. " -- "type=%s attr=%s value=%s\n", -- (newres->r_attrs[x].ra_flags & -- RA_PRIMARY) ? "Primary" : -- "Unique", -- newres->r_rule->rr_type, -- newres->r_attrs[x].ra_name, newres->r_attrs[x].ra_value); -- return -1; -- } -- break; -- } -- } -- } -- while (!list_done(reslist, curr)) ; -- -- list_insert(reslist, newres); -- return 0; --} -- --/** -- Obliterate a resource_t structure. -- -- @param res Resource to free. -- */ --void --destroy_resource(resource_t * res) --{ -- int x; -- -- if (res->r_name) -- free(res->r_name); -- -- if (res->r_attrs) { -- for (x = 0; res->r_attrs && res->r_attrs[x].ra_name; x++) { -- free(res->r_attrs[x].ra_name); -- free(res->r_attrs[x].ra_value); -- } -- -- free(res->r_attrs); -- } -- -- if (res->r_actions) { -- /* Don't free the strings; they're part of the rule */ -- free(res->r_actions); -- } -- -- free(res); --} -- --/** -- Obliterate a resource_t list. -- -- @param list Resource list to free. -- */ --void --destroy_resources(resource_t ** list) --{ -- resource_t *res; -- -- while ((res = *list)) { -- list_remove(list, res); -- destroy_resource(res); -- } --} -- --void * --act_dup(resource_act_t * acts) --{ -- int x; -- resource_act_t *newacts; -- -- for (x = 0; acts[x].ra_name; x++) ; -- -- ++x; -- x *= sizeof(resource_act_t); -- -- newacts = malloc(x); -- if (!newacts) -- return NULL; -- -- memcpy(newacts, acts, x); -- -- return newacts; --} -- --/* Copied from resrules.c -- _get_actions */ --static void --_get_actions_ccs(const char *base, resource_t * res) --{ -- char xpath[256]; -- int idx = 0; -- char *act, *ret; -- int interval, timeout, depth; -- -- do { -- /* setting these to -1 prevents overwriting with 0 */ -- interval = -1; -- depth = -1; -- act = NULL; -- timeout = -1; -- -- snprintf(xpath, sizeof(xpath), "%s/action[%d]/@name", base, ++idx); -- -- if (conf_get(xpath, &act) != 0) -- break; -- -- snprintf(xpath, sizeof(xpath), "%s/action[%d]/@timeout", base, idx); -- if (conf_get(xpath, &ret) == 0 && ret) { -- timeout = expand_time(ret); -- if (timeout < 0) -- timeout = 0; -- free(ret); -- } -- -- snprintf(xpath, sizeof(xpath), "%s/action[%d]/@interval", base, idx); -- if (conf_get(xpath, &ret) == 0 && ret) { -- interval = expand_time(ret); -- if (interval < 0) -- interval = 0; -- free(ret); -- } -- -- if (!strcmp(act, "status") || !strcmp(act, "monitor")) { -- snprintf(xpath, sizeof(xpath), "%s/action[%d]/@depth", base, idx); -- if (conf_get(xpath, &ret) == 0 && ret) { -- depth = atoi(ret); -- if (depth < 0) -- depth = 0; -- -- /* */ -- if (ret[0] == '*') -- depth = -1; -- free(ret); -- } -- } -- -- if (store_action(&res->r_actions, act, depth, timeout, interval) != 0) -- free(act); -- } while (1); --} -- --/** -- Try to load all the attributes in our rule set. If none are found, -- or an error occurs, return NULL and move on to the next one. -- -- @param rule Resource rule set to use when looking for data -- @param base Base XPath path to start with. -- @return New resource if legal or NULL on failure/error -- */ --resource_t * --load_resource(resource_rule_t * rule, const char *base) --{ -- resource_t *res; -- char ccspath[1024]; -- char *attrname, *attr; -- int x, found = 0, flags; -- -- res = malloc(sizeof(*res)); -- if (!res) { -- fprintf(stderr, "Out of memory\n"); -- return NULL; -- } -- -- memset(res, 0, sizeof(*res)); -- res->r_rule = rule; -- -- for (x = 0; res->r_rule->rr_attrs && res->r_rule->rr_attrs[x].ra_name; x++) { -- -- flags = rule->rr_attrs[x].ra_flags; -- attrname = strdup(rule->rr_attrs[x].ra_name); -- if (!attrname) { -- destroy_resource(res); -- return NULL; -- } -- -- /* -- Ask CCS for the respective attribute -- */ -- attr = NULL; -- snprintf(ccspath, sizeof(ccspath), "%s/@%s", base, attrname); -- -- if (conf_get(ccspath, &attr) != 0) { -- -- if (flags & (RA_REQUIRED | RA_PRIMARY)) { -- /* Missing required attribute. We're done. */ -- free(attrname); -- destroy_resource(res); -- return NULL; -- } -- -- if (!(flags & RA_INHERIT)) { -- /* -- If we don't have the inherit flag, see if -- we have a value anyway. If we do, -- this value is the default value, and -- should be used. -- */ -- if (!rule->rr_attrs[x].ra_value) { -- free(attrname); -- continue; -- } -- -- /* Copy default value from resource rule */ -- attr = strdup(rule->rr_attrs[x].ra_value); -- } -- } -- -- found = 1; -- -- /* -- If we are supposed to inherit and we don't have an -- instance of the specified attribute in CCS, then we -- keep the inherit flag and use it as the attribute. -- -- However, if we _do_ have the attribute for this instance, -- we drop the inherit flag and use the attribute. -- */ -- if (flags & RA_INHERIT) { -- if (attr) { -- flags &= ~RA_INHERIT; -- } else { -- attr = strdup(rule->rr_attrs[x].ra_value); -- if (!attr) { -- destroy_resource(res); -- free(attrname); -- return NULL; -- } -- } -- } -- -- /* -- Store the attribute. We'll ensure all required -- attributes are present soon. -- */ -- if (attrname && attr) -- store_attribute(&res->r_attrs, attrname, attr, flags); -- } -- -- if (!found) { -- destroy_resource(res); -- return NULL; -- } -- -- res->r_actions = act_dup(rule->rr_actions); -- _get_actions_ccs(base, res); -- -- return res; --} -- --/** -- Read all resources in the resource manager block in CCS. -- -- @param reslist Empty list to fill with resources. -- @param rulelist List of rules to use when searching CCS. -- @return 0 on success, nonzero on failure. -- */ --int --load_resources(resource_t ** reslist, resource_rule_t ** rulelist) --{ -- int resID = 0; -- resource_t *newres; -- resource_rule_t *currule; -- char tok[256]; -- -- list_do(rulelist, currule) { -- -- for (resID = 1;; resID++) { -- snprintf(tok, sizeof(tok), RESOURCE_BASE "/%s[%d]", currule->rr_type, resID); -- -- newres = load_resource(currule, tok); -- if (!newres) -- break; -- -- if (store_resource(reslist, newres) != 0) { -- fprintf(stderr, "Error storing %s resource\n", newres->r_rule->rr_type); -- -- destroy_resource(newres); -- } -- -- /* Just information */ -- newres->r_flags = RF_DEFINED; -- } -- } -- while (!list_done(rulelist, currule)) ; -- -- return 0; --} -diff --git a/extra/rgmanager/reslist.h b/extra/rgmanager/reslist.h -deleted file mode 100644 -index 07c84fe..0000000 ---- a/extra/rgmanager/reslist.h -+++ /dev/null -@@ -1,205 +0,0 @@ --/* -- Copyright Red Hat, Inc. 2004 -- -- This program is free software; you can redistribute it and/or modify it -- under the terms of the GNU General Public License as published by the -- Free Software Foundation; either version 2, or (at your option) any -- later version. -- -- This program is distributed in the hope that it will be useful, but -- WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program; see the file COPYING. If not, write to the -- Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -- MA 02110-1301 USA --*/ --#ifndef _RESLIST_H --# define _RESLIST_H -- --# include --# include --# include --# include -- --# define RA_PRIMARY (1<<0) /** Primary key */ --# define RA_UNIQUE (1<<1) /** Unique for given type */ --# define RA_REQUIRED (1<<2) /** Required (or an error if not present */ --# define RA_INHERIT (1<<3) /** Inherit a parent resource's attr */ --# define RA_RECONFIG (1<<4) /** Allow inline reconfiguration */ -- --# define RF_INLINE (1<<0) --# define RF_DEFINED (1<<1) --# define RF_NEEDSTART (1<<2) /** Used when adding/changing resources */ --# define RF_NEEDSTOP (1<<3) /** Used when deleting/changing resources */ --# define RF_COMMON (1<<4) /** " */ --# define RF_INDEPENDENT (1<<5) -- /** Define this for a resource if it is -- otherwise an independent subtree */ --# define RF_RECONFIG (1<<6) -- --# define RF_INIT (1<<7) -- /** Resource rule: Initialize this resource -- class on startup */ --# define RF_DESTROY (1<<8) /** Resource rule flag: Destroy this -- resource class if you delete it from -- the configuration */ --# define RF_ENFORCE_TIMEOUTS (1<<9) -- /** Enforce timeouts for this node */ --# define RF_NON_CRITICAL (1<<10) -- /** stop this resource if it fails */ --# define RF_QUIESCE (1<<11) /** don't restart this resource */ -- --# define RES_STOPPED (0) --# define RES_STARTED (1) --# define RES_FAILED (2) --# define RES_DISABLED (3) -- --# ifndef SHAREDIR --# define SHAREDIR "/usr/share/cluster" --# endif -- --# define RESOURCE_ROOTDIR SHAREDIR --# define RESOURCE_TREE_ROOT "//rm" --# define RESOURCE_BASE RESOURCE_TREE_ROOT "/resources" --# define RESOURCE_ROOT_FMT RESOURCE_TREE_ROOT "/%s[%d]" -- --# define RESOURCE_MAX_LEVELS 100 -- --/* Include OCF definitions */ --//#include -- --typedef struct _resource_attribute { -- char *ra_name; -- char *ra_value; -- int ra_flags; -- int _pad_; --} resource_attr_t; -- --typedef struct _resource_child { -- char *rc_name; -- int rc_startlevel; -- int rc_stoplevel; -- int rc_forbid; -- int rc_flags; --} resource_child_t; -- --typedef struct _resource_act { -- char *ra_name; -- time_t ra_timeout; -- time_t ra_last; -- time_t ra_interval; -- int ra_depth; -- int _pad_; --} resource_act_t; -- --typedef struct _resource_rule { -- list_head(); -- char *rr_type; -- char *rr_agent; -- char *rr_version; /** agent XML spec version; OCF-ism */ -- int rr_flags; -- int rr_maxrefs; -- resource_attr_t *rr_attrs; -- resource_child_t *rr_childtypes; -- resource_act_t *rr_actions; --} resource_rule_t; -- --typedef struct _resource { -- list_head(); -- resource_rule_t *r_rule; -- char *r_name; -- resource_attr_t *r_attrs; -- resource_act_t *r_actions; -- int r_flags; -- int r_refs; -- int r_incarnations; /** Number of instances running locally */ -- int _pad_; /* align */ --} resource_t; -- --typedef struct _rg_node { -- list_head(); -- struct _rg_node *rn_child, *rn_parent; -- resource_t *rn_resource; -- resource_act_t *rn_actions; -- int rn_state; /* State of this instance of rn_resource */ -- int rn_flags; -- int rn_last_status; -- int rn_last_depth; -- int rn_checked; -- int rn_pad; --} resource_node_t; -- --typedef struct _fod_node { -- list_head(); -- char *fdn_name; -- int fdn_prio; -- int fdn_nodeid; /* on rhel4 this will be 64-bit int */ --} fod_node_t; -- --typedef struct _fod { -- list_head(); -- char *fd_name; -- fod_node_t *fd_nodes; -- int fd_flags; -- int _pad_; /* align */ --} fod_t; -- --/* -- Exported Functions -- */ --int res_flatten(xmlNode ** n, xmlNode * r, resource_node_t ** tree, resource_t * res); -- --int expand_time(char *val); --int store_action(resource_act_t ** actsp, char *name, int depth, int timeout, int interval); -- --/* -- Load/kill resource rule sets -- */ --int load_resource_rules(const char *rpath, resource_rule_t ** rules); --void destroy_resource_rules(resource_rule_t ** rules); -- --/* -- Load/kill resource sets -- */ --int load_resources(resource_t ** reslist, resource_rule_t ** rulelist); --void dump_resources(FILE * fp, resource_t ** reslist); --void destroy_resources(resource_t ** list); -- --/* -- Construct/deconstruct resource trees -- */ --int build_resource_tree(resource_node_t ** tree, -- resource_rule_t ** rulelist, resource_t ** reslist); --void destroy_resource_tree(resource_node_t ** tree); -- --/* -- Construct/deconstruct failover domains -- */ --int construct_domains(fod_t ** domains); --void deconstruct_domains(fod_t ** domains); -- --/* -- Handy functions -- */ --resource_t *find_resource_by_ref(resource_t ** reslist, char *type, char *ref); --resource_rule_t *find_rule_by_type(resource_rule_t ** rulelist, char *type); --void res_build_name(char *, size_t, resource_t *); -- --/* -- Internal functions; shouldn't be needed. -- */ --int store_attribute(resource_attr_t ** attrsp, char *name, char *value, int flags); -- --resource_t *load_resource(resource_rule_t * rule, const char *base); --int store_resource(resource_t ** reslist, resource_t * newres); --void destroy_resource(resource_t * res); -- --char *attr_value(resource_node_t * node, const char *attrname); --char *res_attr_value(resource_t * res, const char *attrname); --char *primary_attr_value(resource_t *); --void *act_dup(resource_act_t * acts); -- --#endif /* _RESLIST_H */ -diff --git a/extra/rgmanager/resrules.c b/extra/rgmanager/resrules.c -deleted file mode 100644 -index cb003c3..0000000 ---- a/extra/rgmanager/resrules.c -+++ /dev/null -@@ -1,971 +0,0 @@ --/* -- Copyright Red Hat, Inc. 2004-2010 -- -- This program is free software; you can redistribute it and/or modify it -- under the terms of the GNU General Public License as published by the -- Free Software Foundation; either version 2, or (at your option) any -- later version. -- -- This program is distributed in the hope that it will be useful, but -- WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program; see the file COPYING. If not, write to the -- Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -- MA 02110-1301 USA --*/ --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include -- --/** -- Store a new resource rule in the given rule list. -- -- @param rulelist List of rules to store new rule in. -- @param newrule New rule to store. -- @return 0 on success or -1 if rule with same name -- already exists in rulelist -- */ --static int --store_rule(resource_rule_t ** rulelist, resource_rule_t * newrule) --{ -- resource_rule_t *curr; -- -- list_do(rulelist, curr) { -- if (!strcmp(newrule->rr_type, curr->rr_type)) { -- fprintf(stderr, "Error storing %s: Duplicate\n", newrule->rr_type); -- return -1; -- } -- -- } -- while (!list_done(rulelist, curr)) ; -- -- list_insert(rulelist, newrule); -- return 0; --} -- --/** -- Obliterate a resource_rule_t structure. -- -- @param rr Resource rule to free. -- */ --static void --destroy_resource_rule(resource_rule_t * rr) --{ -- int x; -- -- if (rr->rr_type) -- free(rr->rr_type); -- if (rr->rr_agent) -- free(rr->rr_agent); -- if (rr->rr_version) -- free(rr->rr_version); -- -- if (rr->rr_attrs) { -- for (x = 0; rr->rr_attrs && rr->rr_attrs[x].ra_name; x++) { -- free(rr->rr_attrs[x].ra_name); -- if (rr->rr_attrs[x].ra_value) -- free(rr->rr_attrs[x].ra_value); -- } -- -- free(rr->rr_attrs); -- } -- -- if (rr->rr_actions) { -- for (x = 0; rr->rr_actions && rr->rr_actions[x].ra_name; x++) { -- free(rr->rr_actions[x].ra_name); -- } -- -- free(rr->rr_actions); -- } -- -- if (rr->rr_childtypes) { -- for (x = 0; rr->rr_childtypes && rr->rr_childtypes[x].rc_name; x++) -- free(rr->rr_childtypes[x].rc_name); -- free(rr->rr_childtypes); -- } -- -- free(rr); --} -- --/** -- Destroy a list of resource rules. -- -- @param rules List of rules to destroy. -- */ --void --destroy_resource_rules(resource_rule_t ** rules) --{ -- resource_rule_t *rr; -- -- while ((rr = *rules)) { -- list_remove(rules, rr); -- destroy_resource_rule(rr); -- } --} -- --/** -- Get and store the maxparents (max instances) attribute for a given -- resource rule set. -- -- @param doc Pre-parsed XML document pointer. -- @param ctx Pre-allocated XML XPath context pointer. -- @param base XPath prefix to search -- @param rr Resource rule to store new information in. -- */ --static void --_get_maxparents(xmlDocPtr doc, xmlXPathContextPtr ctx, char *base, resource_rule_t * rr) --{ -- char xpath[256]; -- char *ret = NULL; -- -- snprintf(xpath, sizeof(xpath), "%s/attributes/@maxinstances", base); -- ret = xpath_get_one(doc, ctx, xpath); -- if (ret) { -- rr->rr_maxrefs = atoi(ret); -- if (rr->rr_maxrefs < 0) -- rr->rr_maxrefs = 0; -- free(ret); -- } --} -- --/** -- Get and store a bit field. -- -- @param doc Pre-parsed XML document pointer. -- @param ctx Pre-allocated XML XPath context pointer. -- @param base XPath prefix to search -- @param rr Resource rule to store new information in. -- */ --static void --_get_rule_flag(xmlDocPtr doc, xmlXPathContextPtr ctx, const char *base, -- resource_rule_t * rr, const char *flag, int bit) --{ -- char xpath[256]; -- char *ret = NULL; -- -- snprintf(xpath, sizeof(xpath), "%s/attributes/@%s", base, flag); -- ret = xpath_get_one(doc, ctx, xpath); -- if (ret) { -- if (atoi(ret)) { -- rr->rr_flags |= bit; -- } else { -- rr->rr_flags &= ~bit; -- } -- free(ret); -- } --} -- --/** -- Get and store the version -- -- @param doc Pre-parsed XML document pointer. -- @param ctx Pre-allocated XML XPath context pointer. -- @param base XPath prefix to search -- @param rr Resource rule to store new information in. -- */ --static void --_get_version(xmlDocPtr doc, xmlXPathContextPtr ctx, char *base, resource_rule_t * rr) --{ -- char xpath[256]; -- char *ret = NULL; -- -- snprintf(xpath, sizeof(xpath), "%s/@version", base); -- ret = xpath_get_one(doc, ctx, xpath); -- if (ret) { -- rr->rr_version = ret; -- free(ret); -- } -- rr->rr_version = NULL; --} -- --int --expand_time(char *val) --{ -- int curval, len; -- int ret = 0; -- char *start = val, ival[16]; -- -- if (!val) -- return (time_t) 0; -- -- while (start[0]) { -- -- len = 0; -- curval = 0; -- memset(ival, 0, sizeof(ival)); -- -- while (isdigit(start[len])) { -- ival[len] = start[len]; -- len++; -- } -- -- if (len) { -- curval = atoi(ival); -- } else { -- len = 1; -- } -- -- switch (start[len]) { -- case 0: -- case 'S': -- case 's': -- break; -- case 'M': -- case 'm': -- curval *= 60; -- break; -- case 'h': -- case 'H': -- curval *= 3600; -- break; -- case 'd': -- case 'D': -- curval *= 86400; -- break; -- case 'w': -- case 'W': -- curval *= 604800; -- break; -- case 'y': -- case 'Y': -- curval *= 31536000; -- break; -- default: -- curval = 0; -- } -- -- ret += (time_t) curval; -- start += len; -- } -- -- return ret; --} -- --/** -- * Store a resource action -- * @param actsp Action array; may be modified and returned! -- * @param name Name of the action -- * @param depth Resource depth (status/monitor; -1 means *ALL LEVELS* -- * ... this means that only the highest-level check depth -- * will ever be performed!) -- * @param timeout Timeout (not used) -- * @param interval Time interval for status/monitor -- * @return 0 on success, -1 on failure -- * -- */ --int --store_action(resource_act_t ** actsp, char *name, int depth, int timeout, int interval) --{ -- int x = 0, replace = 0; -- resource_act_t *acts = *actsp; -- -- if (!name) -- return -1; -- -- if (depth < 0 && timeout < 0 && interval < 0) -- return -1; -- -- if (!acts) { -- /* Can't create with anything < 0 */ -- if (depth < 0 || timeout < 0 || interval < 0) -- return -1; -- -- acts = malloc(sizeof(resource_act_t) * 2); -- if (!acts) -- return -1; -- acts[0].ra_name = name; -- acts[0].ra_depth = depth; -- acts[0].ra_timeout = timeout; -- acts[0].ra_interval = interval; -- acts[0].ra_last = 0; -- acts[1].ra_name = NULL; -- -- *actsp = acts; -- return 0; -- } -- -- for (x = 0; acts[x].ra_name; x++) { -- if (!strcmp(acts[x].ra_name, name) && (depth == acts[x].ra_depth || depth == -1)) { -- fprintf(stderr, "Replacing action '%s' depth %d: ", name, acts[x].ra_depth); -- if (timeout >= 0) { -- fprintf(stderr, "timeout: %d->%d ", (int)acts[x].ra_timeout, (int)timeout); -- acts[x].ra_timeout = timeout; -- } -- if (interval >= 0) { -- fprintf(stderr, "interval: %d->%d", (int)acts[x].ra_interval, (int)interval); -- acts[x].ra_interval = interval; -- } -- fprintf(stderr, "\n"); -- replace = 1; -- } -- } -- -- if (replace) -- /* If we replaced something, we're done */ -- return 1; -- -- /* Can't create with anything < 0 */ -- if (depth < 0 || timeout < 0 || interval < 0) -- return -1; -- -- acts = realloc(acts, sizeof(resource_act_t) * (x + 2)); -- if (!acts) -- return -1; -- -- acts[x].ra_name = name; -- acts[x].ra_depth = depth; -- acts[x].ra_timeout = timeout; -- acts[x].ra_interval = interval; -- acts[x].ra_last = 0; -- -- acts[x + 1].ra_name = NULL; -- -- *actsp = acts; -- return 0; --} -- --static void --_get_actions(xmlDocPtr doc, xmlXPathContextPtr ctx, char *base, resource_rule_t * rr) --{ -- char xpath[256]; -- int idx = 0; -- char *act, *ret; -- int interval, timeout, depth; -- -- do { -- interval = 0; -- depth = 0; -- act = NULL; -- timeout = 0; -- -- snprintf(xpath, sizeof(xpath), "%s/action[%d]/@name", base, ++idx); -- -- act = xpath_get_one(doc, ctx, xpath); -- if (!act) -- break; -- -- snprintf(xpath, sizeof(xpath), "%s/action[%d]/@timeout", base, idx); -- ret = xpath_get_one(doc, ctx, xpath); -- if (ret) { -- timeout = expand_time(ret); -- if (timeout < 0) -- timeout = 0; -- free(ret); -- } -- -- snprintf(xpath, sizeof(xpath), "%s/action[%d]/@interval", base, idx); -- ret = xpath_get_one(doc, ctx, xpath); -- if (ret) { -- interval = expand_time(ret); -- if (interval < 0) -- interval = 0; -- free(ret); -- } -- -- if (!strcmp(act, "status") || !strcmp(act, "monitor")) { -- snprintf(xpath, sizeof(xpath), "%s/action[%d]/@depth", base, idx); -- ret = xpath_get_one(doc, ctx, xpath); -- if (ret) { -- depth = atoi(ret); -- if (depth < 0) -- depth = 0; -- free(ret); -- } -- } -- -- if (store_action(&rr->rr_actions, act, depth, timeout, interval) != 0) -- free(act); -- } while (1); --} -- --/** -- Store an attribute with the given name, value, and flags in a resource_t -- structure. -- XXX This could be rewritten to use the list macros. -- -- @param attrsp Attribute array to store new attribute in. -- @param name Name of attribute (must be non-null) -- @param value Value of attribute -- @param flags Attribute flags, or 0 if none. -- @return 0 on success, nonzero on error/failure -- */ --int --store_attribute(resource_attr_t ** attrsp, char *name, char *value, int flags) --{ -- int x = 0; -- resource_attr_t *attrs = *attrsp; -- -- if (!name) -- return -1; -- -- if (!attrs) { -- attrs = malloc(sizeof(resource_attr_t) * 2); -- if (!attrs) -- return -1; -- attrs[0].ra_name = name; -- attrs[0].ra_value = value; -- attrs[0].ra_flags = flags; -- attrs[1].ra_name = NULL; -- attrs[1].ra_value = NULL; -- -- *attrsp = attrs; -- return 0; -- } -- -- for (x = 0; attrs[x].ra_name; x++) ; -- -- attrs = realloc(attrs, sizeof(resource_attr_t) * (x + 2)); -- if (!attrs) -- return -1; -- -- /* Primary attribute goes first. This makes this interaction -- with CCS work way faster. */ -- if (flags & RA_PRIMARY) { -- attrs[x].ra_name = attrs[0].ra_name; -- attrs[x].ra_value = attrs[0].ra_value; -- attrs[x].ra_flags = attrs[0].ra_flags; -- attrs[0].ra_name = name; -- attrs[0].ra_value = value; -- attrs[0].ra_flags = flags; -- } else { -- attrs[x].ra_name = name; -- attrs[x].ra_value = value; -- attrs[x].ra_flags = flags; -- } -- attrs[x + 1].ra_name = NULL; -- attrs[x + 1].ra_value = NULL; -- -- *attrsp = attrs; -- return 0; --} -- --/** -- Store a child type in the child array of a resource rule. -- XXX Could be rewritten to use list macros. -- -- @param childp Child array. Might be modified. -- @param name Name of child type -- @param start Start level -- @param stop Stop level -- @param forbid Do NOT allow this child type to exist -- @param flags set to 1 to note that it was defined inline -- @return 0 on success, nonzero on failure -- */ --static int --store_childtype(resource_child_t ** childp, char *name, int start, int stop, int forbid, int flags) --{ -- int x = 0; -- resource_child_t *child = *childp; -- -- if (!name) -- return -1; -- -- if (!child) { -- child = malloc(sizeof(resource_child_t) * 2); -- if (!child) -- return -1; -- child[0].rc_name = name; -- child[0].rc_startlevel = start; -- child[0].rc_stoplevel = stop; -- child[0].rc_forbid = forbid; -- child[0].rc_flags = flags; -- child[1].rc_name = NULL; -- -- *childp = child; -- return 0; -- } -- -- for (x = 0; child[x].rc_name; x++) ; -- -- child = realloc(child, sizeof(resource_child_t) * (x + 2)); -- if (!child) -- return -1; -- -- child[x].rc_name = name; -- child[x].rc_startlevel = start; -- child[x].rc_stoplevel = stop; -- child[x].rc_forbid = forbid; -- child[x].rc_flags = flags; -- child[x + 1].rc_name = NULL; -- -- *childp = child; -- return 0; --} -- --/** -- Get and store attributes for a given instance of a resource rule. -- -- @param doc Pre-parsed XML document pointer. -- @param ctx Pre-allocated XML XPath context pointer. -- @param base XPath prefix to search -- @param rr Resource rule to store new information in. -- @return 0 -- */ --static int --_get_rule_attrs(xmlDocPtr doc, xmlXPathContextPtr ctx, const char *base, resource_rule_t * rr) --{ -- char *ret, *attrname, *dflt = NULL, xpath[256]; -- int x, flags, primary_found = 0; -- -- for (x = 1; 1; x++) { -- snprintf(xpath, sizeof(xpath), "%s/parameter[%d]/@name", base, x); -- -- ret = xpath_get_one(doc, ctx, xpath); -- if (!ret) -- break; -- -- flags = 0; -- attrname = ret; -- -- /* -- See if there's a default value. -- */ -- snprintf(xpath, sizeof(xpath), "%s/parameter[%d]/content/@default", base, x); -- dflt = xpath_get_one(doc, ctx, xpath); -- -- /* -- See if this is either the primary identifier or -- a required field. -- */ -- snprintf(xpath, sizeof(xpath), "%s/parameter[%d]/@required", base, x); -- if ((ret = xpath_get_one(doc, ctx, xpath))) { -- if ((atoi(ret) != 0) || (ret[0] == 'y')) -- flags |= RA_REQUIRED; -- free(ret); -- } -- -- /* -- See if this is supposed to be unique -- */ -- snprintf(xpath, sizeof(xpath), "%s/parameter[%d]/@unique", base, x); -- if ((ret = xpath_get_one(doc, ctx, xpath))) { -- if ((atoi(ret) != 0) || (ret[0] == 'y')) -- flags |= RA_UNIQUE; -- free(ret); -- } -- -- snprintf(xpath, sizeof(xpath), "%s/parameter[%d]/@primary", base, x); -- if ((ret = xpath_get_one(doc, ctx, xpath))) { -- if ((atoi(ret) != 0) || (ret[0] == 'y')) { -- if (primary_found) { -- free(ret); -- fprintf(stderr, "Multiple primary " -- "definitions for " "resource type %s\n", rr->rr_type); -- return -1; -- } -- flags |= RA_PRIMARY; -- primary_found = 1; -- } -- free(ret); -- } -- -- /* -- See if this can be reconfigured on the fly without a -- stop/start -- */ -- snprintf(xpath, sizeof(xpath), "%s/parameter[%d]/@reconfig", base, x); -- if ((ret = xpath_get_one(doc, ctx, xpath))) { -- if ((atoi(ret) != 0) || (ret[0] == 'y')) -- flags |= RA_RECONFIG; -- free(ret); -- } -- -- /* -- See if this is supposed to be inherited -- */ -- snprintf(xpath, sizeof(xpath), "%s/parameter[%d]/@inherit", base, x); -- if ((ret = xpath_get_one(doc, ctx, xpath))) { -- flags |= RA_INHERIT; -- -- if (flags & (RA_REQUIRED | RA_PRIMARY | RA_UNIQUE)) { -- free(ret); -- fprintf(stderr, "Can not inherit and be primary, " "unique, or required\n"); -- return -1; -- } -- /* -- don't free ret. Store as attr value. If we had -- a default value specified from above, free it; -- inheritance supercedes a specified default value. -- */ -- if (dflt) -- free(dflt); -- } else { -- /* -- Use default value, if specified, as the attribute -- value. -- */ -- ret = dflt; -- } -- -- /* -- Store the attribute. We'll ensure all required -- attributes are present soon. -- */ -- if (attrname) -- store_attribute(&rr->rr_attrs, attrname, ret, flags); -- } -- -- return 0; --} -- --/** -- Get and store attributes for a given instance of a resource. -- -- @param doc Pre-parsed XML document pointer. -- @param ctx Pre-allocated XML XPath context pointer. -- @param base XPath prefix to search -- @param rr Resource rule to store new information in. -- @return 0 -- */ --static int --_get_childtypes(xmlDocPtr doc, xmlXPathContextPtr ctx, char *base, resource_rule_t * rr) --{ -- char *ret, *childname, xpath[256]; -- int x, startlevel = 0, stoplevel = 0, forbid = 0; -- -- for (x = 1; 1; x++) { -- snprintf(xpath, sizeof(xpath), "%s/child[%d]/@type", base, x); -- -- ret = xpath_get_one(doc, ctx, xpath); -- if (!ret) -- break; -- -- startlevel = stoplevel = forbid = 0; -- childname = ret; -- -- /* -- Try to get the start level if it exists -- */ -- snprintf(xpath, sizeof(xpath), "%s/child[%d]/@start", base, x); -- if ((ret = xpath_get_one(doc, ctx, xpath))) { -- startlevel = atoi(ret); -- free(ret); -- } -- -- /* -- Try to get the stop level if it exists -- */ -- snprintf(xpath, sizeof(xpath), "%s/child[%d]/@stop", base, x); -- if ((ret = xpath_get_one(doc, ctx, xpath))) { -- stoplevel = atoi(ret); -- free(ret); -- } -- -- /* -- Get the 'forbidden' flag if it exists -- */ -- snprintf(xpath, sizeof(xpath), "%s/child[%d]/@forbid", base, x); -- if ((ret = xpath_get_one(doc, ctx, xpath))) { -- forbid = atoi(ret); -- free(ret); -- } -- -- /* -- Store the attribute. We'll ensure all required -- attributes are present soon. -- */ -- if (childname) -- store_childtype(&rr->rr_childtypes, childname, startlevel, stoplevel, forbid, 0); -- } -- -- return 0; --} -- --/** -- Read a file from a stdout pipe. -- */ --static int --read_pipe(int fd, char **file, size_t * length) --{ -- char buf[4096]; -- int n, done = 0; -- -- *file = NULL; -- *length = 0; -- -- while (!done) { -- -- n = read(fd, buf, sizeof(buf)); -- if (n < 0) { -- -- if (errno == EINTR) -- continue; -- -- if (*file) -- free(*file); -- return -1; -- } -- -- if (n == 0 && (!*length)) -- return 0; -- -- if (n == 0) { -- done = 1; -- } -- -- if (*file) -- *file = realloc(*file, (*length) + n + done); -- else -- *file = malloc(n + done); -- -- if (!*file) -- return -1; -- -- memcpy((*file) + (*length), buf, n); -- *length += (done + n); -- } -- -- /* Null terminator */ -- (*file)[(*length) - 1] = 0; -- -- return 0; --} -- --static xmlDocPtr --read_resource_agent_metadata(char *filename) --{ -- int pid; -- int _pipe[2]; -- char *data; -- size_t size; -- xmlDocPtr doc; -- -- if (pipe(_pipe) == -1) -- return NULL; -- -- pid = fork(); -- if (pid == -1) { -- close(_pipe[0]); -- close(_pipe[1]); -- } -- -- if (pid == 0) { -- /* child */ -- close(0); -- close(1); -- close(2); -- -- close(_pipe[0]); -- dup2(_pipe[1], 1); -- close(_pipe[1]); -- -- /* exec */ -- execl(filename, filename, "meta-data", NULL); -- exit(1); -- } -- -- close(_pipe[1]); -- /* parent */ -- if (read_pipe(_pipe[0], &data, &size) == -1) { -- close(_pipe[0]); -- return NULL; -- } -- -- waitpid(pid, NULL, 0); -- close(_pipe[0]); -- -- if (!size) -- return NULL; -- -- doc = xmlParseMemory(data, size); -- free(data); -- return doc; --} -- --/** -- Load the XML rule set for a resource and store attributes, constructing -- a new resource_t structure. -- -- @param filename File name to load rules from -- @param rules Rule list to add new rules to -- @return 0 -- */ --static int --load_resource_rulefile(char *filename, resource_rule_t ** rules) --{ -- resource_rule_t *rr = NULL; -- xmlDocPtr doc = NULL; -- xmlXPathContextPtr ctx = NULL; -- int ruleid = 0; -- char *type; -- char base[256]; -- -- doc = read_resource_agent_metadata(filename); -- if (!doc) -- return 0; -- ctx = xmlXPathNewContext(doc); -- -- do { -- /* Look for resource types */ -- snprintf(base, sizeof(base), "/resource-agent[%d]/@name", ++ruleid); -- type = xpath_get_one(doc, ctx, base); -- if (!type) -- break; -- -- if (!strcasecmp(type, "action")) { -- fprintf(stderr, "Error: Resource type '%s' is reserved", type); -- free(type); -- break; -- } -- -- rr = malloc(sizeof(*rr)); -- if (!rr) -- break; -- memset(rr, 0, sizeof(*rr)); -- -- rr->rr_flags = RF_INIT | RF_DESTROY; -- rr->rr_type = type; -- snprintf(base, sizeof(base), "/resource-agent[%d]", ruleid); -- -- /* -- First, grab the global attributes if existent -- */ -- _get_version(doc, ctx, base, rr); -- -- snprintf(base, sizeof(base), "/resource-agent[%d]/special[@tag=\"rgmanager\"]", ruleid); -- _get_maxparents(doc, ctx, base, rr); -- _get_rule_flag(doc, ctx, base, rr, "init_on_add", RF_INIT); -- _get_rule_flag(doc, ctx, base, rr, "destroy_on_delete", RF_DESTROY); -- rr->rr_agent = strdup(filename); -- -- /* -- Second, add the children fields -- */ -- _get_childtypes(doc, ctx, base, rr); -- -- /* -- Get the OCF status check intervals/monitor. -- */ -- snprintf(base, sizeof(base), "/resource-agent[%d]/actions", ruleid); -- _get_actions(doc, ctx, base, rr); -- -- /* -- Last, load the attributes from our XML file and their -- respective instantiations from CCS -- */ -- snprintf(base, sizeof(base), "/resource-agent[%d]/parameters", ruleid); -- if (_get_rule_attrs(doc, ctx, base, rr) < 0) { -- destroy_resource_rule(rr); -- rr = NULL; -- } -- -- if (!rr) -- continue; -- -- if (store_rule(rules, rr) != 0) { -- destroy_resource_rule(rr); -- rr = NULL; -- } -- } while (1); -- -- if (ctx) -- xmlXPathFreeContext(ctx); -- if (doc) -- xmlFreeDoc(doc); -- -- return 0; --} -- --/** -- Load all the resource rules we can find from our resource root -- directory. -- -- @param rules Rule list to create/add to -- @return 0 on success, -1 on failure. Sucess does not -- imply any rules have been found; only that no -- errors were encountered. -- */ --int --load_resource_rules(const char *rpath, resource_rule_t ** rules) --{ -- DIR *dir; -- struct dirent *de; -- char *fn, *dot; -- char path[2048]; -- struct stat st_buf; -- -- dir = opendir(rpath); -- if (!dir) -- return -1; -- -- xmlInitParser(); -- while ((de = readdir(dir))) { -- -- fn = basename(de->d_name); -- if (!fn) -- continue; -- -- /* Ignore files with common backup extension */ -- if ((fn != NULL) && (strlen(fn) > 0) && (fn[strlen(fn) - 1] == '~')) -- continue; -- -- /* Ignore hidden files */ -- if (*fn == '.') -- continue; -- -- dot = strrchr(fn, '.'); -- if (dot) { -- /* Ignore RPM installed save files, patches, -- diffs, etc. */ -- if (!strncasecmp(dot, ".rpm", 4)) { -- fprintf(stderr, "Warning: " -- "Ignoring %s/%s: Bad extension %s\n", rpath, de->d_name, dot); -- continue; -- } -- } -- -- snprintf(path, sizeof(path), "%s/%s", rpath, de->d_name); -- -- if (stat(path, &st_buf) < 0) -- continue; -- -- if (S_ISDIR(st_buf.st_mode)) -- continue; -- -- if (st_buf.st_mode & (S_IXUSR | S_IXOTH | S_IXGRP)) { -- //printf("Loading resource rule from %s\n", path); -- load_resource_rulefile(path, rules); -- } -- } -- -- closedir(dir); -- -- return 0; --} -- --/** -- Find a resource rule given its type. -- -- @param rulelist Rule list to search -- @param type Rule type identifier -- @return Resource rule or NULL if not found. -- */ --resource_rule_t * --find_rule_by_type(resource_rule_t ** rulelist, char *type) --{ -- resource_rule_t *curr = NULL; -- -- list_do(rulelist, curr) { -- if (!strcmp(curr->rr_type, type)) -- return curr; -- } -- while (!list_done(rulelist, curr)) ; -- -- return NULL; --} -diff --git a/extra/rgmanager/restree.c b/extra/rgmanager/restree.c -deleted file mode 100644 -index d06be85..0000000 ---- a/extra/rgmanager/restree.c -+++ /dev/null -@@ -1,723 +0,0 @@ --/* -- Copyright Red Hat, Inc. 2004-2006 -- -- This program is free software; you can redistribute it and/or modify it -- under the terms of the GNU General Public License as published by the -- Free Software Foundation; either version 2, or (at your option) any -- later version. -- -- This program is distributed in the hope that it will be useful, but -- WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -- General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program; see the file COPYING. If not, write to the -- Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, -- MA 02110-1301 USA -- -- Fix for #193859 - relocation of a service w/o umounting file-systems -- by Navid Sheikhol-Eslami [ navid at redhat dot com ] --*/ --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include --#include -- --/* XXX from resrules.c */ --int store_childtype(resource_child_t ** childp, char *name, int start, -- int stop, int forbid, int flags); --int _res_op(xmlNode ** xpp, xmlNode * rmp, resource_node_t ** tree, resource_t * first, char *type); --static inline int -- -- --_res_op_internal(xmlNode ** xpp, xmlNode * rmp, resource_node_t ** tree, resource_t * first, -- char *type, resource_node_t * node); -- --/* XXX from reslist.c */ --void *act_dup(resource_act_t * acts); -- --/** -- Fold a resource into an XML node. -- -- @param xpp XML node pp -- @param rmp resources block pp -- @param node Resource tree node we're dealing with -- @param op Operation to perform (stop/start/etc.) -- @param depth OCF Check level/depth -- @return Return value of script. -- @see build_env -- */ --static int --res_do_flatten(xmlNode ** xpp, xmlNode * rmp, resource_node_t * node, const char *arg, int depth) --{ -- xmlNode *n, *r; -- resource_attr_t *ra; -- resource_t *res = node->rn_resource; -- char *val; -- char buf[256]; -- int x, y; -- -- n = xmlNewNode(NULL, (xmlChar *) node->rn_resource->r_rule->rr_type); -- -- xmlSetProp(n, (xmlChar *) "rgmanager-meta-agent", -- (xmlChar *) basename(node->rn_resource->r_rule->rr_agent)); -- -- /* Multiple-instance resources must be decomposed into separate -- resources */ -- if (node->rn_resource->r_refs > 1) { -- snprintf(buf, sizeof(buf), "%s_%d", -- primary_attr_value(node->rn_resource), node->rn_resource->r_incarnations); -- ++node->rn_resource->r_incarnations; -- } else { -- snprintf(buf, sizeof(buf), "%s", primary_attr_value(node->rn_resource)); -- } -- -- for (x = 0; node->rn_resource->r_attrs && node->rn_resource->r_attrs[x].ra_name; x++) { -- ra = &node->rn_resource->r_attrs[x]; -- -- if (ra->ra_flags & RA_PRIMARY) { -- xmlSetProp(n, (xmlChar *) ra->ra_name, (xmlChar *) buf); -- } else { -- val = attr_value(node, res->r_attrs[x].ra_name); -- if (!val) -- continue; -- -- for (y = 0; res->r_rule->rr_attrs[y].ra_name; y++) { -- if (strcmp(ra->ra_name, res->r_rule->rr_attrs[y].ra_name)) -- continue; -- -- if (!res->r_rule->rr_attrs[y].ra_value || -- strcmp(val, res->r_rule->rr_attrs[y].ra_value)) -- xmlSetProp(n, (xmlChar *) ra->ra_name, (xmlChar *) val); -- } -- } -- } -- -- if (!*xpp) { -- /* Add top-level container */ -- *xpp = n; -- } else { -- if (!rmp) { -- xmlAddChild(*xpp, n); -- } else { -- r = xmlNewNode(NULL, (xmlChar *) node->rn_resource->r_rule->rr_type); -- xmlSetProp(r, (xmlChar *) "ref", (xmlChar *) primary_attr_value(node->rn_resource)); -- xmlAddChild(rmp, n); -- xmlAddChild(*xpp, r); -- } -- } -- -- return 0; --} -- --static inline void --assign_restart_policy(resource_t * curres, resource_node_t * parent, -- resource_node_t * node, char *base) --{ -- char *val; -- int max_restarts = 0; -- time_t restart_expire_time = 0; -- char tok[1024]; -- -- if (!curres || !node) -- return; -- if (parent && !(node->rn_flags & RF_INDEPENDENT)) -- return; -- -- if (node->rn_flags & RF_INDEPENDENT) { -- /* per-resource-node failures / expire times */ -- snprintf(tok, sizeof(tok), "%s/@__max_restarts", base); -- if (conf_get(tok, &val) == 0) { -- max_restarts = atoi(val); -- if (max_restarts <= 0) -- max_restarts = 0; -- free(val); -- } -- -- snprintf(tok, sizeof(tok), "%s/@__restart_expire_time", base); -- if (conf_get(tok, &val) == 0) { -- restart_expire_time = (time_t) expand_time(val); -- if ((int64_t) restart_expire_time <= 0) -- restart_expire_time = 0; -- free(val); -- } -- //if (restart_expire_time == 0 || max_restarts == 0) -- return; -- //goto out_assign; -- } -- -- val = (char *)res_attr_value(curres, "max_restarts"); -- if (!val) -- return; -- max_restarts = atoi(val); -- if (max_restarts <= 0) -- return; -- val = res_attr_value(curres, "restart_expire_time"); -- if (val) { -- restart_expire_time = (time_t) expand_time(val); -- if ((int64_t) restart_expire_time < 0) -- return; -- } --//out_assign: -- return; --} -- --static inline int --do_load_resource(char *base, -- resource_rule_t * rule, -- resource_node_t ** tree, -- resource_t ** reslist, resource_node_t * parent, resource_node_t ** newnode) --{ -- char tok[512]; -- char *ref; -- resource_node_t *node; -- resource_t *curres; -- time_t failure_expire = 0; -- int max_failures = 0; -- -- snprintf(tok, sizeof(tok), "%s/@ref", base); -- -- if (conf_get(tok, &ref) != 0) { -- /* There wasn't an existing resource. See if there -- is one defined inline */ -- curres = load_resource(rule, base); -- if (!curres) { -- /* No ref and no new one inline == -- no more of the selected type */ -- return 1; -- } -- -- if (store_resource(reslist, curres) != 0) { -- fprintf(stderr, "Error storing %s resource\n", curres->r_rule->rr_type); -- destroy_resource(curres); -- return -1; -- } -- -- curres->r_flags = RF_INLINE; -- -- } else { -- -- curres = find_resource_by_ref(reslist, rule->rr_type, ref); -- if (!curres) { -- fprintf(stderr, "Error: Reference to nonexistent " -- "resource %s (type %s)\n", ref, rule->rr_type); -- free(ref); -- return -1; -- } -- -- if (curres->r_flags & RF_INLINE) { -- fprintf(stderr, "Error: Reference to inlined " -- "resource %s (type %s) is illegal\n", ref, rule->rr_type); -- free(ref); -- return -1; -- } -- free(ref); -- } -- -- /* Load it if its max refs hasn't been exceeded */ -- if (rule->rr_maxrefs && (curres->r_refs >= rule->rr_maxrefs)) { -- fprintf(stderr, "Warning: Max references exceeded for resource" -- " %s (type %s)\n", curres->r_attrs[0].ra_name, rule->rr_type); -- return -1; -- } -- -- node = malloc(sizeof(*node)); -- if (!node) -- return -1; -- -- memset(node, 0, sizeof(*node)); -- -- //printf("New resource tree node: %s:%s \n", curres->r_rule->rr_type,curres->r_attrs->ra_value); -- -- node->rn_child = NULL; -- node->rn_parent = parent; -- node->rn_resource = curres; -- node->rn_state = RES_STOPPED; -- node->rn_flags = 0; -- node->rn_actions = (resource_act_t *) act_dup(curres->r_actions); -- -- if (parent) { -- /* Independent subtree / non-critical for top-level is -- * not useful and can interfere with restart thresholds for -- * non critical resources */ -- snprintf(tok, sizeof(tok), "%s/@__independent_subtree", base); -- if (conf_get(tok, &ref) == 0) { -- if (atoi(ref) == 1 || strcasecmp(ref, "yes") == 0) -- node->rn_flags |= RF_INDEPENDENT; -- if (atoi(ref) == 2 || strcasecmp(ref, "non-critical") == 0) { -- curres->r_flags |= RF_NON_CRITICAL; -- } -- free(ref); -- } -- } -- -- snprintf(tok, sizeof(tok), "%s/@__enforce_timeouts", base); -- if (conf_get(tok, &ref) == 0) { -- if (atoi(ref) > 0 || strcasecmp(ref, "yes") == 0) -- node->rn_flags |= RF_ENFORCE_TIMEOUTS; -- free(ref); -- } -- -- /* per-resource-node failures / expire times */ -- snprintf(tok, sizeof(tok), "%s/@__max_failures", base); -- if (conf_get(tok, &ref) == 0) { -- max_failures = atoi(ref); -- if (max_failures < 0) -- max_failures = 0; -- free(ref); -- } -- -- snprintf(tok, sizeof(tok), "%s/@__failure_expire_time", base); -- if (conf_get(tok, &ref) == 0) { -- failure_expire = (time_t) expand_time(ref); -- if ((int64_t) failure_expire < 0) -- failure_expire = 0; -- free(ref); -- } -- -- if (max_failures && failure_expire) { -- /* -- node->rn_failure_counter = restart_init(failure_expire, -- max_failures); -- */ -- } -- -- curres->r_refs++; -- -- if (curres->r_refs > 1 && (curres->r_flags & RF_NON_CRITICAL)) { -- res_build_name(tok, sizeof(tok), curres); -- fprintf(stderr, "Non-critical flag for %s is being cleared due to multiple references.\n", -- tok); -- curres->r_flags &= ~RF_NON_CRITICAL; -- } -- -- if (curres->r_flags & RF_NON_CRITICAL) { -- /* Independent subtree is implied if a -- * resource is non-critical -- */ -- node->rn_flags |= RF_NON_CRITICAL | RF_INDEPENDENT; -- -- } -- -- assign_restart_policy(curres, parent, node, base); -- -- *newnode = node; -- -- list_insert(tree, node); -- -- return 0; --} -- --/** -- Build the resource tree. If a new resource is defined inline, add it to -- the resource list. All rules, however, must have already been read in. -- -- @param tree Tree to modify/insert on to -- @param parent Parent node, if one exists. -- @param rule Rule surrounding the new node -- @param rulelist List of all rules allowed in the tree. -- @param reslist List of all currently defined resources -- @param base Base CCS path. -- @see destroy_resource_tree -- */ --#define RFL_FOUND 0x1 --#define RFL_FORBID 0x2 --static int --build_tree(resource_node_t ** tree, -- resource_node_t * parent, -- resource_rule_t * rule, resource_rule_t ** rulelist, resource_t ** reslist, char *base) --{ -- char tok[512]; -- resource_rule_t *childrule; -- resource_node_t *node; -- char *ref; -- char *tmp; -- int ccount = 0, x = 0, y = 0, flags = 0; -- -- //printf("DESCEND: %s / %s\n", rule?rule->rr_type:"(none)", base); -- -- /* Pass 1: typed / defined children */ -- for (y = 0; rule && rule->rr_childtypes && rule->rr_childtypes[y].rc_name; y++) { -- -- flags = 0; -- list_for(rulelist, childrule, x) { -- if (strcmp(rule->rr_childtypes[y].rc_name, childrule->rr_type)) -- continue; -- -- flags |= RFL_FOUND; -- -- if (rule->rr_childtypes[y].rc_forbid) -- flags |= RFL_FORBID; -- -- break; -- } -- -- if (flags & RFL_FORBID) -- /* Allow all *but* forbidden */ -- continue; -- -- if (!(flags & RFL_FOUND)) -- /* Not found? Wait for pass 2 */ -- continue; -- -- //printf("looking for %s %s @ %s\n", -- //rule->rr_childtypes[y].rc_name, -- //childrule->rr_type, base); -- for (x = 1;; x++) { -- -- /* Search for base/type[x]/@ref - reference an existing -- resource */ -- snprintf(tok, sizeof(tok), "%s/%s[%d]", base, childrule->rr_type, x); -- -- flags = 1; -- switch (do_load_resource(tok, childrule, tree, reslist, parent, &node)) { -- case -1: -- continue; -- case 1: -- /* 1 == no more */ -- //printf("No resource found @ %s\n", tok); -- flags = 0; -- break; -- case 0: -- break; -- } -- if (!flags) -- break; -- -- /* Got a child :: bump count */ -- snprintf(tok, sizeof(tok), "%s/%s[%d]", base, childrule->rr_type, x); -- -- /* Kaboom */ -- build_tree(&node->rn_child, node, childrule, rulelist, reslist, tok); -- -- } -- } -- -- /* Pass 2: untyped children */ -- for (ccount = 1;; ccount++) { -- snprintf(tok, sizeof(tok), "%s/child::*[%d]", base, ccount); -- -- if (conf_get(tok, &ref) != 0) { -- /* End of the line. */ -- //printf("End of the line: %s\n", tok); -- break; -- } -- -- tmp = strchr(ref, '='); -- if (tmp) { -- *tmp = 0; -- } else { -- /* no = sign... bad */ -- free(ref); -- continue; -- } -- -- /* Find the resource rule */ -- flags = 0; -- list_for(rulelist, childrule, x) { -- if (!strcasecmp(childrule->rr_type, ref)) { -- /* Ok, matching rule found */ -- flags = 1; -- break; -- } -- } -- /* No resource rule matching the child? Press on... */ -- if (!flags) { -- free(ref); -- continue; -- } -- -- flags = 0; -- /* Don't descend on anything we should have already picked -- up on in the above loop */ -- for (y = 0; rule && rule->rr_childtypes && rule->rr_childtypes[y].rc_name; y++) { -- /* SKIP defined child types of any type */ -- if (strcmp(rule->rr_childtypes[y].rc_name, ref)) -- continue; -- if (rule->rr_childtypes[y].rc_flags == 0) { -- /* 2 = defined as a real child */ -- flags = 2; -- break; -- } -- -- flags = 1; -- break; -- } -- -- free(ref); -- if (flags == 2) -- continue; -- -- x = 1; -- switch (do_load_resource(tok, childrule, tree, reslist, parent, &node)) { -- case -1: -- continue; -- case 1: -- /* no more found */ -- x = 0; -- fprintf(stderr, "No resource found @ %s\n", tok); -- break; -- case 0: -- /* another is found */ -- break; -- } -- if (!x) /* no more found */ -- break; -- -- /* childrule = rule set of this child at this point */ -- /* tok = set above; if we got this far, we're all set */ -- /* Kaboom */ -- -- build_tree(&node->rn_child, node, childrule, rulelist, reslist, tok); -- } -- -- //printf("ASCEND: %s / %s\n", rule?rule->rr_type:"(none)", base); -- return 0; --} -- --/** -- Set up to call build_tree. Hides the nastiness from the user. -- -- @param tree Tree pointer. Should start as a pointer to NULL. -- @param rulelist List of all rules allowed -- @param reslist List of all currently defined resources -- @return 0 -- @see build_tree destroy_resource_tree -- */ --int --build_resource_tree(resource_node_t ** tree, resource_rule_t ** rulelist, resource_t ** reslist) --{ -- resource_node_t *root = NULL; -- char tok[512]; -- -- snprintf(tok, sizeof(tok), "%s", RESOURCE_TREE_ROOT); -- -- /* Find and build the list of root nodes */ -- build_tree(&root, NULL, NULL /*curr */ , rulelist, reslist, tok); -- -- if (root) -- *tree = root; -- -- return 0; --} -- --/** -- Deconstruct a resource tree. -- -- @param tree Tree to obliterate. -- @see build_resource_tree -- */ --void --destroy_resource_tree(resource_node_t ** tree) --{ -- resource_node_t *node; -- -- while ((node = *tree)) { -- if ((*tree)->rn_child) -- destroy_resource_tree(&(*tree)->rn_child); -- -- list_remove(tree, node); -- -- if (node->rn_actions) { -- free(node->rn_actions); -- } -- free(node); -- } --} -- --static inline int --_do_child_levels(xmlNode ** xpp, xmlNode * rmp, resource_node_t ** tree, resource_t * first) --{ -- resource_node_t *node = *tree; -- resource_t *res = node->rn_resource; -- resource_rule_t *rule = res->r_rule; -- int l, lev, x, rv = 0; -- -- for (l = 1; l <= RESOURCE_MAX_LEVELS; l++) { -- -- for (x = 0; rule->rr_childtypes && rule->rr_childtypes[x].rc_name; x++) { -- -- lev = rule->rr_childtypes[x].rc_startlevel; -- -- if (!lev || lev != l) -- continue; -- -- /* Do op on all children at our level */ -- rv |= _res_op(xpp, rmp, &node->rn_child, first, rule->rr_childtypes[x].rc_name); -- -- if (rv & SFL_FAILURE) -- return rv; -- } -- -- if (rv != 0) -- return rv; -- } -- -- return rv; --} -- --static inline int --_xx_child_internal(xmlNode ** xpp, xmlNode * rmp, resource_node_t * node, resource_t * first, -- resource_node_t * child) --{ -- int x; -- resource_rule_t *rule = node->rn_resource->r_rule; -- -- for (x = 0; rule->rr_childtypes && rule->rr_childtypes[x].rc_name; x++) { -- if (!strcmp(child->rn_resource->r_rule->rr_type, rule->rr_childtypes[x].rc_name)) { -- if (rule->rr_childtypes[x].rc_startlevel || rule->rr_childtypes[x].rc_stoplevel) { -- return 0; -- } -- } -- } -- -- return _res_op_internal(xpp, rmp, &child, first, child->rn_resource->r_rule->rr_type, child); --} -- --static inline int --_do_child_default_level(xmlNode ** xpp, xmlNode * rmp, resource_node_t ** tree, resource_t * first) --{ -- resource_node_t *node = *tree, *child; -- int y, rv = 0; -- -- list_for(&node->rn_child, child, y) { -- rv |= _xx_child_internal(xpp, rmp, node, first, child); -- -- if (rv & SFL_FAILURE) -- return rv; -- } -- -- return rv; --} -- --/** -- Nasty codependent function. Perform an operation by numerical level -- at some point in the tree. This allows indirectly-dependent resources -- (such as IP addresses and user scripts) to have ordering without requiring -- a direct dependency. -- -- @param tree Resource tree to search/perform operations on -- @param first Resource we're looking to perform the operation on, -- if one exists. -- @param ret Unused, but will be used to store status information -- such as resources consumed, etc, in the future. -- @param op Operation to perform if either first is found, -- or no first is declared (in which case, all nodes -- in the subtree). -- @see _res_op res_exec -- */ --static int --_res_op_by_level(xmlNode ** xpp, xmlNode * rmp, resource_node_t ** tree, resource_t * first) --{ -- resource_node_t *node = *tree; -- resource_t *res = node->rn_resource; -- resource_rule_t *rule = res->r_rule; -- int rv = 0; -- -- if (!rule->rr_childtypes) -- return _res_op(xpp, rmp, &node->rn_child, first, NULL); -- -- rv |= _do_child_levels(xpp, rmp, tree, first); -- if (rv & SFL_FAILURE) -- return rv; -- -- /* default level after specified ones */ -- rv |= _do_child_default_level(xpp, rmp, tree, first); -- -- return rv; --} -- --/** -- Nasty codependent function. Perform an operation by type for all siblings -- at some point in the tree. This allows indirectly-dependent resources -- (such as IP addresses and user scripts) to have ordering without requiring -- a direct dependency. -- -- @param tree Resource tree to search/perform operations on -- @param first Resource we're looking to perform the operation on, -- if one exists. -- @param type Type to look for. -- @see _res_op_by_level res_exec -- */ --static inline int --_res_op_internal(xmlNode ** xpp, xmlNode * rmp, -- resource_node_t __attribute__ ((unused)) ** tree, -- resource_t * first, char *type, resource_node_t * node) --{ -- int rv = 0, me; -- -- /* Restore default operation. */ -- -- /* If we're starting by type, do that funky thing. */ -- if (type && strlen(type) && strcmp(node->rn_resource->r_rule->rr_type, type)) -- return 0; -- -- /* If the resource is found, all nodes in the subtree must -- have the operation performed as well. */ -- me = !first || (node->rn_resource == first); -- -- /* Start starts before children */ -- if (me) { -- -- rv = res_do_flatten(xpp, rmp, node, NULL, 0); -- -- } -- -- if (node->rn_child) { -- rv |= _res_op_by_level(xpp, rmp, &node, me ? NULL : first); -- } -- -- return rv; --} -- --/** -- Nasty codependent function. Perform an operation by type for all siblings -- at some point in the tree. This allows indirectly-dependent resources -- (such as IP addresses and user scripts) to have ordering without requiring -- a direct dependency. -- -- @param tree Resource tree to search/perform operations on -- @param first Resource we're looking to perform the operation on, -- if one exists. -- @param type Type to look for. -- @see _res_op_by_level res_exec -- */ --int --_res_op(xmlNode ** xpp, xmlNode * rmp, resource_node_t ** tree, resource_t * first, char *type) --{ -- resource_node_t *node; -- int count = 0, rv = 0; -- -- list_for(tree, node, count) { -- rv |= _res_op_internal(xpp, rmp, tree, first, type, node); -- -- if (rv & SFL_FAILURE) -- return rv; -- } -- -- return rv; --} -- --/** -- Flatten resources for a service and return the pointer to it. -- -- @param tree Tree to search for our resource. -- @param res Resource to start/stop -- @param ret Unused -- */ --int --res_flatten(xmlNode ** xpp, xmlNode * rmp, resource_node_t ** tree, resource_t * res) --{ -- return _res_op(xpp, rmp, tree, res, NULL); --} -diff --git a/extra/rgmanager/tests/test1.conf b/extra/rgmanager/tests/test1.conf -deleted file mode 100644 -index 4b0354f..0000000 ---- a/extra/rgmanager/tests/test1.conf -+++ /dev/null -@@ -1,38 +0,0 @@ -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --