diff --git a/SOURCES/0039-prevent-segfault-when-logging.patch b/SOURCES/0039-prevent-segfault-when-logging.patch new file mode 100644 index 0000000..5764033 --- /dev/null +++ b/SOURCES/0039-prevent-segfault-when-logging.patch @@ -0,0 +1,25 @@ +From 6c495a49d444404d0ed3fe910ace58befd2db8dc Mon Sep 17 00:00:00 2001 +From: Andrew Beekhof +Date: Wed, 18 Nov 2015 08:48:57 +1100 +Subject: [PATCH] Fix: systemd: Prevent segfaul when logging failed operations + +--- + lib/services/systemd.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/services/systemd.c b/lib/services/systemd.c +index 3d5a600..a851bc6 100644 +--- a/lib/services/systemd.c ++++ b/lib/services/systemd.c +@@ -442,7 +442,7 @@ systemd_exec_result(DBusMessage *reply, svc_action_t *op) + + /* ignore "already started" or "not running" errors */ + if (!systemd_mask_error(op, error.name)) { +- crm_err("Could not issue %s for %s: %s (%s)", op->action, op->rsc, error.message); ++ crm_err("Could not issue %s for %s: %s", op->action, op->rsc, error.message); + } + + } else { +-- +1.8.3.1 + diff --git a/SOURCES/0040-update-top-format-in-HealthCPU.patch b/SOURCES/0040-update-top-format-in-HealthCPU.patch new file mode 100644 index 0000000..e4be29a --- /dev/null +++ b/SOURCES/0040-update-top-format-in-HealthCPU.patch @@ -0,0 +1,27 @@ +From e8b884997f2d49871c6a19b36095a66e377f54e4 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 3 Dec 2015 15:29:14 -0600 +Subject: [PATCH] Fix: resources: allow for top output with or without percent + sign in HealthCPU + +Problem found and patch provided by Malcome Cowe . +--- + extra/resources/HealthCPU | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/extra/resources/HealthCPU b/extra/resources/HealthCPU +index d320320..1ceaa01 100644 +--- a/extra/resources/HealthCPU ++++ b/extra/resources/HealthCPU +@@ -136,7 +136,7 @@ dummy_monitor() { + + if [ -f ${OCF_RESKEY_state} ]; then + +- IDLE=`top -b -n2 | grep Cpu | tail -1 | awk -F",|\.[0-9]%id" '{ print $4 }'` ++ IDLE=`top -b -n2 | grep Cpu | tail -1 | awk -F",|.[0-9][ %]id" '{ print $4 }'` + # echo "System idle: " $IDLE + # echo "$OCF_RESKEY_red_limit" + # echo $OCF_RESKEY_yellow_limit +-- +1.8.3.1 + diff --git a/SOURCES/0041-delete-fence-attributes-correctly.patch b/SOURCES/0041-delete-fence-attributes-correctly.patch new file mode 100644 index 0000000..06be3a6 --- /dev/null +++ b/SOURCES/0041-delete-fence-attributes-correctly.patch @@ -0,0 +1,32 @@ +From 98e69e033835b3d4dfdc8c9cabacae28770725f1 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Wed, 9 Dec 2015 15:01:25 +0100 +Subject: [PATCH] Fix RHBZ#1287315: stonithd: Trigger cib_devices_update in + case of deletion of just an attribute + +--- + fencing/main.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/fencing/main.c b/fencing/main.c +index e9831f0..0dc4492 100644 +--- a/fencing/main.c ++++ b/fencing/main.c +@@ -796,8 +796,13 @@ update_cib_stonith_devices_v2(const char *event, xmlNode * msg) + } else if(safe_str_eq(op, "delete") && strstr(xpath, XML_CIB_TAG_RESOURCE)) { + const char *rsc_id = NULL; + char *search = NULL; +- char *mutable = strdup(xpath); ++ char *mutable = NULL; + ++ if (strstr(xpath, XML_TAG_ATTR_SETS)) { ++ needs_update = TRUE; ++ break; ++ } ++ mutable = strdup(xpath); + rsc_id = strstr(mutable, "primitive[@id=\'") + strlen("primitive[@id=\'"); + search = strchr(rsc_id, '\''); + search[0] = 0; +-- +1.8.3.1 + diff --git a/SOURCES/0042-handle-systemd-shutdown.patch b/SOURCES/0042-handle-systemd-shutdown.patch new file mode 100644 index 0000000..9b7a51c --- /dev/null +++ b/SOURCES/0042-handle-systemd-shutdown.patch @@ -0,0 +1,55 @@ +From 6aae8542abedc755b90c8c49aa5c429718fd12f1 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Tue, 12 Jan 2016 15:46:26 +0100 +Subject: [PATCH] Fix RHBZ#1286316: Do an ordered shutdown of systemd resources + have lrmd wait till systemd actually starts bringing down systemd + resources instead of being confused if service is still active on first + status send a reload to systemd whenever a unitfile is changed instead of + doing this just with every 10th change + +--- + lib/services/systemd.c | 11 ++++------- + lrmd/lrmd.c | 2 ++ + 2 files changed, 6 insertions(+), 7 deletions(-) + +diff --git a/lib/services/systemd.c b/lib/services/systemd.c +index a851bc6..eb5f8aa 100644 +--- a/lib/services/systemd.c ++++ b/lib/services/systemd.c +@@ -150,16 +150,13 @@ systemd_daemon_reload(int timeout) + { + static unsigned int reload_count = 0; + const char *method = "Reload"; +- ++ DBusMessage *msg = systemd_new_method(BUS_NAME".Manager", method); + + reload_count++; +- if(reload_count % 10 == 0) { +- DBusMessage *msg = systemd_new_method(BUS_NAME".Manager", method); ++ CRM_ASSERT(msg != NULL); ++ pcmk_dbus_send(msg, systemd_proxy, systemd_daemon_reload_complete, GUINT_TO_POINTER(reload_count), timeout); ++ dbus_message_unref(msg); + +- CRM_ASSERT(msg != NULL); +- pcmk_dbus_send(msg, systemd_proxy, systemd_daemon_reload_complete, GUINT_TO_POINTER(reload_count), timeout); +- dbus_message_unref(msg); +- } + return TRUE; + } + +diff --git a/lrmd/lrmd.c b/lrmd/lrmd.c +index a64b430..518d5d1 100644 +--- a/lrmd/lrmd.c ++++ b/lrmd/lrmd.c +@@ -900,6 +900,8 @@ action_complete(svc_action_t * action) + /* Ok, so this is the follow up monitor action to check if start actually completed */ + if(cmd->lrmd_op_status == PCMK_LRM_OP_DONE && cmd->exec_rc == PCMK_OCF_PENDING) { + goagain = true; ++ } else if(cmd->exec_rc == PCMK_OCF_OK && safe_str_eq(cmd->real_action, "stop")) { ++ goagain = true; + + } else { + #ifdef HAVE_SYS_TIMEB_H +-- +1.8.3.1 + diff --git a/SOURCES/0043-cts-fix-for-command-lines.patch b/SOURCES/0043-cts-fix-for-command-lines.patch new file mode 100644 index 0000000..120f87d --- /dev/null +++ b/SOURCES/0043-cts-fix-for-command-lines.patch @@ -0,0 +1,125 @@ +From 94ebc967f2e74301ef5e10ed102832168503c7d9 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 1 Oct 2015 12:00:26 -0500 +Subject: [PATCH] Test: CTS: get Reattach test working again and up-to-date + +Prevously, Reattach relied on command lines being logged, +which 8dae683 removed. Now, it doesn't. + +Previously, Reattach used the now-deprecated is-managed-default cluster option; +now, it uses the is-managed option in rsc_defaults. +--- + cts/CTStests.py | 59 ++++++++++++++++++++++++++++----------------------------- + 1 file changed, 29 insertions(+), 30 deletions(-) + +diff --git a/cts/CTStests.py b/cts/CTStests.py +index ddd8c4a..e4207aa 100644 +--- a/cts/CTStests.py ++++ b/cts/CTStests.py +@@ -1693,6 +1693,19 @@ class Reattach(CTSTest): + self.stopall = SimulStopLite(cm) + self.is_unsafe = 0 # Handled by canrunnow() + ++ def _is_managed(self, node): ++ is_managed = self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -Q -G -d true", 1) ++ is_managed = is_managed[:-1] # Strip off the newline ++ return is_managed == "true" ++ ++ def _set_unmanaged(self, node): ++ self.debug("Disable resource management") ++ self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false") ++ ++ def _set_managed(self, node): ++ self.debug("Re-enable resource management") ++ self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D") ++ + def setup(self, node): + attempt = 0 + if not self.startall(None): +@@ -1717,17 +1730,11 @@ class Reattach(CTSTest): + start = StartTest(self.CM) + start(node) + +- is_managed = self.rsh(node, "crm_attribute -Q -G -t crm_config -n is-managed-default -d true", 1) +- is_managed = is_managed[:-1] # Strip off the newline +- if is_managed != "true": +- self.logger.log("Attempting to re-enable resource management on %s (%s)" % (node, is_managed)) +- managed = self.create_watch(["is-managed-default"], 60) +- managed.setwatch() +- +- self.rsh(node, "crm_attribute -V -D -n is-managed-default") +- +- if not managed.lookforall(): +- self.logger.log("Patterns not found: " + repr(managed.unmatched)) ++ if not self._is_managed(node): ++ self.logger.log("Attempting to re-enable resource management on %s" % node) ++ self._set_managed(node) ++ self.CM.cluster_stable() ++ if not self._is_managed(node): + self.logger.log("Could not re-enable resource management") + return 0 + +@@ -1744,11 +1751,12 @@ class Reattach(CTSTest): + self.incr("calls") + + pats = [] +- managed = self.create_watch(["is-managed-default"], 60) ++ # Conveniently, pengine will display this message when disabling management, ++ # even if fencing is not enabled, so we can rely on it. ++ managed = self.create_watch(["Delaying fencing operations"], 60) + managed.setwatch() + +- self.debug("Disable resource management") +- self.rsh(node, "crm_attribute -V -n is-managed-default -v false") ++ self._set_unmanaged(node) + + if not managed.lookforall(): + self.logger.log("Patterns not found: " + repr(managed.unmatched)) +@@ -1767,37 +1775,28 @@ class Reattach(CTSTest): + self.debug("Shutting down the cluster") + ret = self.stopall(None) + if not ret: +- self.debug("Re-enable resource management") +- self.rsh(node, "crm_attribute -V -D -n is-managed-default") ++ self._set_managed(node) + return self.failure("Couldn't shut down the cluster") + + self.debug("Bringing the cluster back up") + ret = self.startall(None) + time.sleep(5) # allow ping to update the CIB + if not ret: +- self.debug("Re-enable resource management") +- self.rsh(node, "crm_attribute -V -D -n is-managed-default") ++ self._set_managed(node) + return self.failure("Couldn't restart the cluster") + + if self.local_badnews("ResourceActivity:", watch): +- self.debug("Re-enable resource management") +- self.rsh(node, "crm_attribute -V -D -n is-managed-default") ++ self._set_managed(node) + return self.failure("Resources stopped or started during cluster restart") + + watch = self.create_watch(pats, 60, "StartupActivity") + watch.setwatch() + +- managed = self.create_watch(["is-managed-default"], 60) +- managed.setwatch() +- +- self.debug("Re-enable resource management") +- self.rsh(node, "crm_attribute -V -D -n is-managed-default") +- +- if not managed.lookforall(): +- self.logger.log("Patterns not found: " + repr(managed.unmatched)) +- return self.failure("Resource management not enabled") +- ++ # Re-enable resource management (and verify it happened). ++ self._set_managed(node) + self.CM.cluster_stable() ++ if not self._is_managed(node): ++ return self.failure("Could not re-enable resource management") + + # Ignore actions for STONITH resources + ignore = [] +-- +1.8.3.1 + diff --git a/SOURCES/0100-Refactor-lrmd-handle-shutdown-a-little-more-cleanly.patch b/SOURCES/0100-Refactor-lrmd-handle-shutdown-a-little-more-cleanly.patch new file mode 100644 index 0000000..faf4146 --- /dev/null +++ b/SOURCES/0100-Refactor-lrmd-handle-shutdown-a-little-more-cleanly.patch @@ -0,0 +1,71 @@ +From f289115b5a3693934bb3140725e2dc9aef3a6a13 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 22 Dec 2015 12:24:14 -0600 +Subject: [PATCH] Refactor: lrmd: handle shutdown a little more cleanly + +--- + lrmd/main.c | 33 +++++++++++++++++---------------- + 1 file changed, 17 insertions(+), 16 deletions(-) + +diff --git a/lrmd/main.c b/lrmd/main.c +index a3b7929..73519e2 100644 +--- a/lrmd/main.c ++++ b/lrmd/main.c +@@ -231,9 +231,23 @@ void + lrmd_shutdown(int nsig) + { + crm_info("Terminating with %d clients", crm_hash_table_size(client_connections)); ++ ++ if (stonith_api) { ++ stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT); ++ stonith_api->cmds->disconnect(stonith_api); ++ stonith_api_delete(stonith_api); ++ } + if (ipcs) { + mainloop_del_ipc_server(ipcs); + } ++ ++#ifdef ENABLE_PCMK_REMOTE ++ lrmd_tls_server_destroy(); ++ ipc_proxy_cleanup(); ++#endif ++ ++ crm_client_cleanup(); ++ g_hash_table_destroy(rsc_list); + crm_exit(pcmk_ok); + } + +@@ -255,7 +269,6 @@ static struct crm_option long_options[] = { + int + main(int argc, char **argv) + { +- int rc = 0; + int flag = 0; + int index = 0; + const char *option = NULL; +@@ -349,19 +362,7 @@ main(int argc, char **argv) + crm_info("Starting"); + g_main_run(mainloop); + +- mainloop_del_ipc_server(ipcs); +-#ifdef ENABLE_PCMK_REMOTE +- lrmd_tls_server_destroy(); +- ipc_proxy_cleanup(); +-#endif +- crm_client_cleanup(); +- +- g_hash_table_destroy(rsc_list); +- +- if (stonith_api) { +- stonith_api->cmds->disconnect(stonith_api); +- stonith_api_delete(stonith_api); +- } +- +- return rc; ++ /* should never get here */ ++ lrmd_shutdown(SIGTERM); ++ return pcmk_ok; + } +-- +1.8.3.1 + diff --git a/SOURCES/0101-Refactor-lrmd-make-proxied-IPC-providers-clients-opa.patch b/SOURCES/0101-Refactor-lrmd-make-proxied-IPC-providers-clients-opa.patch new file mode 100644 index 0000000..43219a8 --- /dev/null +++ b/SOURCES/0101-Refactor-lrmd-make-proxied-IPC-providers-clients-opa.patch @@ -0,0 +1,111 @@ +From 68e7bb19d69a999443524ba79203979b35f54e83 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 22 Dec 2015 11:41:56 -0600 +Subject: [PATCH 101/105] Refactor: lrmd: make proxied IPC providers/clients + opaque + +This removes an unused extern declaration in crmd.h, +makes the ipc_providers and ipc_clients tables static to ipc_proxy.c, +and adds an ipc_proxy_get_provider() function for future use. +--- + crmd/crmd.h | 1 - + lrmd/ipc_proxy.c | 48 ++++++++++++++++++++++++++++++------------------ + lrmd/lrmd_private.h | 1 + + 3 files changed, 31 insertions(+), 19 deletions(-) + +diff --git a/crmd/crmd.h b/crmd/crmd.h +index 031f414..6039c85 100644 +--- a/crmd/crmd.h ++++ b/crmd/crmd.h +@@ -24,7 +24,6 @@ + # define DAEMON_DEBUG DEVEL_DIR"/"SYS_NAME".debug" + + extern GMainLoop *crmd_mainloop; +-extern GHashTable *ipc_clients; + extern bool no_quorum_suicide_escalation; + + extern void crmd_metadata(void); +diff --git a/lrmd/ipc_proxy.c b/lrmd/ipc_proxy.c +index 84fb3ec..d95a396 100644 +--- a/lrmd/ipc_proxy.c ++++ b/lrmd/ipc_proxy.c +@@ -42,34 +42,46 @@ static qb_ipcs_service_t *crmd_ipcs = NULL; + static qb_ipcs_service_t *stonith_ipcs = NULL; + + /* ipc providers == crmd clients connecting from cluster nodes */ +-GHashTable *ipc_providers; ++static GHashTable *ipc_providers = NULL; + /* ipc clients == things like cibadmin, crm_resource, connecting locally */ +-GHashTable *ipc_clients; ++static GHashTable *ipc_clients = NULL; ++ ++/*! ++ * \internal ++ * \brief Get an IPC proxy provider ++ * ++ * \return Pointer to a provider if one exists, NULL otherwise ++ * ++ * \note Grab the first provider available; any provider will work, and usually ++ * there will be only one. These are client connections originating from a ++ * cluster node's crmd. ++ */ ++crm_client_t * ++ipc_proxy_get_provider() ++{ ++ if (ipc_providers) { ++ GHashTableIter iter; ++ gpointer key = NULL; ++ gpointer value = NULL; ++ ++ g_hash_table_iter_init(&iter, ipc_providers); ++ if (g_hash_table_iter_next(&iter, &key, &value)) { ++ return (crm_client_t*)value; ++ } ++ } ++ return NULL; ++} + + static int32_t + ipc_proxy_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid, const char *ipc_channel) + { +- void *key = NULL; +- void *value = NULL; + crm_client_t *client; +- crm_client_t *ipc_proxy = NULL; +- GHashTableIter iter; ++ crm_client_t *ipc_proxy = ipc_proxy_get_provider(); + xmlNode *msg; + + crm_trace("Connection %p on channel %s", c, ipc_channel); + +- if (g_hash_table_size(ipc_providers) == 0) { +- crm_err("No ipc providers available for uid %d gid %d", uid, gid); +- return -EREMOTEIO; +- } +- +- g_hash_table_iter_init(&iter, ipc_providers); +- if (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) { +- /* grab the first provider available, any provider in this +- * table will work. Usually there will only be one. These are +- * lrmd client connections originating for a cluster node's crmd. */ +- ipc_proxy = value; +- } else { ++ if (ipc_proxy == NULL) { + crm_err("No ipc providers available for uid %d gid %d", uid, gid); + return -EREMOTEIO; + } +diff --git a/lrmd/lrmd_private.h b/lrmd/lrmd_private.h +index ddb1506..52f79b8 100644 +--- a/lrmd/lrmd_private.h ++++ b/lrmd/lrmd_private.h +@@ -103,6 +103,7 @@ void ipc_proxy_cleanup(void); + void ipc_proxy_add_provider(crm_client_t *client); + void ipc_proxy_remove_provider(crm_client_t *client); + void ipc_proxy_forward_client(crm_client_t *client, xmlNode *xml); ++crm_client_t *ipc_proxy_get_provider(void); + #endif + + #endif +-- +1.8.3.1 + diff --git a/SOURCES/0102-Refactor-crmd-lrmd-liblrmd-use-defined-constants-for.patch b/SOURCES/0102-Refactor-crmd-lrmd-liblrmd-use-defined-constants-for.patch new file mode 100644 index 0000000..b22ffd3 --- /dev/null +++ b/SOURCES/0102-Refactor-crmd-lrmd-liblrmd-use-defined-constants-for.patch @@ -0,0 +1,181 @@ +From 6239d1dd84a50585a30175978be7e6d8ffb0b155 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 22 Dec 2015 15:59:21 -0600 +Subject: [PATCH 102/105] Refactor: crmd,lrmd,liblrmd: use defined constants + for lrmd IPC operations + +Reduces chance of typos. +--- + crmd/lrm_state.c | 6 +++--- + include/crm/lrmd.h | 6 ++++++ + lib/lrmd/proxy_common.c | 6 +++--- + lrmd/ipc_proxy.c | 14 +++++++------- + lrmd/remote_ctl.c | 6 +++--- + 5 files changed, 22 insertions(+), 16 deletions(-) + +diff --git a/crmd/lrm_state.c b/crmd/lrm_state.c +index 0e52ff6..497d3f9 100644 +--- a/crmd/lrm_state.c ++++ b/crmd/lrm_state.c +@@ -481,7 +481,7 @@ remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg) + crm_element_value_int(msg, F_LRMD_IPC_MSG_ID, &msg_id); + + /* This is msg from remote ipc client going to real ipc server */ +- if (safe_str_eq(op, "new")) { ++ if (safe_str_eq(op, LRMD_IPC_OP_NEW)) { + const char *channel = crm_element_value(msg, F_LRMD_IPC_IPC_SERVER); + + CRM_CHECK(channel != NULL, return); +@@ -490,10 +490,10 @@ remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg) + remote_proxy_notify_destroy(lrmd, session); + } + crm_trace("new remote proxy client established to %s, session id %s", channel, session); +- } else if (safe_str_eq(op, "destroy")) { ++ } else if (safe_str_eq(op, LRMD_IPC_OP_DESTROY)) { + remote_proxy_end_session(session); + +- } else if (safe_str_eq(op, "request")) { ++ } else if (safe_str_eq(op, LRMD_IPC_OP_REQUEST)) { + int flags = 0; + xmlNode *request = get_message_xml(msg, F_LRMD_IPC_MSG); + const char *name = crm_element_value(msg, F_LRMD_IPC_CLIENT); +diff --git a/include/crm/lrmd.h b/include/crm/lrmd.h +index 5a3c6ce..5c74798 100644 +--- a/include/crm/lrmd.h ++++ b/include/crm/lrmd.h +@@ -90,6 +90,12 @@ typedef struct lrmd_key_value_s { + #define LRMD_OP_POKE "lrmd_rsc_poke" + #define LRMD_OP_NEW_CLIENT "lrmd_rsc_new_client" + ++#define LRMD_IPC_OP_NEW "new" ++#define LRMD_IPC_OP_DESTROY "destroy" ++#define LRMD_IPC_OP_EVENT "event" ++#define LRMD_IPC_OP_REQUEST "request" ++#define LRMD_IPC_OP_RESPONSE "response" ++ + #define F_LRMD_IPC_OP "lrmd_ipc_op" + #define F_LRMD_IPC_IPC_SERVER "lrmd_ipc_server" + #define F_LRMD_IPC_SESSION "lrmd_ipc_session" +diff --git a/lib/lrmd/proxy_common.c b/lib/lrmd/proxy_common.c +index 50c59c3..a0f5e62 100644 +--- a/lib/lrmd/proxy_common.c ++++ b/lib/lrmd/proxy_common.c +@@ -39,7 +39,7 @@ remote_proxy_notify_destroy(lrmd_t *lrmd, const char *session_id) + { + /* sending to the remote node that an ipc connection has been destroyed */ + xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); +- crm_xml_add(msg, F_LRMD_IPC_OP, "destroy"); ++ crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_DESTROY); + crm_xml_add(msg, F_LRMD_IPC_SESSION, session_id); + lrmd_internal_proxy_send(lrmd, msg); + free_xml(msg); +@@ -50,7 +50,7 @@ remote_proxy_relay_event(lrmd_t *lrmd, const char *session_id, xmlNode *msg) + { + /* sending to the remote node an event msg. */ + xmlNode *event = create_xml_node(NULL, T_LRMD_IPC_PROXY); +- crm_xml_add(event, F_LRMD_IPC_OP, "event"); ++ crm_xml_add(event, F_LRMD_IPC_OP, LRMD_IPC_OP_EVENT); + crm_xml_add(event, F_LRMD_IPC_SESSION, session_id); + add_message_xml(event, F_LRMD_IPC_MSG, msg); + crm_log_xml_explicit(event, "EventForProxy"); +@@ -63,7 +63,7 @@ remote_proxy_relay_response(lrmd_t *lrmd, const char *session_id, xmlNode *msg, + { + /* sending to the remote node a response msg. */ + xmlNode *response = create_xml_node(NULL, T_LRMD_IPC_PROXY); +- crm_xml_add(response, F_LRMD_IPC_OP, "response"); ++ crm_xml_add(response, F_LRMD_IPC_OP, LRMD_IPC_OP_RESPONSE); + crm_xml_add(response, F_LRMD_IPC_SESSION, session_id); + crm_xml_add_int(response, F_LRMD_IPC_MSG_ID, msg_id); + add_message_xml(response, F_LRMD_IPC_MSG, msg); +diff --git a/lrmd/ipc_proxy.c b/lrmd/ipc_proxy.c +index d95a396..164a9ff 100644 +--- a/lrmd/ipc_proxy.c ++++ b/lrmd/ipc_proxy.c +@@ -101,7 +101,7 @@ ipc_proxy_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid, const char *ipc + g_hash_table_insert(ipc_clients, client->id, client); + + msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); +- crm_xml_add(msg, F_LRMD_IPC_OP, "new"); ++ crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_NEW); + crm_xml_add(msg, F_LRMD_IPC_IPC_SERVER, ipc_channel); + crm_xml_add(msg, F_LRMD_IPC_SESSION, client->id); + lrmd_server_send_notify(ipc_proxy, msg); +@@ -157,7 +157,7 @@ ipc_proxy_forward_client(crm_client_t *ipc_proxy, xmlNode *xml) + + if (ipc_client == NULL) { + xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); +- crm_xml_add(msg, F_LRMD_IPC_OP, "destroy"); ++ crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_DESTROY); + crm_xml_add(msg, F_LRMD_IPC_SESSION, session); + lrmd_server_send_notify(ipc_proxy, msg); + free_xml(msg); +@@ -176,11 +176,11 @@ ipc_proxy_forward_client(crm_client_t *ipc_proxy, xmlNode *xml) + * and forwarding it to connection 1. + */ + +- if (safe_str_eq(msg_type, "event")) { ++ if (safe_str_eq(msg_type, LRMD_IPC_OP_EVENT)) { + crm_trace("Sending event to %s", ipc_client->id); + rc = crm_ipcs_send(ipc_client, 0, msg, crm_ipc_server_event); + +- } else if (safe_str_eq(msg_type, "response")) { ++ } else if (safe_str_eq(msg_type, LRMD_IPC_OP_RESPONSE)) { + int msg_id = 0; + + crm_element_value_int(xml, F_LRMD_IPC_MSG_ID, &msg_id); +@@ -190,7 +190,7 @@ ipc_proxy_forward_client(crm_client_t *ipc_proxy, xmlNode *xml) + CRM_LOG_ASSERT(msg_id == ipc_client->request_id); + ipc_client->request_id = 0; + +- } else if (safe_str_eq(msg_type, "destroy")) { ++ } else if (safe_str_eq(msg_type, LRMD_IPC_OP_DESTROY)) { + qb_ipcs_disconnect(ipc_client->ipcs); + + } else { +@@ -245,7 +245,7 @@ ipc_proxy_dispatch(qb_ipcs_connection_t * c, void *data, size_t size) + client->request_id = id; + + msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); +- crm_xml_add(msg, F_LRMD_IPC_OP, "request"); ++ crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_REQUEST); + crm_xml_add(msg, F_LRMD_IPC_SESSION, client->id); + crm_xml_add(msg, F_LRMD_IPC_CLIENT, crm_client_name(client)); + crm_xml_add(msg, F_LRMD_IPC_USER, client->user); +@@ -275,7 +275,7 @@ ipc_proxy_closed(qb_ipcs_connection_t * c) + + if (ipc_proxy) { + xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); +- crm_xml_add(msg, F_LRMD_IPC_OP, "destroy"); ++ crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_DESTROY); + crm_xml_add(msg, F_LRMD_IPC_SESSION, client->id); + lrmd_server_send_notify(ipc_proxy, msg); + free_xml(msg); +diff --git a/lrmd/remote_ctl.c b/lrmd/remote_ctl.c +index ad85954..1983c88 100644 +--- a/lrmd/remote_ctl.c ++++ b/lrmd/remote_ctl.c +@@ -333,7 +333,7 @@ remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg) + crm_element_value_int(msg, F_LRMD_IPC_MSG_ID, &msg_id); + + /* This is msg from remote ipc client going to real ipc server */ +- if (safe_str_eq(op, "new")) { ++ if (safe_str_eq(op, LRMD_IPC_OP_NEW)) { + const char *channel = crm_element_value(msg, F_LRMD_IPC_IPC_SERVER); + + CRM_CHECK(channel != NULL, return); +@@ -342,10 +342,10 @@ remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg) + remote_proxy_notify_destroy(lrmd, session); + } + crm_info("new remote proxy client established to %s, session id %s", channel, session); +- } else if (safe_str_eq(op, "destroy")) { ++ } else if (safe_str_eq(op, LRMD_IPC_OP_DESTROY)) { + remote_proxy_end_session(session); + +- } else if (safe_str_eq(op, "request")) { ++ } else if (safe_str_eq(op, LRMD_IPC_OP_REQUEST)) { + int flags = 0; + xmlNode *request = get_message_xml(msg, F_LRMD_IPC_MSG); + const char *name = crm_element_value(msg, F_LRMD_IPC_CLIENT); +-- +1.8.3.1 + diff --git a/SOURCES/0103-Test-cts-simulate-pacemaker_remote-failure-with-kill.patch b/SOURCES/0103-Test-cts-simulate-pacemaker_remote-failure-with-kill.patch new file mode 100644 index 0000000..3a1ecd0 --- /dev/null +++ b/SOURCES/0103-Test-cts-simulate-pacemaker_remote-failure-with-kill.patch @@ -0,0 +1,46 @@ +From 48246b5916745a56cb0ceb7b4e148b9e587708fe Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 23 Dec 2015 14:36:51 -0600 +Subject: [PATCH 103/105] Test: cts: simulate pacemaker_remote failure with + kill + +Previously, failure was simulated by stopping pacemaker_remote, but +that will eventually cause a graceful stop rather than a failure, +so first kill the process. + +rebased commit to 1.1.14-rc5 +--- + cts/CTStests.py | 14 +++++++++++--- + 1 file changed, 11 insertions(+), 3 deletions(-) + +diff --git a/cts/CTStests.py b/cts/CTStests.py +index e6f3abe..fb1c5f2 100644 +--- a/cts/CTStests.py ++++ b/cts/CTStests.py +@@ -2764,6 +2764,14 @@ class RemoteDriver(CTSTest): + self.pcmk_started = 1 + break + ++ def kill_pcmk_remote(self, node): ++ """ Simulate a Pacemaker Remote daemon failure. """ ++ ++ # We kill the process to prevent a graceful stop, ++ # then stop it to prevent the OS from restarting it. ++ self.rsh(node, "killall -9 pacemaker_remoted") ++ self.stop_pcmk_remote(node) ++ + def start_metal(self, node): + pcmk_started = 0 + +@@ -2855,7 +2863,7 @@ class RemoteDriver(CTSTest): + + # force stop the pcmk remote daemon. this will result in fencing + self.debug("Force stopped active remote node") +- self.stop_pcmk_remote(node) ++ self.kill_pcmk_remote(node) + + self.debug("Waiting for remote node to be fenced.") + self.set_timer("remoteMetalFence") +-- +1.8.3.1 + diff --git a/SOURCES/0104-Feature-lrmd-liblrmd-add-lrmd-IPC-operations-for-req.patch b/SOURCES/0104-Feature-lrmd-liblrmd-add-lrmd-IPC-operations-for-req.patch new file mode 100644 index 0000000..f956820 --- /dev/null +++ b/SOURCES/0104-Feature-lrmd-liblrmd-add-lrmd-IPC-operations-for-req.patch @@ -0,0 +1,119 @@ +From 29cc1018cb98b1ff864f2aed090cb6b591963275 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 23 Dec 2015 15:01:48 -0600 +Subject: [PATCH 104/105] Feature: lrmd,liblrmd: add lrmd IPC operations for + requesting and acknowledging shutdown + +This adds two new lrmd IPC operations, LRMD_IPC_OP_SHUTDOWN_REQ and +LRMD_IPC_OP_SHUTDOWN_ACK, along with functions to send them. +This will support the ability to stop pacemaker_remote gracefully. + +At this point, no code uses these new operations. +--- + include/crm/lrmd.h | 2 ++ + include/crm_internal.h | 1 + + lib/lrmd/proxy_common.c | 14 ++++++++++++++ + lrmd/ipc_proxy.c | 24 ++++++++++++++++++++++++ + lrmd/lrmd_private.h | 1 + + 5 files changed, 42 insertions(+) + +diff --git a/include/crm/lrmd.h b/include/crm/lrmd.h +index 5c74798..6660fb9 100644 +--- a/include/crm/lrmd.h ++++ b/include/crm/lrmd.h +@@ -95,6 +95,8 @@ typedef struct lrmd_key_value_s { + #define LRMD_IPC_OP_EVENT "event" + #define LRMD_IPC_OP_REQUEST "request" + #define LRMD_IPC_OP_RESPONSE "response" ++#define LRMD_IPC_OP_SHUTDOWN_REQ "shutdown_req" ++#define LRMD_IPC_OP_SHUTDOWN_ACK "shutdown_ack" + + #define F_LRMD_IPC_OP "lrmd_ipc_op" + #define F_LRMD_IPC_IPC_SERVER "lrmd_ipc_server" +diff --git a/include/crm_internal.h b/include/crm_internal.h +index e0bbb06..c5fbcb7 100644 +--- a/include/crm_internal.h ++++ b/include/crm_internal.h +@@ -380,6 +380,7 @@ typedef struct remote_proxy_s { + + } remote_proxy_t; + void remote_proxy_notify_destroy(lrmd_t *lrmd, const char *session_id); ++void remote_proxy_ack_shutdown(lrmd_t *lrmd); + void remote_proxy_relay_event(lrmd_t *lrmd, const char *session_id, xmlNode *msg); + void remote_proxy_relay_response(lrmd_t *lrmd, const char *session_id, xmlNode *msg, int msg_id); + void remote_proxy_end_session(const char *session); +diff --git a/lib/lrmd/proxy_common.c b/lib/lrmd/proxy_common.c +index a0f5e62..eb17e4e 100644 +--- a/lib/lrmd/proxy_common.c ++++ b/lib/lrmd/proxy_common.c +@@ -45,6 +45,20 @@ remote_proxy_notify_destroy(lrmd_t *lrmd, const char *session_id) + free_xml(msg); + } + ++/*! ++ * \brief Send an acknowledgment of a remote proxy shutdown request. ++ * ++ * \param[in] lrmd Connection to proxy ++ */ ++void ++remote_proxy_ack_shutdown(lrmd_t *lrmd) ++{ ++ xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); ++ crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_SHUTDOWN_ACK); ++ lrmd_internal_proxy_send(lrmd, msg); ++ free_xml(msg); ++} ++ + void + remote_proxy_relay_event(lrmd_t *lrmd, const char *session_id, xmlNode *msg) + { +diff --git a/lrmd/ipc_proxy.c b/lrmd/ipc_proxy.c +index 164a9ff..9633a67 100644 +--- a/lrmd/ipc_proxy.c ++++ b/lrmd/ipc_proxy.c +@@ -259,6 +259,30 @@ ipc_proxy_dispatch(qb_ipcs_connection_t * c, void *data, size_t size) + return 0; + } + ++/*! ++ * \internal ++ * \brief Notify a proxy provider that we wish to shut down ++ * ++ * \return 0 on success, -1 on error ++ */ ++int ++ipc_proxy_shutdown_req(crm_client_t *ipc_proxy) ++{ ++ xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); ++ int rc; ++ ++ crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_SHUTDOWN_REQ); ++ ++ /* We don't really have a session, but crmd needs this attribute ++ * to recognize this as proxy communication. ++ */ ++ crm_xml_add(msg, F_LRMD_IPC_SESSION, "0"); ++ ++ rc = (lrmd_server_send_notify(ipc_proxy, msg) < 0)? -1 : 0; ++ free_xml(msg); ++ return rc; ++} ++ + static int32_t + ipc_proxy_closed(qb_ipcs_connection_t * c) + { +diff --git a/lrmd/lrmd_private.h b/lrmd/lrmd_private.h +index 52f79b8..78f14c9 100644 +--- a/lrmd/lrmd_private.h ++++ b/lrmd/lrmd_private.h +@@ -104,6 +104,7 @@ void ipc_proxy_add_provider(crm_client_t *client); + void ipc_proxy_remove_provider(crm_client_t *client); + void ipc_proxy_forward_client(crm_client_t *client, xmlNode *xml); + crm_client_t *ipc_proxy_get_provider(void); ++int ipc_proxy_shutdown_req(crm_client_t *ipc_proxy); + #endif + + #endif +-- +1.8.3.1 + diff --git a/SOURCES/0105-Feature-crmd-support-graceful-pacemaker_remote-stops.patch b/SOURCES/0105-Feature-crmd-support-graceful-pacemaker_remote-stops.patch new file mode 100644 index 0000000..523c089 --- /dev/null +++ b/SOURCES/0105-Feature-crmd-support-graceful-pacemaker_remote-stops.patch @@ -0,0 +1,32 @@ +From 8eebc8a30a55645164d3c41acaf028dd75fab275 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 23 Dec 2015 15:18:38 -0600 +Subject: [PATCH 105/105] Feature: crmd: support graceful pacemaker_remote + stops + +NOT YET IMPLEMENTED. This just is a placeholder. +--- + crmd/lrm_state.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/crmd/lrm_state.c b/crmd/lrm_state.c +index 497d3f9..5ee5b83 100644 +--- a/crmd/lrm_state.c ++++ b/crmd/lrm_state.c +@@ -478,6 +478,13 @@ remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg) + CRM_CHECK(op != NULL, return); + CRM_CHECK(session != NULL, return); + ++ if (safe_str_eq(op, LRMD_IPC_OP_SHUTDOWN_REQ)) { ++ crm_warn("Graceful proxy shutdown not yet supported"); ++ /* TODO: uncomment this, then put node in standby: */ ++ /* remote_proxy_ack_shutdown(lrmd); */ ++ return; ++ } ++ + crm_element_value_int(msg, F_LRMD_IPC_MSG_ID, &msg_id); + + /* This is msg from remote ipc client going to real ipc server */ +-- +1.8.3.1 + diff --git a/SOURCES/0106-Feature-pacemaker_remote-support-graceful-stops.patch b/SOURCES/0106-Feature-pacemaker_remote-support-graceful-stops.patch new file mode 100644 index 0000000..5fd1506 --- /dev/null +++ b/SOURCES/0106-Feature-pacemaker_remote-support-graceful-stops.patch @@ -0,0 +1,263 @@ +From c83dc10b975aa70a3da85dc2e63cec99a0b729b2 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Wed, 23 Dec 2015 15:19:28 -0600 +Subject: [PATCH] Feature: pacemaker_remote: support graceful stops + +When pacemaker_remote gets an interrupt signal, if there are any connected +proxy providers, it will send an lrmd IPC op for a shutdown request, +and stop accepting new provider connections. If the provider acknowledges the +request, pacemaker_remote will wait until all providers disconnect before +exiting itself. This gives the cluster the opportunity to stop any resources +running on the node that is shutting down. + +If the provider is an older version that does not support graceful stops, +pacemaker_remote will time out waiting for the ack, then exit immediately. + +Since we are now waiting for resources to exit, the systemd stop timeout +for pacemaker_remote has been raised to match pacemaker's. +--- + lrmd/ipc_proxy.c | 12 +++- + lrmd/lrmd_private.h | 4 +- + lrmd/main.c | 121 +++++++++++++++++++++++++++++++++++++-- + lrmd/pacemaker_remote.service.in | 4 +- + lrmd/tls_backend.c | 3 +- + 5 files changed, 135 insertions(+), 9 deletions(-) + +diff --git a/lrmd/ipc_proxy.c b/lrmd/ipc_proxy.c +index 9633a67..07c13ab 100644 +--- a/lrmd/ipc_proxy.c ++++ b/lrmd/ipc_proxy.c +@@ -152,9 +152,19 @@ ipc_proxy_forward_client(crm_client_t *ipc_proxy, xmlNode *xml) + const char *session = crm_element_value(xml, F_LRMD_IPC_SESSION); + const char *msg_type = crm_element_value(xml, F_LRMD_IPC_OP); + xmlNode *msg = get_message_xml(xml, F_LRMD_IPC_MSG); +- crm_client_t *ipc_client = crm_client_get_by_id(session); ++ crm_client_t *ipc_client; + int rc = 0; + ++ /* If the IPC provider is acknowledging our shutdown request, ++ * defuse the short exit timer to give the cluster time to ++ * stop any resources we're running. ++ */ ++ if (safe_str_eq(msg_type, LRMD_IPC_OP_SHUTDOWN_ACK)) { ++ handle_shutdown_ack(); ++ return; ++ } ++ ++ ipc_client = crm_client_get_by_id(session); + if (ipc_client == NULL) { + xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); + crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_DESTROY); +diff --git a/lrmd/lrmd_private.h b/lrmd/lrmd_private.h +index 78f14c9..29146f5 100644 +--- a/lrmd/lrmd_private.h ++++ b/lrmd/lrmd_private.h +@@ -80,7 +80,9 @@ void process_lrmd_message(crm_client_t * client, uint32_t id, xmlNode * request) + + void free_rsc(gpointer data); + +-void lrmd_shutdown(int nsig); ++void handle_shutdown_ack(void); ++ ++void lrmd_client_destroy(crm_client_t *client); + + void client_disconnect_cleanup(const char *client_id); + +diff --git a/lrmd/main.c b/lrmd/main.c +index 73519e2..98a1412 100644 +--- a/lrmd/main.c ++++ b/lrmd/main.c +@@ -40,6 +40,16 @@ static qb_ipcs_service_t *ipcs = NULL; + stonith_t *stonith_api = NULL; + int lrmd_call_id = 0; + ++#ifdef ENABLE_PCMK_REMOTE ++/* whether shutdown request has been sent */ ++static volatile sig_atomic_t shutting_down = FALSE; ++ ++/* timer for waiting for acknowledgment of shutdown request */ ++static volatile guint shutdown_ack_timer = 0; ++ ++static gboolean lrmd_exit(gpointer data); ++#endif ++ + static void + stonith_connection_destroy_cb(stonith_t * st, stonith_event_t * e) + { +@@ -151,6 +161,27 @@ lrmd_ipc_dispatch(qb_ipcs_connection_t * c, void *data, size_t size) + return 0; + } + ++/*! ++ * \internal ++ * \brief Free a client connection, and exit if appropriate ++ * ++ * \param[in] client Client connection to free ++ */ ++void ++lrmd_client_destroy(crm_client_t *client) ++{ ++ crm_client_destroy(client); ++ ++#ifdef ENABLE_PCMK_REMOTE ++ /* If we were waiting to shut down, we can now safely do so ++ * if there are no more proxied IPC providers ++ */ ++ if (shutting_down && (ipc_proxy_get_provider() == NULL)) { ++ lrmd_exit(NULL); ++ } ++#endif ++} ++ + static int32_t + lrmd_ipc_closed(qb_ipcs_connection_t * c) + { +@@ -165,7 +196,7 @@ lrmd_ipc_closed(qb_ipcs_connection_t * c) + #ifdef ENABLE_PCMK_REMOTE + ipc_proxy_remove_provider(client); + #endif +- crm_client_destroy(client); ++ lrmd_client_destroy(client); + return 0; + } + +@@ -227,8 +258,17 @@ lrmd_server_send_notify(crm_client_t * client, xmlNode * msg) + return -1; + } + +-void +-lrmd_shutdown(int nsig) ++/*! ++ * \internal ++ * \brief Clean up and exit immediately ++ * ++ * \param[in] data Ignored ++ * ++ * \return Doesn't return ++ * \note This can be used as a timer callback. ++ */ ++static gboolean ++lrmd_exit(gpointer data) + { + crm_info("Terminating with %d clients", crm_hash_table_size(client_connections)); + +@@ -249,6 +289,79 @@ lrmd_shutdown(int nsig) + crm_client_cleanup(); + g_hash_table_destroy(rsc_list); + crm_exit(pcmk_ok); ++ return FALSE; ++} ++ ++/*! ++ * \internal ++ * \brief Request cluster shutdown if appropriate, otherwise exit immediately ++ * ++ * \param[in] nsig Signal that caused invocation (ignored) ++ */ ++static void ++lrmd_shutdown(int nsig) ++{ ++#ifdef ENABLE_PCMK_REMOTE ++ crm_client_t *ipc_proxy = ipc_proxy_get_provider(); ++ ++ /* If there are active proxied IPC providers, then we may be running ++ * resources, so notify the cluster that we wish to shut down. ++ */ ++ if (ipc_proxy) { ++ if (shutting_down) { ++ crm_trace("Shutdown already in progress"); ++ return; ++ } ++ ++ crm_info("Sending shutdown request to cluster"); ++ if (ipc_proxy_shutdown_req(ipc_proxy) < 0) { ++ crm_crit("Shutdown request failed, exiting immediately"); ++ ++ } else { ++ /* We requested a shutdown. Now, we need to wait for an ++ * acknowledgement from the proxy host (which ensures the proxy host ++ * supports shutdown requests), then wait for all proxy hosts to ++ * disconnect (which ensures that all resources have been stopped). ++ */ ++ shutting_down = TRUE; ++ ++ /* Stop accepting new proxy connections */ ++ lrmd_tls_server_destroy(); ++ ++ /* Older crmd versions will never acknowledge our request, so set a ++ * fairly short timeout to exit quickly in that case. If we get the ++ * ack, we'll defuse this timer. ++ */ ++ shutdown_ack_timer = g_timeout_add_seconds(20, lrmd_exit, NULL); ++ ++ /* Currently, we let the OS kill us if the clients don't disconnect ++ * in a reasonable time. We could instead set a long timer here ++ * (shorter than what the OS is likely to use) and exit immediately ++ * if it pops. ++ */ ++ return; ++ } ++ } ++#endif ++ lrmd_exit(NULL); ++} ++ ++/*! ++ * \internal ++ * \brief Defuse short exit timer if shutting down ++ */ ++void handle_shutdown_ack() ++{ ++#ifdef ENABLE_PCMK_REMOTE ++ if (shutting_down) { ++ crm_info("Received shutdown ack"); ++ if (shutdown_ack_timer > 0) { ++ g_source_remove(shutdown_ack_timer); ++ } ++ return; ++ } ++#endif ++ crm_debug("Ignoring unexpected shutdown ack"); + } + + /* *INDENT-OFF* */ +@@ -363,6 +476,6 @@ main(int argc, char **argv) + g_main_run(mainloop); + + /* should never get here */ +- lrmd_shutdown(SIGTERM); ++ lrmd_exit(NULL); + return pcmk_ok; + } +diff --git a/lrmd/pacemaker_remote.service.in b/lrmd/pacemaker_remote.service.in +index 15e61fb..7252976 100644 +--- a/lrmd/pacemaker_remote.service.in ++++ b/lrmd/pacemaker_remote.service.in +@@ -13,7 +13,9 @@ EnvironmentFile=-/etc/sysconfig/pacemaker + + ExecStart=@sbindir@/pacemaker_remoted + +-TimeoutStopSec=30s ++# Pacemaker Remote can exit only after all managed services have shut down; ++# an HA database could conceivably take even longer than this ++TimeoutStopSec=30min + TimeoutStartSec=30s + + # Restart options include: no, on-success, on-failure, on-abort or always +diff --git a/lrmd/tls_backend.c b/lrmd/tls_backend.c +index df5387f..7b8ef9d 100644 +--- a/lrmd/tls_backend.c ++++ b/lrmd/tls_backend.c +@@ -163,8 +163,7 @@ lrmd_remote_client_destroy(gpointer user_data) + close(csock); + } + +- crm_client_destroy(client); +- ++ lrmd_client_destroy(client); + return; + } + +-- +1.8.3.1 + diff --git a/SOURCES/0107-Feature-PE-Honor-the-shutdown-transient-attributes-f.patch b/SOURCES/0107-Feature-PE-Honor-the-shutdown-transient-attributes-f.patch new file mode 100644 index 0000000..5a897a9 --- /dev/null +++ b/SOURCES/0107-Feature-PE-Honor-the-shutdown-transient-attributes-f.patch @@ -0,0 +1,42 @@ +From 0edc762e63801b92b5a931c10446287f9b3d6406 Mon Sep 17 00:00:00 2001 +From: Andrew Beekhof +Date: Wed, 6 Jan 2016 15:15:24 +1100 +Subject: [PATCH 107/108] Feature: PE: Honor the shutdown transient attributes + for remote nodes + +--- + lib/pengine/unpack.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c +index 6a125b0..75d9dd8 100644 +--- a/lib/pengine/unpack.c ++++ b/lib/pengine/unpack.c +@@ -1145,6 +1145,8 @@ unpack_remote_status(xmlNode * status, pe_working_set_t * data_set) + { + const char *id = NULL; + const char *uname = NULL; ++ const char *shutdown = NULL; ++ + GListPtr gIter = NULL; + + xmlNode *state = NULL; +@@ -1190,6 +1192,15 @@ unpack_remote_status(xmlNode * status, pe_working_set_t * data_set) + attrs = find_xml_node(state, XML_TAG_TRANSIENT_NODEATTRS, FALSE); + add_node_attrs(attrs, this_node, TRUE, data_set); + ++ shutdown = g_hash_table_lookup(this_node->details->attrs, XML_CIB_ATTR_SHUTDOWN); ++ if (shutdown != NULL && safe_str_neq("0", shutdown)) { ++ resource_t *rsc = this_node->details->remote_rsc; ++ ++ crm_info("Node %s is shutting down", this_node->details->uname); ++ this_node->details->shutdown = TRUE; ++ rsc->next_role = RSC_ROLE_STOPPED; ++ } ++ + if (crm_is_true(g_hash_table_lookup(this_node->details->attrs, "standby"))) { + crm_info("Node %s is in standby-mode", this_node->details->uname); + this_node->details->standby = TRUE; +-- +1.8.3.1 + diff --git a/SOURCES/0108-Feature-crmd-Set-the-shutdown-transient-attribute-in.patch b/SOURCES/0108-Feature-crmd-Set-the-shutdown-transient-attribute-in.patch new file mode 100644 index 0000000..d4f7292 --- /dev/null +++ b/SOURCES/0108-Feature-crmd-Set-the-shutdown-transient-attribute-in.patch @@ -0,0 +1,49 @@ +From 0a883a90eeeee4c9b156023da693d4ff93a9d69a Mon Sep 17 00:00:00 2001 +From: Andrew Beekhof +Date: Wed, 6 Jan 2016 15:17:06 +1100 +Subject: [PATCH 108/108] Feature: crmd: Set the shutdown transient attribute + in response to LRMD_IPC_OP_SHUTDOWN_REQ from remote nodes + +--- + crmd/lrm_state.c | 22 ++++++++++++++-------- + 1 file changed, 14 insertions(+), 8 deletions(-) + +diff --git a/crmd/lrm_state.c b/crmd/lrm_state.c +index 5ee5b83..7833ebb 100644 +--- a/crmd/lrm_state.c ++++ b/crmd/lrm_state.c +@@ -478,17 +478,23 @@ remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg) + CRM_CHECK(op != NULL, return); + CRM_CHECK(session != NULL, return); + ++ crm_element_value_int(msg, F_LRMD_IPC_MSG_ID, &msg_id); ++ /* This is msg from remote ipc client going to real ipc server */ ++ + if (safe_str_eq(op, LRMD_IPC_OP_SHUTDOWN_REQ)) { +- crm_warn("Graceful proxy shutdown not yet supported"); +- /* TODO: uncomment this, then put node in standby: */ +- /* remote_proxy_ack_shutdown(lrmd); */ +- return; +- } ++ char *now_s = NULL; ++ time_t now = time(NULL); + +- crm_element_value_int(msg, F_LRMD_IPC_MSG_ID, &msg_id); ++ crm_warn("Graceful proxy shutdown of %s not yet tested", lrm_state->node_name); + +- /* This is msg from remote ipc client going to real ipc server */ +- if (safe_str_eq(op, LRMD_IPC_OP_NEW)) { ++ now_s = crm_itoa(now); ++ update_attrd(lrm_state->node_name, XML_CIB_ATTR_SHUTDOWN, now_s, NULL, FALSE); ++ free(now_s); ++ ++ remote_proxy_ack_shutdown(lrmd); ++ return; ++ ++ } else if (safe_str_eq(op, LRMD_IPC_OP_NEW)) { + const char *channel = crm_element_value(msg, F_LRMD_IPC_IPC_SERVER); + + CRM_CHECK(channel != NULL, return); +-- +1.8.3.1 + diff --git a/SOURCES/0109-Fix-attrd-Hook-up-the-client-name-so-we-can-track-re.patch b/SOURCES/0109-Fix-attrd-Hook-up-the-client-name-so-we-can-track-re.patch new file mode 100644 index 0000000..d13d0c4 --- /dev/null +++ b/SOURCES/0109-Fix-attrd-Hook-up-the-client-name-so-we-can-track-re.patch @@ -0,0 +1,28 @@ +From 6968a8b8c48a63af8c813ed47652662cbce837be Mon Sep 17 00:00:00 2001 +From: Andrew Beekhof +Date: Thu, 7 Jan 2016 11:28:14 +1100 +Subject: [PATCH] Fix: attrd: Hook up the client name so we can track requests + +--- + attrd/main.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/attrd/main.c b/attrd/main.c +index 069e9fa..0198396 100644 +--- a/attrd/main.c ++++ b/attrd/main.c +@@ -226,6 +226,11 @@ attrd_ipc_dispatch(qb_ipcs_connection_t * c, void *data, size_t size) + + op = crm_element_value(xml, F_ATTRD_TASK); + ++ if (client->name == NULL) { ++ const char *value = crm_element_value(xml, F_ORIG); ++ client->name = crm_strdup_printf("%s.%d", value?value:"unknown", client->pid); ++ } ++ + if (safe_str_eq(op, ATTRD_OP_PEER_REMOVE)) { + attrd_send_ack(client, id, flags); + attrd_client_peer_remove(client->name, xml); +-- +1.8.3.1 + diff --git a/SOURCES/0110-Fix-attrd-Correctly-implement-mass-removal-of-a-node.patch b/SOURCES/0110-Fix-attrd-Correctly-implement-mass-removal-of-a-node.patch new file mode 100644 index 0000000..33c75be --- /dev/null +++ b/SOURCES/0110-Fix-attrd-Correctly-implement-mass-removal-of-a-node.patch @@ -0,0 +1,59 @@ +From da17fd0265ffe3b4456c4f81141439c851504281 Mon Sep 17 00:00:00 2001 +From: Andrew Beekhof +Date: Thu, 7 Jan 2016 11:33:34 +1100 +Subject: [PATCH] Fix: attrd: Correctly implement mass removal of a node's + attributes + +--- + attrd/commands.c | 21 +++++++++++++++++---- + 1 file changed, 17 insertions(+), 4 deletions(-) + +diff --git a/attrd/commands.c b/attrd/commands.c +index 378a4f8..28e4a81 100644 +--- a/attrd/commands.c ++++ b/attrd/commands.c +@@ -541,8 +541,9 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml) + } else { + host = NULL; + } +- attrd_peer_remove(host_id, host, TRUE, peer->uname); + ++ crm_notice("Processing %s from %s: %s %u", op, peer->uname, host, host_id); ++ attrd_peer_remove(host_id, host, TRUE, peer->uname); + + } else if (safe_str_eq(op, ATTRD_OP_SYNC_RESPONSE) + && safe_str_neq(peer->uname, attrd_cluster->uname)) { +@@ -589,15 +590,27 @@ attrd_peer_remove(uint32_t nodeid, const char *host, gboolean uncache, const cha + attribute_t *a = NULL; + GHashTableIter aIter; + +- crm_notice("Removing all %s attributes for %s", host, source); ++ crm_notice("Removing all %s (%u) attributes for %s", host, nodeid, source); + if(host == NULL) { + return; + } + + g_hash_table_iter_init(&aIter, attributes); + while (g_hash_table_iter_next(&aIter, NULL, (gpointer *) & a)) { +- if(g_hash_table_remove(a->values, host)) { +- crm_debug("Removed %s[%s] for %s", a->id, host, source); ++ attribute_value_t *v = g_hash_table_lookup(a->values, host); ++ ++ if(v && v->current) { ++ free(v->current); ++ v->current = NULL; ++ a->changed = TRUE; ++ ++ crm_debug("Removed %s[%s]=%s for %s", a->id, host, v->current, source); ++ if(a->timer) { ++ crm_trace("Delayed write out (%dms) for %s", a->timeout_ms, a->id); ++ mainloop_timer_start(a->timer); ++ } else { ++ write_or_elect_attribute(a); ++ } + } + } + +-- +1.8.3.1 + diff --git a/SOURCES/0111-Log-crmd-Graceful-proxy-shutdown-is-now-tested.patch b/SOURCES/0111-Log-crmd-Graceful-proxy-shutdown-is-now-tested.patch new file mode 100644 index 0000000..227a7f5 --- /dev/null +++ b/SOURCES/0111-Log-crmd-Graceful-proxy-shutdown-is-now-tested.patch @@ -0,0 +1,25 @@ +From e2c7f8d987f090a3bb2ba3ec0e007a6dbf138ad2 Mon Sep 17 00:00:00 2001 +From: Andrew Beekhof +Date: Mon, 11 Jan 2016 08:28:24 +1100 +Subject: [PATCH] Log: crmd: Graceful proxy shutdown is now tested + +--- + crmd/lrm_state.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/crmd/lrm_state.c b/crmd/lrm_state.c +index 7833ebb..62e1c76 100644 +--- a/crmd/lrm_state.c ++++ b/crmd/lrm_state.c +@@ -485,7 +485,7 @@ remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg) + char *now_s = NULL; + time_t now = time(NULL); + +- crm_warn("Graceful proxy shutdown of %s not yet tested", lrm_state->node_name); ++ crm_notice("Graceful proxy shutdown of %s", lrm_state->node_name); + + now_s = crm_itoa(now); + update_attrd(lrm_state->node_name, XML_CIB_ATTR_SHUTDOWN, now_s, NULL, FALSE); +-- +1.8.3.1 + diff --git a/SOURCES/0112-Fix-crmd-set-remote-flag.patch b/SOURCES/0112-Fix-crmd-set-remote-flag.patch new file mode 100644 index 0000000..a40862a --- /dev/null +++ b/SOURCES/0112-Fix-crmd-set-remote-flag.patch @@ -0,0 +1,26 @@ +From 615b0784516933106a8446272bc3c043b0a0d50a Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 26 Jan 2016 14:04:30 -0600 +Subject: [PATCH] Fix: crmd: set remote flag when gracefully shutting down + remote nodes + +--- + crmd/lrm_state.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/crmd/lrm_state.c b/crmd/lrm_state.c +index 62e1c76..7ea4e8a 100644 +--- a/crmd/lrm_state.c ++++ b/crmd/lrm_state.c +@@ -488,7 +488,7 @@ remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg) + crm_notice("Graceful proxy shutdown of %s", lrm_state->node_name); + + now_s = crm_itoa(now); +- update_attrd(lrm_state->node_name, XML_CIB_ATTR_SHUTDOWN, now_s, NULL, FALSE); ++ update_attrd(lrm_state->node_name, XML_CIB_ATTR_SHUTDOWN, now_s, NULL, TRUE); + free(now_s); + + remote_proxy_ack_shutdown(lrmd); +-- +1.8.3.1 + diff --git a/SOURCES/0113-Fix-attrd-correct-peer-cache.patch b/SOURCES/0113-Fix-attrd-correct-peer-cache.patch new file mode 100644 index 0000000..e8d7343 --- /dev/null +++ b/SOURCES/0113-Fix-attrd-correct-peer-cache.patch @@ -0,0 +1,59 @@ +From 942efa4e8edcfdbdce42505c30c18cacd1d8fff0 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Tue, 26 Jan 2016 15:55:46 -0600 +Subject: [PATCH] Fix: attrd: ensure remote nodes are in correct peer cache + +If attrd receives an update for an unknown node name, it assumes the unknown +node is a cluster node, and adds it to the cluster peer cache. + +Previously, if the name was later used for a remote node, that would prevent +its attributes from being written to the CIB. Now, when an attribute is +received for a remote node, attrd will purge any inactive cluster peer cache +entry before adding the node to the remote peer cache. +--- + attrd/commands.c | 22 +++++++++++++++++----- + 1 file changed, 17 insertions(+), 5 deletions(-) + +diff --git a/attrd/commands.c b/attrd/commands.c +index 28e4a81..b2cc83a 100644 +--- a/attrd/commands.c ++++ b/attrd/commands.c +@@ -634,6 +634,22 @@ static attribute_value_t * + attrd_lookup_or_create_value(GHashTable *values, const char *host, xmlNode *xml) + { + attribute_value_t *v = g_hash_table_lookup(values, host); ++ int is_remote = 0; ++ ++ crm_element_value_int(xml, F_ATTRD_IS_REMOTE, &is_remote); ++ if (is_remote) { ++ /* If we previously assumed this node was an unseen cluster node, ++ * remove its entry from the cluster peer cache. ++ */ ++ crm_node_t *dup = crm_find_peer(0, host); ++ ++ if (dup && (dup->uuid == NULL)) { ++ reap_crm_member(0, host); ++ } ++ ++ /* Ensure this host is in the remote peer cache */ ++ crm_remote_peer_cache_add(host); ++ } + + if (v == NULL) { + v = calloc(1, sizeof(attribute_value_t)); +@@ -642,11 +658,7 @@ attrd_lookup_or_create_value(GHashTable *values, const char *host, xmlNode *xml) + v->nodename = strdup(host); + CRM_ASSERT(v->nodename != NULL); + +- crm_element_value_int(xml, F_ATTRD_IS_REMOTE, &v->is_remote); +- if (v->is_remote == TRUE) { +- crm_remote_peer_cache_add(host); +- } +- ++ v->is_remote = is_remote; + g_hash_table_replace(values, v->nodename, v); + } + return(v); +-- +1.8.3.1 + diff --git a/SPECS/pacemaker.spec b/SPECS/pacemaker.spec index 06663b6..299eaea 100644 --- a/SPECS/pacemaker.spec +++ b/SPECS/pacemaker.spec @@ -57,7 +57,7 @@ Name: pacemaker Summary: Scalable High-Availability cluster resource manager Version: 1.1.13 -Release: %{pcmk_release}%{?dist} +Release: %{pcmk_release}%{?dist}.2 License: GPLv2+ and LGPLv2+ Url: http://www.clusterlabs.org Group: System Environment/Daemons @@ -103,6 +103,27 @@ Patch35: 0035-Fix-crm_resource-Correctly-update-existing-meta-attr.patch Patch36: 0036-Log-crm_resource-restart-Improved-user-feedback-on-f.patch Patch37: 0037-Fix-crm_resource-Correctly-delete-existing-meta-attr.patch Patch38: 0038-Fix-crm_resource-Correctly-observe-force-when-deleti.patch +Patch39: 0039-prevent-segfault-when-logging.patch +Patch40: 0040-update-top-format-in-HealthCPU.patch +Patch41: 0041-delete-fence-attributes-correctly.patch +Patch42: 0042-handle-systemd-shutdown.patch +Patch43: 0043-cts-fix-for-command-lines.patch + +# graceful pacemaker_remote stops +Patch100: 0100-Refactor-lrmd-handle-shutdown-a-little-more-cleanly.patch +Patch101: 0101-Refactor-lrmd-make-proxied-IPC-providers-clients-opa.patch +Patch102: 0102-Refactor-crmd-lrmd-liblrmd-use-defined-constants-for.patch +Patch103: 0103-Test-cts-simulate-pacemaker_remote-failure-with-kill.patch +Patch104: 0104-Feature-lrmd-liblrmd-add-lrmd-IPC-operations-for-req.patch +Patch105: 0105-Feature-crmd-support-graceful-pacemaker_remote-stops.patch +Patch106: 0106-Feature-pacemaker_remote-support-graceful-stops.patch +Patch107: 0107-Feature-PE-Honor-the-shutdown-transient-attributes-f.patch +Patch108: 0108-Feature-crmd-Set-the-shutdown-transient-attribute-in.patch +Patch109: 0109-Fix-attrd-Hook-up-the-client-name-so-we-can-track-re.patch +Patch110: 0110-Fix-attrd-Correctly-implement-mass-removal-of-a-node.patch +Patch111: 0111-Log-crmd-Graceful-proxy-shutdown-is-now-tested.patch +Patch112: 0112-Fix-crmd-set-remote-flag.patch +Patch113: 0113-Fix-attrd-correct-peer-cache.patch BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) AutoReqProv: on @@ -627,6 +648,23 @@ exit 0 %attr(0644,root,root) %{_datadir}/pacemaker/nagios/plugins-metadata/* %changelog +* Tue Jan 26 2016 Ken Gaillot - 1.1.13-10.2 +- Properly cache remote nodes when adding node attributes +- Resolves: rhbz#1299348 + +* Mon Jan 18 2016 Ken Gaillot - 1.1.13-10.1 +- Prevent lrmd crash when logging certain systemd operation failures +- Handle systemd shutdown properly +- Don't delete fence device when deleting an attribute +- Handle new top output format in HealthCPU resource +- Implement graceful stopping of pacemaker_remote +- Update CTS to match applied code patches +- Resolves: rhbz#1299339 +- Resolves: rhbz#1299340 +- Resolves: rhbz#1299341 +- Resolves: rhbz#1299342 +- Resolves: rhbz#1299348 + * Thu Oct 08 2015 Andrew Beekhof - 1.1.13-10 - More improvements when updating and deleting meta attributes - Resolves: rhbz#1267265