From 14bb468ab404228cae34809420ef0763d3d54482 Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Thu, 13 Jun 2019 15:31:24 +0200 Subject: [PATCH] Fix: fence-history: fail leftover pending-actions after fenced-restart --- daemons/fenced/fenced_history.c | 15 +++++++++++++++ daemons/fenced/fenced_remote.c | 6 +++--- daemons/fenced/pacemaker-fenced.h | 8 ++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c index 7c129cc..b65b64c 100644 --- a/daemons/fenced/fenced_history.c +++ b/daemons/fenced/fenced_history.c @@ -347,6 +347,21 @@ stonith_merge_in_history_list(GHashTable *history) updated = TRUE; g_hash_table_iter_steal(&iter); + + if ((op->state != st_failed) && + (op->state != st_done) && + safe_str_eq(op->originator, stonith_our_uname)) { + crm_warn("received pending action we are supposed to be the " + "owner but it's not in our records -> fail it"); + op->state = st_failed; + op->completed = time(NULL); + /* use -EHOSTUNREACH to not introduce a new return-code that might + trigger unexpected results at other places and to prevent + remote_op_done from setting the delegate if not present + */ + stonith_bcast_result_to_peers(op, -EHOSTUNREACH); + } + g_hash_table_insert(stonith_remote_op_list, op->id, op); /* we could trim the history here but if we bail * out after trim we might miss more recent entries diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c index 7d61249..5b86f0f 100644 --- a/daemons/fenced/fenced_remote.c +++ b/daemons/fenced/fenced_remote.c @@ -369,8 +369,8 @@ create_op_done_notify(remote_fencing_op_t * op, int rc) return notify_data; } -static void -bcast_result_to_peers(remote_fencing_op_t * op, int rc) +void +stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc) { static int count = 0; xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); @@ -509,7 +509,7 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) subt = crm_element_value(data, F_SUBTYPE); if (dup == FALSE && safe_str_neq(subt, "broadcast")) { /* Defer notification until the bcast message arrives */ - bcast_result_to_peers(op, rc); + stonith_bcast_result_to_peers(op, rc); goto remote_op_done_cleanup; } diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h index 3a2edbb..a8531a6 100644 --- a/daemons/fenced/pacemaker-fenced.h +++ b/daemons/fenced/pacemaker-fenced.h @@ -149,6 +149,14 @@ typedef struct remote_fencing_op_s { } remote_fencing_op_t; +/*! + * \internal + * \brief Broadcast the result of an operation to the peers. + * \param op, Operation whose result should be broadcast + * \param rc, Result of the operation + */ +void stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc); + enum st_callback_flags { st_callback_unknown = 0x0000, st_callback_notify_fence = 0x0001, -- 1.8.3.1 From a0bc0d3ab5aed64e37b1caae746f5c421696df1b Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Fri, 14 Jun 2019 13:41:43 +0200 Subject: [PATCH] Fix: controld-fencing: remove-notifications upon connection-destroy --- daemons/controld/controld_fencing.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c index 92336e9..b925bc5 100644 --- a/daemons/controld/controld_fencing.c +++ b/daemons/controld/controld_fencing.c @@ -403,7 +403,14 @@ tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) } if (stonith_api) { - stonith_api->state = stonith_disconnected; + /* the client API won't properly reconnect notifications + * if they are still in the table - so remove them + */ + stonith_api->cmds->remove_notification(st, T_STONITH_NOTIFY_DISCONNECT); + stonith_api->cmds->remove_notification(st, T_STONITH_NOTIFY_FENCE); + if (stonith_api->state != stonith_disconnected) { + stonith_api->cmds->disconnect(st); + } } if (AM_I_DC) { -- 1.8.3.1 From 487cdd9e3ec6ab47fde5074acbb2ff564047d59c Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Tue, 18 Jun 2019 14:09:20 +0200 Subject: [PATCH] Feature: fence-history: add notification upon history-synced --- daemons/fenced/fenced_history.c | 5 +++++ daemons/fenced/pacemaker-fenced.c | 3 +++ daemons/fenced/pacemaker-fenced.h | 11 ++++++----- include/crm/stonith-ng.h | 1 + 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c index b65b64c..cd08d74 100644 --- a/daemons/fenced/fenced_history.c +++ b/daemons/fenced/fenced_history.c @@ -420,6 +420,11 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, stonith_fence_history_cleanup(target, crm_element_value(msg, F_STONITH_CALLID) != NULL); } else if (options & st_opt_broadcast) { + /* there is no clear sign atm for when a history sync + is done so send a notification for anything + that smells like history-sync + */ + do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY_SYNCED, 0, NULL); if (crm_element_value(msg, F_STONITH_CALLID)) { /* this is coming from the stonith-API * diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c index 7e9bb07..7a87f93 100644 --- a/daemons/fenced/pacemaker-fenced.c +++ b/daemons/fenced/pacemaker-fenced.c @@ -279,6 +279,9 @@ get_stonith_flag(const char *name) } else if (safe_str_eq(name, T_STONITH_NOTIFY_HISTORY)) { return st_callback_notify_history; + } else if (safe_str_eq(name, T_STONITH_NOTIFY_HISTORY_SYNCED)) { + return st_callback_notify_history_synced; + } return st_callback_unknown; } diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h index a8531a6..583cb47 100644 --- a/daemons/fenced/pacemaker-fenced.h +++ b/daemons/fenced/pacemaker-fenced.h @@ -158,11 +158,12 @@ typedef struct remote_fencing_op_s { void stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc); enum st_callback_flags { - st_callback_unknown = 0x0000, - st_callback_notify_fence = 0x0001, - st_callback_device_add = 0x0004, - st_callback_device_del = 0x0010, - st_callback_notify_history = 0x0020 + st_callback_unknown = 0x0000, + st_callback_notify_fence = 0x0001, + st_callback_device_add = 0x0004, + st_callback_device_del = 0x0010, + st_callback_notify_history = 0x0020, + st_callback_notify_history_synced = 0x0040 }; /* diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h index b640732..418a03c 100644 --- a/include/crm/stonith-ng.h +++ b/include/crm/stonith-ng.h @@ -29,6 +29,7 @@ extern "C" { # define T_STONITH_NOTIFY_DISCONNECT "st_notify_disconnect" # define T_STONITH_NOTIFY_FENCE "st_notify_fence" # define T_STONITH_NOTIFY_HISTORY "st_notify_history" +# define T_STONITH_NOTIFY_HISTORY_SYNCED "st_notify_history_synced" /* *INDENT-OFF* */ enum stonith_state { -- 1.8.3.1 From 03c4455fced74f093deb782198b1ba3076e52015 Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Tue, 18 Jun 2019 14:12:27 +0200 Subject: [PATCH] Fix: fence-history: resync fence-history after fenced crash Setting up a 30s fallback timer to trigger history-sync if the sync via DC doesn't happen --- daemons/controld/controld_callbacks.c | 2 +- daemons/controld/controld_control.c | 2 + daemons/controld/controld_fencing.c | 86 ++++++++++++++++++++++++++++++----- daemons/controld/controld_fencing.h | 3 +- 4 files changed, 79 insertions(+), 14 deletions(-) diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c index 3ce7470..48225ac 100644 --- a/daemons/controld/controld_callbacks.c +++ b/daemons/controld/controld_callbacks.c @@ -211,7 +211,7 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d } else if(AM_I_DC) { if (appeared) { - te_trigger_stonith_history_sync(); + te_trigger_stonith_history_sync(FALSE); } else { erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); } diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c index e99d605..f3bb20f 100644 --- a/daemons/controld/controld_control.c +++ b/daemons/controld/controld_control.c @@ -259,6 +259,8 @@ crmd_exit(crm_exit_t exit_code) crm_timer_stop(wait_timer); crm_timer_stop(recheck_timer); + te_cleanup_stonith_history_sync(NULL, TRUE); + free(transition_timer); transition_timer = NULL; free(integration_timer); integration_timer = NULL; free(finalization_timer); finalization_timer = NULL; diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c index b925bc5..22fa727 100644 --- a/daemons/controld/controld_fencing.c +++ b/daemons/controld/controld_fencing.c @@ -20,6 +20,9 @@ # include #endif +static void +tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event); + /* * stonith failure counting * @@ -394,6 +397,8 @@ fail_incompletable_stonith(crm_graph_t *graph) static void tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) { + te_cleanup_stonith_history_sync(st, FALSE); + if (is_set(fsa_input_register, R_ST_REQUIRED)) { crm_crit("Fencing daemon connection failed"); mainloop_set_trigger(stonith_reconnect); @@ -406,11 +411,12 @@ tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) /* the client API won't properly reconnect notifications * if they are still in the table - so remove them */ - stonith_api->cmds->remove_notification(st, T_STONITH_NOTIFY_DISCONNECT); - stonith_api->cmds->remove_notification(st, T_STONITH_NOTIFY_FENCE); if (stonith_api->state != stonith_disconnected) { stonith_api->cmds->disconnect(st); } + stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT); + stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE); + stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED); } if (AM_I_DC) { @@ -622,7 +628,12 @@ te_connect_stonith(gpointer user_data) stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_FENCE, tengine_stonith_notify); + stonith_api->cmds->register_notification(stonith_api, + T_STONITH_NOTIFY_HISTORY_SYNCED, + tengine_stonith_history_synced); + te_trigger_stonith_history_sync(TRUE); } + return TRUE; } @@ -649,7 +660,12 @@ controld_disconnect_fencer(bool destroy) // Prevent fencer connection from coming up again clear_bit(fsa_input_register, R_ST_REQUIRED); - stonith_api->cmds->disconnect(stonith_api); + if (stonith_api->state != stonith_disconnected) { + stonith_api->cmds->disconnect(stonith_api); + } + stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT); + stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE); + stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED); } if (destroy) { if (stonith_api) { @@ -673,6 +689,7 @@ do_stonith_history_sync(gpointer user_data) if (stonith_api && (stonith_api->state != stonith_disconnected)) { stonith_history_t *history = NULL; + te_cleanup_stonith_history_sync(stonith_api, FALSE); stonith_api->cmds->history(stonith_api, st_opt_sync_call | st_opt_broadcast, NULL, &history, 5); @@ -845,7 +862,33 @@ te_fence_node(crm_graph_t *graph, crm_action_t *action) */ static crm_trigger_t *stonith_history_sync_trigger = NULL; -static mainloop_timer_t *stonith_history_sync_timer = NULL; +static mainloop_timer_t *stonith_history_sync_timer_short = NULL; +static mainloop_timer_t *stonith_history_sync_timer_long = NULL; + +void +te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers) +{ + if (free_timers) { + mainloop_timer_del(stonith_history_sync_timer_short); + stonith_history_sync_timer_short = NULL; + mainloop_timer_del(stonith_history_sync_timer_long); + stonith_history_sync_timer_long = NULL; + } else { + mainloop_timer_stop(stonith_history_sync_timer_short); + mainloop_timer_stop(stonith_history_sync_timer_long); + } + + if (st) { + st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY_SYNCED); + } +} + +static void +tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event) +{ + te_cleanup_stonith_history_sync(st, FALSE); + crm_debug("Fence-history synced - cancel all timers"); +} static gboolean stonith_history_sync_set_trigger(gpointer user_data) @@ -855,11 +898,18 @@ stonith_history_sync_set_trigger(gpointer user_data) } void -te_trigger_stonith_history_sync(void) +te_trigger_stonith_history_sync(bool long_timeout) { /* trigger a sync in 5s to give more nodes the * chance to show up so that we don't create * unnecessary stonith-history-sync traffic + * + * the long timeout of 30s is there as a fallback + * so that after a successful connection to fenced + * we will wait for 30s for the DC to trigger a + * history-sync + * if this doesn't happen we trigger a sync locally + * (e.g. fenced segfaults and is restarted by pacemakerd) */ /* as we are finally checking the stonith-connection @@ -873,14 +923,26 @@ te_trigger_stonith_history_sync(void) do_stonith_history_sync, NULL); } - if(stonith_history_sync_timer == NULL) { - stonith_history_sync_timer = - mainloop_timer_add("history_sync", 5000, - FALSE, stonith_history_sync_set_trigger, - NULL); + if (long_timeout) { + if(stonith_history_sync_timer_long == NULL) { + stonith_history_sync_timer_long = + mainloop_timer_add("history_sync_long", 30000, + FALSE, stonith_history_sync_set_trigger, + NULL); + } + crm_info("Fence history will be synchronized cluster-wide within 30 seconds"); + mainloop_timer_start(stonith_history_sync_timer_long); + } else { + if(stonith_history_sync_timer_short == NULL) { + stonith_history_sync_timer_short = + mainloop_timer_add("history_sync_short", 5000, + FALSE, stonith_history_sync_set_trigger, + NULL); + } + crm_info("Fence history will be synchronized cluster-wide within 5 seconds"); + mainloop_timer_start(stonith_history_sync_timer_short); } - crm_info("Fence history will be synchronized cluster-wide within 5 seconds"); - mainloop_timer_start(stonith_history_sync_timer); + } /* end stonith history synchronization functions */ diff --git a/daemons/controld/controld_fencing.h b/daemons/controld/controld_fencing.h index 8f7f19b..2fe6d88 100644 --- a/daemons/controld/controld_fencing.h +++ b/daemons/controld/controld_fencing.h @@ -29,6 +29,7 @@ void purge_stonith_cleanup(void); void execute_stonith_cleanup(void); // stonith history synchronization -void te_trigger_stonith_history_sync(void); +void te_trigger_stonith_history_sync(bool long_timeout); +void te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers); #endif -- 1.8.3.1 From 2b038831edf6dd345c3f39f0fc27cfbf9503f512 Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Tue, 18 Jun 2019 21:54:49 +0200 Subject: [PATCH] Fix: st_client: make safe to remove notifications from notifications While cycling over the notification-list just mark for deletion and delete afterwards. --- lib/fencing/st_client.c | 58 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c index 629887a..ba23ac5 100644 --- a/lib/fencing/st_client.c +++ b/lib/fencing/st_client.c @@ -67,6 +67,8 @@ typedef struct stonith_private_s { mainloop_io_t *source; GHashTable *stonith_op_callback_table; GList *notify_list; + int notify_refcnt; + bool notify_deletes; void (*op_callback) (stonith_t * st, stonith_callback_data_t * data); @@ -77,6 +79,7 @@ typedef struct stonith_notify_client_s { const char *obj_id; /* implement one day */ const char *obj_type; /* implement one day */ void (*notify) (stonith_t * st, stonith_event_t * e); + bool delete; } stonith_notify_client_t; @@ -211,6 +214,38 @@ log_action(stonith_action_t *action, pid_t pid) } } +/* when cycling through the list we don't want to delete items + so just mark them and when we know nobody is using the list + loop over it to remove the marked items + */ +static void +foreach_notify_entry (stonith_private_t *private, + GFunc func, + gpointer user_data) +{ + private->notify_refcnt++; + g_list_foreach(private->notify_list, func, user_data); + private->notify_refcnt--; + if ((private->notify_refcnt == 0) && + private->notify_deletes) { + GList *list_item = private->notify_list; + + private->notify_deletes = FALSE; + while (list_item != NULL) + { + stonith_notify_client_t *list_client = list_item->data; + GList *next = g_list_next(list_item); + + if (list_client->delete) { + free(list_client); + private->notify_list = + g_list_delete_link(private->notify_list, list_item); + } + list_item = next; + } + } +} + static void stonith_connection_destroy(gpointer user_data) { @@ -230,7 +265,7 @@ stonith_connection_destroy(gpointer user_data) crm_xml_add(blob.xml, F_TYPE, T_STONITH_NOTIFY); crm_xml_add(blob.xml, F_SUBTYPE, T_STONITH_NOTIFY_DISCONNECT); - g_list_foreach(native->notify_list, stonith_send_notification, &blob); + foreach_notify_entry(native, stonith_send_notification, &blob); free_xml(blob.xml); } @@ -1140,6 +1175,10 @@ stonithlib_GCompareFunc(gconstpointer a, gconstpointer b) const stonith_notify_client_t *a_client = a; const stonith_notify_client_t *b_client = b; + if (a_client->delete || b_client->delete) { + /* make entries marked for deletion not findable */ + return -1; + } CRM_CHECK(a_client->event != NULL && b_client->event != NULL, return 0); rc = strcmp(a_client->event, b_client->event); if (rc == 0) { @@ -1394,7 +1433,7 @@ stonith_dispatch_internal(const char *buffer, ssize_t length, gpointer userdata) stonith_perform_callback(st, blob.xml, 0, 0); } else if (safe_str_eq(type, T_STONITH_NOTIFY)) { - g_list_foreach(private->notify_list, stonith_send_notification, &blob); + foreach_notify_entry(private, stonith_send_notification, &blob); } else if (safe_str_eq(type, T_STONITH_TIMEOUT_VALUE)) { int call_id = 0; int timeout = 0; @@ -1592,8 +1631,13 @@ stonith_api_del_notification(stonith_t * stonith, const char *event) if (list_item != NULL) { stonith_notify_client_t *list_client = list_item->data; - private->notify_list = g_list_remove(private->notify_list, list_client); - free(list_client); + if (private->notify_refcnt) { + list_client->delete = TRUE; + private->notify_deletes = TRUE; + } else { + private->notify_list = g_list_remove(private->notify_list, list_client); + free(list_client); + } crm_trace("Removed callback"); @@ -1754,6 +1798,10 @@ stonith_send_notification(gpointer data, gpointer user_data) crm_warn("Skipping callback - NULL callback client"); return; + } else if (entry->delete) { + crm_trace("Skipping callback - marked for deletion"); + return; + } else if (entry->notify == NULL) { crm_warn("Skipping callback - NULL callback"); return; @@ -2037,6 +2085,8 @@ stonith_api_new(void) private->stonith_op_callback_table = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL, stonith_destroy_op_callback); private->notify_list = NULL; + private->notify_refcnt = 0; + private->notify_deletes = FALSE; new_stonith->call_id = 1; new_stonith->state = stonith_disconnected; -- 1.8.3.1 From 03765b7803f935f0db149843a0b90aa9c872d922 Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Fri, 21 Jun 2019 14:13:10 +0200 Subject: [PATCH] Test: CTS: new pattern to identify fenced reconnected Now that we are removing notifications upon disconnect a duplicate notification can't be used as sign for reconnection any more. --- cts/patterns.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cts/patterns.py b/cts/patterns.py index 1b86ee7..8de67b1 100644 --- a/cts/patterns.py +++ b/cts/patterns.py @@ -303,7 +303,7 @@ class crm_corosync(BasePatterns): self.components["pacemaker-fenced"] = [ r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"Fencing daemon connection failed", - r"pacemaker-controld.*:\s*warn.*:\s*Callback already present", + r"pacemaker-controld.*Fencer successfully connected", ] self.components["pacemaker-fenced-ignore"] = [ r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", -- 1.8.3.1 From c45c98cd77cb3e0913bcdb18fd6b116c3a25285d Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Fri, 21 Jun 2019 16:40:47 +0200 Subject: [PATCH] Fix: controld-fencing: add notice-log for successful fencer-connect --- daemons/controld/controld_fencing.c | 1 + 1 file changed, 1 insertion(+) diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c index 22fa727..2428168 100644 --- a/daemons/controld/controld_fencing.c +++ b/daemons/controld/controld_fencing.c @@ -632,6 +632,7 @@ te_connect_stonith(gpointer user_data) T_STONITH_NOTIFY_HISTORY_SYNCED, tengine_stonith_history_synced); te_trigger_stonith_history_sync(TRUE); + crm_notice("Fencer successfully connected"); } return TRUE; -- 1.8.3.1