From 0a884f325e1049febc28bf0419ab307dd0bce5af Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Thu, 16 May 2019 20:04:57 -0500 Subject: [PATCH] Log: various: improve fencer connection messages Previously, log messages around fencer connections were inconsistent. This attempts to make them more consistent by: having stonith_api_signon() log only at debug level, letting the callers log at a level appropriate to the situation using the return code; functionizing retrying a connection; and using similar wording across clients. This also does a bit of refactoring for better error checking and improved efficiency. --- daemons/controld/controld_control.c | 7 +- daemons/controld/controld_te_utils.c | 59 ++++++----- daemons/execd/pacemaker-execd.c | 28 ++--- daemons/fenced/cts-fence-helper.c | 38 +++---- include/crm/stonith-ng.h | 4 + lib/fencing/st_client.c | 195 ++++++++++++++++++++--------------- tools/crm_mon.c | 1 - tools/stonith_admin.c | 29 +----- 8 files changed, 181 insertions(+), 180 deletions(-) diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c index 89b5b5d..6d9f335 100644 --- a/daemons/controld/controld_control.c +++ b/daemons/controld/controld_control.c @@ -628,10 +628,11 @@ do_started(long long action, register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } + // Try connecting to fencer (retrying later in mainloop if failed) if (stonith_reconnect == NULL) { - int dummy; - - stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, te_connect_stonith, &dummy); + stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, + te_connect_stonith, + GINT_TO_POINTER(TRUE)); } set_bit(fsa_input_register, R_ST_REQUIRED); mainloop_set_trigger(stonith_reconnect); diff --git a/daemons/controld/controld_te_utils.c b/daemons/controld/controld_te_utils.c index 5606ed6..22f83ad 100644 --- a/daemons/controld/controld_te_utils.c +++ b/daemons/controld/controld_te_utils.c @@ -1,5 +1,5 @@ /* - * Copyright 2004-2018 Andrew Beekhof + * Copyright 2004-2019 the Pacemaker project contributors * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. @@ -385,10 +385,18 @@ te_trigger_stonith_history_sync(void) mainloop_timer_start(stonith_history_sync_timer); } +/*! + * \brief Connect to fencer + * + * \param[in] user_data If NULL, retry failures now, otherwise retry in main loop + * + * \return TRUE + * \note If user_data is NULL, this will wait 2s between attempts, for up to + * 30 attempts, meaning the controller could be blocked as long as 58s. + */ gboolean te_connect_stonith(gpointer user_data) { - int lpc = 0; int rc = pcmk_ok; if (stonith_api == NULL) { @@ -396,42 +404,41 @@ te_connect_stonith(gpointer user_data) } if (stonith_api->state != stonith_disconnected) { - crm_trace("Still connected"); + crm_trace("Already connected to fencer, no need to retry"); return TRUE; } - for (lpc = 0; lpc < 30; lpc++) { - crm_debug("Attempting connection to fencing daemon..."); - - sleep(1); - rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); - - if (rc == pcmk_ok) { - break; + if (user_data == NULL) { + // Blocking (retry failures now until successful) + rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30); + if (rc != pcmk_ok) { + crm_err("Could not connect to fencer in 30 attempts: %s " + CRM_XS " rc=%d", pcmk_strerror(rc), rc); } - - if (user_data != NULL) { + } else { + // Non-blocking (retry failures later in main loop) + rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); + if (rc != pcmk_ok) { if (is_set(fsa_input_register, R_ST_REQUIRED)) { - crm_err("Sign-in failed: triggered a retry"); + crm_err("Fencer connection failed (will retry): %s " + CRM_XS " rc=%d", pcmk_strerror(rc), rc); mainloop_set_trigger(stonith_reconnect); } else { - crm_info("Sign-in failed, but no longer required"); + crm_info("Fencer connection failed (ignoring because no longer required): %s " + CRM_XS " rc=%d", pcmk_strerror(rc), rc); } return TRUE; } - - crm_err("Sign-in failed: pausing and trying again in 2s..."); - sleep(1); } - CRM_CHECK(rc == pcmk_ok, return TRUE); /* If not, we failed 30 times... just get out */ - stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT, - tengine_stonith_connection_destroy); - - stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_FENCE, - tengine_stonith_notify); - - crm_trace("Connected"); + if (rc == pcmk_ok) { + stonith_api->cmds->register_notification(stonith_api, + T_STONITH_NOTIFY_DISCONNECT, + tengine_stonith_connection_destroy); + stonith_api->cmds->register_notification(stonith_api, + T_STONITH_NOTIFY_FENCE, + tengine_stonith_notify); + } return TRUE; } diff --git a/daemons/execd/pacemaker-execd.c b/daemons/execd/pacemaker-execd.c index 21bb0ed..e2fdfca 100644 --- a/daemons/execd/pacemaker-execd.c +++ b/daemons/execd/pacemaker-execd.c @@ -65,28 +65,20 @@ get_stonith_connection(void) stonith_api = NULL; } - if (!stonith_api) { - int rc = 0; - int tries = 10; + if (stonith_api == NULL) { + int rc = pcmk_ok; stonith_api = stonith_api_new(); - do { - rc = stonith_api->cmds->connect(stonith_api, "pacemaker-execd", NULL); - if (rc == pcmk_ok) { - stonith_api->cmds->register_notification(stonith_api, - T_STONITH_NOTIFY_DISCONNECT, - stonith_connection_destroy_cb); - break; - } - sleep(1); - tries--; - } while (tries); - - if (rc) { - crm_err("Unable to connect to stonith daemon to execute command. error: %s", - pcmk_strerror(rc)); + rc = stonith_api_connect_retry(stonith_api, crm_system_name, 10); + if (rc != pcmk_ok) { + crm_err("Could not connect to fencer in 10 attempts: %s " + CRM_XS " rc=%d", pcmk_strerror(rc), rc); stonith_api_delete(stonith_api); stonith_api = NULL; + } else { + stonith_api->cmds->register_notification(stonith_api, + T_STONITH_NOTIFY_DISCONNECT, + stonith_connection_destroy_cb); } } return stonith_api; diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c index c5ce1ab..4552fc1 100644 --- a/daemons/fenced/cts-fence-helper.c +++ b/daemons/fenced/cts-fence-helper.c @@ -1,5 +1,5 @@ /* - * Copyright 2009-2018 Andrew Beekhof + * Copyright 2009-2019 the Pacemaker project contributors * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. @@ -124,8 +124,10 @@ passive_test(void) int rc = 0; rc = st->cmds->connect(st, crm_system_name, &pollfd.fd); - crm_debug("Connect: %d", rc); - + if (rc != pcmk_ok) { + stonith_api_delete(st); + crm_exit(CRM_EX_DISCONNECT); + } st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT, st_callback); st->cmds->register_notification(st, T_STONITH_NOTIFY_FENCE, st_callback); st->cmds->register_notification(st, STONITH_OP_DEVICE_ADD, st_callback); @@ -271,8 +273,10 @@ sanity_tests(void) int rc = 0; rc = st->cmds->connect(st, crm_system_name, &pollfd.fd); - crm_debug("Connect: %d", rc); - + if (rc != pcmk_ok) { + stonith_api_delete(st); + crm_exit(CRM_EX_DISCONNECT); + } st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT, st_callback); st->cmds->register_notification(st, T_STONITH_NOTIFY_FENCE, st_callback); st->cmds->register_notification(st, STONITH_OP_DEVICE_ADD, st_callback); @@ -295,7 +299,10 @@ standard_dev_test(void) stonith_key_value_t *params = NULL; rc = st->cmds->connect(st, crm_system_name, &pollfd.fd); - crm_debug("Connect: %d", rc); + if (rc != pcmk_ok) { + stonith_api_delete(st); + crm_exit(CRM_EX_DISCONNECT); + } params = stonith_key_value_add(params, "pcmk_host_map", "some-host=pcmk-7 true_1_node1=3,4"); @@ -502,23 +509,12 @@ test_register_async_devices(int check_event) static void try_mainloop_connect(int check_event) { - int tries = 10; - int i = 0; - int rc = 0; + int rc = stonith_api_connect_retry(st, crm_system_name, 10); - for (i = 0; i < tries; i++) { - rc = st->cmds->connect(st, crm_system_name, NULL); - - if (!rc) { - crm_info("stonith client connection established"); - mainloop_test_done(TRUE); - return; - } else { - crm_info("stonith client connection failed"); - } - sleep(1); + if (rc == pcmk_ok) { + mainloop_test_done(TRUE); + return; } - crm_err("API CONNECTION FAILURE"); mainloop_test_done(FALSE); } diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h index b7365a9..b640732 100644 --- a/include/crm/stonith-ng.h +++ b/include/crm/stonith-ng.h @@ -430,6 +430,10 @@ void stonith_key_value_freeall(stonith_key_value_t * kvp, int keys, int values); void stonith_history_free(stonith_history_t *history); +// Convenience functions +int stonith_api_connect_retry(stonith_t *st, const char *name, + int max_attempts); + /* Basic helpers that allows nodes to be fenced and the history to be * queried without mainloop or the caller understanding the full API * diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c index 270ef8d..ceee944 100644 --- a/lib/fencing/st_client.c +++ b/lib/fencing/st_client.c @@ -1,5 +1,5 @@ /* - * Copyright 2004-2018 Andrew Beekhof + * Copyright 2004-2019 the Pacemaker project contributors * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. @@ -1415,14 +1415,21 @@ static int stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd) { int rc = pcmk_ok; - stonith_private_t *native = stonith->st_private; + stonith_private_t *native = NULL; + const char *display_name = name? name : "client"; static struct ipc_client_callbacks st_callbacks = { .dispatch = stonith_dispatch_internal, .destroy = stonith_connection_destroy }; - crm_trace("Connecting command channel"); + CRM_CHECK(stonith != NULL, return -EINVAL); + + native = stonith->st_private; + CRM_ASSERT(native != NULL); + + crm_debug("Attempting fencer connection by %s with%s mainloop", + display_name, (stonith_fd? "out" : "")); stonith->state = stonith_connected_command; if (stonith_fd) { @@ -1432,8 +1439,9 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd) if (native->ipc && crm_ipc_connect(native->ipc)) { *stonith_fd = crm_ipc_get_fd(native->ipc); } else if (native->ipc) { - crm_perror(LOG_ERR, "Connection to fencer failed"); - rc = -ENOTCONN; + crm_ipc_close(native->ipc); + crm_ipc_destroy(native->ipc); + native->ipc = NULL; } } else { @@ -1444,11 +1452,8 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd) } if (native->ipc == NULL) { - crm_debug("Could not connect to the Stonith API"); rc = -ENOTCONN; - } - - if (rc == pcmk_ok) { + } else { xmlNode *reply = NULL; xmlNode *hello = create_xml_node(NULL, "stonith_command"); @@ -1458,11 +1463,12 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd) rc = crm_ipc_send(native->ipc, hello, crm_ipc_client_response, -1, &reply); if (rc < 0) { - crm_perror(LOG_DEBUG, "Couldn't complete registration with the fencing API: %d", rc); + crm_debug("Couldn't register with the fencer: %s " + CRM_XS " rc=%d", pcmk_strerror(rc), rc); rc = -ECOMM; } else if (reply == NULL) { - crm_err("Did not receive registration reply"); + crm_debug("Couldn't register with the fencer: no reply"); rc = -EPROTO; } else { @@ -1470,18 +1476,23 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd) const char *tmp_ticket = crm_element_value(reply, F_STONITH_CLIENTID); if (safe_str_neq(msg_type, CRM_OP_REGISTER)) { - crm_err("Invalid registration message: %s", msg_type); - crm_log_xml_err(reply, "Bad reply"); + crm_debug("Couldn't register with the fencer: invalid reply type '%s'", + (msg_type? msg_type : "(missing)")); + crm_log_xml_debug(reply, "Invalid fencer reply"); rc = -EPROTO; } else if (tmp_ticket == NULL) { - crm_err("No registration token provided"); - crm_log_xml_err(reply, "Bad reply"); + crm_debug("Couldn't register with the fencer: no token in reply"); + crm_log_xml_debug(reply, "Invalid fencer reply"); rc = -EPROTO; } else { - crm_trace("Obtained registration token: %s", tmp_ticket); native->token = strdup(tmp_ticket); +#if HAVE_MSGFROMIPC_TIMEOUT + stonith->call_timeout = MAX_IPC_DELAY; +#endif + crm_debug("Connection to fencer by %s succeeded (registration token: %s)", + display_name, native->token); rc = pcmk_ok; } } @@ -1490,16 +1501,11 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd) free_xml(hello); } - if (rc == pcmk_ok) { -#if HAVE_MSGFROMIPC_TIMEOUT - stonith->call_timeout = MAX_IPC_DELAY; -#endif - crm_debug("Connection to fencer successful"); - return pcmk_ok; + if (rc != pcmk_ok) { + crm_debug("Connection attempt to fencer by %s failed: %s " + CRM_XS " rc=%d", display_name, pcmk_strerror(rc), rc); + stonith->cmds->disconnect(stonith); } - - crm_debug("Connection to fencer failed: %s", pcmk_strerror(rc)); - stonith->cmds->disconnect(stonith); return rc; } @@ -2071,6 +2077,36 @@ stonith_api_new(void) return new_stonith; } +/*! + * \brief Make a blocking connection attempt to the fencer + * + * \param[in,out] st Fencer API object + * \param[in] name Client name to use with fencer + * \param[in] max_attempts Return error if this many attempts fail + * + * \return pcmk_ok on success, result of last attempt otherwise + */ +int +stonith_api_connect_retry(stonith_t *st, const char *name, int max_attempts) +{ + int rc = -EINVAL; // if max_attempts is not positive + + for (int attempt = 1; attempt <= max_attempts; attempt++) { + rc = st->cmds->connect(st, name, NULL); + if (rc == pcmk_ok) { + return pcmk_ok; + } else if (attempt < max_attempts) { + crm_notice("Fencer connection attempt %d of %d failed (retrying in 2s): %s " + CRM_XS " rc=%d", + attempt, max_attempts, pcmk_strerror(rc), rc); + sleep(2); + } + } + crm_notice("Could not connect to fencer: %s " CRM_XS " rc=%d", + pcmk_strerror(rc), rc); + return rc; +} + stonith_key_value_t * stonith_key_value_add(stonith_key_value_t * head, const char *key, const char *value) { @@ -2122,85 +2158,78 @@ stonith_key_value_freeall(stonith_key_value_t * head, int keys, int values) int stonith_api_kick(uint32_t nodeid, const char *uname, int timeout, bool off) { - char *name = NULL; - const char *action = "reboot"; - - int rc = -EPROTO; - stonith_t *st = NULL; - enum stonith_call_options opts = st_opt_sync_call | st_opt_allow_suicide; + int rc = pcmk_ok; + stonith_t *st = stonith_api_new(); + const char *action = off? "off" : "reboot"; api_log_open(); - st = stonith_api_new(); - if (st) { - rc = st->cmds->connect(st, "stonith-api", NULL); - if(rc != pcmk_ok) { - api_log(LOG_ERR, "Connection failed, could not kick (%s) node %u/%s : %s (%d)", action, nodeid, uname, pcmk_strerror(rc), rc); - } + if (st == NULL) { + api_log(LOG_ERR, "API initialization failed, could not kick (%s) node %u/%s", + action, nodeid, uname); + return -EPROTO; } - if (uname != NULL) { - name = strdup(uname); - - } else if (nodeid > 0) { - opts |= st_opt_cs_nodeid; - name = crm_itoa(nodeid); - } - - if (off) { - action = "off"; - } - - if (rc == pcmk_ok) { + rc = st->cmds->connect(st, "stonith-api", NULL); + if (rc != pcmk_ok) { + api_log(LOG_ERR, "Connection failed, could not kick (%s) node %u/%s : %s (%d)", + action, nodeid, uname, pcmk_strerror(rc), rc); + } else { + char *name = NULL; + enum stonith_call_options opts = st_opt_sync_call | st_opt_allow_suicide; + + if (uname != NULL) { + name = strdup(uname); + } else if (nodeid > 0) { + opts |= st_opt_cs_nodeid; + name = crm_itoa(nodeid); + } rc = st->cmds->fence(st, opts, name, action, timeout, 0); - if(rc != pcmk_ok) { - api_log(LOG_ERR, "Could not kick (%s) node %u/%s : %s (%d)", action, nodeid, uname, pcmk_strerror(rc), rc); + free(name); + + if (rc != pcmk_ok) { + api_log(LOG_ERR, "Could not kick (%s) node %u/%s : %s (%d)", + action, nodeid, uname, pcmk_strerror(rc), rc); } else { - api_log(LOG_NOTICE, "Node %u/%s kicked: %s ", nodeid, uname, action); + api_log(LOG_NOTICE, "Node %u/%s kicked: %s", nodeid, uname, action); } } - if (st) { - st->cmds->disconnect(st); - stonith_api_delete(st); - } - - free(name); + stonith_api_delete(st); return rc; } time_t stonith_api_time(uint32_t nodeid, const char *uname, bool in_progress) { - int rc = 0; - char *name = NULL; - + int rc = pcmk_ok; time_t when = 0; - stonith_t *st = NULL; + stonith_t *st = stonith_api_new(); stonith_history_t *history = NULL, *hp = NULL; - enum stonith_call_options opts = st_opt_sync_call; - - st = stonith_api_new(); - if (st) { - rc = st->cmds->connect(st, "stonith-api", NULL); - if(rc != pcmk_ok) { - api_log(LOG_NOTICE, "Connection failed: %s (%d)", pcmk_strerror(rc), rc); - } - } - - if (uname != NULL) { - name = strdup(uname); - } else if (nodeid > 0) { - opts |= st_opt_cs_nodeid; - name = crm_itoa(nodeid); + if (st == NULL) { + api_log(LOG_ERR, "Could not retrieve fence history for %u/%s: " + "API initialization failed", nodeid, uname); + return when; } - if (st && rc == pcmk_ok) { + rc = st->cmds->connect(st, "stonith-api", NULL); + if (rc != pcmk_ok) { + api_log(LOG_NOTICE, "Connection failed: %s (%d)", pcmk_strerror(rc), rc); + } else { int entries = 0; int progress = 0; int completed = 0; - + char *name = NULL; + enum stonith_call_options opts = st_opt_sync_call; + + if (uname != NULL) { + name = strdup(uname); + } else if (nodeid > 0) { + opts |= st_opt_cs_nodeid; + name = crm_itoa(nodeid); + } rc = st->cmds->history(st, opts, name, &history, 120); + free(name); for (hp = history; hp; hp = hp->next) { entries++; @@ -2227,15 +2256,11 @@ stonith_api_time(uint32_t nodeid, const char *uname, bool in_progress) } } - if (st) { - st->cmds->disconnect(st); - stonith_api_delete(st); - } + stonith_api_delete(st); if(when) { api_log(LOG_INFO, "Node %u/%s last kicked at: %ld", nodeid, uname, (long int)when); } - free(name); return when; } diff --git a/tools/crm_mon.c b/tools/crm_mon.c index e101b62..bed0796 100644 --- a/tools/crm_mon.c +++ b/tools/crm_mon.c @@ -298,7 +298,6 @@ cib_connect(gboolean full) } if ((fence_connect) && (st->state == stonith_disconnected)) { - crm_trace("Connecting to stonith"); rc = st->cmds->connect(st, crm_system_name, NULL); if (rc == pcmk_ok) { crm_trace("Setting up stonith callbacks"); diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c index d960fb1..6be66c6 100644 --- a/tools/stonith_admin.c +++ b/tools/stonith_admin.c @@ -198,31 +198,6 @@ struct { int rc; } async_fence_data; -static int -try_mainloop_connect(void) -{ - stonith_t *st = async_fence_data.st; - int tries = 10; - int i = 0; - int rc = 0; - - for (i = 0; i < tries; i++) { - crm_debug("Connecting as %s", async_fence_data.name); - rc = st->cmds->connect(st, async_fence_data.name, NULL); - - if (!rc) { - crm_debug("stonith client connection established"); - return 0; - } else { - crm_debug("stonith client connection failed"); - } - sleep(1); - } - - crm_err("Could not connect to the fencer"); - return -1; -} - static void notify_callback(stonith_t * st, stonith_event_t * e) { @@ -251,8 +226,10 @@ async_fence_helper(gpointer user_data) { stonith_t *st = async_fence_data.st; int call_id = 0; + int rc = stonith_api_connect_retry(st, async_fence_data.name, 10); - if (try_mainloop_connect()) { + if (rc != pcmk_ok) { + fprintf(stderr, "Could not connect to fencer: %s\n", pcmk_strerror(rc)); g_main_loop_quit(mainloop); return TRUE; } -- 1.8.3.1