commit d9b95435059189843e1fb7b1f7530fc163fdfc13 Author: David Vossel Date: Wed Sep 25 17:02:50 2013 -0400 properly set remote node attributes diff --git a/crmd/lrm.c b/crmd/lrm.c index 0254a9f..7157e24 100644 --- a/crmd/lrm.c +++ b/crmd/lrm.c @@ -367,7 +367,7 @@ lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state, } if (counter > 0) { - do_crm_log(log_level, "%d pending LRM operations at %s%s", counter, when); + do_crm_log(log_level, "%d pending LRM operations at %s", counter, when); if (cur_state == S_TERMINATE || !is_set(fsa_input_register, R_SENT_RSC_STOP)) { g_hash_table_iter_init(&gIter, lrm_state->pending_ops); diff --git a/crmd/membership.c b/crmd/membership.c index 370d1a2..e2bcd45 100644 --- a/crmd/membership.c +++ b/crmd/membership.c @@ -260,6 +260,13 @@ populate_cib_nodes(enum node_update_flags flags, const char *source) do_update_node_cib(node, flags, node_list, source); } + if (crm_remote_peer_cache) { + g_hash_table_iter_init(&iter, crm_remote_peer_cache); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { + do_update_node_cib(node, flags, node_list, source); + } + } + fsa_cib_update(XML_CIB_TAG_STATUS, node_list, call_options, call_id, NULL); fsa_register_cib_callback(call_id, FALSE, NULL, crmd_node_update_complete); last_peer_update = call_id; diff --git a/crmd/messages.c b/crmd/messages.c index 9aa69cc..057383a 100644 --- a/crmd/messages.c +++ b/crmd/messages.c @@ -930,7 +930,7 @@ send_msg_via_ipc(xmlNode * msg, const char *sys) crmd_proxy_send(sys, msg); } else { - crm_err("Unknown Sub-system (%s)... discarding message.", crm_str(sys)); + crm_debug("Unknown Sub-system (%s)... discarding message.", crm_str(sys)); send_ok = FALSE; } diff --git a/crmd/pengine.c b/crmd/pengine.c index 5546d7e..2f3eba8 100644 --- a/crmd/pengine.c +++ b/crmd/pengine.c @@ -271,6 +271,9 @@ do_pe_invoke_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void CRM_LOG_ASSERT(output != NULL); + /* refresh our remote-node cache when the pengine is invoked */ + crm_remote_peer_cache_refresh(output); + crm_xml_add(output, XML_ATTR_DC_UUID, fsa_our_uuid); crm_xml_add_int(output, XML_ATTR_HAVE_QUORUM, fsa_has_quorum); diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c index d38d7f0..42ea043 100644 --- a/crmd/remote_lrmd_ra.c +++ b/crmd/remote_lrmd_ra.c @@ -333,7 +333,10 @@ remote_lrm_op_callback(lrmd_event_data_t * op) } else { /* make sure we have a clean status section to start with */ + lrm_state_reset_tables(lrm_state); remote_init_cib_status(lrm_state); + erase_status_tag(lrm_state->node_name, XML_CIB_TAG_LRM, cib_scope_local); + erase_status_tag(lrm_state->node_name, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); cmd->rc = PCMK_EXECRA_OK; cmd->op_status = PCMK_LRM_OP_DONE; @@ -430,15 +433,6 @@ handle_remote_ra_exec(gpointer user_data) g_list_free_1(first); if (!strcmp(cmd->action, "start") || !strcmp(cmd->action, "migrate_from")) { - xmlNode *status = create_xml_node(NULL, XML_CIB_TAG_STATE); - - /* clear node status in cib */ - crm_xml_add(status, XML_ATTR_ID, lrm_state->node_name); - lrm_state_reset_tables(lrm_state); - fsa_cib_delete(XML_CIB_TAG_STATUS, status, cib_quorum_override, rc, NULL); - crm_info("Forced a remote LRM refresh before connection start: call=%d", rc); - crm_log_xml_trace(status, "CLEAR LRM"); - free_xml(status); rc = handle_remote_ra_start(lrm_state, cmd, cmd->timeout); if (rc == 0) { diff --git a/crmd/te_utils.c b/crmd/te_utils.c index 54fae04..239af63 100644 --- a/crmd/te_utils.c +++ b/crmd/te_utils.c @@ -390,16 +390,7 @@ abort_transition_graph(int abort_priority, enum transition_action abort_action, if (safe_str_eq(XML_CIB_TAG_STATE, kind) || safe_str_eq(XML_CIB_TAG_NODE, kind)) { - if (crm_is_true(crm_element_value(search, XML_NODE_IS_REMOTE))) { - /* Remote node uname and uuids are the same. - * We also don't want them to be present in the - * peer cache, so we shouldn't look them up with - * crm_peer_uname() - */ - uname = ID(search); - } else { - uname = crm_peer_uname(ID(search)); - } + uname = crm_peer_uname(ID(search)); break; } search = search->parent; diff --git a/include/crm/cluster.h b/include/crm/cluster.h index 54b7f58..960c3d0 100644 --- a/include/crm/cluster.h +++ b/include/crm/cluster.h @@ -32,6 +32,7 @@ extern gboolean crm_have_quorum; extern GHashTable *crm_peer_cache; +extern GHashTable *crm_remote_peer_cache; extern unsigned long long crm_peer_seq; # ifndef CRM_SERVICE @@ -55,12 +56,16 @@ enum crm_join_phase }; /* *INDENT-ON* */ +enum crm_node_flags +{ + crm_remote_node = 0x0001, +}; typedef struct crm_peer_node_s { uint32_t id; /* Only used by corosync derivatives */ uint64_t born; /* Only used by heartbeat and the legacy plugin */ uint64_t last_seen; - uint64_t flags; /* Unused, but might be a good place to specify 'remote' */ + uint64_t flags; /* Specified by crm_node_flags enum */ int32_t votes; /* Only used by the legacy plugin */ uint32_t processes; @@ -124,11 +129,25 @@ enum crm_ais_msg_types { crm_msg_pe = 8, crm_msg_stonith_ng = 9, }; + +/* used with crm_get_peer_full */ +enum crm_get_peer_flags { + CRM_GET_PEER_CLUSTER = 0x0001, + CRM_GET_PEER_REMOTE = 0x0002, +}; /* *INDENT-ON* */ gboolean send_cluster_message(crm_node_t * node, enum crm_ais_msg_types service, xmlNode * data, gboolean ordered); + +/* Initialize and refresh the remote peer cache from a cib config */ +void crm_remote_peer_cache_refresh(xmlNode *cib); + +/* allows filtering of remote and cluster nodes using crm_get_peer_flags */ +crm_node_t *crm_get_peer_full(unsigned int id, const char *uname, int flags); + +/* only searches cluster nodes */ crm_node_t *crm_get_peer(unsigned int id, const char *uname); guint crm_active_peers(void); diff --git a/lib/cib/cib_attrs.c b/lib/cib/cib_attrs.c index 4af077c..d1e1b74 100644 --- a/lib/cib/cib_attrs.c +++ b/lib/cib/cib_attrs.c @@ -430,6 +430,8 @@ get_remote_node_uuid(cib_t * the_cib, const char *uname, char **uuid) cib_sync_call | cib_scope_local | cib_xpath, NULL); free(xpath_string); free(xml_search); + xml_search = NULL; + xpath_string = NULL; if (rc != pcmk_ok) { len = strlen(REMOTE_NODE_XPATH2) + strlen(uname) + 1; diff --git a/lib/cluster/cluster.c b/lib/cluster/cluster.c index 5820c8d..5b743f9 100644 --- a/lib/cluster/cluster.c +++ b/lib/cluster/cluster.c @@ -361,6 +361,11 @@ crm_peer_uname(const char *uuid) CRM_CHECK(uuid != NULL, return NULL); + /* remote nodes have the same uname and uuid */ + if (g_hash_table_lookup(crm_remote_peer_cache, uuid)) { + return uuid; + } + /* avoid blocking calls where possible */ g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c index bc1684e..e3082b4 100644 --- a/lib/cluster/membership.c +++ b/lib/cluster/membership.c @@ -33,9 +33,56 @@ #include GHashTable *crm_peer_cache = NULL; +GHashTable *crm_remote_peer_cache = NULL; unsigned long long crm_peer_seq = 0; gboolean crm_have_quorum = FALSE; +static void +remote_cache_refresh_helper(xmlNode *cib, const char *xpath, const char *field, int flags) +{ + const char *remote = NULL; + crm_node_t *node = NULL; + xmlXPathObjectPtr xpathObj = NULL; + int max = 0; + int lpc = 0; + + xpathObj = xpath_search(cib, xpath); + max = numXpathResults(xpathObj); + for (lpc = 0; lpc < max; lpc++) { + xmlNode *xml = getXpathResult(xpathObj, lpc); + + CRM_CHECK(xml != NULL, continue); + + remote = crm_element_value(xml, field); + if (remote) { + crm_trace("added %s to remote cache", remote); + node = calloc(1, sizeof(crm_node_t)); + node->flags = flags; + CRM_ASSERT(node); + node->uname = strdup(remote); + node->uuid = strdup(remote); + node->state = strdup(CRM_NODE_MEMBER); + g_hash_table_replace(crm_remote_peer_cache, node->uname, node); + } + } + freeXpathObject(xpathObj); +} + +void crm_remote_peer_cache_refresh(xmlNode *cib) +{ + const char *xpath = NULL; + + g_hash_table_remove_all(crm_remote_peer_cache); + + /* remote nodes associated with a cluster resource */ + xpath = "//" XML_TAG_CIB "//" XML_CIB_TAG_CONFIGURATION "//" XML_CIB_TAG_RESOURCE "//" XML_TAG_META_SETS "//" XML_CIB_TAG_NVPAIR "[@name='remote-node']"; + remote_cache_refresh_helper(cib, xpath, "value", crm_remote_node); + + /* remote nodes seen in the status section */ + xpath = "//" XML_TAG_CIB "//" XML_CIB_TAG_STATUS "//" XML_CIB_TAG_STATE "[@remote_node='true']"; + remote_cache_refresh_helper(cib, xpath, "id", crm_remote_node); +} + gboolean crm_is_peer_active(const crm_node_t * node) { @@ -146,6 +193,10 @@ crm_peer_init(void) if (crm_peer_cache == NULL) { crm_peer_cache = g_hash_table_new_full(crm_str_hash, g_str_equal, free, destroy_crm_node); } + + if (crm_remote_peer_cache == NULL) { + crm_remote_peer_cache = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, destroy_crm_node); + } } void @@ -156,6 +207,12 @@ crm_peer_destroy(void) g_hash_table_destroy(crm_peer_cache); crm_peer_cache = NULL; } + + if (crm_remote_peer_cache != NULL) { + crm_trace("Destroying remote peer cache with %d members", g_hash_table_size(crm_remote_peer_cache)); + g_hash_table_destroy(crm_remote_peer_cache); + crm_remote_peer_cache = NULL; + } } void (*crm_status_callback) (enum crm_status_type, crm_node_t *, const void *) = NULL; @@ -186,6 +243,25 @@ static gboolean crm_hash_find_by_data(gpointer key, gpointer value, gpointer use return FALSE; } +crm_node_t * +crm_get_peer_full(unsigned int id, const char *uname, int flags) +{ + crm_node_t *node = NULL; + + CRM_ASSERT(id > 0 || uname != NULL); + + crm_peer_init(); + + if (flags & CRM_GET_PEER_REMOTE) { + node = g_hash_table_lookup(crm_remote_peer_cache, uname); + } + + if (node == NULL && (flags & CRM_GET_PEER_CLUSTER)) { + node = crm_get_peer(id, uname); + } + return node; +} + /* coverity[-alloc] Memory is referenced in one or both hashtables */ crm_node_t * crm_get_peer(unsigned int id, const char *uname) diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c index bdaf18c..e577293 100644 --- a/lib/lrmd/lrmd_client.c +++ b/lib/lrmd/lrmd_client.c @@ -89,6 +89,11 @@ typedef struct lrmd_private_s { gnutls_psk_client_credentials_t psk_cred_c; int sock; + /* since tls requires a round trip across the network for a + * request/reply, there are times where we just want to be able + * to send a request from the client and not wait around (or even care + * about) what the reply is. */ + int expected_late_replies; GList *pending_notify; crm_trigger_t *process_notify; #endif @@ -241,9 +246,7 @@ lrmd_dispatch_internal(lrmd_t * lrmd, xmlNode * msg) /* this is proxy business */ lrmd_internal_proxy_dispatch(lrmd, msg); return 1; - } - - if (!native->callback) { + } else if (!native->callback) { /* no callback set */ crm_trace("notify event received but client has not set callback"); return 1; @@ -371,7 +374,19 @@ lrmd_tls_dispatch(gpointer userdata) xml = crm_remote_parse_buffer(native->remote); } while (xml) { - lrmd_dispatch_internal(lrmd, xml); + const char *msg_type = crm_element_value(xml, F_LRMD_REMOTE_MSG_TYPE); + if (safe_str_eq(msg_type, "notify")) { + lrmd_dispatch_internal(lrmd, xml); + } else if (safe_str_eq(msg_type, "reply")) { + if (native->expected_late_replies > 0) { + native->expected_late_replies--; + } else { + int reply_id = 0; + crm_element_value_int(xml, F_LRMD_CALLID, &reply_id); + /* if this happens, we want to know about it */ + crm_err("Got outdated reply %d", reply_id); + } + } free_xml(xml); xml = crm_remote_parse_buffer(native->remote); } @@ -617,7 +632,11 @@ lrmd_tls_recv_reply(lrmd_t * lrmd, int total_timeout, int expected_reply_id, int free_xml(xml); xml = NULL; } else if (reply_id != expected_reply_id) { - crm_err("Got outdated reply, expected id %d got id %d", expected_reply_id, reply_id); + if (native->expected_late_replies > 0) { + native->expected_late_replies--; + } else { + crm_err("Got outdated reply, expected id %d got id %d", expected_reply_id, reply_id); + } free_xml(xml); xml = NULL; } @@ -724,6 +743,12 @@ lrmd_send_xml_no_reply(lrmd_t * lrmd, xmlNode * msg) #ifdef HAVE_GNUTLS_GNUTLS_H case CRM_CLIENT_TLS: rc = lrmd_tls_send(lrmd, msg); + if (rc == pcmk_ok) { + /* we don't want to wait around for the reply, but + * since the request/reply protocol needs to behave the same + * as libqb, a reply will eventually come later anyway. */ + native->expected_late_replies++; + } break; #endif default: diff --git a/lrmd/ipc_proxy.c b/lrmd/ipc_proxy.c index bbf9b24..3a51a5b 100644 --- a/lrmd/ipc_proxy.c +++ b/lrmd/ipc_proxy.c @@ -313,6 +313,8 @@ ipc_proxy_remove_provider(crm_client_t *ipc_proxy) GHashTableIter iter; crm_client_t *ipc_client = NULL; char *key = NULL; + GList *remove_these = NULL; + GListPtr gIter = NULL; if (ipc_providers == NULL) { return; @@ -326,9 +328,19 @@ ipc_proxy_remove_provider(crm_client_t *ipc_proxy) if (safe_str_eq(proxy_id, ipc_proxy->id)) { crm_info("ipc proxy connection for client %s pid %d destroyed because cluster node disconnected.", ipc_client->id, ipc_client->pid); - qb_ipcs_disconnect(ipc_client->ipcs); + /* we can't remove during the iteration, so copy items + * to a list we can destroy later */ + remove_these = g_list_append(remove_these, ipc_client); } } + + for (gIter = remove_these; gIter != NULL; gIter = gIter->next) { + ipc_client = gIter->data; + qb_ipcs_disconnect(ipc_client->ipcs); + } + + /* just frees the list, not the elements in the list */ + g_list_free(remove_these); } void diff --git a/lrmd/lrmd.c b/lrmd/lrmd.c index a4747cb..b5bbea0 100644 --- a/lrmd/lrmd.c +++ b/lrmd/lrmd.c @@ -1297,6 +1297,7 @@ process_lrmd_message(crm_client_t * client, uint32_t id, xmlNode * request) do_reply = 1; } else if (crm_str_eq(op, LRMD_OP_POKE, TRUE)) { do_notify = 1; + do_reply = 1; } else { rc = -EOPNOTSUPP; do_reply = 1; diff --git a/pengine/allocate.c b/pengine/allocate.c index cf8f4d4..bfa8e7b 100644 --- a/pengine/allocate.c +++ b/pengine/allocate.c @@ -1578,6 +1578,8 @@ apply_remote_node_ordering(pe_working_set_t *data_set) container = remote_rsc->container; if (safe_str_eq(action->task, "monitor") || safe_str_eq(action->task, "start") || + safe_str_eq(action->task, "promote") || + safe_str_eq(action->task, "demote") || safe_str_eq(action->task, CRM_OP_LRM_REFRESH) || safe_str_eq(action->task, CRM_OP_CLEAR_FAILCOUNT) || safe_str_eq(action->task, "delete")) {