Blame SOURCES/bz773464-9-totemsrp-Improve-logging-of-left-down-nodes.patch

f1cb04
From ab8942f6260fde93824ed2a18e09e572b59ceb25 Mon Sep 17 00:00:00 2001
f1cb04
From: Christine Caulfield <ccaulfie@redhat.com>
f1cb04
Date: Fri, 12 Jun 2015 16:16:45 +0100
f1cb04
Subject: [PATCH] totemsrp: Improve logging of left/down nodes
f1cb04
f1cb04
This patch from Hideo Yamauchi improves the logging of
f1cb04
whether nodes leave the cluster cleanly or uncleanly,
f1cb04
making it easier to determine if a node ws shut down
f1cb04
by the operator. There is also the possibility that a
f1cb04
LEAVE message could get missed (due to the node being
f1cb04
in flush state) so this can also make that clearer.
f1cb04
f1cb04
The modifications are as follows.
f1cb04
f1cb04
Change 1) I added the list which maintained LEAVE node to totemsrp.
f1cb04
Change 2) I added registration, a search, the handling of to clear LEAVE
f1cb04
node.
f1cb04
Change 3) I added the output to log.
f1cb04
Change 4) I changed an output level of the log.
f1cb04
f1cb04
Signed-off-by: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
f1cb04
Reviewed-by: Christine Caulfield <ccaulfie@redhat.com>
f1cb04
Reviewed-by: Jan Friesse <jfriesse@redhat.com>
f1cb04
---
f1cb04
 exec/totemsrp.c |  105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
f1cb04
 1 files changed, 104 insertions(+), 1 deletions(-)
f1cb04
f1cb04
diff --git a/exec/totemsrp.c b/exec/totemsrp.c
f1cb04
index 6357f5a..3aa61cc 100644
f1cb04
--- a/exec/totemsrp.c
f1cb04
+++ b/exec/totemsrp.c
f1cb04
@@ -316,6 +316,8 @@ struct totemsrp_instance {
f1cb04
 
f1cb04
 	struct srp_addr my_left_memb_list[PROCESSOR_COUNT_MAX];
f1cb04
 
f1cb04
+	unsigned int my_leave_memb_list[PROCESSOR_COUNT_MAX];
f1cb04
+	
f1cb04
 	int my_proc_list_entries;
f1cb04
 
f1cb04
 	int my_failed_list_entries;
f1cb04
@@ -329,6 +331,8 @@ struct totemsrp_instance {
f1cb04
 	int my_deliver_memb_entries;
f1cb04
 
f1cb04
 	int my_left_memb_entries;
f1cb04
+	
f1cb04
+	int my_leave_memb_entries;
f1cb04
 
f1cb04
 	struct memb_ring_id my_ring_id;
f1cb04
 
f1cb04
@@ -513,6 +517,8 @@ struct totemsrp_instance {
f1cb04
 	uint32_t threaded_mode_enabled;
f1cb04
 
f1cb04
 	uint32_t waiting_trans_ack;
f1cb04
+
f1cb04
+	int 	flushing;
f1cb04
 	
f1cb04
 	void * token_recv_event_handle;
f1cb04
 	void * token_sent_event_handle;
f1cb04
@@ -1476,6 +1482,52 @@ static void memb_set_print (
f1cb04
 	}
f1cb04
 }
f1cb04
 #endif
f1cb04
+static void my_leave_memb_clear(
f1cb04
+        struct totemsrp_instance *instance)
f1cb04
+{
f1cb04
+        memset(instance->my_leave_memb_list, 0, sizeof(instance->my_leave_memb_list));
f1cb04
+        instance->my_leave_memb_entries = 0;
f1cb04
+}
f1cb04
+
f1cb04
+static unsigned int my_leave_memb_match(
f1cb04
+        struct totemsrp_instance *instance,
f1cb04
+        unsigned int nodeid)
f1cb04
+{
f1cb04
+        int i;
f1cb04
+        unsigned int ret = 0;
f1cb04
+
f1cb04
+        for (i = 0; i < instance->my_leave_memb_entries; i++){
f1cb04
+                if (instance->my_leave_memb_list[i] ==  nodeid){
f1cb04
+                        ret = nodeid;
f1cb04
+                        break;
f1cb04
+                }
f1cb04
+        }
f1cb04
+        return ret;
f1cb04
+}
f1cb04
+
f1cb04
+static void my_leave_memb_set(
f1cb04
+        struct totemsrp_instance *instance,
f1cb04
+        unsigned int nodeid)
f1cb04
+{
f1cb04
+        int i, found = 0;
f1cb04
+        for (i = 0; i < instance->my_leave_memb_entries; i++){
f1cb04
+                if (instance->my_leave_memb_list[i] ==  nodeid){
f1cb04
+                        found = 1;
f1cb04
+                        break;
f1cb04
+                }
f1cb04
+        }
f1cb04
+        if (found == 1) {
f1cb04
+                return;
f1cb04
+        }
f1cb04
+        if (instance->my_leave_memb_entries < (PROCESSOR_COUNT_MAX - 1)) {
f1cb04
+                instance->my_leave_memb_list[instance->my_leave_memb_entries] = nodeid;
f1cb04
+                instance->my_leave_memb_entries++;
f1cb04
+        } else {
f1cb04
+                log_printf (instance->totemsrp_log_level_warning,
f1cb04
+                        "Cannot set LEAVE nodeid=%d", nodeid);
f1cb04
+        }
f1cb04
+}
f1cb04
+
f1cb04
 
f1cb04
 static void *totemsrp_buffer_alloc (struct totemsrp_instance *instance)
f1cb04
 {
f1cb04
@@ -1837,6 +1889,7 @@ static void memb_state_operational_enter (struct totemsrp_instance *instance)
f1cb04
 	unsigned int res;
f1cb04
 	char left_node_msg[1024];
f1cb04
 	char joined_node_msg[1024];
f1cb04
+	char failed_node_msg[1024];
f1cb04
 
f1cb04
 	instance->originated_orf_token = 0;
f1cb04
 
f1cb04
@@ -2008,15 +2061,30 @@ static void memb_state_operational_enter (struct totemsrp_instance *instance)
f1cb04
 
f1cb04
 	if (instance->my_left_memb_entries) {
f1cb04
 		int sptr = 0;
f1cb04
+		int sptr2 = 0;
f1cb04
 		sptr += snprintf(left_node_msg, sizeof(left_node_msg)-sptr, " left:");
f1cb04
 		for (i=0; i< instance->my_left_memb_entries; i++) {
f1cb04
 			sptr += snprintf(left_node_msg+sptr, sizeof(left_node_msg)-sptr, " %u", left_list[i]);
f1cb04
 		}
f1cb04
+		for (i=0; i< instance->my_left_memb_entries; i++) {
f1cb04
+			if (my_leave_memb_match(instance, left_list[i]) == 0) {
f1cb04
+				if (sptr2 == 0) {
f1cb04
+					sptr2 += snprintf(failed_node_msg, sizeof(failed_node_msg)-sptr2, " failed:");
f1cb04
+				}
f1cb04
+				sptr2 += snprintf(failed_node_msg+sptr2, sizeof(left_node_msg)-sptr2, " %u", left_list[i]);
f1cb04
+			}		
f1cb04
+		}
f1cb04
+		if (sptr2 == 0) {
f1cb04
+			failed_node_msg[0] = '\0';
f1cb04
+		}
f1cb04
 	}
f1cb04
 	else {
f1cb04
 		left_node_msg[0] = '\0';
f1cb04
+		failed_node_msg[0] = '\0';
f1cb04
 	}
f1cb04
 
f1cb04
+	my_leave_memb_clear(instance);
f1cb04
+
f1cb04
 	log_printf (instance->totemsrp_log_level_debug,
f1cb04
 		"entering OPERATIONAL state.");
f1cb04
 	log_printf (instance->totemsrp_log_level_notice,
f1cb04
@@ -2025,6 +2093,13 @@ static void memb_state_operational_enter (struct totemsrp_instance *instance)
f1cb04
 		instance->my_ring_id.seq,
f1cb04
 		joined_node_msg,
f1cb04
 		left_node_msg);
f1cb04
+
f1cb04
+	if (strlen(failed_node_msg)) {
f1cb04
+		log_printf (instance->totemsrp_log_level_notice,
f1cb04
+			"Failed to receive the leave message.%s",
f1cb04
+			failed_node_msg);
f1cb04
+	}
f1cb04
+
f1cb04
 	instance->memb_state = MEMB_STATE_OPERATIONAL;
f1cb04
 
f1cb04
 	instance->stats.operational_entered++;
f1cb04
@@ -3597,8 +3672,9 @@ static int message_handler_orf_token (
f1cb04
 		return (0);
f1cb04
 	}
f1cb04
 #endif
f1cb04
-
f1cb04
+	instance->flushing = 1;
f1cb04
 	totemrrp_recv_flush (instance->totemrrp_context);
f1cb04
+	instance->flushing = 0;
f1cb04
 
f1cb04
 	/*
f1cb04
 	 * Determine if we should hold (in reality drop) the token
f1cb04
@@ -4130,6 +4206,32 @@ static void memb_join_process (
f1cb04
 	memb_set_print ("my_faillist", instance->my_failed_list, instance->my_failed_list_entries);
f1cb04
 -*/
f1cb04
 
f1cb04
+	if (memb_join->header.type == MESSAGE_TYPE_MEMB_JOIN) {
f1cb04
+		if (instance->flushing) {
f1cb04
+			if (memb_join->header.nodeid == LEAVE_DUMMY_NODEID) {
f1cb04
+				log_printf (instance->totemsrp_log_level_warning,
f1cb04
+			    		"Discarding LEAVE message during flush, nodeid=%u", 
f1cb04
+						memb_join->failed_list_entries > 0 ? failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid : LEAVE_DUMMY_NODEID);
f1cb04
+				if (memb_join->failed_list_entries > 0) {
f1cb04
+					my_leave_memb_set(instance, failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid);
f1cb04
+				}
f1cb04
+			} else {
f1cb04
+				log_printf (instance->totemsrp_log_level_warning,
f1cb04
+			    		"Discarding JOIN message during flush, nodeid=%d", memb_join->header.nodeid);
f1cb04
+			}
f1cb04
+			return;
f1cb04
+		} else {
f1cb04
+			if (memb_join->header.nodeid == LEAVE_DUMMY_NODEID) {
f1cb04
+				log_printf (instance->totemsrp_log_level_debug,
f1cb04
+		    		"Recieve LEAVE message from %u", memb_join->failed_list_entries > 0 ? failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid : LEAVE_DUMMY_NODEID);
f1cb04
+				if (memb_join->failed_list_entries > 0) {
f1cb04
+					my_leave_memb_set(instance, failed_list[memb_join->failed_list_entries - 1 ].addr[0].nodeid);
f1cb04
+				}
f1cb04
+			}
f1cb04
+		}
f1cb04
+		
f1cb04
+	}
f1cb04
+
f1cb04
 	if (memb_set_equal (proc_list,
f1cb04
 		memb_join->proc_list_entries,
f1cb04
 		instance->my_proc_list,
f1cb04
@@ -4573,6 +4675,7 @@ void main_deliver_fn (
f1cb04
 		return;
f1cb04
 	}
f1cb04
 
f1cb04
+
f1cb04
 	switch (message_header->type) {
f1cb04
 	case MESSAGE_TYPE_ORF_TOKEN:
f1cb04
 		instance->stats.orf_token_rx++;
f1cb04
-- 
f1cb04
1.7.1
f1cb04