Blame SOURCES/bz1234261-2-totem-Ignore-duplicated-commit-tokens-in-recovery.patch

f1cb04
From 4ee84c51fa73c4ec7cbee922111a140a3aaf75df Mon Sep 17 00:00:00 2001
f1cb04
From: Jason <huzhijiang@gmail.com>
f1cb04
Date: Sat, 10 Jan 2015 17:35:47 +0800
f1cb04
Subject: [PATCH] totem: Ignore duplicated commit tokens in recovery
f1cb04
f1cb04
In active rrp mode, commit tokens are treated as mcast data messages,
f1cb04
thus, rrp directly delivers them to srp layer by active_mcast_recv().
f1cb04
This will result in duplicated commit tokens being received by srp from
f1cb04
different heartbeat links. If node is in recovery state and has already
f1cb04
sent out the initial orf token, those duplicated commit tokens will
f1cb04
cause message_handler_memb_commit_token() to send initial orf token
f1cb04
again! This is wrong because it resets the orf token content in
f1cb04
instance->orf_token_retransmit, which breaks the token retransmission
f1cb04
state.
f1cb04
f1cb04
Furthermore, by sending those initial orf tokens again and again,
f1cb04
it may lead active_token_recv() to drop some subsequent orf tokens.
f1cb04
It is OK for rrp because srp will do token retransmission,
f1cb04
but as said above, srp retransmission state has already been broken,
f1cb04
so finally we meet a "token lost in recovery state" condition caused
f1cb04
by software. If token timeout value is large, then it will takes long
f1cb04
time to create a new ring.
f1cb04
f1cb04
This can be reproduced by having two noded set to active rrp mode, with
f1cb04
two heartbeat links. Then with one node always on, let the other one do
f1cb04
stop/start again and again. It has a low probability to reproduce.
f1cb04
In theory, I think, the more heartbeat links used, the more easily it
f1cb04
can be reproduced.
f1cb04
f1cb04
This problem can be resolved by letting
f1cb04
message_handler_memb_commit_token() to ignore duplicated commit tokens
f1cb04
in recovery state if node (the ring representation) has already sent
f1cb04
out the initial orf token.
f1cb04
f1cb04
Different from prev take, this version do not depends on stored token
f1cb04
data but uses originated_orf_token in totemsrp_instance to remember
f1cb04
if initial orf token has been already originated for current membership.
f1cb04
f1cb04
Signed-off-by: Jason <huzhijiang@gmail.com>
f1cb04
Reviewed-by: Steven Dake <sdake@redhat.com>
f1cb04
Reviewed-by: Jan Friesse <jfriesse@redhat.com>
f1cb04
Reviewed-by: Christine Caulfield <ccaulfie@redhat.com>
f1cb04
---
f1cb04
 exec/totemsrp.c |   16 ++++++++++++++++
f1cb04
 1 files changed, 16 insertions(+), 0 deletions(-)
f1cb04
f1cb04
diff --git a/exec/totemsrp.c b/exec/totemsrp.c
f1cb04
index 95736b1..b05773a 100644
f1cb04
--- a/exec/totemsrp.c
f1cb04
+++ b/exec/totemsrp.c
f1cb04
@@ -508,6 +508,8 @@ struct totemsrp_instance {
f1cb04
 
f1cb04
 	uint32_t orf_token_discard;
f1cb04
 
f1cb04
+	uint32_t originated_orf_token;
f1cb04
+
f1cb04
 	uint32_t threaded_mode_enabled;
f1cb04
 
f1cb04
 	uint32_t waiting_trans_ack;
f1cb04
@@ -731,6 +733,8 @@ static void totemsrp_instance_initialize (struct totemsrp_instance *instance)
f1cb04
 
f1cb04
 	instance->orf_token_discard = 0;
f1cb04
 
f1cb04
+	instance->originated_orf_token = 0;
f1cb04
+
f1cb04
 	instance->commit_token = (struct memb_commit_token *)instance->commit_token_storage;
f1cb04
 
f1cb04
 	instance->my_id.no_addrs = INTERFACE_MAX;
f1cb04
@@ -1834,6 +1838,8 @@ static void memb_state_operational_enter (struct totemsrp_instance *instance)
f1cb04
 	char left_node_msg[1024];
f1cb04
 	char joined_node_msg[1024];
f1cb04
 
f1cb04
+	instance->originated_orf_token = 0;
f1cb04
+
f1cb04
 	memb_consensus_reset (instance);
f1cb04
 
f1cb04
 	old_ring_state_reset (instance);
f1cb04
@@ -2045,6 +2051,8 @@ static void memb_state_gather_enter (
f1cb04
 {
f1cb04
 	instance->orf_token_discard = 1;
f1cb04
 
f1cb04
+	instance->originated_orf_token = 0;
f1cb04
+
f1cb04
 	memb_set_merge (
f1cb04
 		&instance->my_id, 1,
f1cb04
 		instance->my_proc_list, &instance->my_proc_list_entries);
f1cb04
@@ -4510,6 +4518,14 @@ static int message_handler_memb_commit_token (
f1cb04
 
f1cb04
 		case MEMB_STATE_RECOVERY:
f1cb04
 			if (totemip_equal (&instance->my_id.addr[0], &instance->my_ring_id.rep)) {
f1cb04
+
f1cb04
+				/* Filter out duplicated tokens */
f1cb04
+				if (instance->originated_orf_token) {
f1cb04
+					break;
f1cb04
+				}
f1cb04
+
f1cb04
+				instance->originated_orf_token = 1;
f1cb04
+
f1cb04
 				log_printf (instance->totemsrp_log_level_debug,
f1cb04
 					"Sending initial ORF token");
f1cb04
 
f1cb04
-- 
f1cb04
1.7.1
f1cb04