Blob Blame History Raw
From 64010f573801b73222e80821a19140c59b003f5c Mon Sep 17 00:00:00 2001
From: Jan Friesse <jfriesse@redhat.com>
Date: Thu, 19 Aug 2021 16:13:53 +0200
Subject: [PATCH] totem: Add cancel_hold_on_retransmit config option

Previously, existence of retransmit messages canceled holding
of token (and never allowed representative to enter token hold
state).

This makes token rotating maximum speed and keeps processor
resending messages over and over again - overloading network
and reducing chance to successfully deliver the messages.

Also there were reports of various Antivirus / IPS / IDS which slows
down delivery of packets with certain sizes (packets bigger than token)
what make Corosync retransmit messages over and over again.

Proposed solution is to allow representative to enter token hold
state when there are only retransmit messages. This allows network to
handle overload and/or gives Antivirus/IPS/IDS enough time scan and
deliver packets without corosync entering "FAILED TO RECEIVE" state and
adding more load to network.

(backported from master cdf72925db5a81e546ca8e8d7d8291ee1fc77be4)

Signed-off-by: Jan Friesse <jfriesse@redhat.com>
Reviewed-by: Christine Caulfield <ccaulfie@redhat.com>
---
 exec/totemconfig.c             |  6 ++++++
 exec/totemsrp.c                |  5 +++--
 include/corosync/totem/totem.h |  2 ++
 man/corosync.conf.5            | 15 ++++++++++++++-
 4 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/exec/totemconfig.c b/exec/totemconfig.c
index 38da46b5..a9d8fcf5 100644
--- a/exec/totemconfig.c
+++ b/exec/totemconfig.c
@@ -78,6 +78,7 @@
 #define RRP_PROBLEM_COUNT_THRESHOLD_MIN		2
 #define RRP_AUTORECOVERY_CHECK_TIMEOUT		1000
 #define BLOCK_UNLISTED_IPS			1
+#define CANCEL_TOKEN_HOLD_ON_RETRANSMIT		0
 
 #define DEFAULT_PORT				5405
 
@@ -133,6 +134,8 @@ static uint32_t *totem_get_param_by_name(struct totem_config *totem_config, cons
 		return &totem_config->miss_count_const;
 	if (strcmp(param_name, "totem.block_unlisted_ips") == 0)
 		return &totem_config->block_unlisted_ips;
+	if (strcmp(param_name, "totem.cancel_token_hold_on_retransmit") == 0)
+		return &totem_config->cancel_token_hold_on_retransmit;
 
 	return NULL;
 }
@@ -293,6 +296,9 @@ static void totem_volatile_config_read (struct totem_config *totem_config, const
 
 	totem_volatile_config_set_boolean_value(totem_config, "totem.block_unlisted_ips", deleted_key,
 	    BLOCK_UNLISTED_IPS);
+
+	totem_volatile_config_set_boolean_value(totem_config, "totem.cancel_token_hold_on_retransmit",
+	    deleted_key, CANCEL_TOKEN_HOLD_ON_RETRANSMIT);
 }
 
 static int totem_volatile_config_validate (
diff --git a/exec/totemsrp.c b/exec/totemsrp.c
index 41c2deee..269c6e6c 100644
--- a/exec/totemsrp.c
+++ b/exec/totemsrp.c
@@ -4105,8 +4105,9 @@ static int message_handler_orf_token (
 		transmits_allowed = fcc_calculate (instance, token);
 		mcasted_retransmit = orf_token_rtr (instance, token, &transmits_allowed);
 
-		if (instance->my_token_held == 1 &&
-			(token->rtr_list_entries > 0 || mcasted_retransmit > 0)) {
+		if (instance->totem_config->cancel_token_hold_on_retransmit &&
+		    instance->my_token_held == 1 &&
+		    (token->rtr_list_entries > 0 || mcasted_retransmit > 0)) {
 			instance->my_token_held = 0;
 			forward_token = 1;
 		}
diff --git a/include/corosync/totem/totem.h b/include/corosync/totem/totem.h
index 86968817..90f3cf17 100644
--- a/include/corosync/totem/totem.h
+++ b/include/corosync/totem/totem.h
@@ -193,6 +193,8 @@ struct totem_config {
 
 	unsigned int block_unlisted_ips;
 
+	unsigned int cancel_token_hold_on_retransmit;
+
 	void (*totem_memb_ring_id_create_or_load) (
 	    struct memb_ring_id *memb_ring_id,
 	    const struct totem_ip_address *addr);
diff --git a/man/corosync.conf.5 b/man/corosync.conf.5
index 0487794d..5685da4e 100644
--- a/man/corosync.conf.5
+++ b/man/corosync.conf.5
@@ -32,7 +32,7 @@
 .\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 .\" * THE POSSIBILITY OF SUCH DAMAGE.
 .\" */
-.TH COROSYNC_CONF 5 2019-05-23 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual"
+.TH COROSYNC_CONF 5 2021-08-19 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual"
 .SH NAME
 corosync.conf - corosync executive configuration file
 
@@ -539,6 +539,19 @@ with an old configuration.
 
 The default value is yes.
 
+.TP
+cancel_token_hold_on_retransmit
+Allows Corosync to hold token by representative when there is too much
+retransmit messages. This allows network to process increased load without
+overloading it. Used mechanism is same as described for
+.B hold
+directive.
+
+Some deployments may prefer to never hold token when there is
+retransmit messages. If so, option should be set to yes.
+
+The default value is no.
+
 .PP
 Within the
 .B logging
-- 
2.27.0