Blame SOURCES/bz2112924-icmp-faster-link-down-detection.patch

2c21a8
commit 28ddb87a2f4562c5d1752a778744cc56136f81c1
2c21a8
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
2c21a8
Date:   Sun Nov 7 17:02:05 2021 +0100
2c21a8
2c21a8
    [udp] use ICMP error messages to trigger faster link down detection
2c21a8
    
2c21a8
    this solves a possible race condition when:
2c21a8
    
2c21a8
    - node1 is running
2c21a8
    - node2 very fast
2c21a8
    - node1 does NOT have enough time to detect that node2 has gone
2c21a8
      and reset the local seq numbers / buffers
2c21a8
    - node1 will start rejecting valid packets from node2
2c21a8
    
2c21a8
    There is still a potential minor race condition where app
2c21a8
    can restart so fast that kernel / network don't have time
2c21a8
    to generate an ICMP error. This will be addressed using
2c21a8
    instance id in onwire v2 protocol, as suggested by Jan F.
2c21a8
    
2c21a8
    Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
2c21a8
2c21a8
diff --git a/libknet/transport_udp.c b/libknet/transport_udp.c
2c21a8
index 963340d..32dd032 100644
2c21a8
--- a/libknet/transport_udp.c
2c21a8
+++ b/libknet/transport_udp.c
2c21a8
@@ -364,6 +364,46 @@ static int read_errs_from_sock(knet_handle_t knet_h, int sockfd)
2c21a8
 									log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from %s: %s destination unknown", addr_str, strerror(sock_err->ee_errno));
2c21a8
 								} else {
2c21a8
 									log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from %s: %s %s", addr_str, strerror(sock_err->ee_errno), addr_remote_str);
2c21a8
+									if ((sock_err->ee_errno == ECONNREFUSED) || /* knet is not running on the other node */
2c21a8
+									    (sock_err->ee_errno == ECONNABORTED) || /* local kernel closed the socket */
2c21a8
+									    (sock_err->ee_errno == ENONET)       || /* network does not exist */
2c21a8
+									    (sock_err->ee_errno == ENETUNREACH)  || /* network unreachable */
2c21a8
+									    (sock_err->ee_errno == EHOSTUNREACH) || /* host unreachable */
2c21a8
+									    (sock_err->ee_errno == EHOSTDOWN)    || /* host down (from kernel/net/ipv4/icmp.c */
2c21a8
+									    (sock_err->ee_errno == ENETDOWN)) {     /* network down */
2c21a8
+										struct knet_host *host = NULL;
2c21a8
+										struct knet_link *kn_link = NULL;
2c21a8
+										int link_idx, found = 0;
2c21a8
+
2c21a8
+										for (host = knet_h->host_head; host != NULL; host = host->next) {
2c21a8
+											for (link_idx = 0; link_idx < KNET_MAX_LINK; link_idx++) {
2c21a8
+												kn_link = &host->link[link_idx];
2c21a8
+												if (kn_link->outsock == sockfd) {
2c21a8
+													if (!cmpaddr(&remote, &kn_link->dst_addr)) {
2c21a8
+														found = 1;
2c21a8
+														break;
2c21a8
+													}
2c21a8
+												}
2c21a8
+											}
2c21a8
+											if (found) {
2c21a8
+												break;
2c21a8
+											}
2c21a8
+										}
2c21a8
+
2c21a8
+										if ((host) && (kn_link) &&
2c21a8
+										    (kn_link->status.connected)) {
2c21a8
+											log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Setting down host %u link %i", host->host_id, kn_link->link_id);
2c21a8
+											/*
2c21a8
+											 * setting transport_connected = 0 will trigger
2c21a8
+											 * thread_heartbeat link_down process.
2c21a8
+											 *
2c21a8
+											 * the process terminates calling into transport_link_down
2c21a8
+											 * below that will set transport_connected = 1
2c21a8
+											 */
2c21a8
+											kn_link->transport_connected = 0;
2c21a8
+										}
2c21a8
+
2c21a8
+									}
2c21a8
 								}
2c21a8
 							}
2c21a8
 							break;
2c21a8
@@ -436,5 +476,9 @@ int udp_transport_link_dyn_connect(knet_handle_t knet_h, int sockfd, struct knet
2c21a8
 
2c21a8
 int udp_transport_link_is_down(knet_handle_t knet_h, struct knet_link *kn_link)
2c21a8
 {
2c21a8
+	/*
2c21a8
+	 * see comments about handling ICMP error messages
2c21a8
+	 */
2c21a8
+	kn_link->transport_connected = 1;
2c21a8
 	return 0;
2c21a8
 }