Blame SOURCES/bz2112923-icmp-faster-link-down-detection.patch

30d765
commit 28ddb87a2f4562c5d1752a778744cc56136f81c1
30d765
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
30d765
Date:   Sun Nov 7 17:02:05 2021 +0100
30d765
30d765
    [udp] use ICMP error messages to trigger faster link down detection
30d765
    
30d765
    this solves a possible race condition when:
30d765
    
30d765
    - node1 is running
30d765
    - node2 very fast
30d765
    - node1 does NOT have enough time to detect that node2 has gone
30d765
      and reset the local seq numbers / buffers
30d765
    - node1 will start rejecting valid packets from node2
30d765
    
30d765
    There is still a potential minor race condition where app
30d765
    can restart so fast that kernel / network don't have time
30d765
    to generate an ICMP error. This will be addressed using
30d765
    instance id in onwire v2 protocol, as suggested by Jan F.
30d765
    
30d765
    Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
30d765
30d765
diff --git a/libknet/transport_udp.c b/libknet/transport_udp.c
30d765
index 963340d..32dd032 100644
30d765
--- a/libknet/transport_udp.c
30d765
+++ b/libknet/transport_udp.c
30d765
@@ -364,6 +364,46 @@ static int read_errs_from_sock(knet_handle_t knet_h, int sockfd)
30d765
 									log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from %s: %s destination unknown", addr_str, strerror(sock_err->ee_errno));
30d765
 								} else {
30d765
 									log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from %s: %s %s", addr_str, strerror(sock_err->ee_errno), addr_remote_str);
30d765
+									if ((sock_err->ee_errno == ECONNREFUSED) || /* knet is not running on the other node */
30d765
+									    (sock_err->ee_errno == ECONNABORTED) || /* local kernel closed the socket */
30d765
+									    (sock_err->ee_errno == ENONET)       || /* network does not exist */
30d765
+									    (sock_err->ee_errno == ENETUNREACH)  || /* network unreachable */
30d765
+									    (sock_err->ee_errno == EHOSTUNREACH) || /* host unreachable */
30d765
+									    (sock_err->ee_errno == EHOSTDOWN)    || /* host down (from kernel/net/ipv4/icmp.c */
30d765
+									    (sock_err->ee_errno == ENETDOWN)) {     /* network down */
30d765
+										struct knet_host *host = NULL;
30d765
+										struct knet_link *kn_link = NULL;
30d765
+										int link_idx, found = 0;
30d765
+
30d765
+										for (host = knet_h->host_head; host != NULL; host = host->next) {
30d765
+											for (link_idx = 0; link_idx < KNET_MAX_LINK; link_idx++) {
30d765
+												kn_link = &host->link[link_idx];
30d765
+												if (kn_link->outsock == sockfd) {
30d765
+													if (!cmpaddr(&remote, &kn_link->dst_addr)) {
30d765
+														found = 1;
30d765
+														break;
30d765
+													}
30d765
+												}
30d765
+											}
30d765
+											if (found) {
30d765
+												break;
30d765
+											}
30d765
+										}
30d765
+
30d765
+										if ((host) && (kn_link) &&
30d765
+										    (kn_link->status.connected)) {
30d765
+											log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Setting down host %u link %i", host->host_id, kn_link->link_id);
30d765
+											/*
30d765
+											 * setting transport_connected = 0 will trigger
30d765
+											 * thread_heartbeat link_down process.
30d765
+											 *
30d765
+											 * the process terminates calling into transport_link_down
30d765
+											 * below that will set transport_connected = 1
30d765
+											 */
30d765
+											kn_link->transport_connected = 0;
30d765
+										}
30d765
+
30d765
+									}
30d765
 								}
30d765
 							}
30d765
 							break;
30d765
@@ -436,5 +476,9 @@ int udp_transport_link_dyn_connect(knet_handle_t knet_h, int sockfd, struct knet
30d765
 
30d765
 int udp_transport_link_is_down(knet_handle_t knet_h, struct knet_link *kn_link)
30d765
 {
30d765
+	/*
30d765
+	 * see comments about handling ICMP error messages
30d765
+	 */
30d765
+	kn_link->transport_connected = 1;
30d765
 	return 0;
30d765
 }