Blob Blame History Raw
commit 28ddb87a2f4562c5d1752a778744cc56136f81c1
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
Date:   Sun Nov 7 17:02:05 2021 +0100

    [udp] use ICMP error messages to trigger faster link down detection
    
    this solves a possible race condition when:
    
    - node1 is running
    - node2 very fast
    - node1 does NOT have enough time to detect that node2 has gone
      and reset the local seq numbers / buffers
    - node1 will start rejecting valid packets from node2
    
    There is still a potential minor race condition where app
    can restart so fast that kernel / network don't have time
    to generate an ICMP error. This will be addressed using
    instance id in onwire v2 protocol, as suggested by Jan F.
    
    Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>

diff --git a/libknet/transport_udp.c b/libknet/transport_udp.c
index 963340d..32dd032 100644
--- a/libknet/transport_udp.c
+++ b/libknet/transport_udp.c
@@ -364,6 +364,46 @@ static int read_errs_from_sock(knet_handle_t knet_h, int sockfd)
 									log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from %s: %s destination unknown", addr_str, strerror(sock_err->ee_errno));
 								} else {
 									log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from %s: %s %s", addr_str, strerror(sock_err->ee_errno), addr_remote_str);
+									if ((sock_err->ee_errno == ECONNREFUSED) || /* knet is not running on the other node */
+									    (sock_err->ee_errno == ECONNABORTED) || /* local kernel closed the socket */
+									    (sock_err->ee_errno == ENONET)       || /* network does not exist */
+									    (sock_err->ee_errno == ENETUNREACH)  || /* network unreachable */
+									    (sock_err->ee_errno == EHOSTUNREACH) || /* host unreachable */
+									    (sock_err->ee_errno == EHOSTDOWN)    || /* host down (from kernel/net/ipv4/icmp.c */
+									    (sock_err->ee_errno == ENETDOWN)) {     /* network down */
+										struct knet_host *host = NULL;
+										struct knet_link *kn_link = NULL;
+										int link_idx, found = 0;
+
+										for (host = knet_h->host_head; host != NULL; host = host->next) {
+											for (link_idx = 0; link_idx < KNET_MAX_LINK; link_idx++) {
+												kn_link = &host->link[link_idx];
+												if (kn_link->outsock == sockfd) {
+													if (!cmpaddr(&remote, &kn_link->dst_addr)) {
+														found = 1;
+														break;
+													}
+												}
+											}
+											if (found) {
+												break;
+											}
+										}
+
+										if ((host) && (kn_link) &&
+										    (kn_link->status.connected)) {
+											log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Setting down host %u link %i", host->host_id, kn_link->link_id);
+											/*
+											 * setting transport_connected = 0 will trigger
+											 * thread_heartbeat link_down process.
+											 *
+											 * the process terminates calling into transport_link_down
+											 * below that will set transport_connected = 1
+											 */
+											kn_link->transport_connected = 0;
+										}
+
+									}
 								}
 							}
 							break;
@@ -436,5 +476,9 @@ int udp_transport_link_dyn_connect(knet_handle_t knet_h, int sockfd, struct knet
 
 int udp_transport_link_is_down(knet_handle_t knet_h, struct knet_link *kn_link)
 {
+	/*
+	 * see comments about handling ICMP error messages
+	 */
+	kn_link->transport_connected = 1;
 	return 0;
 }