commit 28ddb87a2f4562c5d1752a778744cc56136f81c1
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
Date: Sun Nov 7 17:02:05 2021 +0100
[udp] use ICMP error messages to trigger faster link down detection
this solves a possible race condition when:
- node1 is running
- node2 very fast
- node1 does NOT have enough time to detect that node2 has gone
and reset the local seq numbers / buffers
- node1 will start rejecting valid packets from node2
There is still a potential minor race condition where app
can restart so fast that kernel / network don't have time
to generate an ICMP error. This will be addressed using
instance id in onwire v2 protocol, as suggested by Jan F.
Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
diff --git a/libknet/transport_udp.c b/libknet/transport_udp.c
index 963340d..32dd032 100644
--- a/libknet/transport_udp.c
+++ b/libknet/transport_udp.c
@@ -364,6 +364,46 @@ static int read_errs_from_sock(knet_handle_t knet_h, int sockfd)
log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from %s: %s destination unknown", addr_str, strerror(sock_err->ee_errno));
} else {
log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from %s: %s %s", addr_str, strerror(sock_err->ee_errno), addr_remote_str);
+ if ((sock_err->ee_errno == ECONNREFUSED) || /* knet is not running on the other node */
+ (sock_err->ee_errno == ECONNABORTED) || /* local kernel closed the socket */
+ (sock_err->ee_errno == ENONET) || /* network does not exist */
+ (sock_err->ee_errno == ENETUNREACH) || /* network unreachable */
+ (sock_err->ee_errno == EHOSTUNREACH) || /* host unreachable */
+ (sock_err->ee_errno == EHOSTDOWN) || /* host down (from kernel/net/ipv4/icmp.c */
+ (sock_err->ee_errno == ENETDOWN)) { /* network down */
+ struct knet_host *host = NULL;
+ struct knet_link *kn_link = NULL;
+ int link_idx, found = 0;
+
+ for (host = knet_h->host_head; host != NULL; host = host->next) {
+ for (link_idx = 0; link_idx < KNET_MAX_LINK; link_idx++) {
+ kn_link = &host->link[link_idx];
+ if (kn_link->outsock == sockfd) {
+ if (!cmpaddr(&remote, &kn_link->dst_addr)) {
+ found = 1;
+ break;
+ }
+ }
+ }
+ if (found) {
+ break;
+ }
+ }
+
+ if ((host) && (kn_link) &&
+ (kn_link->status.connected)) {
+ log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Setting down host %u link %i", host->host_id, kn_link->link_id);
+ /*
+ * setting transport_connected = 0 will trigger
+ * thread_heartbeat link_down process.
+ *
+ * the process terminates calling into transport_link_down
+ * below that will set transport_connected = 1
+ */
+ kn_link->transport_connected = 0;
+ }
+
+ }
}
}
break;
@@ -436,5 +476,9 @@ int udp_transport_link_dyn_connect(knet_handle_t knet_h, int sockfd, struct knet
int udp_transport_link_is_down(knet_handle_t knet_h, struct knet_link *kn_link)
{
+ /*
+ * see comments about handling ICMP error messages
+ */
+ kn_link->transport_connected = 1;
return 0;
}