|
|
30d765 |
commit 28ddb87a2f4562c5d1752a778744cc56136f81c1
|
|
|
30d765 |
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
|
|
|
30d765 |
Date: Sun Nov 7 17:02:05 2021 +0100
|
|
|
30d765 |
|
|
|
30d765 |
[udp] use ICMP error messages to trigger faster link down detection
|
|
|
30d765 |
|
|
|
30d765 |
this solves a possible race condition when:
|
|
|
30d765 |
|
|
|
30d765 |
- node1 is running
|
|
|
30d765 |
- node2 very fast
|
|
|
30d765 |
- node1 does NOT have enough time to detect that node2 has gone
|
|
|
30d765 |
and reset the local seq numbers / buffers
|
|
|
30d765 |
- node1 will start rejecting valid packets from node2
|
|
|
30d765 |
|
|
|
30d765 |
There is still a potential minor race condition where app
|
|
|
30d765 |
can restart so fast that kernel / network don't have time
|
|
|
30d765 |
to generate an ICMP error. This will be addressed using
|
|
|
30d765 |
instance id in onwire v2 protocol, as suggested by Jan F.
|
|
|
30d765 |
|
|
|
30d765 |
Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
|
|
|
30d765 |
|
|
|
30d765 |
diff --git a/libknet/transport_udp.c b/libknet/transport_udp.c
|
|
|
30d765 |
index 963340d..32dd032 100644
|
|
|
30d765 |
--- a/libknet/transport_udp.c
|
|
|
30d765 |
+++ b/libknet/transport_udp.c
|
|
|
30d765 |
@@ -364,6 +364,46 @@ static int read_errs_from_sock(knet_handle_t knet_h, int sockfd)
|
|
|
30d765 |
log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from %s: %s destination unknown", addr_str, strerror(sock_err->ee_errno));
|
|
|
30d765 |
} else {
|
|
|
30d765 |
log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Received ICMP error from %s: %s %s", addr_str, strerror(sock_err->ee_errno), addr_remote_str);
|
|
|
30d765 |
+ if ((sock_err->ee_errno == ECONNREFUSED) || /* knet is not running on the other node */
|
|
|
30d765 |
+ (sock_err->ee_errno == ECONNABORTED) || /* local kernel closed the socket */
|
|
|
30d765 |
+ (sock_err->ee_errno == ENONET) || /* network does not exist */
|
|
|
30d765 |
+ (sock_err->ee_errno == ENETUNREACH) || /* network unreachable */
|
|
|
30d765 |
+ (sock_err->ee_errno == EHOSTUNREACH) || /* host unreachable */
|
|
|
30d765 |
+ (sock_err->ee_errno == EHOSTDOWN) || /* host down (from kernel/net/ipv4/icmp.c */
|
|
|
30d765 |
+ (sock_err->ee_errno == ENETDOWN)) { /* network down */
|
|
|
30d765 |
+ struct knet_host *host = NULL;
|
|
|
30d765 |
+ struct knet_link *kn_link = NULL;
|
|
|
30d765 |
+ int link_idx, found = 0;
|
|
|
30d765 |
+
|
|
|
30d765 |
+ for (host = knet_h->host_head; host != NULL; host = host->next) {
|
|
|
30d765 |
+ for (link_idx = 0; link_idx < KNET_MAX_LINK; link_idx++) {
|
|
|
30d765 |
+ kn_link = &host->link[link_idx];
|
|
|
30d765 |
+ if (kn_link->outsock == sockfd) {
|
|
|
30d765 |
+ if (!cmpaddr(&remote, &kn_link->dst_addr)) {
|
|
|
30d765 |
+ found = 1;
|
|
|
30d765 |
+ break;
|
|
|
30d765 |
+ }
|
|
|
30d765 |
+ }
|
|
|
30d765 |
+ }
|
|
|
30d765 |
+ if (found) {
|
|
|
30d765 |
+ break;
|
|
|
30d765 |
+ }
|
|
|
30d765 |
+ }
|
|
|
30d765 |
+
|
|
|
30d765 |
+ if ((host) && (kn_link) &&
|
|
|
30d765 |
+ (kn_link->status.connected)) {
|
|
|
30d765 |
+ log_debug(knet_h, KNET_SUB_TRANSP_UDP, "Setting down host %u link %i", host->host_id, kn_link->link_id);
|
|
|
30d765 |
+ /*
|
|
|
30d765 |
+ * setting transport_connected = 0 will trigger
|
|
|
30d765 |
+ * thread_heartbeat link_down process.
|
|
|
30d765 |
+ *
|
|
|
30d765 |
+ * the process terminates calling into transport_link_down
|
|
|
30d765 |
+ * below that will set transport_connected = 1
|
|
|
30d765 |
+ */
|
|
|
30d765 |
+ kn_link->transport_connected = 0;
|
|
|
30d765 |
+ }
|
|
|
30d765 |
+
|
|
|
30d765 |
+ }
|
|
|
30d765 |
}
|
|
|
30d765 |
}
|
|
|
30d765 |
break;
|
|
|
30d765 |
@@ -436,5 +476,9 @@ int udp_transport_link_dyn_connect(knet_handle_t knet_h, int sockfd, struct knet
|
|
|
30d765 |
|
|
|
30d765 |
int udp_transport_link_is_down(knet_handle_t knet_h, struct knet_link *kn_link)
|
|
|
30d765 |
{
|
|
|
30d765 |
+ /*
|
|
|
30d765 |
+ * see comments about handling ICMP error messages
|
|
|
30d765 |
+ */
|
|
|
30d765 |
+ kn_link->transport_connected = 1;
|
|
|
30d765 |
return 0;
|
|
|
30d765 |
}
|