Blame SOURCES/bz1761711-fix-data-deliver-corruption-from-fragmented-packets.patch

cf5375
commit db21da87bba6017c8343f9c6f255b21813ffd5d0
cf5375
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
cf5375
Date:   Tue Oct 15 06:46:36 2019 +0200
cf5375
cf5375
    [host] rename variables to make it easier to read the code
cf5375
    
cf5375
    Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
cf5375
cf5375
diff --git a/libknet/host.c b/libknet/host.c
cf5375
index abb1f89..ac26b89 100644
cf5375
--- a/libknet/host.c
cf5375
+++ b/libknet/host.c
cf5375
@@ -569,7 +569,7 @@ static void _clear_cbuffers(struct knet_host *host, seq_num_t rx_seq_num)
cf5375
 
cf5375
 int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, int clear_buf)
cf5375
 {
cf5375
-	size_t i, j; /* circular buffer indexes */
cf5375
+	size_t head, tail; /* circular buffer indexes */
cf5375
 	seq_num_t seq_dist;
cf5375
 	char *dst_cbuf = host->circular_buffer;
cf5375
 	char *dst_cbuf_defrag = host->circular_buffer_defrag;
cf5375
@@ -585,13 +585,13 @@ int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, i
cf5375
 		seq_dist = *dst_seq_num - seq_num;
cf5375
 	}
cf5375
 
cf5375
-	j = seq_num % KNET_CBUFFER_SIZE;
cf5375
+	head = seq_num % KNET_CBUFFER_SIZE;
cf5375
 
cf5375
 	if (seq_dist < KNET_CBUFFER_SIZE) { /* seq num is in ring buffer */
cf5375
 		if (!defrag_buf) {
cf5375
-			return (dst_cbuf[j] == 0) ? 1 : 0;
cf5375
+			return (dst_cbuf[head] == 0) ? 1 : 0;
cf5375
 		} else {
cf5375
-			return (dst_cbuf_defrag[j] == 0) ? 1 : 0;
cf5375
+			return (dst_cbuf_defrag[head] == 0) ? 1 : 0;
cf5375
 		}
cf5375
 	} else if (seq_dist <= SEQ_MAX - KNET_CBUFFER_SIZE) {
cf5375
 		memset(dst_cbuf, 0, KNET_CBUFFER_SIZE);
cf5375
@@ -600,16 +600,16 @@ int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, i
cf5375
 	}
cf5375
 
cf5375
 	/* cleaning up circular buffer */
cf5375
-	i = (*dst_seq_num + 1) % KNET_CBUFFER_SIZE;
cf5375
+	tail = (*dst_seq_num + 1) % KNET_CBUFFER_SIZE;
cf5375
 
cf5375
-	if (i > j) {
cf5375
-		memset(dst_cbuf + i, 0, KNET_CBUFFER_SIZE - i);
cf5375
-		memset(dst_cbuf, 0, j + 1);
cf5375
-		memset(dst_cbuf_defrag + i, 0, KNET_CBUFFER_SIZE - i);
cf5375
-		memset(dst_cbuf_defrag, 0, j + 1);
cf5375
+	if (tail > head) {
cf5375
+		memset(dst_cbuf + tail, 0, KNET_CBUFFER_SIZE - tail);
cf5375
+		memset(dst_cbuf, 0, head + 1);
cf5375
+		memset(dst_cbuf_defrag + tail, 0, KNET_CBUFFER_SIZE - tail);
cf5375
+		memset(dst_cbuf_defrag, 0, head + 1);
cf5375
 	} else {
cf5375
-		memset(dst_cbuf + i, 0, j - i + 1);
cf5375
-		memset(dst_cbuf_defrag + i, 0, j - i + 1);
cf5375
+		memset(dst_cbuf + tail, 0, head - tail + 1);
cf5375
+		memset(dst_cbuf_defrag + tail, 0, head - tail + 1);
cf5375
 	}
cf5375
 
cf5375
 	*dst_seq_num = seq_num;
cf5375
commit 1e473cf26d55c2b6ff8d5bfaa5aa689554de803c
cf5375
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
cf5375
Date:   Tue Oct 15 06:53:24 2019 +0200
cf5375
cf5375
    [host] fix defrag buffers reclaim logic
cf5375
    
cf5375
    The problem:
cf5375
    
cf5375
    - let's assume a 2 nodes (A and B) cluster setup
cf5375
    - node A sends fragmented packets to node B and there is
cf5375
      packet loss on the network.
cf5375
    - node B receives all those fragments and attempts to
cf5375
      reassemble them.
cf5375
    - node A sends packet seq_num X in Y fragments.
cf5375
    - node B receives only part of the fragments and stores
cf5375
      them in a defrag buf.
cf5375
    - packet loss stops.
cf5375
    - node A continues to send packets and a seq_num
cf5375
      roll-over takes place.
cf5375
    - node A sends a new packet seq_num X in Y fragments.
cf5375
    - node B gets confused here because the parts of the old
cf5375
      packet seq_num X are still stored and the buffer
cf5375
      has not been reclaimed.
cf5375
    - node B continues to rebuild packet seq_num X with
cf5375
      old stale data and new data from after the roll-over.
cf5375
    - node B completes reassembling the packet and delivers
cf5375
      junk to the application.
cf5375
    
cf5375
    The solution:
cf5375
    
cf5375
    Add a much stronger buffer reclaim logic that will apply
cf5375
    on each received packet and not only when defrag buffers
cf5375
    are needed, as there might be a mix of fragmented and not
cf5375
    fragmented packets in-flight.
cf5375
    
cf5375
    The new logic creates a window of N packets that can be
cf5375
    handled at the same time (based on the number of buffers)
cf5375
    and clear everything else.
cf5375
    
cf5375
    Fixes https://github.com/kronosnet/kronosnet/issues/261
cf5375
    
cf5375
    Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
cf5375
cf5375
diff --git a/libknet/host.c b/libknet/host.c
cf5375
index ac26b89..85d4626 100644
cf5375
--- a/libknet/host.c
cf5375
+++ b/libknet/host.c
cf5375
@@ -562,6 +562,35 @@ static void _clear_cbuffers(struct knet_host *host, seq_num_t rx_seq_num)
cf5375
 	}
cf5375
 }
cf5375
 
cf5375
+static void _reclaim_old_defrag_bufs(struct knet_host *host, seq_num_t seq_num)
cf5375
+{
cf5375
+	seq_num_t head, tail; /* seq_num boundaries */
cf5375
+	int i;
cf5375
+
cf5375
+	head = seq_num + 1;
cf5375
+	tail = seq_num - (KNET_MAX_LINK + 1);
cf5375
+
cf5375
+	/*
cf5375
+	 * expire old defrag buffers
cf5375
+	 */
cf5375
+	for (i = 0; i < KNET_MAX_LINK; i++) {
cf5375
+		if (host->defrag_buf[i].in_use) {
cf5375
+			/*
cf5375
+			 * head has done a rollover to 0+
cf5375
+			 */
cf5375
+			if (tail > head) {
cf5375
+				if ((host->defrag_buf[i].pckt_seq >= head) && (host->defrag_buf[i].pckt_seq <= tail)) {
cf5375
+					host->defrag_buf[i].in_use = 0;
cf5375
+				}
cf5375
+			} else {
cf5375
+				if ((host->defrag_buf[i].pckt_seq >= head) || (host->defrag_buf[i].pckt_seq <= tail)){
cf5375
+					host->defrag_buf[i].in_use = 0;
cf5375
+				}
cf5375
+			}
cf5375
+		}
cf5375
+	}
cf5375
+}
cf5375
+
cf5375
 /*
cf5375
  * check if a given packet seq num is in the circular buffers
cf5375
  * defrag_buf = 0 -> use normal cbuf 1 -> use the defrag buffer lookup
cf5375
@@ -579,6 +608,8 @@ int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, i
cf5375
 		_clear_cbuffers(host, seq_num);
cf5375
 	}
cf5375
 
cf5375
+	_reclaim_old_defrag_bufs(host, seq_num);
cf5375
+
cf5375
 	if (seq_num < *dst_seq_num) {
cf5375
 		seq_dist =  (SEQ_MAX - seq_num) + *dst_seq_num;
cf5375
 	} else {
cf5375
commit 5bd88ebd63af20577095c2c98975f0f1781ba46a
cf5375
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
cf5375
Date:   Tue Oct 15 07:02:05 2019 +0200
cf5375
cf5375
    [rx] copy data into the defrag buffer only if we know the size of the frame
cf5375
    
cf5375
    Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
cf5375
cf5375
diff --git a/libknet/threads_rx.c b/libknet/threads_rx.c
cf5375
index b2a5dad..6c26cdc 100644
cf5375
--- a/libknet/threads_rx.c
cf5375
+++ b/libknet/threads_rx.c
cf5375
@@ -186,8 +186,10 @@ static int pckt_defrag(knet_handle_t knet_h, struct knet_header *inbuf, ssize_t
cf5375
 		defrag_buf->frag_size = *len;
cf5375
 	}
cf5375
 
cf5375
-	memmove(defrag_buf->buf + ((inbuf->khp_data_frag_seq - 1) * defrag_buf->frag_size),
cf5375
-	       inbuf->khp_data_userdata, *len);
cf5375
+	if (defrag_buf->frag_size) {
cf5375
+		memmove(defrag_buf->buf + ((inbuf->khp_data_frag_seq - 1) * defrag_buf->frag_size),
cf5375
+		       inbuf->khp_data_userdata, *len);
cf5375
+	}
cf5375
 
cf5375
 	defrag_buf->frag_recv++;
cf5375
 	defrag_buf->frag_map[inbuf->khp_data_frag_seq] = 1;
cf5375
commit cd59986900510119d8e7b63d33ad35466d480858
cf5375
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
cf5375
Date:   Tue Oct 15 07:16:22 2019 +0200
cf5375
cf5375
    [test] add ability to knet_bench to specify a fixed packet size for perf test
cf5375
    
cf5375
    Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
cf5375
cf5375
diff --git a/libknet/tests/knet_bench.c b/libknet/tests/knet_bench.c
cf5375
index dc04239..54b5303 100644
cf5375
--- a/libknet/tests/knet_bench.c
cf5375
+++ b/libknet/tests/knet_bench.c
cf5375
@@ -67,6 +67,8 @@ static int test_type = TEST_PING;
cf5375
 static uint64_t perf_by_size_size = 1 * ONE_GIGABYTE;
cf5375
 static uint64_t perf_by_time_secs = 10;
cf5375
 
cf5375
+static uint32_t force_packet_size = 0;
cf5375
+
cf5375
 struct node {
cf5375
 	int nodeid;
cf5375
 	int links;
cf5375
@@ -109,6 +111,7 @@ static void print_help(void)
cf5375
 	printf(" -s                                        nodeid that will generate traffic for benchmarks\n");
cf5375
 	printf(" -S [size|seconds]                         when used in combination with -T perf-by-size it indicates how many GB of traffic to generate for the test. (default: 1GB)\n");
cf5375
 	printf("                                           when used in combination with -T perf-by-time it indicates how many Seconds of traffic to generate for the test. (default: 10 seconds)\n");
cf5375
+	printf(" -x                                        force packet size for perf-by-time or perf-by-size\n");
cf5375
 	printf(" -C                                        repeat the test continously (default: off)\n");
cf5375
 	printf(" -X[XX]                                    show stats at the end of the run (default: 1)\n");
cf5375
 	printf("                                           1: show handle stats, 2: show summary link stats\n");
cf5375
@@ -250,7 +253,7 @@ static void setup_knet(int argc, char *argv[])
cf5375
 
cf5375
 	memset(nodes, 0, sizeof(nodes));
cf5375
 
cf5375
-	while ((rv = getopt(argc, argv, "aCT:S:s:ldfom:wb:t:n:c:p:X::P:z:h")) != EOF) {
cf5375
+	while ((rv = getopt(argc, argv, "aCT:S:s:ldfom:wb:t:n:c:p:x:X::P:z:h")) != EOF) {
cf5375
 		switch(rv) {
cf5375
 			case 'h':
cf5375
 				print_help();
cf5375
@@ -406,6 +409,13 @@ static void setup_knet(int argc, char *argv[])
cf5375
 				perf_by_size_size = (uint64_t)atoi(optarg) * ONE_GIGABYTE;
cf5375
 				perf_by_time_secs = (uint64_t)atoi(optarg);
cf5375
 				break;
cf5375
+			case 'x':
cf5375
+				force_packet_size = (uint32_t)atoi(optarg);
cf5375
+				if ((force_packet_size < 1) || (force_packet_size > 65536)) {
cf5375
+					printf("Unsupported packet size %u (accepted 1 - 65536)\n", force_packet_size);
cf5375
+					exit(FAIL);
cf5375
+				}
cf5375
+				break;
cf5375
 			case 'C':
cf5375
 				continous = 1;
cf5375
 				break;
cf5375
@@ -874,7 +884,7 @@ static int setup_send_buffers_common(struct knet_mmsghdr *msg, struct iovec *iov
cf5375
 			printf("TXT: Unable to malloc!\n");
cf5375
 			return -1;
cf5375
 		}
cf5375
-		memset(tx_buf[i], 0, KNET_MAX_PACKET_SIZE);
cf5375
+		memset(tx_buf[i], i, KNET_MAX_PACKET_SIZE);
cf5375
 		iov_out[i].iov_base = (void *)tx_buf[i];
cf5375
 		memset(&msg[i].msg_hdr, 0, sizeof(struct msghdr));
cf5375
 		msg[i].msg_hdr.msg_iov = &iov_out[i];
cf5375
@@ -898,6 +908,9 @@ static void send_perf_data_by_size(void)
cf5375
 	setup_send_buffers_common(msg, iov_out, tx_buf);
cf5375
 
cf5375
 	while (packetsize <= KNET_MAX_PACKET_SIZE) {
cf5375
+		if (force_packet_size) {
cf5375
+			packetsize = force_packet_size;
cf5375
+		}
cf5375
 		for (i = 0; i < PCKT_FRAG_MAX; i++) {
cf5375
 			iov_out[i].iov_len = packetsize;
cf5375
 		}
cf5375
@@ -926,7 +939,7 @@ static void send_perf_data_by_size(void)
cf5375
 
cf5375
 		knet_send(knet_h, ctrl_message, TEST_STOP, channel);
cf5375
 
cf5375
-		if (packetsize == KNET_MAX_PACKET_SIZE) {
cf5375
+		if ((packetsize == KNET_MAX_PACKET_SIZE) || (force_packet_size)) {
cf5375
 			break;
cf5375
 		}
cf5375
 
cf5375
@@ -1175,6 +1188,9 @@ static void send_perf_data_by_time(void)
cf5375
 	memset(&clock_end, 0, sizeof(clock_start));
cf5375
 
cf5375
 	while (packetsize <= KNET_MAX_PACKET_SIZE) {
cf5375
+		if (force_packet_size) {
cf5375
+			packetsize = force_packet_size;
cf5375
+		}
cf5375
 		for (i = 0; i < PCKT_FRAG_MAX; i++) {
cf5375
 			iov_out[i].iov_len = packetsize;
cf5375
 		}
cf5375
@@ -1205,7 +1221,7 @@ static void send_perf_data_by_time(void)
cf5375
 
cf5375
 		knet_send(knet_h, ctrl_message, TEST_STOP, channel);
cf5375
 
cf5375
-		if (packetsize == KNET_MAX_PACKET_SIZE) {
cf5375
+		if ((packetsize == KNET_MAX_PACKET_SIZE) || (force_packet_size)) {
cf5375
 			break;
cf5375
 		}
cf5375
 
cf5375
commit e28e2ea7c7e8139a6792ec1508215d4560b53e65
cf5375
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
cf5375
Date:   Wed Oct 16 08:10:23 2019 +0200
cf5375
cf5375
    [test] add packet verification option to knet_bench
cf5375
    
cf5375
    Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
cf5375
cf5375
diff --git a/libknet/tests/knet_bench.c b/libknet/tests/knet_bench.c
cf5375
index 54b5303..c9e1c06 100644
cf5375
--- a/libknet/tests/knet_bench.c
cf5375
+++ b/libknet/tests/knet_bench.c
cf5375
@@ -47,6 +47,7 @@ static char *compresscfg = NULL;
cf5375
 static char *cryptocfg = NULL;
cf5375
 static int machine_output = 0;
cf5375
 static int use_access_lists = 0;
cf5375
+static int use_pckt_verification = 0;
cf5375
 
cf5375
 static int bench_shutdown_in_progress = 0;
cf5375
 static pthread_mutex_t shutdown_mutex = PTHREAD_MUTEX_INITIALIZER;
cf5375
@@ -76,6 +77,11 @@ struct node {
cf5375
 	struct sockaddr_storage address[KNET_MAX_LINK];
cf5375
 };
cf5375
 
cf5375
+struct pckt_ver {
cf5375
+	uint32_t len;
cf5375
+	uint32_t chksum;
cf5375
+};
cf5375
+
cf5375
 static void print_help(void)
cf5375
 {
cf5375
 	printf("knet_bench usage:\n");
cf5375
@@ -117,6 +123,7 @@ static void print_help(void)
cf5375
 	printf("                                           1: show handle stats, 2: show summary link stats\n");
cf5375
 	printf("                                           3: show detailed link stats\n");
cf5375
 	printf(" -a                                        enable machine parsable output (default: off).\n");
cf5375
+	printf(" -v                                        enable packet verification for performance tests (default: off).\n");
cf5375
 }
cf5375
 
cf5375
 static void parse_nodes(char *nodesinfo[MAX_NODES], int onidx, int port, struct node nodes[MAX_NODES], int *thisidx)
cf5375
@@ -253,7 +260,7 @@ static void setup_knet(int argc, char *argv[])
cf5375
 
cf5375
 	memset(nodes, 0, sizeof(nodes));
cf5375
 
cf5375
-	while ((rv = getopt(argc, argv, "aCT:S:s:ldfom:wb:t:n:c:p:x:X::P:z:h")) != EOF) {
cf5375
+	while ((rv = getopt(argc, argv, "aCT:S:s:lvdfom:wb:t:n:c:p:x:X::P:z:h")) != EOF) {
cf5375
 		switch(rv) {
cf5375
 			case 'h':
cf5375
 				print_help();
cf5375
@@ -411,11 +418,14 @@ static void setup_knet(int argc, char *argv[])
cf5375
 				break;
cf5375
 			case 'x':
cf5375
 				force_packet_size = (uint32_t)atoi(optarg);
cf5375
-				if ((force_packet_size < 1) || (force_packet_size > 65536)) {
cf5375
-					printf("Unsupported packet size %u (accepted 1 - 65536)\n", force_packet_size);
cf5375
+				if ((force_packet_size < 64) || (force_packet_size > 65536)) {
cf5375
+					printf("Unsupported packet size %u (accepted 64 - 65536)\n", force_packet_size);
cf5375
 					exit(FAIL);
cf5375
 				}
cf5375
 				break;
cf5375
+			case 'v':
cf5375
+				use_pckt_verification = 1;
cf5375
+				break;
cf5375
 			case 'C':
cf5375
 				continous = 1;
cf5375
 				break;
cf5375
@@ -654,6 +664,24 @@ static void setup_knet(int argc, char *argv[])
cf5375
 	}
cf5375
 }
cf5375
 
cf5375
+/*
cf5375
+ * calculate weak chksum (stole from corosync for debugging purposes)
cf5375
+ */
cf5375
+static uint32_t compute_chsum(const unsigned char *data, uint32_t data_len)
cf5375
+{
cf5375
+	unsigned int i;
cf5375
+	unsigned int checksum = 0;
cf5375
+
cf5375
+	for (i = 0; i < data_len; i++) {
cf5375
+		if (checksum & 1) {
cf5375
+			checksum |= 0x10000;
cf5375
+		}
cf5375
+
cf5375
+		checksum = ((checksum >> 1) + (unsigned char)data[i]) & 0xffff;
cf5375
+	}
cf5375
+	return (checksum);
cf5375
+}
cf5375
+
cf5375
 static void *_rx_thread(void *args)
cf5375
 {
cf5375
 	int rx_epoll;
cf5375
@@ -766,6 +794,20 @@ static void *_rx_thread(void *args)
cf5375
 							}
cf5375
 							continue;
cf5375
 						}
cf5375
+						if (use_pckt_verification) {
cf5375
+							struct pckt_ver *recv_pckt = (struct pckt_ver *)msg[i].msg_hdr.msg_iov->iov_base;
cf5375
+							uint32_t chksum;
cf5375
+
cf5375
+							if (msg[i].msg_len != recv_pckt->len) {
cf5375
+								printf("Wrong packet len received: %u expected: %u!\n", msg[i].msg_len, recv_pckt->len);
cf5375
+								exit(FAIL);
cf5375
+							}
cf5375
+							chksum = compute_chsum((const unsigned char *)msg[i].msg_hdr.msg_iov->iov_base + sizeof(struct pckt_ver), msg[i].msg_len - sizeof(struct pckt_ver));
cf5375
+							if (recv_pckt->chksum != chksum){
cf5375
+								printf("Wrong packet checksum received: %u expected: %u!\n", recv_pckt->chksum, chksum);
cf5375
+								exit(FAIL);
cf5375
+							}
cf5375
+						}
cf5375
 						rx_pkts++;
cf5375
 						rx_bytes = rx_bytes + msg[i].msg_len;
cf5375
 						current_pckt_size = msg[i].msg_len;
cf5375
@@ -913,6 +955,11 @@ static void send_perf_data_by_size(void)
cf5375
 		}
cf5375
 		for (i = 0; i < PCKT_FRAG_MAX; i++) {
cf5375
 			iov_out[i].iov_len = packetsize;
cf5375
+			if (use_pckt_verification) {
cf5375
+				struct pckt_ver *tx_pckt = (struct pckt_ver *)&iov_out[i].iov_base;
cf5375
+				tx_pckt->len = iov_out[i].iov_len;
cf5375
+				tx_pckt->chksum = compute_chsum((const unsigned char *)iov_out[i].iov_base + sizeof(struct pckt_ver), iov_out[i].iov_len - sizeof(struct pckt_ver));
cf5375
+			}
cf5375
 		}
cf5375
 
cf5375
 		total_pkts_to_tx = perf_by_size_size / packetsize;
cf5375
@@ -1193,6 +1240,11 @@ static void send_perf_data_by_time(void)
cf5375
 		}
cf5375
 		for (i = 0; i < PCKT_FRAG_MAX; i++) {
cf5375
 			iov_out[i].iov_len = packetsize;
cf5375
+			if (use_pckt_verification) {
cf5375
+				struct pckt_ver *tx_pckt = (struct pckt_ver *)iov_out[i].iov_base;
cf5375
+				tx_pckt->len = iov_out[i].iov_len;
cf5375
+				tx_pckt->chksum = compute_chsum((const unsigned char *)iov_out[i].iov_base + sizeof(struct pckt_ver), iov_out[i].iov_len - sizeof(struct pckt_ver));
cf5375
+			}
cf5375
 		}
cf5375
 		printf("[info]: testing with %u bytes packet size for %" PRIu64 " seconds.\n", packetsize, perf_by_time_secs);
cf5375