Blame SOURCES/bz1761711-fix-data-deliver-corruption-from-fragmented-packets.patch

0c6670
commit db21da87bba6017c8343f9c6f255b21813ffd5d0
0c6670
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
0c6670
Date:   Tue Oct 15 06:46:36 2019 +0200
0c6670
0c6670
    [host] rename variables to make it easier to read the code
0c6670
    
0c6670
    Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
0c6670
0c6670
diff --git a/libknet/host.c b/libknet/host.c
0c6670
index abb1f89..ac26b89 100644
0c6670
--- a/libknet/host.c
0c6670
+++ b/libknet/host.c
0c6670
@@ -569,7 +569,7 @@ static void _clear_cbuffers(struct knet_host *host, seq_num_t rx_seq_num)
0c6670
 
0c6670
 int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, int clear_buf)
0c6670
 {
0c6670
-	size_t i, j; /* circular buffer indexes */
0c6670
+	size_t head, tail; /* circular buffer indexes */
0c6670
 	seq_num_t seq_dist;
0c6670
 	char *dst_cbuf = host->circular_buffer;
0c6670
 	char *dst_cbuf_defrag = host->circular_buffer_defrag;
0c6670
@@ -585,13 +585,13 @@ int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, i
0c6670
 		seq_dist = *dst_seq_num - seq_num;
0c6670
 	}
0c6670
 
0c6670
-	j = seq_num % KNET_CBUFFER_SIZE;
0c6670
+	head = seq_num % KNET_CBUFFER_SIZE;
0c6670
 
0c6670
 	if (seq_dist < KNET_CBUFFER_SIZE) { /* seq num is in ring buffer */
0c6670
 		if (!defrag_buf) {
0c6670
-			return (dst_cbuf[j] == 0) ? 1 : 0;
0c6670
+			return (dst_cbuf[head] == 0) ? 1 : 0;
0c6670
 		} else {
0c6670
-			return (dst_cbuf_defrag[j] == 0) ? 1 : 0;
0c6670
+			return (dst_cbuf_defrag[head] == 0) ? 1 : 0;
0c6670
 		}
0c6670
 	} else if (seq_dist <= SEQ_MAX - KNET_CBUFFER_SIZE) {
0c6670
 		memset(dst_cbuf, 0, KNET_CBUFFER_SIZE);
0c6670
@@ -600,16 +600,16 @@ int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, i
0c6670
 	}
0c6670
 
0c6670
 	/* cleaning up circular buffer */
0c6670
-	i = (*dst_seq_num + 1) % KNET_CBUFFER_SIZE;
0c6670
+	tail = (*dst_seq_num + 1) % KNET_CBUFFER_SIZE;
0c6670
 
0c6670
-	if (i > j) {
0c6670
-		memset(dst_cbuf + i, 0, KNET_CBUFFER_SIZE - i);
0c6670
-		memset(dst_cbuf, 0, j + 1);
0c6670
-		memset(dst_cbuf_defrag + i, 0, KNET_CBUFFER_SIZE - i);
0c6670
-		memset(dst_cbuf_defrag, 0, j + 1);
0c6670
+	if (tail > head) {
0c6670
+		memset(dst_cbuf + tail, 0, KNET_CBUFFER_SIZE - tail);
0c6670
+		memset(dst_cbuf, 0, head + 1);
0c6670
+		memset(dst_cbuf_defrag + tail, 0, KNET_CBUFFER_SIZE - tail);
0c6670
+		memset(dst_cbuf_defrag, 0, head + 1);
0c6670
 	} else {
0c6670
-		memset(dst_cbuf + i, 0, j - i + 1);
0c6670
-		memset(dst_cbuf_defrag + i, 0, j - i + 1);
0c6670
+		memset(dst_cbuf + tail, 0, head - tail + 1);
0c6670
+		memset(dst_cbuf_defrag + tail, 0, head - tail + 1);
0c6670
 	}
0c6670
 
0c6670
 	*dst_seq_num = seq_num;
0c6670
commit 1e473cf26d55c2b6ff8d5bfaa5aa689554de803c
0c6670
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
0c6670
Date:   Tue Oct 15 06:53:24 2019 +0200
0c6670
0c6670
    [host] fix defrag buffers reclaim logic
0c6670
    
0c6670
    The problem:
0c6670
    
0c6670
    - let's assume a 2 nodes (A and B) cluster setup
0c6670
    - node A sends fragmented packets to node B and there is
0c6670
      packet loss on the network.
0c6670
    - node B receives all those fragments and attempts to
0c6670
      reassemble them.
0c6670
    - node A sends packet seq_num X in Y fragments.
0c6670
    - node B receives only part of the fragments and stores
0c6670
      them in a defrag buf.
0c6670
    - packet loss stops.
0c6670
    - node A continues to send packets and a seq_num
0c6670
      roll-over takes place.
0c6670
    - node A sends a new packet seq_num X in Y fragments.
0c6670
    - node B gets confused here because the parts of the old
0c6670
      packet seq_num X are still stored and the buffer
0c6670
      has not been reclaimed.
0c6670
    - node B continues to rebuild packet seq_num X with
0c6670
      old stale data and new data from after the roll-over.
0c6670
    - node B completes reassembling the packet and delivers
0c6670
      junk to the application.
0c6670
    
0c6670
    The solution:
0c6670
    
0c6670
    Add a much stronger buffer reclaim logic that will apply
0c6670
    on each received packet and not only when defrag buffers
0c6670
    are needed, as there might be a mix of fragmented and not
0c6670
    fragmented packets in-flight.
0c6670
    
0c6670
    The new logic creates a window of N packets that can be
0c6670
    handled at the same time (based on the number of buffers)
0c6670
    and clear everything else.
0c6670
    
0c6670
    Fixes https://github.com/kronosnet/kronosnet/issues/261
0c6670
    
0c6670
    Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
0c6670
0c6670
diff --git a/libknet/host.c b/libknet/host.c
0c6670
index ac26b89..85d4626 100644
0c6670
--- a/libknet/host.c
0c6670
+++ b/libknet/host.c
0c6670
@@ -562,6 +562,35 @@ static void _clear_cbuffers(struct knet_host *host, seq_num_t rx_seq_num)
0c6670
 	}
0c6670
 }
0c6670
 
0c6670
+static void _reclaim_old_defrag_bufs(struct knet_host *host, seq_num_t seq_num)
0c6670
+{
0c6670
+	seq_num_t head, tail; /* seq_num boundaries */
0c6670
+	int i;
0c6670
+
0c6670
+	head = seq_num + 1;
0c6670
+	tail = seq_num - (KNET_MAX_LINK + 1);
0c6670
+
0c6670
+	/*
0c6670
+	 * expire old defrag buffers
0c6670
+	 */
0c6670
+	for (i = 0; i < KNET_MAX_LINK; i++) {
0c6670
+		if (host->defrag_buf[i].in_use) {
0c6670
+			/*
0c6670
+			 * head has done a rollover to 0+
0c6670
+			 */
0c6670
+			if (tail > head) {
0c6670
+				if ((host->defrag_buf[i].pckt_seq >= head) && (host->defrag_buf[i].pckt_seq <= tail)) {
0c6670
+					host->defrag_buf[i].in_use = 0;
0c6670
+				}
0c6670
+			} else {
0c6670
+				if ((host->defrag_buf[i].pckt_seq >= head) || (host->defrag_buf[i].pckt_seq <= tail)){
0c6670
+					host->defrag_buf[i].in_use = 0;
0c6670
+				}
0c6670
+			}
0c6670
+		}
0c6670
+	}
0c6670
+}
0c6670
+
0c6670
 /*
0c6670
  * check if a given packet seq num is in the circular buffers
0c6670
  * defrag_buf = 0 -> use normal cbuf 1 -> use the defrag buffer lookup
0c6670
@@ -579,6 +608,8 @@ int _seq_num_lookup(struct knet_host *host, seq_num_t seq_num, int defrag_buf, i
0c6670
 		_clear_cbuffers(host, seq_num);
0c6670
 	}
0c6670
 
0c6670
+	_reclaim_old_defrag_bufs(host, seq_num);
0c6670
+
0c6670
 	if (seq_num < *dst_seq_num) {
0c6670
 		seq_dist =  (SEQ_MAX - seq_num) + *dst_seq_num;
0c6670
 	} else {
0c6670
commit 5bd88ebd63af20577095c2c98975f0f1781ba46a
0c6670
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
0c6670
Date:   Tue Oct 15 07:02:05 2019 +0200
0c6670
0c6670
    [rx] copy data into the defrag buffer only if we know the size of the frame
0c6670
    
0c6670
    Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
0c6670
0c6670
diff --git a/libknet/threads_rx.c b/libknet/threads_rx.c
0c6670
index b2a5dad..6c26cdc 100644
0c6670
--- a/libknet/threads_rx.c
0c6670
+++ b/libknet/threads_rx.c
0c6670
@@ -186,8 +186,10 @@ static int pckt_defrag(knet_handle_t knet_h, struct knet_header *inbuf, ssize_t
0c6670
 		defrag_buf->frag_size = *len;
0c6670
 	}
0c6670
 
0c6670
-	memmove(defrag_buf->buf + ((inbuf->khp_data_frag_seq - 1) * defrag_buf->frag_size),
0c6670
-	       inbuf->khp_data_userdata, *len);
0c6670
+	if (defrag_buf->frag_size) {
0c6670
+		memmove(defrag_buf->buf + ((inbuf->khp_data_frag_seq - 1) * defrag_buf->frag_size),
0c6670
+		       inbuf->khp_data_userdata, *len);
0c6670
+	}
0c6670
 
0c6670
 	defrag_buf->frag_recv++;
0c6670
 	defrag_buf->frag_map[inbuf->khp_data_frag_seq] = 1;
0c6670
commit cd59986900510119d8e7b63d33ad35466d480858
0c6670
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
0c6670
Date:   Tue Oct 15 07:16:22 2019 +0200
0c6670
0c6670
    [test] add ability to knet_bench to specify a fixed packet size for perf test
0c6670
    
0c6670
    Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
0c6670
0c6670
diff --git a/libknet/tests/knet_bench.c b/libknet/tests/knet_bench.c
0c6670
index dc04239..54b5303 100644
0c6670
--- a/libknet/tests/knet_bench.c
0c6670
+++ b/libknet/tests/knet_bench.c
0c6670
@@ -67,6 +67,8 @@ static int test_type = TEST_PING;
0c6670
 static uint64_t perf_by_size_size = 1 * ONE_GIGABYTE;
0c6670
 static uint64_t perf_by_time_secs = 10;
0c6670
 
0c6670
+static uint32_t force_packet_size = 0;
0c6670
+
0c6670
 struct node {
0c6670
 	int nodeid;
0c6670
 	int links;
0c6670
@@ -109,6 +111,7 @@ static void print_help(void)
0c6670
 	printf(" -s                                        nodeid that will generate traffic for benchmarks\n");
0c6670
 	printf(" -S [size|seconds]                         when used in combination with -T perf-by-size it indicates how many GB of traffic to generate for the test. (default: 1GB)\n");
0c6670
 	printf("                                           when used in combination with -T perf-by-time it indicates how many Seconds of traffic to generate for the test. (default: 10 seconds)\n");
0c6670
+	printf(" -x                                        force packet size for perf-by-time or perf-by-size\n");
0c6670
 	printf(" -C                                        repeat the test continously (default: off)\n");
0c6670
 	printf(" -X[XX]                                    show stats at the end of the run (default: 1)\n");
0c6670
 	printf("                                           1: show handle stats, 2: show summary link stats\n");
0c6670
@@ -250,7 +253,7 @@ static void setup_knet(int argc, char *argv[])
0c6670
 
0c6670
 	memset(nodes, 0, sizeof(nodes));
0c6670
 
0c6670
-	while ((rv = getopt(argc, argv, "aCT:S:s:ldfom:wb:t:n:c:p:X::P:z:h")) != EOF) {
0c6670
+	while ((rv = getopt(argc, argv, "aCT:S:s:ldfom:wb:t:n:c:p:x:X::P:z:h")) != EOF) {
0c6670
 		switch(rv) {
0c6670
 			case 'h':
0c6670
 				print_help();
0c6670
@@ -406,6 +409,13 @@ static void setup_knet(int argc, char *argv[])
0c6670
 				perf_by_size_size = (uint64_t)atoi(optarg) * ONE_GIGABYTE;
0c6670
 				perf_by_time_secs = (uint64_t)atoi(optarg);
0c6670
 				break;
0c6670
+			case 'x':
0c6670
+				force_packet_size = (uint32_t)atoi(optarg);
0c6670
+				if ((force_packet_size < 1) || (force_packet_size > 65536)) {
0c6670
+					printf("Unsupported packet size %u (accepted 1 - 65536)\n", force_packet_size);
0c6670
+					exit(FAIL);
0c6670
+				}
0c6670
+				break;
0c6670
 			case 'C':
0c6670
 				continous = 1;
0c6670
 				break;
0c6670
@@ -874,7 +884,7 @@ static int setup_send_buffers_common(struct knet_mmsghdr *msg, struct iovec *iov
0c6670
 			printf("TXT: Unable to malloc!\n");
0c6670
 			return -1;
0c6670
 		}
0c6670
-		memset(tx_buf[i], 0, KNET_MAX_PACKET_SIZE);
0c6670
+		memset(tx_buf[i], i, KNET_MAX_PACKET_SIZE);
0c6670
 		iov_out[i].iov_base = (void *)tx_buf[i];
0c6670
 		memset(&msg[i].msg_hdr, 0, sizeof(struct msghdr));
0c6670
 		msg[i].msg_hdr.msg_iov = &iov_out[i];
0c6670
@@ -898,6 +908,9 @@ static void send_perf_data_by_size(void)
0c6670
 	setup_send_buffers_common(msg, iov_out, tx_buf);
0c6670
 
0c6670
 	while (packetsize <= KNET_MAX_PACKET_SIZE) {
0c6670
+		if (force_packet_size) {
0c6670
+			packetsize = force_packet_size;
0c6670
+		}
0c6670
 		for (i = 0; i < PCKT_FRAG_MAX; i++) {
0c6670
 			iov_out[i].iov_len = packetsize;
0c6670
 		}
0c6670
@@ -926,7 +939,7 @@ static void send_perf_data_by_size(void)
0c6670
 
0c6670
 		knet_send(knet_h, ctrl_message, TEST_STOP, channel);
0c6670
 
0c6670
-		if (packetsize == KNET_MAX_PACKET_SIZE) {
0c6670
+		if ((packetsize == KNET_MAX_PACKET_SIZE) || (force_packet_size)) {
0c6670
 			break;
0c6670
 		}
0c6670
 
0c6670
@@ -1175,6 +1188,9 @@ static void send_perf_data_by_time(void)
0c6670
 	memset(&clock_end, 0, sizeof(clock_start));
0c6670
 
0c6670
 	while (packetsize <= KNET_MAX_PACKET_SIZE) {
0c6670
+		if (force_packet_size) {
0c6670
+			packetsize = force_packet_size;
0c6670
+		}
0c6670
 		for (i = 0; i < PCKT_FRAG_MAX; i++) {
0c6670
 			iov_out[i].iov_len = packetsize;
0c6670
 		}
0c6670
@@ -1205,7 +1221,7 @@ static void send_perf_data_by_time(void)
0c6670
 
0c6670
 		knet_send(knet_h, ctrl_message, TEST_STOP, channel);
0c6670
 
0c6670
-		if (packetsize == KNET_MAX_PACKET_SIZE) {
0c6670
+		if ((packetsize == KNET_MAX_PACKET_SIZE) || (force_packet_size)) {
0c6670
 			break;
0c6670
 		}
0c6670
 
0c6670
commit e28e2ea7c7e8139a6792ec1508215d4560b53e65
0c6670
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
0c6670
Date:   Wed Oct 16 08:10:23 2019 +0200
0c6670
0c6670
    [test] add packet verification option to knet_bench
0c6670
    
0c6670
    Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
0c6670
0c6670
diff --git a/libknet/tests/knet_bench.c b/libknet/tests/knet_bench.c
0c6670
index 54b5303..c9e1c06 100644
0c6670
--- a/libknet/tests/knet_bench.c
0c6670
+++ b/libknet/tests/knet_bench.c
0c6670
@@ -47,6 +47,7 @@ static char *compresscfg = NULL;
0c6670
 static char *cryptocfg = NULL;
0c6670
 static int machine_output = 0;
0c6670
 static int use_access_lists = 0;
0c6670
+static int use_pckt_verification = 0;
0c6670
 
0c6670
 static int bench_shutdown_in_progress = 0;
0c6670
 static pthread_mutex_t shutdown_mutex = PTHREAD_MUTEX_INITIALIZER;
0c6670
@@ -76,6 +77,11 @@ struct node {
0c6670
 	struct sockaddr_storage address[KNET_MAX_LINK];
0c6670
 };
0c6670
 
0c6670
+struct pckt_ver {
0c6670
+	uint32_t len;
0c6670
+	uint32_t chksum;
0c6670
+};
0c6670
+
0c6670
 static void print_help(void)
0c6670
 {
0c6670
 	printf("knet_bench usage:\n");
0c6670
@@ -117,6 +123,7 @@ static void print_help(void)
0c6670
 	printf("                                           1: show handle stats, 2: show summary link stats\n");
0c6670
 	printf("                                           3: show detailed link stats\n");
0c6670
 	printf(" -a                                        enable machine parsable output (default: off).\n");
0c6670
+	printf(" -v                                        enable packet verification for performance tests (default: off).\n");
0c6670
 }
0c6670
 
0c6670
 static void parse_nodes(char *nodesinfo[MAX_NODES], int onidx, int port, struct node nodes[MAX_NODES], int *thisidx)
0c6670
@@ -253,7 +260,7 @@ static void setup_knet(int argc, char *argv[])
0c6670
 
0c6670
 	memset(nodes, 0, sizeof(nodes));
0c6670
 
0c6670
-	while ((rv = getopt(argc, argv, "aCT:S:s:ldfom:wb:t:n:c:p:x:X::P:z:h")) != EOF) {
0c6670
+	while ((rv = getopt(argc, argv, "aCT:S:s:lvdfom:wb:t:n:c:p:x:X::P:z:h")) != EOF) {
0c6670
 		switch(rv) {
0c6670
 			case 'h':
0c6670
 				print_help();
0c6670
@@ -411,11 +418,14 @@ static void setup_knet(int argc, char *argv[])
0c6670
 				break;
0c6670
 			case 'x':
0c6670
 				force_packet_size = (uint32_t)atoi(optarg);
0c6670
-				if ((force_packet_size < 1) || (force_packet_size > 65536)) {
0c6670
-					printf("Unsupported packet size %u (accepted 1 - 65536)\n", force_packet_size);
0c6670
+				if ((force_packet_size < 64) || (force_packet_size > 65536)) {
0c6670
+					printf("Unsupported packet size %u (accepted 64 - 65536)\n", force_packet_size);
0c6670
 					exit(FAIL);
0c6670
 				}
0c6670
 				break;
0c6670
+			case 'v':
0c6670
+				use_pckt_verification = 1;
0c6670
+				break;
0c6670
 			case 'C':
0c6670
 				continous = 1;
0c6670
 				break;
0c6670
@@ -654,6 +664,24 @@ static void setup_knet(int argc, char *argv[])
0c6670
 	}
0c6670
 }
0c6670
 
0c6670
+/*
0c6670
+ * calculate weak chksum (stole from corosync for debugging purposes)
0c6670
+ */
0c6670
+static uint32_t compute_chsum(const unsigned char *data, uint32_t data_len)
0c6670
+{
0c6670
+	unsigned int i;
0c6670
+	unsigned int checksum = 0;
0c6670
+
0c6670
+	for (i = 0; i < data_len; i++) {
0c6670
+		if (checksum & 1) {
0c6670
+			checksum |= 0x10000;
0c6670
+		}
0c6670
+
0c6670
+		checksum = ((checksum >> 1) + (unsigned char)data[i]) & 0xffff;
0c6670
+	}
0c6670
+	return (checksum);
0c6670
+}
0c6670
+
0c6670
 static void *_rx_thread(void *args)
0c6670
 {
0c6670
 	int rx_epoll;
0c6670
@@ -766,6 +794,20 @@ static void *_rx_thread(void *args)
0c6670
 							}
0c6670
 							continue;
0c6670
 						}
0c6670
+						if (use_pckt_verification) {
0c6670
+							struct pckt_ver *recv_pckt = (struct pckt_ver *)msg[i].msg_hdr.msg_iov->iov_base;
0c6670
+							uint32_t chksum;
0c6670
+
0c6670
+							if (msg[i].msg_len != recv_pckt->len) {
0c6670
+								printf("Wrong packet len received: %u expected: %u!\n", msg[i].msg_len, recv_pckt->len);
0c6670
+								exit(FAIL);
0c6670
+							}
0c6670
+							chksum = compute_chsum((const unsigned char *)msg[i].msg_hdr.msg_iov->iov_base + sizeof(struct pckt_ver), msg[i].msg_len - sizeof(struct pckt_ver));
0c6670
+							if (recv_pckt->chksum != chksum){
0c6670
+								printf("Wrong packet checksum received: %u expected: %u!\n", recv_pckt->chksum, chksum);
0c6670
+								exit(FAIL);
0c6670
+							}
0c6670
+						}
0c6670
 						rx_pkts++;
0c6670
 						rx_bytes = rx_bytes + msg[i].msg_len;
0c6670
 						current_pckt_size = msg[i].msg_len;
0c6670
@@ -913,6 +955,11 @@ static void send_perf_data_by_size(void)
0c6670
 		}
0c6670
 		for (i = 0; i < PCKT_FRAG_MAX; i++) {
0c6670
 			iov_out[i].iov_len = packetsize;
0c6670
+			if (use_pckt_verification) {
0c6670
+				struct pckt_ver *tx_pckt = (struct pckt_ver *)&iov_out[i].iov_base;
0c6670
+				tx_pckt->len = iov_out[i].iov_len;
0c6670
+				tx_pckt->chksum = compute_chsum((const unsigned char *)iov_out[i].iov_base + sizeof(struct pckt_ver), iov_out[i].iov_len - sizeof(struct pckt_ver));
0c6670
+			}
0c6670
 		}
0c6670
 
0c6670
 		total_pkts_to_tx = perf_by_size_size / packetsize;
0c6670
@@ -1193,6 +1240,11 @@ static void send_perf_data_by_time(void)
0c6670
 		}
0c6670
 		for (i = 0; i < PCKT_FRAG_MAX; i++) {
0c6670
 			iov_out[i].iov_len = packetsize;
0c6670
+			if (use_pckt_verification) {
0c6670
+				struct pckt_ver *tx_pckt = (struct pckt_ver *)iov_out[i].iov_base;
0c6670
+				tx_pckt->len = iov_out[i].iov_len;
0c6670
+				tx_pckt->chksum = compute_chsum((const unsigned char *)iov_out[i].iov_base + sizeof(struct pckt_ver), iov_out[i].iov_len - sizeof(struct pckt_ver));
0c6670
+			}
0c6670
 		}
0c6670
 		printf("[info]: testing with %u bytes packet size for %" PRIu64 " seconds.\n", packetsize, perf_by_time_secs);
0c6670