Blame SOURCES/0044-multipathd-Don-t-keep-starting-TUR-threads-if-they-a.patch

ca7c20
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
ca7c20
From: Benjamin Marzinski <bmarzins@redhat.com>
ca7c20
Date: Tue, 29 Mar 2022 22:22:10 -0500
ca7c20
Subject: [PATCH] multipathd: Don't keep starting TUR threads, if they always
ca7c20
 hang.
ca7c20
ca7c20
If tur thead hangs, multipathd was simply creating a new thread, and
ca7c20
assuming that the old thread would get cleaned up eventually. I have
ca7c20
seen a case recently where there were 26000 multipathd threads on a
ca7c20
system, all stuck trying to send TUR commands to path devices. The root
ca7c20
cause of the issue was a scsi kernel issue, but it shows that the way
ca7c20
multipathd currently deals with stuck threads could use some refinement.
ca7c20
ca7c20
Now, when one tur thread hangs, multipathd will act as it did before.
ca7c20
If a second one in a row hangs, multipathd will instead wait for it to
ca7c20
complete before starting another thread. Once the thread completes, the
ca7c20
count is reset.
ca7c20
ca7c20
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
ca7c20
Reviewed-by: Martin Wilck 
ca7c20
---
ca7c20
 libmultipath/checkers/tur.c | 23 +++++++++++++++++++++--
ca7c20
 1 file changed, 21 insertions(+), 2 deletions(-)
ca7c20
ca7c20
diff --git a/libmultipath/checkers/tur.c b/libmultipath/checkers/tur.c
ca7c20
index a4b4a213..d82f7dbc 100644
ca7c20
--- a/libmultipath/checkers/tur.c
ca7c20
+++ b/libmultipath/checkers/tur.c
ca7c20
@@ -27,6 +27,7 @@
ca7c20
 
ca7c20
 #define TUR_CMD_LEN 6
ca7c20
 #define HEAVY_CHECK_COUNT       10
ca7c20
+#define MAX_NR_TIMEOUTS 1
ca7c20
 
ca7c20
 enum {
ca7c20
 	MSG_TUR_RUNNING = CHECKER_FIRST_MSGID,
ca7c20
@@ -55,6 +56,7 @@ struct tur_checker_context {
ca7c20
 	int holders; /* uatomic access only */
ca7c20
 	int msgid;
ca7c20
 	struct checker_context ctx;
ca7c20
+	unsigned int nr_timeouts;
ca7c20
 };
ca7c20
 
ca7c20
 int libcheck_init (struct checker * c)
ca7c20
@@ -359,8 +361,23 @@ int libcheck_check(struct checker * c)
ca7c20
 		}
ca7c20
 	} else {
ca7c20
 		if (uatomic_read(&ct->holders) > 1) {
ca7c20
+			/* The thread has been cancelled but hasn't quit. */
ca7c20
+			if (ct->nr_timeouts == MAX_NR_TIMEOUTS) {
ca7c20
+				condlog(2, "%d:%d : waiting for stalled tur thread to finish",
ca7c20
+					major(ct->devt), minor(ct->devt));
ca7c20
+				ct->nr_timeouts++;
ca7c20
+			}
ca7c20
 			/*
ca7c20
-			 * The thread has been cancelled but hasn't quit.
ca7c20
+			 * Don't start new threads until the last once has
ca7c20
+			 * finished.
ca7c20
+			 */
ca7c20
+			if (ct->nr_timeouts > MAX_NR_TIMEOUTS) {
ca7c20
+				c->msgid = MSG_TUR_TIMEOUT;
ca7c20
+				return PATH_TIMEOUT;
ca7c20
+			}
ca7c20
+			ct->nr_timeouts++;
ca7c20
+			/*
ca7c20
+			 * Start a new thread while the old one is stalled.
ca7c20
 			 * We have to prevent it from interfering with the new
ca7c20
 			 * thread. We create a new context and leave the old
ca7c20
 			 * one with the stale thread, hoping it will clean up
ca7c20
@@ -376,13 +393,15 @@ int libcheck_check(struct checker * c)
ca7c20
 			 */
ca7c20
 			if (libcheck_init(c) != 0)
ca7c20
 				return PATH_UNCHECKED;
ca7c20
+			((struct tur_checker_context *)c->context)->nr_timeouts = ct->nr_timeouts;
ca7c20
 
ca7c20
 			if (!uatomic_sub_return(&ct->holders, 1))
ca7c20
 				/* It did terminate, eventually */
ca7c20
 				cleanup_context(ct);
ca7c20
 
ca7c20
 			ct = c->context;
ca7c20
-		}
ca7c20
+		} else
ca7c20
+			ct->nr_timeouts = 0;
ca7c20
 		/* Start new TUR checker */
ca7c20
 		pthread_mutex_lock(&ct->lock);
ca7c20
 		tur_status = ct->state = PATH_PENDING;