|
|
ca7c20 |
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
|
ca7c20 |
From: Benjamin Marzinski <bmarzins@redhat.com>
|
|
|
ca7c20 |
Date: Tue, 29 Mar 2022 22:22:10 -0500
|
|
|
ca7c20 |
Subject: [PATCH] multipathd: Don't keep starting TUR threads, if they always
|
|
|
ca7c20 |
hang.
|
|
|
ca7c20 |
|
|
|
ca7c20 |
If tur thead hangs, multipathd was simply creating a new thread, and
|
|
|
ca7c20 |
assuming that the old thread would get cleaned up eventually. I have
|
|
|
ca7c20 |
seen a case recently where there were 26000 multipathd threads on a
|
|
|
ca7c20 |
system, all stuck trying to send TUR commands to path devices. The root
|
|
|
ca7c20 |
cause of the issue was a scsi kernel issue, but it shows that the way
|
|
|
ca7c20 |
multipathd currently deals with stuck threads could use some refinement.
|
|
|
ca7c20 |
|
|
|
ca7c20 |
Now, when one tur thread hangs, multipathd will act as it did before.
|
|
|
ca7c20 |
If a second one in a row hangs, multipathd will instead wait for it to
|
|
|
ca7c20 |
complete before starting another thread. Once the thread completes, the
|
|
|
ca7c20 |
count is reset.
|
|
|
ca7c20 |
|
|
|
ca7c20 |
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
|
|
|
ca7c20 |
Reviewed-by: Martin Wilck
|
|
|
ca7c20 |
---
|
|
|
ca7c20 |
libmultipath/checkers/tur.c | 23 +++++++++++++++++++++--
|
|
|
ca7c20 |
1 file changed, 21 insertions(+), 2 deletions(-)
|
|
|
ca7c20 |
|
|
|
ca7c20 |
diff --git a/libmultipath/checkers/tur.c b/libmultipath/checkers/tur.c
|
|
|
ca7c20 |
index a4b4a213..d82f7dbc 100644
|
|
|
ca7c20 |
--- a/libmultipath/checkers/tur.c
|
|
|
ca7c20 |
+++ b/libmultipath/checkers/tur.c
|
|
|
ca7c20 |
@@ -27,6 +27,7 @@
|
|
|
ca7c20 |
|
|
|
ca7c20 |
#define TUR_CMD_LEN 6
|
|
|
ca7c20 |
#define HEAVY_CHECK_COUNT 10
|
|
|
ca7c20 |
+#define MAX_NR_TIMEOUTS 1
|
|
|
ca7c20 |
|
|
|
ca7c20 |
enum {
|
|
|
ca7c20 |
MSG_TUR_RUNNING = CHECKER_FIRST_MSGID,
|
|
|
ca7c20 |
@@ -55,6 +56,7 @@ struct tur_checker_context {
|
|
|
ca7c20 |
int holders; /* uatomic access only */
|
|
|
ca7c20 |
int msgid;
|
|
|
ca7c20 |
struct checker_context ctx;
|
|
|
ca7c20 |
+ unsigned int nr_timeouts;
|
|
|
ca7c20 |
};
|
|
|
ca7c20 |
|
|
|
ca7c20 |
int libcheck_init (struct checker * c)
|
|
|
ca7c20 |
@@ -359,8 +361,23 @@ int libcheck_check(struct checker * c)
|
|
|
ca7c20 |
}
|
|
|
ca7c20 |
} else {
|
|
|
ca7c20 |
if (uatomic_read(&ct->holders) > 1) {
|
|
|
ca7c20 |
+ /* The thread has been cancelled but hasn't quit. */
|
|
|
ca7c20 |
+ if (ct->nr_timeouts == MAX_NR_TIMEOUTS) {
|
|
|
ca7c20 |
+ condlog(2, "%d:%d : waiting for stalled tur thread to finish",
|
|
|
ca7c20 |
+ major(ct->devt), minor(ct->devt));
|
|
|
ca7c20 |
+ ct->nr_timeouts++;
|
|
|
ca7c20 |
+ }
|
|
|
ca7c20 |
/*
|
|
|
ca7c20 |
- * The thread has been cancelled but hasn't quit.
|
|
|
ca7c20 |
+ * Don't start new threads until the last once has
|
|
|
ca7c20 |
+ * finished.
|
|
|
ca7c20 |
+ */
|
|
|
ca7c20 |
+ if (ct->nr_timeouts > MAX_NR_TIMEOUTS) {
|
|
|
ca7c20 |
+ c->msgid = MSG_TUR_TIMEOUT;
|
|
|
ca7c20 |
+ return PATH_TIMEOUT;
|
|
|
ca7c20 |
+ }
|
|
|
ca7c20 |
+ ct->nr_timeouts++;
|
|
|
ca7c20 |
+ /*
|
|
|
ca7c20 |
+ * Start a new thread while the old one is stalled.
|
|
|
ca7c20 |
* We have to prevent it from interfering with the new
|
|
|
ca7c20 |
* thread. We create a new context and leave the old
|
|
|
ca7c20 |
* one with the stale thread, hoping it will clean up
|
|
|
ca7c20 |
@@ -376,13 +393,15 @@ int libcheck_check(struct checker * c)
|
|
|
ca7c20 |
*/
|
|
|
ca7c20 |
if (libcheck_init(c) != 0)
|
|
|
ca7c20 |
return PATH_UNCHECKED;
|
|
|
ca7c20 |
+ ((struct tur_checker_context *)c->context)->nr_timeouts = ct->nr_timeouts;
|
|
|
ca7c20 |
|
|
|
ca7c20 |
if (!uatomic_sub_return(&ct->holders, 1))
|
|
|
ca7c20 |
/* It did terminate, eventually */
|
|
|
ca7c20 |
cleanup_context(ct);
|
|
|
ca7c20 |
|
|
|
ca7c20 |
ct = c->context;
|
|
|
ca7c20 |
- }
|
|
|
ca7c20 |
+ } else
|
|
|
ca7c20 |
+ ct->nr_timeouts = 0;
|
|
|
ca7c20 |
/* Start new TUR checker */
|
|
|
ca7c20 |
pthread_mutex_lock(&ct->lock);
|
|
|
ca7c20 |
tur_status = ct->state = PATH_PENDING;
|