6c49ac
From b37391fef92548f31822f9df2a9b5fa2a61b4514 Mon Sep 17 00:00:00 2001
6c49ac
From: Ken Gaillot <kgaillot@redhat.com>
6c49ac
Date: Wed, 23 Jun 2021 15:17:54 -0500
6c49ac
Subject: [PATCH] Fix: CTS: handle longer Corosync token timeouts
6c49ac
6c49ac
Previously, startall() would call cluster_stable() immediately after detecting
6c49ac
the "controller successfully started" message. If the Corosync token timeout is
6c49ac
small enough, this will be fine. However with a token timeout of more than
6c49ac
about 1 second, the controllers will not have formed a membership by this
6c49ac
point, causing cluster_stable() to think there are multiple partitions, and
6c49ac
wait for a DC to be elected in each one, when really they will unite into a
6c49ac
single partition in a short time, and only elect a single DC.
6c49ac
6c49ac
Now, startall() waits until seeing that each node is a cluster member before
6c49ac
calling cluster_stable().
6c49ac
---
6c49ac
 cts/lab/CTS.py.in   | 3 ++-
6c49ac
 cts/lab/patterns.py | 2 ++
6c49ac
 2 files changed, 4 insertions(+), 1 deletion(-)
6c49ac
6c49ac
diff --git a/cts/lab/CTS.py.in b/cts/lab/CTS.py.in
6c49ac
index abcb9d285..d9924437b 100644
6c49ac
--- a/cts/lab/CTS.py.in
6c49ac
+++ b/cts/lab/CTS.py.in
6c49ac
@@ -628,9 +628,10 @@ class ClusterManager(UserDict):
6c49ac
         watchpats = [ ]
6c49ac
         watchpats.append(self.templates["Pat:DC_IDLE"])
6c49ac
         for node in nodelist:
6c49ac
-            watchpats.append(self.templates["Pat:Local_started"] % node)
6c49ac
             watchpats.append(self.templates["Pat:InfraUp"] % node)
6c49ac
             watchpats.append(self.templates["Pat:PacemakerUp"] % node)
6c49ac
+            watchpats.append(self.templates["Pat:Local_started"] % node)
6c49ac
+            watchpats.append(self.templates["Pat:They_up"] % (nodelist[0], node))
6c49ac
 
6c49ac
         #   Start all the nodes - at about the same time...
6c49ac
         watch = LogWatcher(self.Env["LogFileName"], watchpats, "fast-start", self.Env["DeadTime"]+10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"])
6c49ac
diff --git a/cts/lab/patterns.py b/cts/lab/patterns.py
6c49ac
index e21a016ff..400fd3dc8 100644
6c49ac
--- a/cts/lab/patterns.py
6c49ac
+++ b/cts/lab/patterns.py
6c49ac
@@ -61,6 +61,7 @@ class BasePatterns(object):
6c49ac
             "Pat:We_stopped"    : "%s\W.*OVERRIDE THIS PATTERN",
6c49ac
             "Pat:They_stopped"  : "%s\W.*LOST:.* %s ",
6c49ac
             "Pat:They_dead"     : "node %s.*: is dead",
6c49ac
+            "Pat:They_up"       : "%s %s\W.*OVERRIDE THIS PATTERN",
6c49ac
             "Pat:TransitionComplete" : "Transition status: Complete: complete",
6c49ac
 
6c49ac
             "Pat:Fencing_start"   : r"Requesting peer fencing .* targeting %s",
6c49ac
@@ -130,6 +131,7 @@ class crm_corosync(BasePatterns):
6c49ac
             "Pat:We_stopped"   : "%s\W.*Unloading all Corosync service engines",
6c49ac
             "Pat:They_stopped" : "%s\W.*pacemaker-controld.*Node %s(\[|\s).*state is now lost",
6c49ac
             "Pat:They_dead"    : "pacemaker-controld.*Node %s(\[|\s).*state is now lost",
6c49ac
+            "Pat:They_up"      : "\W%s\W.*pacemaker-controld.*Node %s state is now member",
6c49ac
 
6c49ac
             "Pat:ChildExit"    : r"\[[0-9]+\] exited with status [0-9]+ \(",
6c49ac
             # "with signal 9" == pcmk_child_exit(), "$" == check_active_before_startup_processes()
6c49ac
-- 
6c49ac
2.27.0
6c49ac