From b37391fef92548f31822f9df2a9b5fa2a61b4514 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Wed, 23 Jun 2021 15:17:54 -0500 Subject: [PATCH] Fix: CTS: handle longer Corosync token timeouts Previously, startall() would call cluster_stable() immediately after detecting the "controller successfully started" message. If the Corosync token timeout is small enough, this will be fine. However with a token timeout of more than about 1 second, the controllers will not have formed a membership by this point, causing cluster_stable() to think there are multiple partitions, and wait for a DC to be elected in each one, when really they will unite into a single partition in a short time, and only elect a single DC. Now, startall() waits until seeing that each node is a cluster member before calling cluster_stable(). --- cts/lab/CTS.py.in | 3 ++- cts/lab/patterns.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cts/lab/CTS.py.in b/cts/lab/CTS.py.in index abcb9d285..d9924437b 100644 --- a/cts/lab/CTS.py.in +++ b/cts/lab/CTS.py.in @@ -628,9 +628,10 @@ class ClusterManager(UserDict): watchpats = [ ] watchpats.append(self.templates["Pat:DC_IDLE"]) for node in nodelist: - watchpats.append(self.templates["Pat:Local_started"] % node) watchpats.append(self.templates["Pat:InfraUp"] % node) watchpats.append(self.templates["Pat:PacemakerUp"] % node) + watchpats.append(self.templates["Pat:Local_started"] % node) + watchpats.append(self.templates["Pat:They_up"] % (nodelist[0], node)) # Start all the nodes - at about the same time... watch = LogWatcher(self.Env["LogFileName"], watchpats, "fast-start", self.Env["DeadTime"]+10, hosts=self.Env["nodes"], kind=self.Env["LogWatcher"]) diff --git a/cts/lab/patterns.py b/cts/lab/patterns.py index e21a016ff..400fd3dc8 100644 --- a/cts/lab/patterns.py +++ b/cts/lab/patterns.py @@ -61,6 +61,7 @@ class BasePatterns(object): "Pat:We_stopped" : "%s\W.*OVERRIDE THIS PATTERN", "Pat:They_stopped" : "%s\W.*LOST:.* %s ", "Pat:They_dead" : "node %s.*: is dead", + "Pat:They_up" : "%s %s\W.*OVERRIDE THIS PATTERN", "Pat:TransitionComplete" : "Transition status: Complete: complete", "Pat:Fencing_start" : r"Requesting peer fencing .* targeting %s", @@ -130,6 +131,7 @@ class crm_corosync(BasePatterns): "Pat:We_stopped" : "%s\W.*Unloading all Corosync service engines", "Pat:They_stopped" : "%s\W.*pacemaker-controld.*Node %s(\[|\s).*state is now lost", "Pat:They_dead" : "pacemaker-controld.*Node %s(\[|\s).*state is now lost", + "Pat:They_up" : "\W%s\W.*pacemaker-controld.*Node %s state is now member", "Pat:ChildExit" : r"\[[0-9]+\] exited with status [0-9]+ \(", # "with signal 9" == pcmk_child_exit(), "$" == check_active_before_startup_processes() -- 2.27.0