Blame SOURCES/sos-bz2082914-collect-pacemaker-cluster.patch

7701ef
From 3b84b4ccfa9e4924a5a3829d3810568dfb69bf63 Mon Sep 17 00:00:00 2001
7701ef
From: Jake Hunsaker <jhunsake@redhat.com>
7701ef
Date: Fri, 18 Mar 2022 16:25:35 -0400
7701ef
Subject: [PATCH 1/2] [pacemaker] Redesign node enumeration logic
7701ef
7701ef
It has been found that `pcs status` output is liable to change, which
7701ef
ends up breaking our parsing of node lists when using it on newer
7701ef
versions.
7701ef
7701ef
Instead, first try to parse through `crm_mon` output, which is what `pcs
7701ef
status` uses under the hood, but as a stable and reliable xml format.
7701ef
7701ef
Failing that, for example if the `--primary` node is not functioning as
7701ef
part of the cluster, source `/etc/corosync/corosync.conf` instead.
7701ef
7701ef
Related: RHBZ2065805
7701ef
Related: RHBZ2065811
7701ef
7701ef
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
7701ef
---
7701ef
 sos/collector/clusters/pacemaker.py | 110 +++++++++++++++++++---------
7701ef
 1 file changed, 76 insertions(+), 34 deletions(-)
7701ef
7701ef
diff --git a/sos/collector/clusters/pacemaker.py b/sos/collector/clusters/pacemaker.py
7701ef
index 55024314..49d0ce51 100644
7701ef
--- a/sos/collector/clusters/pacemaker.py
7701ef
+++ b/sos/collector/clusters/pacemaker.py
7701ef
@@ -8,7 +8,11 @@
7701ef
 #
7701ef
 # See the LICENSE file in the source distribution for further information.
7701ef
 
7701ef
+import re
7701ef
+
7701ef
 from sos.collector.clusters import Cluster
7701ef
+from setuptools._vendor.packaging import version
7701ef
+from xml.etree import ElementTree
7701ef
 
7701ef
 
7701ef
 class pacemaker(Cluster):
7701ef
@@ -18,42 +22,80 @@ class pacemaker(Cluster):
7701ef
     packages = ('pacemaker',)
7701ef
     option_list = [
7701ef
         ('online', True, 'Collect nodes listed as online'),
7701ef
-        ('offline', True, 'Collect nodes listed as offline')
7701ef
+        ('offline', True, 'Collect nodes listed as offline'),
7701ef
+        ('only-corosync', False, 'Only use corosync.conf to enumerate nodes')
7701ef
     ]
7701ef
 
7701ef
     def get_nodes(self):
7701ef
-        self.res = self.exec_primary_cmd('pcs status')
7701ef
-        if self.res['status'] != 0:
7701ef
-            self.log_error('Cluster status could not be determined. Is the '
7701ef
-                           'cluster running on this node?')
7701ef
-            return []
7701ef
-        if 'node names do not match' in self.res['output']:
7701ef
-            self.log_warn('Warning: node name mismatch reported. Attempts to '
7701ef
-                          'connect to some nodes may fail.\n')
7701ef
-        return self.parse_pcs_output()
7701ef
-
7701ef
-    def parse_pcs_output(self):
7701ef
-        nodes = []
7701ef
-        if self.get_option('online'):
7701ef
-            nodes += self.get_online_nodes()
7701ef
-        if self.get_option('offline'):
7701ef
-            nodes += self.get_offline_nodes()
7701ef
-        return nodes
7701ef
-
7701ef
-    def get_online_nodes(self):
7701ef
-        for line in self.res['output'].splitlines():
7701ef
-            if line.startswith('Online:'):
7701ef
-                nodes = line.split('[')[1].split(']')[0]
7701ef
-                return [n for n in nodes.split(' ') if n]
7701ef
-
7701ef
-    def get_offline_nodes(self):
7701ef
-        offline = []
7701ef
-        for line in self.res['output'].splitlines():
7701ef
-            if line.startswith('Node') and line.endswith('(offline)'):
7701ef
-                offline.append(line.split()[1].replace(':', ''))
7701ef
-            if line.startswith('OFFLINE:'):
7701ef
-                nodes = line.split('[')[1].split(']')[0]
7701ef
-                offline.extend([n for n in nodes.split(' ') if n])
7701ef
-        return offline
7701ef
+        self.nodes = []
7701ef
+        # try crm_mon first
7701ef
+        try:
7701ef
+            if not self.get_option('only-corosync'):
7701ef
+                try:
7701ef
+                    self.get_nodes_from_crm()
7701ef
+                except Exception as err:
7701ef
+                    self.log_warn("Falling back to sourcing corosync.conf. "
7701ef
+                                  "Could not parse crm_mon output: %s" % err)
7701ef
+            if not self.nodes:
7701ef
+                # fallback to corosync.conf, in case the node we're inspecting
7701ef
+                # is offline from the cluster
7701ef
+                self.get_nodes_from_corosync()
7701ef
+        except Exception as err:
7701ef
+            self.log_error("Could not determine nodes from cluster: %s" % err)
7701ef
+
7701ef
+        _shorts = [n for n in self.nodes if '.' not in n]
7701ef
+        if _shorts:
7701ef
+            self.log_warn(
7701ef
+                "WARNING: Node addresses '%s' may not resolve locally if you "
7701ef
+                "are not running on a node in the cluster. Try using option "
7701ef
+                "'-c pacemaker.only-corosync' if these connections fail."
7701ef
+                % ','.join(_shorts)
7701ef
+            )
7701ef
+        return self.nodes
7701ef
+
7701ef
+    def get_nodes_from_crm(self):
7701ef
+        """
7701ef
+        Try to parse crm_mon output for node list and status.
7701ef
+        """
7701ef
+        xmlopt = '--output-as=xml'
7701ef
+        # older pacemaker had a different option for xml output
7701ef
+        _ver = self.exec_primary_cmd('crm_mon --version')
7701ef
+        if _ver['status'] == 0:
7701ef
+            cver = _ver['output'].split()[1].split('-')[0]
7701ef
+            if not version.parse(cver) > version.parse('2.0.3'):
7701ef
+                xmlopt = '--as-xml'
7701ef
+        else:
7701ef
+            return
7701ef
+        _out = self.exec_primary_cmd(
7701ef
+            "crm_mon --one-shot --inactive %s" % xmlopt,
7701ef
+            need_root=True
7701ef
+        )
7701ef
+        if _out['status'] == 0:
7701ef
+            self.parse_crm_xml(_out['output'])
7701ef
+
7701ef
+    def parse_crm_xml(self, xmlstring):
7701ef
+        """
7701ef
+        Parse the xml output string provided by crm_mon
7701ef
+        """
7701ef
+        _xml = ElementTree.fromstring(xmlstring)
7701ef
+        nodes = _xml.find('nodes')
7701ef
+        for node in nodes:
7701ef
+            _node = node.attrib
7701ef
+            if self.get_option('online') and _node['online'] == 'true':
7701ef
+                self.nodes.append(_node['name'])
7701ef
+            elif self.get_option('offline') and _node['online'] == 'false':
7701ef
+                self.nodes.append(_node['name'])
7701ef
+
7701ef
+    def get_nodes_from_corosync(self):
7701ef
+        """
7701ef
+        As a fallback measure, read corosync.conf to get the node list. Note
7701ef
+        that this prevents us from separating online nodes from offline nodes.
7701ef
+        """
7701ef
+        self.log_warn("WARNING: unable to distinguish online nodes from "
7701ef
+                      "offline nodes when sourcing from corosync.conf")
7701ef
+        cc = self.primary.read_file('/etc/corosync/corosync.conf')
7701ef
+        nodes = re.findall(r'((\sring0_addr:)(.*))', cc)
7701ef
+        for node in nodes:
7701ef
+            self.nodes.append(node[-1].strip())
7701ef
 
7701ef
 # vim: set et ts=4 sw=4 :
7701ef
-- 
7701ef
2.34.3
7701ef
7701ef
7701ef
From 6701a7d77ecc998b018b54ecc00f9fd102ae9518 Mon Sep 17 00:00:00 2001
7701ef
From: Jake Hunsaker <jhunsake@redhat.com>
7701ef
Date: Mon, 21 Mar 2022 12:05:59 -0400
7701ef
Subject: [PATCH 2/2] [clusters] Allow clusters to not add localhost to node
7701ef
 list
7701ef
7701ef
For most of our supported clusters, we end up needing to add the
7701ef
local host executing `sos collect` to the node list (unless `--no-local`
7701ef
is used) as that accounts for the primary node that may otherwise be
7701ef
left off. However, this is not helpful for clusters that may reports
7701ef
node names as something other than resolveable names. In those cases,
7701ef
such as with pacemaker, adding the local hostname may result in
7701ef
duplicate collections.
7701ef
7701ef
Add a toggle to cluster profiles via a new `strict_node_list` class attr
7701ef
that, if True, will skip this addition. This toggle is default `False`
7701ef
to preserve existing behavior, and is now enabled for `pacemaker`
7701ef
specifically.
7701ef
7701ef
Related: RHBZ#2065821
7701ef
7701ef
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
7701ef
---
7701ef
 sos/collector/__init__.py           | 3 ++-
7701ef
 sos/collector/clusters/__init__.py  | 4 ++++
7701ef
 sos/collector/clusters/pacemaker.py | 1 +
7701ef
 3 files changed, 7 insertions(+), 1 deletion(-)
7701ef
7701ef
diff --git a/sos/collector/__init__.py b/sos/collector/__init__.py
7701ef
index a8bb0064..d898ca34 100644
7701ef
--- a/sos/collector/__init__.py
7701ef
+++ b/sos/collector/__init__.py
7701ef
@@ -1073,7 +1073,8 @@ class SoSCollector(SoSComponent):
7701ef
             for node in self.node_list:
7701ef
                 if host == node.split('.')[0]:
7701ef
                     self.node_list.remove(node)
7701ef
-            self.node_list.append(self.hostname)
7701ef
+            if not self.cluster.strict_node_list:
7701ef
+                self.node_list.append(self.hostname)
7701ef
         self.reduce_node_list()
7701ef
         try:
7701ef
             _node_max = len(max(self.node_list, key=len))
7701ef
diff --git a/sos/collector/clusters/__init__.py b/sos/collector/clusters/__init__.py
7701ef
index f3f550ad..f00677b8 100644
7701ef
--- a/sos/collector/clusters/__init__.py
7701ef
+++ b/sos/collector/clusters/__init__.py
7701ef
@@ -57,6 +57,10 @@ class Cluster():
7701ef
     sos_plugin_options = {}
7701ef
     sos_preset = ''
7701ef
     cluster_name = None
7701ef
+    # set this to True if the local host running collect should *not* be
7701ef
+    # forcibly added to the node list. This can be helpful in situations where
7701ef
+    # the host's fqdn and the name the cluster uses are different
7701ef
+    strict_node_list = False
7701ef
 
7701ef
     def __init__(self, commons):
7701ef
         self.primary = None
7701ef
diff --git a/sos/collector/clusters/pacemaker.py b/sos/collector/clusters/pacemaker.py
7701ef
index 49d0ce51..bebcb265 100644
7701ef
--- a/sos/collector/clusters/pacemaker.py
7701ef
+++ b/sos/collector/clusters/pacemaker.py
7701ef
@@ -20,6 +20,7 @@ class pacemaker(Cluster):
7701ef
     cluster_name = 'Pacemaker High Availability Cluster Manager'
7701ef
     sos_plugins = ['pacemaker']
7701ef
     packages = ('pacemaker',)
7701ef
+    strict_node_list = True
7701ef
     option_list = [
7701ef
         ('online', True, 'Collect nodes listed as online'),
7701ef
         ('offline', True, 'Collect nodes listed as offline'),
7701ef
-- 
7701ef
2.34.3
7701ef