|
|
7701ef |
From 3b84b4ccfa9e4924a5a3829d3810568dfb69bf63 Mon Sep 17 00:00:00 2001
|
|
|
7701ef |
From: Jake Hunsaker <jhunsake@redhat.com>
|
|
|
7701ef |
Date: Fri, 18 Mar 2022 16:25:35 -0400
|
|
|
7701ef |
Subject: [PATCH 1/2] [pacemaker] Redesign node enumeration logic
|
|
|
7701ef |
|
|
|
7701ef |
It has been found that `pcs status` output is liable to change, which
|
|
|
7701ef |
ends up breaking our parsing of node lists when using it on newer
|
|
|
7701ef |
versions.
|
|
|
7701ef |
|
|
|
7701ef |
Instead, first try to parse through `crm_mon` output, which is what `pcs
|
|
|
7701ef |
status` uses under the hood, but as a stable and reliable xml format.
|
|
|
7701ef |
|
|
|
7701ef |
Failing that, for example if the `--primary` node is not functioning as
|
|
|
7701ef |
part of the cluster, source `/etc/corosync/corosync.conf` instead.
|
|
|
7701ef |
|
|
|
7701ef |
Related: RHBZ2065805
|
|
|
7701ef |
Related: RHBZ2065811
|
|
|
7701ef |
|
|
|
7701ef |
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
|
|
|
7701ef |
---
|
|
|
7701ef |
sos/collector/clusters/pacemaker.py | 110 +++++++++++++++++++---------
|
|
|
7701ef |
1 file changed, 76 insertions(+), 34 deletions(-)
|
|
|
7701ef |
|
|
|
7701ef |
diff --git a/sos/collector/clusters/pacemaker.py b/sos/collector/clusters/pacemaker.py
|
|
|
7701ef |
index 55024314..49d0ce51 100644
|
|
|
7701ef |
--- a/sos/collector/clusters/pacemaker.py
|
|
|
7701ef |
+++ b/sos/collector/clusters/pacemaker.py
|
|
|
7701ef |
@@ -8,7 +8,11 @@
|
|
|
7701ef |
#
|
|
|
7701ef |
# See the LICENSE file in the source distribution for further information.
|
|
|
7701ef |
|
|
|
7701ef |
+import re
|
|
|
7701ef |
+
|
|
|
7701ef |
from sos.collector.clusters import Cluster
|
|
|
7701ef |
+from setuptools._vendor.packaging import version
|
|
|
7701ef |
+from xml.etree import ElementTree
|
|
|
7701ef |
|
|
|
7701ef |
|
|
|
7701ef |
class pacemaker(Cluster):
|
|
|
7701ef |
@@ -18,42 +22,80 @@ class pacemaker(Cluster):
|
|
|
7701ef |
packages = ('pacemaker',)
|
|
|
7701ef |
option_list = [
|
|
|
7701ef |
('online', True, 'Collect nodes listed as online'),
|
|
|
7701ef |
- ('offline', True, 'Collect nodes listed as offline')
|
|
|
7701ef |
+ ('offline', True, 'Collect nodes listed as offline'),
|
|
|
7701ef |
+ ('only-corosync', False, 'Only use corosync.conf to enumerate nodes')
|
|
|
7701ef |
]
|
|
|
7701ef |
|
|
|
7701ef |
def get_nodes(self):
|
|
|
7701ef |
- self.res = self.exec_primary_cmd('pcs status')
|
|
|
7701ef |
- if self.res['status'] != 0:
|
|
|
7701ef |
- self.log_error('Cluster status could not be determined. Is the '
|
|
|
7701ef |
- 'cluster running on this node?')
|
|
|
7701ef |
- return []
|
|
|
7701ef |
- if 'node names do not match' in self.res['output']:
|
|
|
7701ef |
- self.log_warn('Warning: node name mismatch reported. Attempts to '
|
|
|
7701ef |
- 'connect to some nodes may fail.\n')
|
|
|
7701ef |
- return self.parse_pcs_output()
|
|
|
7701ef |
-
|
|
|
7701ef |
- def parse_pcs_output(self):
|
|
|
7701ef |
- nodes = []
|
|
|
7701ef |
- if self.get_option('online'):
|
|
|
7701ef |
- nodes += self.get_online_nodes()
|
|
|
7701ef |
- if self.get_option('offline'):
|
|
|
7701ef |
- nodes += self.get_offline_nodes()
|
|
|
7701ef |
- return nodes
|
|
|
7701ef |
-
|
|
|
7701ef |
- def get_online_nodes(self):
|
|
|
7701ef |
- for line in self.res['output'].splitlines():
|
|
|
7701ef |
- if line.startswith('Online:'):
|
|
|
7701ef |
- nodes = line.split('[')[1].split(']')[0]
|
|
|
7701ef |
- return [n for n in nodes.split(' ') if n]
|
|
|
7701ef |
-
|
|
|
7701ef |
- def get_offline_nodes(self):
|
|
|
7701ef |
- offline = []
|
|
|
7701ef |
- for line in self.res['output'].splitlines():
|
|
|
7701ef |
- if line.startswith('Node') and line.endswith('(offline)'):
|
|
|
7701ef |
- offline.append(line.split()[1].replace(':', ''))
|
|
|
7701ef |
- if line.startswith('OFFLINE:'):
|
|
|
7701ef |
- nodes = line.split('[')[1].split(']')[0]
|
|
|
7701ef |
- offline.extend([n for n in nodes.split(' ') if n])
|
|
|
7701ef |
- return offline
|
|
|
7701ef |
+ self.nodes = []
|
|
|
7701ef |
+ # try crm_mon first
|
|
|
7701ef |
+ try:
|
|
|
7701ef |
+ if not self.get_option('only-corosync'):
|
|
|
7701ef |
+ try:
|
|
|
7701ef |
+ self.get_nodes_from_crm()
|
|
|
7701ef |
+ except Exception as err:
|
|
|
7701ef |
+ self.log_warn("Falling back to sourcing corosync.conf. "
|
|
|
7701ef |
+ "Could not parse crm_mon output: %s" % err)
|
|
|
7701ef |
+ if not self.nodes:
|
|
|
7701ef |
+ # fallback to corosync.conf, in case the node we're inspecting
|
|
|
7701ef |
+ # is offline from the cluster
|
|
|
7701ef |
+ self.get_nodes_from_corosync()
|
|
|
7701ef |
+ except Exception as err:
|
|
|
7701ef |
+ self.log_error("Could not determine nodes from cluster: %s" % err)
|
|
|
7701ef |
+
|
|
|
7701ef |
+ _shorts = [n for n in self.nodes if '.' not in n]
|
|
|
7701ef |
+ if _shorts:
|
|
|
7701ef |
+ self.log_warn(
|
|
|
7701ef |
+ "WARNING: Node addresses '%s' may not resolve locally if you "
|
|
|
7701ef |
+ "are not running on a node in the cluster. Try using option "
|
|
|
7701ef |
+ "'-c pacemaker.only-corosync' if these connections fail."
|
|
|
7701ef |
+ % ','.join(_shorts)
|
|
|
7701ef |
+ )
|
|
|
7701ef |
+ return self.nodes
|
|
|
7701ef |
+
|
|
|
7701ef |
+ def get_nodes_from_crm(self):
|
|
|
7701ef |
+ """
|
|
|
7701ef |
+ Try to parse crm_mon output for node list and status.
|
|
|
7701ef |
+ """
|
|
|
7701ef |
+ xmlopt = '--output-as=xml'
|
|
|
7701ef |
+ # older pacemaker had a different option for xml output
|
|
|
7701ef |
+ _ver = self.exec_primary_cmd('crm_mon --version')
|
|
|
7701ef |
+ if _ver['status'] == 0:
|
|
|
7701ef |
+ cver = _ver['output'].split()[1].split('-')[0]
|
|
|
7701ef |
+ if not version.parse(cver) > version.parse('2.0.3'):
|
|
|
7701ef |
+ xmlopt = '--as-xml'
|
|
|
7701ef |
+ else:
|
|
|
7701ef |
+ return
|
|
|
7701ef |
+ _out = self.exec_primary_cmd(
|
|
|
7701ef |
+ "crm_mon --one-shot --inactive %s" % xmlopt,
|
|
|
7701ef |
+ need_root=True
|
|
|
7701ef |
+ )
|
|
|
7701ef |
+ if _out['status'] == 0:
|
|
|
7701ef |
+ self.parse_crm_xml(_out['output'])
|
|
|
7701ef |
+
|
|
|
7701ef |
+ def parse_crm_xml(self, xmlstring):
|
|
|
7701ef |
+ """
|
|
|
7701ef |
+ Parse the xml output string provided by crm_mon
|
|
|
7701ef |
+ """
|
|
|
7701ef |
+ _xml = ElementTree.fromstring(xmlstring)
|
|
|
7701ef |
+ nodes = _xml.find('nodes')
|
|
|
7701ef |
+ for node in nodes:
|
|
|
7701ef |
+ _node = node.attrib
|
|
|
7701ef |
+ if self.get_option('online') and _node['online'] == 'true':
|
|
|
7701ef |
+ self.nodes.append(_node['name'])
|
|
|
7701ef |
+ elif self.get_option('offline') and _node['online'] == 'false':
|
|
|
7701ef |
+ self.nodes.append(_node['name'])
|
|
|
7701ef |
+
|
|
|
7701ef |
+ def get_nodes_from_corosync(self):
|
|
|
7701ef |
+ """
|
|
|
7701ef |
+ As a fallback measure, read corosync.conf to get the node list. Note
|
|
|
7701ef |
+ that this prevents us from separating online nodes from offline nodes.
|
|
|
7701ef |
+ """
|
|
|
7701ef |
+ self.log_warn("WARNING: unable to distinguish online nodes from "
|
|
|
7701ef |
+ "offline nodes when sourcing from corosync.conf")
|
|
|
7701ef |
+ cc = self.primary.read_file('/etc/corosync/corosync.conf')
|
|
|
7701ef |
+ nodes = re.findall(r'((\sring0_addr:)(.*))', cc)
|
|
|
7701ef |
+ for node in nodes:
|
|
|
7701ef |
+ self.nodes.append(node[-1].strip())
|
|
|
7701ef |
|
|
|
7701ef |
# vim: set et ts=4 sw=4 :
|
|
|
7701ef |
--
|
|
|
7701ef |
2.34.3
|
|
|
7701ef |
|
|
|
7701ef |
|
|
|
7701ef |
From 6701a7d77ecc998b018b54ecc00f9fd102ae9518 Mon Sep 17 00:00:00 2001
|
|
|
7701ef |
From: Jake Hunsaker <jhunsake@redhat.com>
|
|
|
7701ef |
Date: Mon, 21 Mar 2022 12:05:59 -0400
|
|
|
7701ef |
Subject: [PATCH 2/2] [clusters] Allow clusters to not add localhost to node
|
|
|
7701ef |
list
|
|
|
7701ef |
|
|
|
7701ef |
For most of our supported clusters, we end up needing to add the
|
|
|
7701ef |
local host executing `sos collect` to the node list (unless `--no-local`
|
|
|
7701ef |
is used) as that accounts for the primary node that may otherwise be
|
|
|
7701ef |
left off. However, this is not helpful for clusters that may reports
|
|
|
7701ef |
node names as something other than resolveable names. In those cases,
|
|
|
7701ef |
such as with pacemaker, adding the local hostname may result in
|
|
|
7701ef |
duplicate collections.
|
|
|
7701ef |
|
|
|
7701ef |
Add a toggle to cluster profiles via a new `strict_node_list` class attr
|
|
|
7701ef |
that, if True, will skip this addition. This toggle is default `False`
|
|
|
7701ef |
to preserve existing behavior, and is now enabled for `pacemaker`
|
|
|
7701ef |
specifically.
|
|
|
7701ef |
|
|
|
7701ef |
Related: RHBZ#2065821
|
|
|
7701ef |
|
|
|
7701ef |
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
|
|
|
7701ef |
---
|
|
|
7701ef |
sos/collector/__init__.py | 3 ++-
|
|
|
7701ef |
sos/collector/clusters/__init__.py | 4 ++++
|
|
|
7701ef |
sos/collector/clusters/pacemaker.py | 1 +
|
|
|
7701ef |
3 files changed, 7 insertions(+), 1 deletion(-)
|
|
|
7701ef |
|
|
|
7701ef |
diff --git a/sos/collector/__init__.py b/sos/collector/__init__.py
|
|
|
7701ef |
index a8bb0064..d898ca34 100644
|
|
|
7701ef |
--- a/sos/collector/__init__.py
|
|
|
7701ef |
+++ b/sos/collector/__init__.py
|
|
|
7701ef |
@@ -1073,7 +1073,8 @@ class SoSCollector(SoSComponent):
|
|
|
7701ef |
for node in self.node_list:
|
|
|
7701ef |
if host == node.split('.')[0]:
|
|
|
7701ef |
self.node_list.remove(node)
|
|
|
7701ef |
- self.node_list.append(self.hostname)
|
|
|
7701ef |
+ if not self.cluster.strict_node_list:
|
|
|
7701ef |
+ self.node_list.append(self.hostname)
|
|
|
7701ef |
self.reduce_node_list()
|
|
|
7701ef |
try:
|
|
|
7701ef |
_node_max = len(max(self.node_list, key=len))
|
|
|
7701ef |
diff --git a/sos/collector/clusters/__init__.py b/sos/collector/clusters/__init__.py
|
|
|
7701ef |
index f3f550ad..f00677b8 100644
|
|
|
7701ef |
--- a/sos/collector/clusters/__init__.py
|
|
|
7701ef |
+++ b/sos/collector/clusters/__init__.py
|
|
|
7701ef |
@@ -57,6 +57,10 @@ class Cluster():
|
|
|
7701ef |
sos_plugin_options = {}
|
|
|
7701ef |
sos_preset = ''
|
|
|
7701ef |
cluster_name = None
|
|
|
7701ef |
+ # set this to True if the local host running collect should *not* be
|
|
|
7701ef |
+ # forcibly added to the node list. This can be helpful in situations where
|
|
|
7701ef |
+ # the host's fqdn and the name the cluster uses are different
|
|
|
7701ef |
+ strict_node_list = False
|
|
|
7701ef |
|
|
|
7701ef |
def __init__(self, commons):
|
|
|
7701ef |
self.primary = None
|
|
|
7701ef |
diff --git a/sos/collector/clusters/pacemaker.py b/sos/collector/clusters/pacemaker.py
|
|
|
7701ef |
index 49d0ce51..bebcb265 100644
|
|
|
7701ef |
--- a/sos/collector/clusters/pacemaker.py
|
|
|
7701ef |
+++ b/sos/collector/clusters/pacemaker.py
|
|
|
7701ef |
@@ -20,6 +20,7 @@ class pacemaker(Cluster):
|
|
|
7701ef |
cluster_name = 'Pacemaker High Availability Cluster Manager'
|
|
|
7701ef |
sos_plugins = ['pacemaker']
|
|
|
7701ef |
packages = ('pacemaker',)
|
|
|
7701ef |
+ strict_node_list = True
|
|
|
7701ef |
option_list = [
|
|
|
7701ef |
('online', True, 'Collect nodes listed as online'),
|
|
|
7701ef |
('offline', True, 'Collect nodes listed as offline'),
|
|
|
7701ef |
--
|
|
|
7701ef |
2.34.3
|
|
|
7701ef |
|