Blame SOURCES/sos-bz2024893-cleaner-hostnames-improvements.patch

819553
From decd39b7799a0579ea085b0da0728b6eabd49b38 Mon Sep 17 00:00:00 2001
819553
From: Jake Hunsaker <jhunsake@redhat.com>
819553
Date: Wed, 1 Sep 2021 00:28:58 -0400
819553
Subject: [PATCH] [clean] Provide archive abstractions to obfuscate more than
819553
 sos archives
819553
819553
This commit removes the restriction imposed on `sos clean` since its
819553
introduction in sos-4.0 to only work against known sos report archives
819553
or build directories. This is because there has been interest in using
819553
the obfuscation bits of sos in other data-collector projects.
819553
819553
The `SoSObfuscationArchive()` class has been revamped to now be an
819553
abstraction for different types of archives, and the cleaner logic has
819553
been updated to leverage this new abstraction rather than assuming we're
819553
working on an sos archive.
819553
819553
Abstractions are added for our own native use cases - that being `sos
819553
report` and `sos collect` for at-runtime obfuscation, as well as
819553
standalone archives previously generated. Further generic abstractions
819553
are available for plain directories and tarballs however these will not
819553
provide the same level of coverage as fully supported archive types, as
819553
is noted in the manpage for sos-clean.
819553
819553
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
819553
---
819553
 man/en/sos-clean.1                            |  25 ++
819553
 sos/cleaner/__init__.py                       | 308 +++++++++---------
819553
 .../__init__.py}                              |  80 ++++-
819553
 sos/cleaner/archives/generic.py               |  52 +++
819553
 sos/cleaner/archives/sos.py                   | 106 ++++++
819553
 sos/cleaner/parsers/__init__.py               |   6 -
819553
 sos/cleaner/parsers/hostname_parser.py        |   1 -
819553
 sos/cleaner/parsers/ip_parser.py              |   1 -
819553
 sos/cleaner/parsers/keyword_parser.py         |   1 -
819553
 sos/cleaner/parsers/mac_parser.py             |   1 -
819553
 sos/cleaner/parsers/username_parser.py        |   8 -
819553
 tests/cleaner_tests/existing_archive.py       |   7 +
819553
 tests/cleaner_tests/full_report_run.py        |   3 +
819553
 tests/cleaner_tests/report_with_mask.py       |   3 +
819553
 14 files changed, 423 insertions(+), 179 deletions(-)
819553
 rename sos/cleaner/{obfuscation_archive.py => archives/__init__.py} (81%)
819553
 create mode 100644 sos/cleaner/archives/generic.py
819553
 create mode 100644 sos/cleaner/archives/sos.py
819553
819553
diff --git a/man/en/sos-clean.1 b/man/en/sos-clean.1
819553
index b77bc63c..54026713 100644
819553
--- a/man/en/sos-clean.1
819553
+++ b/man/en/sos-clean.1
819553
@@ -10,6 +10,7 @@ sos clean - Obfuscate sensitive data from one or more sosreports
819553
     [\-\-jobs]
819553
     [\-\-no-update]
819553
     [\-\-keep-binary-files]
819553
+    [\-\-archive-type]
819553
 
819553
 .SH DESCRIPTION
819553
 \fBsos clean\fR or \fBsos mask\fR is an sos subcommand used to obfuscate sensitive information from
819553
@@ -88,6 +89,30 @@ Users should review any archive that keeps binary files in place before sending
819553
 a third party.
819553
 
819553
 Default: False (remove encountered binary files)
819553
+.TP
819553
+.B \-\-archive-type TYPE
819553
+Specify the type of archive that TARGET was generated as.
819553
+When sos inspects a TARGET archive, it tries to identify what type of archive it is.
819553
+For example, it may be a report generated by \fBsos report\fR, or a collection of those
819553
+reports generated by \fBsos collect\fR, which require separate approaches.
819553
+
819553
+This option may be useful if a given TARGET archive is known to be of a specific type,
819553
+but due to unknown reasons or some malformed/missing information in the archive directly,
819553
+that is not properly identified by sos.
819553
+
819553
+The following are accepted values for this option:
819553
+
819553
+    \fBauto\fR          Automatically detect the archive type
819553
+    \fBreport\fR        An archive generated by \fBsos report\fR
819553
+    \fBcollect\fR       An archive generated by \fBsos collect\fR
819553
+
819553
+The following may also be used, however note that these do not attempt to pre-load
819553
+any information from the archives into the parsers. This means that, among other limitations,
819553
+items like host and domain names may not be obfuscated unless an obfuscated mapping already exists
819553
+on the system from a previous execution.
819553
+
819553
+    \fBdata-dir\fR      A plain directory on the filesystem.
819553
+    \fBtarball\fR       A generic tar archive not associated with any known tool
819553
 
819553
 .SH SEE ALSO
819553
 .BR sos (1)
819553
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
819553
index 6aadfe79..6d2eb483 100644
819553
--- a/sos/cleaner/__init__.py
819553
+++ b/sos/cleaner/__init__.py
819553
@@ -12,9 +12,7 @@ import hashlib
819553
 import json
819553
 import logging
819553
 import os
819553
-import re
819553
 import shutil
819553
-import tarfile
819553
 import tempfile
819553
 
819553
 from concurrent.futures import ThreadPoolExecutor
819553
@@ -27,7 +25,10 @@ from sos.cleaner.parsers.mac_parser import SoSMacParser
819553
 from sos.cleaner.parsers.hostname_parser import SoSHostnameParser
819553
 from sos.cleaner.parsers.keyword_parser import SoSKeywordParser
819553
 from sos.cleaner.parsers.username_parser import SoSUsernameParser
819553
-from sos.cleaner.obfuscation_archive import SoSObfuscationArchive
819553
+from sos.cleaner.archives.sos import (SoSReportArchive, SoSReportDirectory,
819553
+                                      SoSCollectorArchive,
819553
+                                      SoSCollectorDirectory)
819553
+from sos.cleaner.archives.generic import DataDirArchive, TarballArchive
819553
 from sos.utilities import get_human_readable
819553
 from textwrap import fill
819553
 
819553
@@ -41,6 +42,7 @@ class SoSCleaner(SoSComponent):
819553
     desc = "Obfuscate sensitive networking information in a report"
819553
 
819553
     arg_defaults = {
819553
+        'archive_type': 'auto',
819553
         'domains': [],
819553
         'jobs': 4,
819553
         'keywords': [],
819553
@@ -70,6 +72,7 @@ class SoSCleaner(SoSComponent):
819553
             self.from_cmdline = False
819553
             if not hasattr(self.opts, 'jobs'):
819553
                 self.opts.jobs = 4
819553
+            self.opts.archive_type = 'auto'
819553
             self.soslog = logging.getLogger('sos')
819553
             self.ui_log = logging.getLogger('sos_ui')
819553
             # create the tmp subdir here to avoid a potential race condition
819553
@@ -92,6 +95,17 @@ class SoSCleaner(SoSComponent):
819553
             SoSUsernameParser(self.cleaner_mapping, self.opts.usernames)
819553
         ]
819553
 
819553
+        self.archive_types = [
819553
+            SoSReportDirectory,
819553
+            SoSReportArchive,
819553
+            SoSCollectorDirectory,
819553
+            SoSCollectorArchive,
819553
+            # make sure these two are always last as they are fallbacks
819553
+            DataDirArchive,
819553
+            TarballArchive
819553
+        ]
819553
+        self.nested_archive = None
819553
+
819553
         self.log_info("Cleaner initialized. From cmdline: %s"
819553
                       % self.from_cmdline)
819553
 
819553
@@ -178,6 +192,11 @@ third party.
819553
         )
819553
         clean_grp.add_argument('target', metavar='TARGET',
819553
                                help='The directory or archive to obfuscate')
819553
+        clean_grp.add_argument('--archive-type', default='auto',
819553
+                               choices=['auto', 'report', 'collect',
819553
+                                        'data-dir', 'tarball'],
819553
+                               help=('Specify what kind of archive the target '
819553
+                                     'was generated as'))
819553
         clean_grp.add_argument('--domains', action='extend', default=[],
819553
                                help='List of domain names to obfuscate')
819553
         clean_grp.add_argument('-j', '--jobs', default=4, type=int,
819553
@@ -218,59 +237,28 @@ third party.
819553
 
819553
         In the event the target path is not an archive, abort.
819553
         """
819553
-        if not tarfile.is_tarfile(self.opts.target):
819553
-            self.ui_log.error(
819553
-                "Invalid target: must be directory or tar archive"
819553
-            )
819553
-            self._exit(1)
819553
-
819553
-        archive = tarfile.open(self.opts.target)
819553
-        self.arc_name = self.opts.target.split('/')[-1].split('.')[:-2][0]
819553
-
819553
-        try:
819553
-            archive.getmember(os.path.join(self.arc_name, 'sos_logs'))
819553
-        except Exception:
819553
-            # this is not an sos archive
819553
-            self.ui_log.error("Invalid target: not an sos archive")
819553
-            self._exit(1)
819553
-
819553
-        # see if there are archives within this archive
819553
-        nested_archives = []
819553
-        for _file in archive.getmembers():
819553
-            if (re.match('sosreport-.*.tar', _file.name.split('/')[-1]) and not
819553
-                    (_file.name.endswith(('.md5', '.sha256')))):
819553
-                nested_archives.append(_file.name.split('/')[-1])
819553
-
819553
-        if nested_archives:
819553
-            self.log_info("Found nested archive(s), extracting top level")
819553
-            nested_path = self.extract_archive(archive)
819553
-            for arc_file in os.listdir(nested_path):
819553
-                if re.match('sosreport.*.tar.*', arc_file):
819553
-                    if arc_file.endswith(('.md5', '.sha256')):
819553
-                        continue
819553
-                    self.report_paths.append(os.path.join(nested_path,
819553
-                                                          arc_file))
819553
-            # add the toplevel extracted archive
819553
-            self.report_paths.append(nested_path)
819553
+        _arc = None
819553
+        if self.opts.archive_type != 'auto':
819553
+            check_type = self.opts.archive_type.replace('-', '_')
819553
+            for archive in self.archive_types:
819553
+                if archive.type_name == check_type:
819553
+                    _arc = archive(self.opts.target, self.tmpdir)
819553
         else:
819553
-            self.report_paths.append(self.opts.target)
819553
-
819553
-        archive.close()
819553
-
819553
-    def extract_archive(self, archive):
819553
-        """Extract an archive into our tmpdir so that we may inspect it or
819553
-        iterate through its contents for obfuscation
819553
-
819553
-        Positional arguments:
819553
-
819553
-            :param archive:     An open TarFile object for the archive
819553
-
819553
-        """
819553
-        if not isinstance(archive, tarfile.TarFile):
819553
-            archive = tarfile.open(archive)
819553
-        path = os.path.join(self.tmpdir, 'cleaner')
819553
-        archive.extractall(path)
819553
-        return os.path.join(path, archive.name.split('/')[-1].split('.tar')[0])
819553
+            for arc in self.archive_types:
819553
+                if arc.check_is_type(self.opts.target):
819553
+                    _arc = arc(self.opts.target, self.tmpdir)
819553
+                    break
819553
+        if not _arc:
819553
+            return
819553
+        self.report_paths.append(_arc)
819553
+        if _arc.is_nested:
819553
+            self.report_paths.extend(_arc.get_nested_archives())
819553
+            # We need to preserve the top level archive until all
819553
+            # nested archives are processed
819553
+            self.report_paths.remove(_arc)
819553
+            self.nested_archive = _arc
819553
+        if self.nested_archive:
819553
+            self.nested_archive.ui_name = self.nested_archive.description
819553
 
819553
     def execute(self):
819553
         """SoSCleaner will begin by inspecting the TARGET option to determine
819553
@@ -283,6 +271,7 @@ third party.
819553
         be unpacked, cleaned, and repacked and the final top-level archive will
819553
         then be repacked as well.
819553
         """
819553
+        self.arc_name = self.opts.target.split('/')[-1].split('.tar')[0]
819553
         if self.from_cmdline:
819553
             self.print_disclaimer()
819553
         self.report_paths = []
819553
@@ -290,23 +279,11 @@ third party.
819553
             self.ui_log.error("Invalid target: no such file or directory %s"
819553
                               % self.opts.target)
819553
             self._exit(1)
819553
-        if os.path.isdir(self.opts.target):
819553
-            self.arc_name = self.opts.target.split('/')[-1]
819553
-            for _file in os.listdir(self.opts.target):
819553
-                if _file == 'sos_logs':
819553
-                    self.report_paths.append(self.opts.target)
819553
-                if (_file.startswith('sosreport') and
819553
-                   (_file.endswith(".tar.gz") or _file.endswith(".tar.xz"))):
819553
-                    self.report_paths.append(os.path.join(self.opts.target,
819553
-                                                          _file))
819553
-            if not self.report_paths:
819553
-                self.ui_log.error("Invalid target: not an sos directory")
819553
-                self._exit(1)
819553
-        else:
819553
-            self.inspect_target_archive()
819553
+
819553
+        self.inspect_target_archive()
819553
 
819553
         if not self.report_paths:
819553
-            self.ui_log.error("No valid sos archives or directories found\n")
819553
+            self.ui_log.error("No valid archives or directories found\n")
819553
             self._exit(1)
819553
 
819553
         # we have at least one valid target to obfuscate
819553
@@ -334,33 +311,7 @@ third party.
819553
 
819553
         final_path = None
819553
         if len(self.completed_reports) > 1:
819553
-            # we have an archive of archives, so repack the obfuscated tarball
819553
-            arc_name = self.arc_name + '-obfuscated'
819553
-            self.setup_archive(name=arc_name)
819553
-            for arc in self.completed_reports:
819553
-                if arc.is_tarfile:
819553
-                    arc_dest = self.obfuscate_string(
819553
-                        arc.final_archive_path.split('/')[-1]
819553
-                    )
819553
-                    self.archive.add_file(arc.final_archive_path,
819553
-                                          dest=arc_dest)
819553
-                    checksum = self.get_new_checksum(arc.final_archive_path)
819553
-                    if checksum is not None:
819553
-                        dname = self.obfuscate_string(
819553
-                            "checksums/%s.%s" % (arc_dest, self.hash_name)
819553
-                        )
819553
-                        self.archive.add_string(checksum, dest=dname)
819553
-                else:
819553
-                    for dirname, dirs, files in os.walk(arc.archive_path):
819553
-                        for filename in files:
819553
-                            if filename.startswith('sosreport'):
819553
-                                continue
819553
-                            fname = os.path.join(dirname, filename)
819553
-                            dnm = self.obfuscate_string(
819553
-                                fname.split(arc.archive_name)[-1].lstrip('/')
819553
-                            )
819553
-                            self.archive.add_file(fname, dest=dnm)
819553
-            arc_path = self.archive.finalize(self.opts.compression_type)
819553
+            arc_path = self.rebuild_nested_archive()
819553
         else:
819553
             arc = self.completed_reports[0]
819553
             arc_path = arc.final_archive_path
819553
@@ -371,8 +322,7 @@ third party.
819553
                 )
819553
                 with open(os.path.join(self.sys_tmp, chksum_name), 'w') as cf:
819553
                     cf.write(checksum)
819553
-
819553
-        self.write_cleaner_log()
819553
+            self.write_cleaner_log()
819553
 
819553
         final_path = self.obfuscate_string(
819553
             os.path.join(self.sys_tmp, arc_path.split('/')[-1])
819553
@@ -393,6 +343,30 @@ third party.
819553
 
819553
         self.cleanup()
819553
 
819553
+    def rebuild_nested_archive(self):
819553
+        """Handles repacking the nested tarball, now containing only obfuscated
819553
+        copies of the reports, log files, manifest, etc...
819553
+        """
819553
+        # we have an archive of archives, so repack the obfuscated tarball
819553
+        arc_name = self.arc_name + '-obfuscated'
819553
+        self.setup_archive(name=arc_name)
819553
+        for archive in self.completed_reports:
819553
+            arc_dest = archive.final_archive_path.split('/')[-1]
819553
+            checksum = self.get_new_checksum(archive.final_archive_path)
819553
+            if checksum is not None:
819553
+                dname = "checksums/%s.%s" % (arc_dest, self.hash_name)
819553
+                self.archive.add_string(checksum, dest=dname)
819553
+        for dirn, dirs, files in os.walk(self.nested_archive.extracted_path):
819553
+            for filename in files:
819553
+                fname = os.path.join(dirn, filename)
819553
+                dname = fname.split(self.nested_archive.extracted_path)[-1]
819553
+                dname = dname.lstrip('/')
819553
+                self.archive.add_file(fname, dest=dname)
819553
+                # remove it now so we don't balloon our fs space needs
819553
+                os.remove(fname)
819553
+        self.write_cleaner_log(archive=True)
819553
+        return self.archive.finalize(self.opts.compression_type)
819553
+
819553
     def compile_mapping_dict(self):
819553
         """Build a dict that contains each parser's map as a key, with the
819553
         contents as that key's value. This will then be written to disk in the
819553
@@ -441,7 +415,7 @@ third party.
819553
                 self.log_error("Could not update mapping config file: %s"
819553
                                % err)
819553
 
819553
-    def write_cleaner_log(self):
819553
+    def write_cleaner_log(self, archive=False):
819553
         """When invoked via the command line, the logging from SoSCleaner will
819553
         not be added to the archive(s) it processes, so we need to write it
819553
         separately to disk
819553
@@ -454,6 +428,10 @@ third party.
819553
             for line in self.sos_log_file.readlines():
819553
                 logfile.write(line)
819553
 
819553
+        if archive:
819553
+            self.obfuscate_file(log_name)
819553
+            self.archive.add_file(log_name, dest="sos_logs/cleaner.log")
819553
+
819553
     def get_new_checksum(self, archive_path):
819553
         """Calculate a new checksum for the obfuscated archive, as the previous
819553
         checksum will no longer be valid
819553
@@ -481,11 +459,11 @@ third party.
819553
         be obfuscated concurrently.
819553
         """
819553
         try:
819553
-            if len(self.report_paths) > 1:
819553
-                msg = ("Found %s total reports to obfuscate, processing up to "
819553
-                       "%s concurrently\n"
819553
-                       % (len(self.report_paths), self.opts.jobs))
819553
-                self.ui_log.info(msg)
819553
+            msg = (
819553
+                "Found %s total reports to obfuscate, processing up to %s "
819553
+                "concurrently\n" % (len(self.report_paths), self.opts.jobs)
819553
+            )
819553
+            self.ui_log.info(msg)
819553
             if self.opts.keep_binary_files:
819553
                 self.ui_log.warning(
819553
                     "WARNING: binary files that potentially contain sensitive "
819553
@@ -494,53 +472,67 @@ third party.
819553
             pool = ThreadPoolExecutor(self.opts.jobs)
819553
             pool.map(self.obfuscate_report, self.report_paths, chunksize=1)
819553
             pool.shutdown(wait=True)
819553
+            # finally, obfuscate the nested archive if one exists
819553
+            if self.nested_archive:
819553
+                self._replace_obfuscated_archives()
819553
+                self.obfuscate_report(self.nested_archive)
819553
         except KeyboardInterrupt:
819553
             self.ui_log.info("Exiting on user cancel")
819553
             os._exit(130)
819553
 
819553
+    def _replace_obfuscated_archives(self):
819553
+        """When we have a nested archive, we need to rebuild the original
819553
+        archive, which entails replacing the existing archives with their
819553
+        obfuscated counterparts
819553
+        """
819553
+        for archive in self.completed_reports:
819553
+            os.remove(archive.archive_path)
819553
+            dest = self.nested_archive.extracted_path
819553
+            tarball = archive.final_archive_path.split('/')[-1]
819553
+            dest_name = os.path.join(dest, tarball)
819553
+            shutil.move(archive.final_archive_path, dest)
819553
+            archive.final_archive_path = dest_name
819553
+
819553
     def preload_all_archives_into_maps(self):
819553
         """Before doing the actual obfuscation, if we have multiple archives
819553
         to obfuscate then we need to preload each of them into the mappings
819553
         to ensure that node1 is obfuscated in node2 as well as node2 being
819553
         obfuscated in node1's archive.
819553
         """
819553
-        self.log_info("Pre-loading multiple archives into obfuscation maps")
819553
+        self.log_info("Pre-loading all archives into obfuscation maps")
819553
         for _arc in self.report_paths:
819553
-            is_dir = os.path.isdir(_arc)
819553
-            if is_dir:
819553
-                _arc_name = _arc
819553
-            else:
819553
-                archive = tarfile.open(_arc)
819553
-                _arc_name = _arc.split('/')[-1].split('.tar')[0]
819553
-            # for each parser, load the map_prep_file into memory, and then
819553
-            # send that for obfuscation. We don't actually obfuscate the file
819553
-            # here, do that in the normal archive loop
819553
             for _parser in self.parsers:
819553
-                if not _parser.prep_map_file:
819553
+                try:
819553
+                    pfile = _arc.prep_files[_parser.name.lower().split()[0]]
819553
+                    if not pfile:
819553
+                        continue
819553
+                except (IndexError, KeyError):
819553
                     continue
819553
-                if isinstance(_parser.prep_map_file, str):
819553
-                    _parser.prep_map_file = [_parser.prep_map_file]
819553
-                for parse_file in _parser.prep_map_file:
819553
-                    _arc_path = os.path.join(_arc_name, parse_file)
819553
+                if isinstance(pfile, str):
819553
+                    pfile = [pfile]
819553
+                for parse_file in pfile:
819553
+                    self.log_debug("Attempting to load %s" % parse_file)
819553
                     try:
819553
-                        if is_dir:
819553
-                            _pfile = open(_arc_path, 'r')
819553
-                            content = _pfile.read()
819553
-                        else:
819553
-                            _pfile = archive.extractfile(_arc_path)
819553
-                            content = _pfile.read().decode('utf-8')
819553
-                        _pfile.close()
819553
+                        content = _arc.get_file_content(parse_file)
819553
+                        if not content:
819553
+                            continue
819553
                         if isinstance(_parser, SoSUsernameParser):
819553
                             _parser.load_usernames_into_map(content)
819553
-                        for line in content.splitlines():
819553
-                            if isinstance(_parser, SoSHostnameParser):
819553
-                                _parser.load_hostname_into_map(line)
819553
-                            self.obfuscate_line(line)
819553
+                        elif isinstance(_parser, SoSHostnameParser):
819553
+                            _parser.load_hostname_into_map(
819553
+                                content.splitlines()[0]
819553
+                            )
819553
+                        else:
819553
+                            for line in content.splitlines():
819553
+                                self.obfuscate_line(line)
819553
                     except Exception as err:
819553
-                        self.log_debug("Could not prep %s: %s"
819553
-                                       % (_arc_path, err))
819553
+                        self.log_info(
819553
+                            "Could not prepare %s from %s (archive: %s): %s"
819553
+                            % (_parser.name, parse_file, _arc.archive_name,
819553
+                               err)
819553
+                        )
819553
 
819553
-    def obfuscate_report(self, report):
819553
+    def obfuscate_report(self, archive):
819553
         """Individually handle each archive or directory we've discovered by
819553
         running through each file therein.
819553
 
819553
@@ -549,17 +541,12 @@ third party.
819553
             :param report str:      Filepath to the directory or archive
819553
         """
819553
         try:
819553
-            if not os.access(report, os.W_OK):
819553
-                msg = "Insufficient permissions on %s" % report
819553
-                self.log_info(msg)
819553
-                self.ui_log.error(msg)
819553
-                return
819553
-
819553
-            archive = SoSObfuscationArchive(report, self.tmpdir)
819553
             arc_md = self.cleaner_md.add_section(archive.archive_name)
819553
             start_time = datetime.now()
819553
             arc_md.add_field('start_time', start_time)
819553
-            archive.extract()
819553
+            # don't double extract nested archives
819553
+            if not archive.is_extracted:
819553
+                archive.extract()
819553
             archive.report_msg("Beginning obfuscation...")
819553
 
819553
             file_list = archive.get_file_list()
819553
@@ -586,27 +573,28 @@ third party.
819553
                               caller=archive.archive_name)
819553
 
819553
             # if the archive was already a tarball, repack it
819553
-            method = archive.get_compression()
819553
-            if method:
819553
-                archive.report_msg("Re-compressing...")
819553
-                try:
819553
-                    archive.rename_top_dir(
819553
-                        self.obfuscate_string(archive.archive_name)
819553
-                    )
819553
-                    archive.compress(method)
819553
-                except Exception as err:
819553
-                    self.log_debug("Archive %s failed to compress: %s"
819553
-                                   % (archive.archive_name, err))
819553
-                    archive.report_msg("Failed to re-compress archive: %s"
819553
-                                       % err)
819553
-                    return
819553
+            if not archive.is_nested:
819553
+                method = archive.get_compression()
819553
+                if method:
819553
+                    archive.report_msg("Re-compressing...")
819553
+                    try:
819553
+                        archive.rename_top_dir(
819553
+                            self.obfuscate_string(archive.archive_name)
819553
+                        )
819553
+                        archive.compress(method)
819553
+                    except Exception as err:
819553
+                        self.log_debug("Archive %s failed to compress: %s"
819553
+                                       % (archive.archive_name, err))
819553
+                        archive.report_msg("Failed to re-compress archive: %s"
819553
+                                           % err)
819553
+                        return
819553
+                self.completed_reports.append(archive)
819553
 
819553
             end_time = datetime.now()
819553
             arc_md.add_field('end_time', end_time)
819553
             arc_md.add_field('run_time', end_time - start_time)
819553
             arc_md.add_field('files_obfuscated', len(archive.file_sub_list))
819553
             arc_md.add_field('total_substitutions', archive.total_sub_count)
819553
-            self.completed_reports.append(archive)
819553
             rmsg = ''
819553
             if archive.removed_file_count:
819553
                 rmsg = " [removed %s unprocessable files]"
819553
@@ -615,7 +603,7 @@ third party.
819553
 
819553
         except Exception as err:
819553
             self.ui_log.info("Exception while processing %s: %s"
819553
-                             % (report, err))
819553
+                             % (archive.archive_name, err))
819553
 
819553
     def obfuscate_file(self, filename, short_name=None, arc_name=None):
819553
         """Obfuscate and individual file, line by line.
819553
@@ -635,6 +623,8 @@ third party.
819553
             # the requested file doesn't exist in the archive
819553
             return
819553
         subs = 0
819553
+        if not short_name:
819553
+            short_name = filename.split('/')[-1]
819553
         if not os.path.islink(filename):
819553
             # don't run the obfuscation on the link, but on the actual file
819553
             # at some other point.
819553
@@ -745,3 +735,5 @@ third party.
819553
         for parser in self.parsers:
819553
             _sec = parse_sec.add_section(parser.name.replace(' ', '_').lower())
819553
             _sec.add_field('entries', len(parser.mapping.dataset.keys()))
819553
+
819553
+# vim: set et ts=4 sw=4 :
819553
diff --git a/sos/cleaner/obfuscation_archive.py b/sos/cleaner/archives/__init__.py
819553
similarity index 81%
819553
rename from sos/cleaner/obfuscation_archive.py
819553
rename to sos/cleaner/archives/__init__.py
819553
index ea0b7012..795c5a78 100644
819553
--- a/sos/cleaner/obfuscation_archive.py
819553
+++ b/sos/cleaner/archives/__init__.py
819553
@@ -40,6 +40,10 @@ class SoSObfuscationArchive():
819553
     file_sub_list = []
819553
     total_sub_count = 0
819553
     removed_file_count = 0
819553
+    type_name = 'undetermined'
819553
+    description = 'undetermined'
819553
+    is_nested = False
819553
+    prep_files = {}
819553
 
819553
     def __init__(self, archive_path, tmpdir):
819553
         self.archive_path = archive_path
819553
@@ -50,7 +54,43 @@ class SoSObfuscationArchive():
819553
         self.soslog = logging.getLogger('sos')
819553
         self.ui_log = logging.getLogger('sos_ui')
819553
         self.skip_list = self._load_skip_list()
819553
-        self.log_info("Loaded %s as an archive" % self.archive_path)
819553
+        self.is_extracted = False
819553
+        self._load_self()
819553
+        self.archive_root = ''
819553
+        self.log_info(
819553
+            "Loaded %s as type %s"
819553
+            % (self.archive_path, self.description)
819553
+        )
819553
+
819553
+    @classmethod
819553
+    def check_is_type(cls, arc_path):
819553
+        """Check if the archive is a well-known type we directly support"""
819553
+        return False
819553
+
819553
+    def _load_self(self):
819553
+        if self.is_tarfile:
819553
+            self.tarobj = tarfile.open(self.archive_path)
819553
+
819553
+    def get_nested_archives(self):
819553
+        """Return a list of ObfuscationArchives that represent additional
819553
+        archives found within the target archive. For example, an archive from
819553
+        `sos collect` will return a list of ``SoSReportArchive`` objects.
819553
+
819553
+        This should be overridden by individual types of ObfuscationArchive's
819553
+        """
819553
+        return []
819553
+
819553
+    def get_archive_root(self):
819553
+        """Set the root path for the archive that should be prepended to any
819553
+        filenames given to methods in this class.
819553
+        """
819553
+        if self.is_tarfile:
819553
+            toplevel = self.tarobj.firstmember
819553
+            if toplevel.isdir():
819553
+                return toplevel.name
819553
+            else:
819553
+                return os.sep
819553
+        return os.path.abspath(self.archive_path)
819553
 
819553
     def report_msg(self, msg):
819553
         """Helper to easily format ui messages on a per-report basis"""
819553
@@ -96,10 +136,42 @@ class SoSObfuscationArchive():
819553
             os.remove(full_fname)
819553
             self.removed_file_count += 1
819553
 
819553
-    def extract(self):
819553
+    def format_file_name(self, fname):
819553
+        """Based on the type of archive we're dealing with, do whatever that
819553
+        archive requires to a provided **relative** filepath to be able to
819553
+        access it within the archive
819553
+        """
819553
+        if not self.is_extracted:
819553
+            if not self.archive_root:
819553
+                self.archive_root = self.get_archive_root()
819553
+            return os.path.join(self.archive_root, fname)
819553
+        else:
819553
+            return os.path.join(self.extracted_path, fname)
819553
+
819553
+    def get_file_content(self, fname):
819553
+        """Return the content from the specified fname. Particularly useful for
819553
+        tarball-type archives so we can retrieve prep file contents prior to
819553
+        extracting the entire archive
819553
+        """
819553
+        if self.is_extracted is False and self.is_tarfile:
819553
+            filename = self.format_file_name(fname)
819553
+            try:
819553
+                return self.tarobj.extractfile(filename).read().decode('utf-8')
819553
+            except KeyError:
819553
+                self.log_debug(
819553
+                    "Unable to retrieve %s: no such file in archive" % fname
819553
+                )
819553
+                return ''
819553
+        else:
819553
+            with open(self.format_file_name(fname), 'r') as to_read:
819553
+                return to_read.read()
819553
+
819553
+    def extract(self, quiet=False):
819553
         if self.is_tarfile:
819553
-            self.report_msg("Extracting...")
819553
+            if not quiet:
819553
+                self.report_msg("Extracting...")
819553
             self.extracted_path = self.extract_self()
819553
+            self.is_extracted = True
819553
         else:
819553
             self.extracted_path = self.archive_path
819553
         # if we're running as non-root (e.g. collector), then we can have a
819553
@@ -317,3 +389,5 @@ class SoSObfuscationArchive():
819553
                 return False
819553
             except UnicodeDecodeError:
819553
                 return True
819553
+
819553
+# vim: set et ts=4 sw=4 :
819553
diff --git a/sos/cleaner/archives/generic.py b/sos/cleaner/archives/generic.py
819553
new file mode 100644
819553
index 00000000..2ce6f09b
819553
--- /dev/null
819553
+++ b/sos/cleaner/archives/generic.py
819553
@@ -0,0 +1,52 @@
819553
+# Copyright 2020 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
819553
+
819553
+# This file is part of the sos project: https://github.com/sosreport/sos
819553
+#
819553
+# This copyrighted material is made available to anyone wishing to use,
819553
+# modify, copy, or redistribute it subject to the terms and conditions of
819553
+# version 2 of the GNU General Public License.
819553
+#
819553
+# See the LICENSE file in the source distribution for further information.
819553
+
819553
+
819553
+from sos.cleaner.archives import SoSObfuscationArchive
819553
+
819553
+import os
819553
+import tarfile
819553
+
819553
+
819553
+class DataDirArchive(SoSObfuscationArchive):
819553
+    """A plain directory on the filesystem that is not directly associated with
819553
+    any known or supported collection utility
819553
+    """
819553
+
819553
+    type_name = 'data_dir'
819553
+    description = 'unassociated directory'
819553
+
819553
+    @classmethod
819553
+    def check_is_type(cls, arc_path):
819553
+        return os.path.isdir(arc_path)
819553
+
819553
+    def set_archive_root(self):
819553
+        return os.path.abspath(self.archive_path)
819553
+
819553
+
819553
+class TarballArchive(SoSObfuscationArchive):
819553
+    """A generic tar archive that is not associated with any known or supported
819553
+    collection utility
819553
+    """
819553
+
819553
+    type_name = 'tarball'
819553
+    description = 'unassociated tarball'
819553
+
819553
+    @classmethod
819553
+    def check_is_type(cls, arc_path):
819553
+        try:
819553
+            return tarfile.is_tarfile(arc_path)
819553
+        except Exception:
819553
+            return False
819553
+
819553
+    def set_archive_root(self):
819553
+        if self.tarobj.firstmember.isdir():
819553
+            return self.tarobj.firstmember.name
819553
+        return ''
819553
diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
819553
new file mode 100644
819553
index 00000000..4401d710
819553
--- /dev/null
819553
+++ b/sos/cleaner/archives/sos.py
819553
@@ -0,0 +1,106 @@
819553
+# Copyright 2021 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
819553
+
819553
+# This file is part of the sos project: https://github.com/sosreport/sos
819553
+#
819553
+# This copyrighted material is made available to anyone wishing to use,
819553
+# modify, copy, or redistribute it subject to the terms and conditions of
819553
+# version 2 of the GNU General Public License.
819553
+#
819553
+# See the LICENSE file in the source distribution for further information.
819553
+
819553
+
819553
+from sos.cleaner.archives import SoSObfuscationArchive
819553
+
819553
+import os
819553
+import tarfile
819553
+
819553
+
819553
+class SoSReportArchive(SoSObfuscationArchive):
819553
+    """This is the class representing an sos report, or in other words the
819553
+    type the archive the SoS project natively generates
819553
+    """
819553
+
819553
+    type_name = 'report'
819553
+    description = 'sos report archive'
819553
+    prep_files = {
819553
+        'hostname': 'sos_commands/host/hostname',
819553
+        'ip': 'sos_commands/networking/ip_-o_addr',
819553
+        'mac': 'sos_commands/networking/ip_-d_address',
819553
+        'username': [
819553
+            'sos_commands/login/lastlog_-u_1000-60000',
819553
+            'sos_commands/login/lastlog_-u_60001-65536',
819553
+            'sos_commands/login/lastlog_-u_65537-4294967295',
819553
+            # AD users will be reported here, but favor the lastlog files since
819553
+            # those will include local users who have not logged in
819553
+            'sos_commands/login/last'
819553
+        ]
819553
+    }
819553
+
819553
+    @classmethod
819553
+    def check_is_type(cls, arc_path):
819553
+        try:
819553
+            return tarfile.is_tarfile(arc_path) and 'sosreport-' in arc_path
819553
+        except Exception:
819553
+            return False
819553
+
819553
+
819553
+class SoSReportDirectory(SoSReportArchive):
819553
+    """This is the archive class representing a build directory, or in other
819553
+    words what `sos report --clean` will end up using for in-line obfuscation
819553
+    """
819553
+
819553
+    type_name = 'report_dir'
819553
+    description = 'sos report directory'
819553
+
819553
+    @classmethod
819553
+    def check_is_type(cls, arc_path):
819553
+        if os.path.isdir(arc_path):
819553
+            return 'sos_logs' in os.listdir(arc_path)
819553
+        return False
819553
+
819553
+
819553
+class SoSCollectorArchive(SoSObfuscationArchive):
819553
+    """Archive class representing the tarball created by ``sos collect``. It
819553
+    will not provide prep files on its own, however it will provide a list
819553
+    of SoSReportArchive's which will then be used to prep the parsers
819553
+    """
819553
+
819553
+    type_name = 'collect'
819553
+    description = 'sos collect tarball'
819553
+    is_nested = True
819553
+
819553
+    @classmethod
819553
+    def check_is_type(cls, arc_path):
819553
+        try:
819553
+            return (tarfile.is_tarfile(arc_path) and 'sos-collect' in arc_path)
819553
+        except Exception:
819553
+            return False
819553
+
819553
+    def get_nested_archives(self):
819553
+        self.extract(quiet=True)
819553
+        _path = self.extracted_path
819553
+        archives = []
819553
+        for fname in os.listdir(_path):
819553
+            arc_name = os.path.join(_path, fname)
819553
+            if 'sosreport-' in fname and tarfile.is_tarfile(arc_name):
819553
+                archives.append(SoSReportArchive(arc_name, self.tmpdir))
819553
+        return archives
819553
+
819553
+
819553
+class SoSCollectorDirectory(SoSCollectorArchive):
819553
+    """The archive class representing the temp directory used by ``sos
819553
+    collect`` when ``--clean`` is used during runtime.
819553
+    """
819553
+
819553
+    type_name = 'collect_dir'
819553
+    description = 'sos collect directory'
819553
+
819553
+    @classmethod
819553
+    def check_is_type(cls, arc_path):
819553
+        if os.path.isdir(arc_path):
819553
+            for fname in os.listdir(arc_path):
819553
+                if 'sos-collector-' in fname:
819553
+                    return True
819553
+        return False
819553
+
819553
+# vim: set et ts=4 sw=4 :
819553
diff --git a/sos/cleaner/parsers/__init__.py b/sos/cleaner/parsers/__init__.py
819553
index af6e375e..e62fd938 100644
819553
--- a/sos/cleaner/parsers/__init__.py
819553
+++ b/sos/cleaner/parsers/__init__.py
819553
@@ -37,11 +37,6 @@ class SoSCleanerParser():
819553
     :cvar map_file_key: The key in the ``map_file`` to read when loading
819553
                         previous obfuscation matches
819553
     :vartype map_file_key: ``str``
819553
-
819553
-
819553
-    :cvar prep_map_file: File to read from an archive to pre-seed the map with
819553
-                         matches. E.G. ip_addr for loading IP addresses
819553
-    :vartype prep_map_fie: ``str``
819553
     """
819553
 
819553
     name = 'Undefined Parser'
819553
@@ -49,7 +44,6 @@ class SoSCleanerParser():
819553
     skip_line_patterns = []
819553
     skip_files = []
819553
     map_file_key = 'unset'
819553
-    prep_map_file = []
819553
 
819553
     def __init__(self, config={}):
819553
         if self.map_file_key in config:
819553
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
819553
index 71e13d3f..daa76a62 100644
819553
--- a/sos/cleaner/parsers/hostname_parser.py
819553
+++ b/sos/cleaner/parsers/hostname_parser.py
819553
@@ -16,7 +16,6 @@ class SoSHostnameParser(SoSCleanerParser):
819553
 
819553
     name = 'Hostname Parser'
819553
     map_file_key = 'hostname_map'
819553
-    prep_map_file = 'sos_commands/host/hostname'
819553
     regex_patterns = [
819553
         r'(((\b|_)[a-zA-Z0-9-\.]{1,200}\.[a-zA-Z]{1,63}(\b|_)))'
819553
     ]
819553
diff --git a/sos/cleaner/parsers/ip_parser.py b/sos/cleaner/parsers/ip_parser.py
819553
index 525139e8..71d38be8 100644
819553
--- a/sos/cleaner/parsers/ip_parser.py
819553
+++ b/sos/cleaner/parsers/ip_parser.py
819553
@@ -41,7 +41,6 @@ class SoSIPParser(SoSCleanerParser):
819553
     ]
819553
 
819553
     map_file_key = 'ip_map'
819553
-    prep_map_file = 'sos_commands/networking/ip_-o_addr'
819553
 
819553
     def __init__(self, config):
819553
         self.mapping = SoSIPMap()
819553
diff --git a/sos/cleaner/parsers/keyword_parser.py b/sos/cleaner/parsers/keyword_parser.py
819553
index 68de3727..694c6073 100644
819553
--- a/sos/cleaner/parsers/keyword_parser.py
819553
+++ b/sos/cleaner/parsers/keyword_parser.py
819553
@@ -20,7 +20,6 @@ class SoSKeywordParser(SoSCleanerParser):
819553
 
819553
     name = 'Keyword Parser'
819553
     map_file_key = 'keyword_map'
819553
-    prep_map_file = ''
819553
 
819553
     def __init__(self, config, keywords=None, keyword_file=None):
819553
         self.mapping = SoSKeywordMap()
819553
diff --git a/sos/cleaner/parsers/mac_parser.py b/sos/cleaner/parsers/mac_parser.py
819553
index 7ca80b8d..c74288cf 100644
819553
--- a/sos/cleaner/parsers/mac_parser.py
819553
+++ b/sos/cleaner/parsers/mac_parser.py
819553
@@ -30,7 +30,6 @@ class SoSMacParser(SoSCleanerParser):
819553
         '534f:53'
819553
     )
819553
     map_file_key = 'mac_map'
819553
-    prep_map_file = 'sos_commands/networking/ip_-d_address'
819553
 
819553
     def __init__(self, config):
819553
         self.mapping = SoSMacMap()
819553
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
819553
index b142e371..35377a31 100644
819553
--- a/sos/cleaner/parsers/username_parser.py
819553
+++ b/sos/cleaner/parsers/username_parser.py
819553
@@ -25,14 +25,6 @@ class SoSUsernameParser(SoSCleanerParser):
819553
 
819553
     name = 'Username Parser'
819553
     map_file_key = 'username_map'
819553
-    prep_map_file = [
819553
-        'sos_commands/login/lastlog_-u_1000-60000',
819553
-        'sos_commands/login/lastlog_-u_60001-65536',
819553
-        'sos_commands/login/lastlog_-u_65537-4294967295',
819553
-        # AD users will be reported here, but favor the lastlog files since
819553
-        # those will include local users who have not logged in
819553
-        'sos_commands/login/last'
819553
-    ]
819553
     regex_patterns = []
819553
     skip_list = [
819553
         'core',
819553
diff --git a/tests/cleaner_tests/existing_archive.py b/tests/cleaner_tests/existing_archive.py
819553
index 0eaf6c8d..e13d1cae 100644
819553
--- a/tests/cleaner_tests/existing_archive.py
819553
+++ b/tests/cleaner_tests/existing_archive.py
819553
@@ -28,6 +28,13 @@ class ExistingArchiveCleanTest(StageTwoReportTest):
819553
     def test_obfuscation_log_created(self):
819553
         self.assertFileExists(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE))
819553
 
819553
+    def test_archive_type_correct(self):
819553
+        with open(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE), 'r') as log:
819553
+            for line in log:
819553
+                if "Loaded %s" % ARCHIVE in line:
819553
+                    assert 'as type sos report archive' in line, "Incorrect archive type detected: %s" % line
819553
+                    break
819553
+
819553
     def test_from_cmdline_logged(self):
819553
         with open(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE), 'r') as log:
819553
             for line in log:
819553
diff --git a/tests/cleaner_tests/full_report_run.py b/tests/cleaner_tests/full_report_run.py
819553
index 3b28e7a2..2de54946 100644
819553
--- a/tests/cleaner_tests/full_report_run.py
819553
+++ b/tests/cleaner_tests/full_report_run.py
819553
@@ -35,6 +35,9 @@ class FullCleanTest(StageTwoReportTest):
819553
     def test_tarball_named_obfuscated(self):
819553
         self.assertTrue('obfuscated' in self.archive)
819553
 
819553
+    def test_archive_type_correct(self):
819553
+        self.assertSosLogContains('Loaded .* as type sos report directory')
819553
+
819553
     def test_hostname_not_in_any_file(self):
819553
         host = self.sysinfo['pre']['networking']['hostname']
819553
         # much faster to just use grep here
819553
diff --git a/tests/cleaner_tests/report_with_mask.py b/tests/cleaner_tests/report_with_mask.py
819553
index 4f94ba33..08e873d4 100644
819553
--- a/tests/cleaner_tests/report_with_mask.py
819553
+++ b/tests/cleaner_tests/report_with_mask.py
819553
@@ -31,6 +31,9 @@ class ReportWithMask(StageOneReportTest):
819553
     def test_tarball_named_obfuscated(self):
819553
         self.assertTrue('obfuscated' in self.archive)
819553
 
819553
+    def test_archive_type_correct(self):
819553
+        self.assertSosLogContains('Loaded .* as type sos report directory')
819553
+
819553
     def test_localhost_was_obfuscated(self):
819553
         self.assertFileHasContent('/etc/hostname', 'host0')
819553
 
819553
-- 
819553
2.31.1
819553
819553
From 9b119f860eaec089f7ef884ff39c42589a662994 Mon Sep 17 00:00:00 2001
819553
From: Jake Hunsaker <jhunsake@redhat.com>
819553
Date: Wed, 1 Sep 2021 00:34:04 -0400
819553
Subject: [PATCH] [hostname_map] Add a catch for single-character hostnames
819553
819553
If a log file was truncated at a specific boundary in a string of the
819553
FQDN of the host such that we only get a couple characters before the
819553
rest of the domain, we would previously bodly replace all instances of
819553
that character with the obfuscated short name; not very helpful.
819553
819553
Instead, don't sanitize the short name if this happens and instead
819553
obfuscate the whole FQDN as 'unknown.example.com'.
819553
819553
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
819553
---
819553
 sos/cleaner/mappings/hostname_map.py | 9 ++++++++-
819553
 1 file changed, 8 insertions(+), 1 deletion(-)
819553
819553
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
819553
index d4b2c88e..e70a5530 100644
819553
--- a/sos/cleaner/mappings/hostname_map.py
819553
+++ b/sos/cleaner/mappings/hostname_map.py
819553
@@ -184,7 +184,14 @@ class SoSHostnameMap(SoSMap):
819553
             hostname = host[0]
819553
             domain = host[1:]
819553
             # obfuscate the short name
819553
-            ob_hostname = self.sanitize_short_name(hostname)
819553
+            if len(hostname) > 2:
819553
+                ob_hostname = self.sanitize_short_name(hostname)
819553
+            else:
819553
+                # by best practice it appears the host part of the fqdn was cut
819553
+                # off due to some form of truncating, as such don't obfuscate
819553
+                # short strings that are likely to throw off obfuscation of
819553
+                # unrelated bits and paths
819553
+                ob_hostname = 'unknown'
819553
             ob_domain = self.sanitize_domain(domain)
819553
             self.dataset[item] = ob_domain
819553
             return '.'.join([ob_hostname, ob_domain])
819553
-- 
819553
2.31.1
819553
819553
From f3f3e763d7c31b7b7cafdf8dd4dab87056fb7696 Mon Sep 17 00:00:00 2001
819553
From: Jake Hunsaker <jhunsake@redhat.com>
819553
Date: Wed, 1 Sep 2021 15:54:55 -0400
819553
Subject: [PATCH] [cleaner] Add support for Insights client archives
819553
819553
Adds a new type of `SoSObfuscationArchive` to add support for
819553
obfuscating archives generated by the Insights project.
819553
819553
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
819553
---
819553
 man/en/sos-clean.1               |  1 +
819553
 sos/cleaner/__init__.py          |  4 ++-
819553
 sos/cleaner/archives/insights.py | 42 ++++++++++++++++++++++++++++++++
819553
 3 files changed, 46 insertions(+), 1 deletion(-)
819553
 create mode 100644 sos/cleaner/archives/insights.py
819553
819553
diff --git a/man/en/sos-clean.1 b/man/en/sos-clean.1
819553
index 54026713..358ec0cb 100644
819553
--- a/man/en/sos-clean.1
819553
+++ b/man/en/sos-clean.1
819553
@@ -105,6 +105,7 @@ The following are accepted values for this option:
819553
     \fBauto\fR          Automatically detect the archive type
819553
     \fBreport\fR        An archive generated by \fBsos report\fR
819553
     \fBcollect\fR       An archive generated by \fBsos collect\fR
819553
+    \fBinsights\fR      An archive generated by the \fBinsights-client\fR package
819553
 
819553
 The following may also be used, however note that these do not attempt to pre-load
819553
 any information from the archives into the parsers. This means that, among other limitations,
819553
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
819553
index 6d2eb483..3e08aa28 100644
819553
--- a/sos/cleaner/__init__.py
819553
+++ b/sos/cleaner/__init__.py
819553
@@ -29,6 +29,7 @@ from sos.cleaner.archives.sos import (SoSReportArchive, SoSReportDirectory,
819553
                                       SoSCollectorArchive,
819553
                                       SoSCollectorDirectory)
819553
 from sos.cleaner.archives.generic import DataDirArchive, TarballArchive
819553
+from sos.cleaner.archives.insights import InsightsArchive
819553
 from sos.utilities import get_human_readable
819553
 from textwrap import fill
819553
 
819553
@@ -100,6 +101,7 @@ class SoSCleaner(SoSComponent):
819553
             SoSReportArchive,
819553
             SoSCollectorDirectory,
819553
             SoSCollectorArchive,
819553
+            InsightsArchive,
819553
             # make sure these two are always last as they are fallbacks
819553
             DataDirArchive,
819553
             TarballArchive
819553
@@ -194,7 +196,7 @@ third party.
819553
                                help='The directory or archive to obfuscate')
819553
         clean_grp.add_argument('--archive-type', default='auto',
819553
                                choices=['auto', 'report', 'collect',
819553
-                                        'data-dir', 'tarball'],
819553
+                                        'insights', 'data-dir', 'tarball'],
819553
                                help=('Specify what kind of archive the target '
819553
                                      'was generated as'))
819553
         clean_grp.add_argument('--domains', action='extend', default=[],
819553
diff --git a/sos/cleaner/archives/insights.py b/sos/cleaner/archives/insights.py
819553
new file mode 100644
819553
index 00000000..dab48b16
819553
--- /dev/null
819553
+++ b/sos/cleaner/archives/insights.py
819553
@@ -0,0 +1,42 @@
819553
+# Copyright 2021 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
819553
+
819553
+# This file is part of the sos project: https://github.com/sosreport/sos
819553
+#
819553
+# This copyrighted material is made available to anyone wishing to use,
819553
+# modify, copy, or redistribute it subject to the terms and conditions of
819553
+# version 2 of the GNU General Public License.
819553
+#
819553
+# See the LICENSE file in the source distribution for further information.
819553
+
819553
+
819553
+from sos.cleaner.archives import SoSObfuscationArchive
819553
+
819553
+import tarfile
819553
+
819553
+
819553
+class InsightsArchive(SoSObfuscationArchive):
819553
+    """This class represents archives generated by the insights-client utility
819553
+    for RHEL systems.
819553
+    """
819553
+
819553
+    type_name = 'insights'
819553
+    description = 'insights-client archive'
819553
+
819553
+    prep_files = {
819553
+        'hostname': 'data/insights_commands/hostname_-f',
819553
+        'ip': 'data/insights_commands/ip_addr',
819553
+        'mac': 'data/insights_commands/ip_addr'
819553
+    }
819553
+
819553
+    @classmethod
819553
+    def check_is_type(cls, arc_path):
819553
+        try:
819553
+            return tarfile.is_tarfile(arc_path) and 'insights-' in arc_path
819553
+        except Exception:
819553
+            return False
819553
+
819553
+    def get_archive_root(self):
819553
+        top = self.archive_path.split('/')[-1].split('.tar')[0]
819553
+        if self.tarobj.firstmember.name == '.':
819553
+            top = './' + top
819553
+        return top
819553
-- 
819553
2.31.1
819553
819553
From 9639dc3d240076b55f2a1d04b43ea42bebd09215 Mon Sep 17 00:00:00 2001
819553
From: Jake Hunsaker <jhunsake@redhat.com>
819553
Date: Tue, 16 Nov 2021 17:50:42 -0500
819553
Subject: [PATCH] [clean,hostname_parser] Source /etc/hosts for obfuscation
819553
819553
Up until now, our sourcing of hostnames/domains for obfuscation has been
819553
dependent upon the output of the `hostname` command. However, some
819553
scenarios have come up where sourcing `/etc/hosts` is advantageous for
819553
several reasons:
819553
819553
First, if `hostname` output is unavailable, this provides a fallback
819553
measure.
819553
819553
Second, `/etc/hosts` is a common place to have short names defined which
819553
would otherwise not be detected (or at the very least would result in a
819553
race condition based on where/if the short name was elsewhere able to be
819553
gleaned from an FQDN), thus leaving the potential for unobfuscated data
819553
in an archive.
819553
819553
Due to both the nature of hostname obfuscation and the malleable syntax
819553
of `/etc/hosts`, the parsing of this file needs special handling not
819553
covered by our more generic parsing and obfuscation methods.
819553
819553
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
819553
---
819553
 sos/cleaner/__init__.py                | 11 ++++++++---
819553
 sos/cleaner/archives/sos.py            |  5 ++++-
819553
 sos/cleaner/parsers/hostname_parser.py | 19 +++++++++++++++++++
819553
 3 files changed, 31 insertions(+), 4 deletions(-)
819553
819553
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
819553
index ed461a8f..3f530d44 100644
819553
--- a/sos/cleaner/__init__.py
819553
+++ b/sos/cleaner/__init__.py
819553
@@ -523,9 +523,14 @@ third party.
819553
                         if isinstance(_parser, SoSUsernameParser):
819553
                             _parser.load_usernames_into_map(content)
819553
                         elif isinstance(_parser, SoSHostnameParser):
819553
-                            _parser.load_hostname_into_map(
819553
-                                content.splitlines()[0]
819553
-                            )
819553
+                            if 'hostname' in parse_file:
819553
+                                _parser.load_hostname_into_map(
819553
+                                    content.splitlines()[0]
819553
+                                )
819553
+                            elif 'etc/hosts' in parse_file:
819553
+                                _parser.load_hostname_from_etc_hosts(
819553
+                                    content
819553
+                                )
819553
                         else:
819553
                             for line in content.splitlines():
819553
                                 self.obfuscate_line(line)
819553
diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
819553
index 4401d710..f8720c88 100644
819553
--- a/sos/cleaner/archives/sos.py
819553
+++ b/sos/cleaner/archives/sos.py
819553
@@ -23,7 +23,10 @@ class SoSReportArchive(SoSObfuscationArchive):
819553
     type_name = 'report'
819553
     description = 'sos report archive'
819553
     prep_files = {
819553
-        'hostname': 'sos_commands/host/hostname',
819553
+        'hostname': [
819553
+            'sos_commands/host/hostname',
819553
+            'etc/hosts'
819553
+        ],
819553
         'ip': 'sos_commands/networking/ip_-o_addr',
819553
         'mac': 'sos_commands/networking/ip_-d_address',
819553
         'username': [
819553
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
819553
index daa76a62..0a733bee 100644
819553
--- a/sos/cleaner/parsers/hostname_parser.py
819553
+++ b/sos/cleaner/parsers/hostname_parser.py
819553
@@ -61,6 +61,25 @@ class SoSHostnameParser(SoSCleanerParser):
819553
             self.mapping.add(high_domain)
819553
         self.mapping.add(hostname_string)
819553
 
819553
+    def load_hostname_from_etc_hosts(self, content):
819553
+        """Parse an archive's copy of /etc/hosts, which requires handling that
819553
+        is separate from the output of the `hostname` command. Just like
819553
+        load_hostname_into_map(), this has to be done explicitly and we
819553
+        cannot rely upon the more generic methods to do this reliably.
819553
+        """
819553
+        lines = content.splitlines()
819553
+        for line in lines:
819553
+            if line.startswith('#') or 'localhost' in line:
819553
+                continue
819553
+            hostln = line.split()[1:]
819553
+            for host in hostln:
819553
+                if len(host.split('.')) == 1:
819553
+                    # only generate a mapping for fqdns but still record the
819553
+                    # short name here for later obfuscation with parse_line()
819553
+                    self.short_names.append(host)
819553
+                else:
819553
+                    self.mapping.add(host)
819553
+
819553
     def parse_line(self, line):
819553
         """Override the default parse_line() method to also check for the
819553
         shortname of the host derived from the hostname.
819553
-- 
819553
2.31.1
819553
819553
From c1680226b53452b18f27f2e76c3e0e03e521f935 Mon Sep 17 00:00:00 2001
819553
From: Jake Hunsaker <jhunsake@redhat.com>
819553
Date: Wed, 17 Nov 2021 13:11:33 -0500
819553
Subject: [PATCH] [clean, hostname] Fix unintentionally case sensitive
819553
 shortname handling
819553
819553
It was discovered that our extra handling for shortnames was
819553
unintentionally case sensitive. Fix this to ensure that shortnames are
819553
obfuscated regardless of case in all collected text.
819553
819553
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
819553
---
819553
 sos/cleaner/mappings/hostname_map.py   |  6 +++---
819553
 sos/cleaner/parsers/hostname_parser.py |  8 +++++---
819553
 tests/cleaner_tests/full_report_run.py | 21 ++++++++++++++++++++-
819553
 3 files changed, 28 insertions(+), 7 deletions(-)
819553
819553
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
819553
index e70a5530..0fe78fb1 100644
819553
--- a/sos/cleaner/mappings/hostname_map.py
819553
+++ b/sos/cleaner/mappings/hostname_map.py
819553
@@ -169,13 +169,13 @@ class SoSHostnameMap(SoSMap):
819553
 
819553
     def sanitize_item(self, item):
819553
         host = item.split('.')
819553
-        if all([h.isupper() for h in host]):
819553
+        if len(host) > 1 and all([h.isupper() for h in host]):
819553
             # by convention we have just a domain
819553
             _host = [h.lower() for h in host]
819553
             return self.sanitize_domain(_host).upper()
819553
         if len(host) == 1:
819553
             # we have a shortname for a host
819553
-            return self.sanitize_short_name(host[0])
819553
+            return self.sanitize_short_name(host[0].lower())
819553
         if len(host) == 2:
819553
             # we have just a domain name, e.g. example.com
819553
             return self.sanitize_domain(host)
819553
@@ -185,7 +185,7 @@ class SoSHostnameMap(SoSMap):
819553
             domain = host[1:]
819553
             # obfuscate the short name
819553
             if len(hostname) > 2:
819553
-                ob_hostname = self.sanitize_short_name(hostname)
819553
+                ob_hostname = self.sanitize_short_name(hostname.lower())
819553
             else:
819553
                 # by best practice it appears the host part of the fqdn was cut
819553
                 # off due to some form of truncating, as such don't obfuscate
819553
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
819553
index 0a733bee..7fd0e698 100644
819553
--- a/sos/cleaner/parsers/hostname_parser.py
819553
+++ b/sos/cleaner/parsers/hostname_parser.py
819553
@@ -8,6 +8,8 @@
819553
 #
819553
 # See the LICENSE file in the source distribution for further information.
819553
 
819553
+import re
819553
+
819553
 from sos.cleaner.parsers import SoSCleanerParser
819553
 from sos.cleaner.mappings.hostname_map import SoSHostnameMap
819553
 
819553
@@ -91,9 +93,9 @@ class SoSHostnameParser(SoSCleanerParser):
819553
             """
819553
             if search in self.mapping.skip_keys:
819553
                 return ln, count
819553
-            if search in ln:
819553
-                count += ln.count(search)
819553
-                ln = ln.replace(search, self.mapping.get(repl or search))
819553
+            _reg = re.compile(search, re.I)
819553
+            if _reg.search(ln):
819553
+                return _reg.subn(self.mapping.get(repl or search), ln)
819553
             return ln, count
819553
 
819553
         count = 0
819553
diff --git a/tests/cleaner_tests/full_report_run.py b/tests/cleaner_tests/full_report_run.py
819553
index 2de54946..0b23acaf 100644
819553
--- a/tests/cleaner_tests/full_report_run.py
819553
+++ b/tests/cleaner_tests/full_report_run.py
819553
@@ -26,6 +26,24 @@ class FullCleanTest(StageTwoReportTest):
819553
     # replace with an empty placeholder, make sure that this test case is not
819553
     # influenced by previous clean runs
819553
     files = ['/etc/sos/cleaner/default_mapping']
819553
+    packages = {
819553
+        'rhel': ['python3-systemd'],
819553
+        'ubuntu': ['python3-systemd']
819553
+    }
819553
+
819553
+    def pre_sos_setup(self):
819553
+        # ensure that case-insensitive matching of FQDNs and shortnames work
819553
+        from systemd import journal
819553
+        from socket import gethostname
819553
+        host = gethostname()
819553
+        short = host.split('.')[0]
819553
+        sosfd = journal.stream('sos-testing')
819553
+        sosfd.write(
819553
+            "This is a test line from sos clean testing. The hostname %s "
819553
+            "should not appear, nor should %s in an obfuscated archive. The "
819553
+            "shortnames of %s and %s should also not appear."
819553
+            % (host.lower(), host.upper(), short.lower(), short.upper())
819553
+        )
819553
 
819553
     def test_private_map_was_generated(self):
819553
         self.assertOutputContains('A mapping of obfuscated elements is available at')
819553
@@ -40,8 +58,9 @@ class FullCleanTest(StageTwoReportTest):
819553
 
819553
     def test_hostname_not_in_any_file(self):
819553
         host = self.sysinfo['pre']['networking']['hostname']
819553
+        short = host.split('.')[0]
819553
         # much faster to just use grep here
819553
-        content = self.grep_for_content(host)
819553
+        content = self.grep_for_content(host) + self.grep_for_content(short)
819553
         if not content:
819553
             assert True
819553
         else:
819553
-- 
819553
2.31.1
819553
819553
From aaeb8cb57ed55598ab744b96d4f127aedebcb292 Mon Sep 17 00:00:00 2001
819553
From: Jake Hunsaker <jhunsake@redhat.com>
819553
Date: Tue, 21 Sep 2021 15:23:20 -0400
819553
Subject: [PATCH] [build] Add archives to setup.py packages
819553
819553
Adds the newly abstracted `sos.cleaner.archives` package to `setup.py`
819553
so that manual builds will properly include it.
819553
819553
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
819553
---
819553
 setup.py | 2 +-
819553
 1 file changed, 1 insertion(+), 1 deletion(-)
819553
819553
diff --git a/setup.py b/setup.py
819553
index 1e8d8e2dc5..7653b59de3 100644
819553
--- a/setup.py
819553
+++ b/setup.py
819553
@@ -102,7 +102,7 @@ def copy_file (self, filename, dirname):
819553
         'sos.policies.package_managers', 'sos.policies.init_systems',
819553
         'sos.report', 'sos.report.plugins', 'sos.collector',
819553
         'sos.collector.clusters', 'sos.cleaner', 'sos.cleaner.mappings',
819553
-        'sos.cleaner.parsers'
819553
+        'sos.cleaner.parsers', 'sos.cleaner.archives'
819553
     ],
819553
     cmdclass=cmdclass,
819553
     command_options=command_options,
819553
-- 
819553
2.31.1
819553
819553
From ba3528230256429a4394f155a9ca1fdb91cf3560 Mon Sep 17 00:00:00 2001
819553
From: Jake Hunsaker <jhunsake@redhat.com>
819553
Date: Tue, 30 Nov 2021 12:46:34 -0500
819553
Subject: [PATCH 1/2] [hostname] Simplify case matching for domains
819553
819553
Instead of special handling all uppercase domain conventions, use our
819553
normal flow for obfuscation and just match the casing at the end of the
819553
sanitization routine.
819553
819553
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
819553
---
819553
 sos/cleaner/mappings/hostname_map.py | 14 ++++++++------
819553
 1 file changed, 8 insertions(+), 6 deletions(-)
819553
819553
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
819553
index 0fe78fb1..5cd8e985 100644
819553
--- a/sos/cleaner/mappings/hostname_map.py
819553
+++ b/sos/cleaner/mappings/hostname_map.py
819553
@@ -169,16 +169,15 @@ class SoSHostnameMap(SoSMap):
819553
 
819553
     def sanitize_item(self, item):
819553
         host = item.split('.')
819553
-        if len(host) > 1 and all([h.isupper() for h in host]):
819553
-            # by convention we have just a domain
819553
-            _host = [h.lower() for h in host]
819553
-            return self.sanitize_domain(_host).upper()
819553
         if len(host) == 1:
819553
             # we have a shortname for a host
819553
             return self.sanitize_short_name(host[0].lower())
819553
         if len(host) == 2:
819553
             # we have just a domain name, e.g. example.com
819553
-            return self.sanitize_domain(host)
819553
+            dname = self.sanitize_domain(host)
819553
+            if all([h.isupper() for h in host]):
819553
+                dname = dname.upper()
819553
+            return dname
819553
         if len(host) > 2:
819553
             # we have an FQDN, e.g. foo.example.com
819553
             hostname = host[0]
819553
@@ -194,7 +193,10 @@ class SoSHostnameMap(SoSMap):
819553
                 ob_hostname = 'unknown'
819553
             ob_domain = self.sanitize_domain(domain)
819553
             self.dataset[item] = ob_domain
819553
-            return '.'.join([ob_hostname, ob_domain])
819553
+            _fqdn = '.'.join([ob_hostname, ob_domain])
819553
+            if all([h.isupper() for h in host]):
819553
+                _fqdn = _fqdn.upper()
819553
+            return _fqdn
819553
 
819553
     def sanitize_short_name(self, hostname):
819553
         """Obfuscate the short name of the host with an incremented counter
819553
-- 
819553
2.31.1
819553
819553
819553
From 189586728de22dd55122c1f7e06b19590f9a788f Mon Sep 17 00:00:00 2001
819553
From: Jake Hunsaker <jhunsake@redhat.com>
819553
Date: Tue, 30 Nov 2021 12:47:58 -0500
819553
Subject: [PATCH 2/2] [username] Improve username sourcing and remove case
819553
 sensitivity
819553
819553
First, don't skip the first line of `last` output, and instead add the
819553
header from lastlog to the skip list. Additionally, add
819553
`/etc/cron.allow` and `/etc/cron.deny` as sources for usernames that
819553
might not appear in other locations in certain environments.
819553
819553
Also, make matching and replacement case insensitive.
819553
819553
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
819553
---
819553
 sos/cleaner/archives/sos.py            |  4 +++-
819553
 sos/cleaner/mappings/username_map.py   |  2 +-
819553
 sos/cleaner/parsers/username_parser.py | 14 +++++++++-----
819553
 3 files changed, 13 insertions(+), 7 deletions(-)
819553
819553
diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
819553
index f8720c88..12766496 100644
819553
--- a/sos/cleaner/archives/sos.py
819553
+++ b/sos/cleaner/archives/sos.py
819553
@@ -35,7 +35,9 @@ class SoSReportArchive(SoSObfuscationArchive):
819553
             'sos_commands/login/lastlog_-u_65537-4294967295',
819553
             # AD users will be reported here, but favor the lastlog files since
819553
             # those will include local users who have not logged in
819553
-            'sos_commands/login/last'
819553
+            'sos_commands/login/last',
819553
+            'etc/cron.allow',
819553
+            'etc/cron.deny'
819553
         ]
819553
     }
819553
 
819553
diff --git a/sos/cleaner/mappings/username_map.py b/sos/cleaner/mappings/username_map.py
819553
index cdbf36fe..7ecccd7b 100644
819553
--- a/sos/cleaner/mappings/username_map.py
819553
+++ b/sos/cleaner/mappings/username_map.py
819553
@@ -33,5 +33,5 @@ class SoSUsernameMap(SoSMap):
819553
         ob_name = "obfuscateduser%s" % self.name_count
819553
         self.name_count += 1
819553
         if ob_name in self.dataset.values():
819553
-            return self.sanitize_item(username)
819553
+            return self.sanitize_item(username.lower())
819553
         return ob_name
819553
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
819553
index 35377a31..229c7de4 100644
819553
--- a/sos/cleaner/parsers/username_parser.py
819553
+++ b/sos/cleaner/parsers/username_parser.py
819553
@@ -8,6 +8,7 @@
819553
 #
819553
 # See the LICENSE file in the source distribution for further information.
819553
 
819553
+import re
819553
 
819553
 from sos.cleaner.parsers import SoSCleanerParser
819553
 from sos.cleaner.mappings.username_map import SoSUsernameMap
819553
@@ -34,6 +35,7 @@ class SoSUsernameParser(SoSCleanerParser):
819553
         'reboot',
819553
         'root',
819553
         'ubuntu',
819553
+        'username',
819553
         'wtmp'
819553
     ]
819553
 
819553
@@ -47,12 +49,12 @@ class SoSUsernameParser(SoSCleanerParser):
819553
         this parser, we need to override the initial parser prepping here.
819553
         """
819553
         users = set()
819553
-        for line in content.splitlines()[1:]:
819553
+        for line in content.splitlines():
819553
             try:
819553
                 user = line.split()[0]
819553
             except Exception:
819553
                 continue
819553
-            if user in self.skip_list:
819553
+            if user.lower() in self.skip_list:
819553
                 continue
819553
             users.add(user)
819553
         for each in users:
819553
@@ -61,7 +63,9 @@ class SoSUsernameParser(SoSCleanerParser):
819553
     def parse_line(self, line):
819553
         count = 0
819553
         for username in sorted(self.mapping.dataset.keys(), reverse=True):
819553
-            if username in line:
819553
-                count = line.count(username)
819553
-                line = line.replace(username, self.mapping.get(username))
819553
+            _reg = re.compile(username, re.I)
819553
+            if _reg.search(line):
819553
+                line, count = _reg.subn(
819553
+                    self.mapping.get(username.lower()), line
819553
+                )
819553
         return line, count
819553
-- 
819553
2.31.1
819553
819553
From cafd0f3a52436a3966576e7db21e5dd17c06f0cc Mon Sep 17 00:00:00 2001
819553
From: Jake Hunsaker <jhunsake@redhat.com>
819553
Date: Sun, 12 Dec 2021 11:10:46 -0500
819553
Subject: [PATCH] [hostname] Fix edge case for new hosts in a known subdomain
819553
819553
Fixes an edge case that would cause us to at first not recognize that a
819553
given hostname string is a new host in a known subdomain, but then on
819553
the obfuscation attempt properly recognize it as such and result in an
819553
incomplete obfuscation.
819553
819553
This was mostly triggered by specific patterns for build hosts within
819553
`sos_commands/rpm/package-data`. With this refined check, these types of
819553
matches are properly obfuscated.
819553
819553
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
819553
---
819553
 sos/cleaner/mappings/hostname_map.py | 9 +++++----
819553
 1 file changed, 5 insertions(+), 4 deletions(-)
819553
819553
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
819553
index 5cd8e9857..33b0e6c80 100644
819553
--- a/sos/cleaner/mappings/hostname_map.py
819553
+++ b/sos/cleaner/mappings/hostname_map.py
819553
@@ -129,7 +129,7 @@ def get(self, item):
819553
             item = item[0:-1]
819553
         if not self.domain_name_in_loaded_domains(item.lower()):
819553
             return item
819553
-        if item.endswith(('.yaml', '.yml', '.crt', '.key', '.pem')):
819553
+        if item.endswith(('.yaml', '.yml', '.crt', '.key', '.pem', '.log')):
819553
             ext = '.' + item.split('.')[-1]
819553
             item = item.replace(ext, '')
819553
             suffix += ext
819553
@@ -148,7 +148,8 @@ def get(self, item):
819553
                 if len(_test) == 1 or not _test[0]:
819553
                     # does not match existing obfuscation
819553
                     continue
819553
-                elif _test[0].endswith('.') and not _host_substr:
819553
+                elif not _host_substr and (_test[0].endswith('.') or
819553
+                                           item.endswith(_existing)):
819553
                     # new hostname in known domain
819553
                     final = super(SoSHostnameMap, self).get(item)
819553
                     break
819553
@@ -219,8 +220,8 @@ def sanitize_domain(self, domain):
819553
             # don't obfuscate vendor domains
819553
             if re.match(_skip, '.'.join(domain)):
819553
                 return '.'.join(domain)
819553
-        top_domain = domain[-1]
819553
-        dname = '.'.join(domain[0:-1])
819553
+        top_domain = domain[-1].lower()
819553
+        dname = '.'.join(domain[0:-1]).lower()
819553
         ob_domain = self._new_obfuscated_domain(dname)
819553
         ob_domain = '.'.join([ob_domain, top_domain])
819553
         self.dataset['.'.join(domain)] = ob_domain
819553
From f5e1298162a9393ea2d9f5c4df40dfece50f5f88 Mon Sep 17 00:00:00 2001
819553
From: Jake Hunsaker <jhunsake@redhat.com>
819553
Date: Thu, 6 Jan 2022 13:15:15 -0500
819553
Subject: [PATCH 1/3] [hostname] Fix loading and detection of long base domains
819553
819553
Our domain matching has up to now assumed that users would be providing
819553
'base' domains such as 'example.com' whereby something like
819553
'foo.bar.example.com' is a subdomain (or host) within that base domain.
819553
819553
However, the use case exists to provide 'foo.bar.example.com' as the
819553
base domain, without wanting to obfuscate 'example.com' directly.
819553
819553
This commit fixes our handling of both loading these longer domains and
819553
doing the 'domain is part of a domain we want to obfuscate' check.
819553
819553
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
819553
---
819553
 sos/cleaner/mappings/hostname_map.py | 9 ++++++++-
819553
 1 file changed, 8 insertions(+), 1 deletion(-)
819553
819553
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
819553
index 33b0e6c8..7a7cf6b8 100644
819553
--- a/sos/cleaner/mappings/hostname_map.py
819553
+++ b/sos/cleaner/mappings/hostname_map.py
819553
@@ -50,10 +50,14 @@ class SoSHostnameMap(SoSMap):
819553
         in this parser, we need to re-inject entries from the map_file into
819553
         these dicts and not just the underlying 'dataset' dict
819553
         """
819553
-        for domain in self.dataset:
819553
+        for domain, ob_pair in self.dataset.items():
819553
             if len(domain.split('.')) == 1:
819553
                 self.hosts[domain.split('.')[0]] = self.dataset[domain]
819553
             else:
819553
+                if ob_pair.startswith('obfuscateddomain'):
819553
+                    # directly exact domain matches
819553
+                    self._domains[domain] = ob_pair.split('.')[0]
819553
+                    continue
819553
                 # strip the host name and trailing top-level domain so that
819553
                 # we in inject the domain properly for later string matching
819553
 
819553
@@ -102,9 +106,12 @@ class SoSHostnameMap(SoSMap):
819553
         and should be obfuscated
819553
         """
819553
         host = domain.split('.')
819553
+        no_tld = '.'.join(domain.split('.')[0:-1])
819553
         if len(host) == 1:
819553
             # don't block on host's shortname
819553
             return host[0] in self.hosts.keys()
819553
+        elif any([no_tld.endswith(_d) for _d in self._domains]):
819553
+            return True
819553
         else:
819553
             domain = host[0:-1]
819553
             for known_domain in self._domains:
819553
-- 
819553
2.31.1
819553
819553
819553
From e241cf33a14ecd4e848a5fd857c5d3d7d07fbd71 Mon Sep 17 00:00:00 2001
819553
From: Jake Hunsaker <jhunsake@redhat.com>
819553
Date: Thu, 6 Jan 2022 13:18:44 -0500
819553
Subject: [PATCH 2/3] [cleaner] Improve parser-specific file skipping
819553
819553
This commit improves our handling of skipping files on a per-parser
819553
basis, by first filtering the list of parsers that `obfuscate_line()`
819553
will iterate over by the parser's `skip_file` class attr, rather than
819553
relying on higher-level checks.
819553
819553
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
819553
---
819553
 sos/cleaner/__init__.py | 17 ++++++++++++++---
819553
 1 file changed, 14 insertions(+), 3 deletions(-)
819553
819553
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
819553
index 3f530d44..5686e213 100644
819553
--- a/sos/cleaner/__init__.py
819553
+++ b/sos/cleaner/__init__.py
819553
@@ -12,6 +12,7 @@ import hashlib
819553
 import json
819553
 import logging
819553
 import os
819553
+import re
819553
 import shutil
819553
 import tempfile
819553
 
819553
@@ -640,10 +641,16 @@ third party.
819553
             self.log_debug("Obfuscating %s" % short_name or filename,
819553
                            caller=arc_name)
819553
             tfile = tempfile.NamedTemporaryFile(mode='w', dir=self.tmpdir)
819553
+            _parsers = [
819553
+                _p for _p in self.parsers if not
819553
+                any([
819553
+                    re.match(p, short_name) for p in _p.skip_files
819553
+                ])
819553
+            ]
819553
             with open(filename, 'r') as fname:
819553
                 for line in fname:
819553
                     try:
819553
-                        line, count = self.obfuscate_line(line)
819553
+                        line, count = self.obfuscate_line(line, _parsers)
819553
                         subs += count
819553
                         tfile.write(line)
819553
                     except Exception as err:
819553
@@ -713,7 +720,7 @@ third party.
819553
                 pass
819553
         return string_data
819553
 
819553
-    def obfuscate_line(self, line):
819553
+    def obfuscate_line(self, line, parsers=None):
819553
         """Run a line through each of the obfuscation parsers, keeping a
819553
         cumulative total of substitutions done on that particular line.
819553
 
819553
@@ -721,6 +728,8 @@ third party.
819553
 
819553
             :param line str:        The raw line as read from the file being
819553
                                     processed
819553
+            :param parsers:         A list of parser objects to obfuscate
819553
+                                    with. If None, use all.
819553
 
819553
         Returns the fully obfuscated line and the number of substitutions made
819553
         """
819553
@@ -729,7 +738,9 @@ third party.
819553
         count = 0
819553
         if not line.strip():
819553
             return line, count
819553
-        for parser in self.parsers:
819553
+        if parsers is None:
819553
+            parsers = self.parsers
819553
+        for parser in parsers:
819553
             try:
819553
                 line, _count = parser.parse_line(line)
819553
                 count += _count
819553
-- 
819553
2.31.1
819553
819553
819553
From 96c9a833e77639a853b7d3d6f1df68bbbbe5e9cb Mon Sep 17 00:00:00 2001
819553
From: Jake Hunsaker <jhunsake@redhat.com>
819553
Date: Thu, 6 Jan 2022 13:20:32 -0500
819553
Subject: [PATCH 3/3] [cleaner] Add skips for known files and usernames
819553
819553
Adds skips for `/proc/kallsyms` which should never be obfuscated, as
819553
well as any packaging-related log file for the IP parser. Further, do
819553
not obfuscate the `stack` users, as that is a well-known user for many
819553
configurations that, if obfuscated, could result in undesired string
819553
substitutions in normal logging.
819553
819553
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
819553
---
819553
 sos/cleaner/archives/__init__.py       | 2 ++
819553
 sos/cleaner/parsers/ip_parser.py       | 3 ++-
819553
 sos/cleaner/parsers/username_parser.py | 1 +
819553
 3 files changed, 5 insertions(+), 1 deletion(-)
819553
819553
diff --git a/sos/cleaner/archives/__init__.py b/sos/cleaner/archives/__init__.py
819553
index 795c5a78..cbf1f809 100644
819553
--- a/sos/cleaner/archives/__init__.py
819553
+++ b/sos/cleaner/archives/__init__.py
819553
@@ -43,6 +43,7 @@ class SoSObfuscationArchive():
819553
     type_name = 'undetermined'
819553
     description = 'undetermined'
819553
     is_nested = False
819553
+    skip_files = []
819553
     prep_files = {}
819553
 
819553
     def __init__(self, archive_path, tmpdir):
819553
@@ -111,6 +112,7 @@ class SoSObfuscationArchive():
819553
         Returns: list of files and file regexes
819553
         """
819553
         return [
819553
+            'proc/kallsyms',
819553
             'sosreport-',
819553
             'sys/firmware',
819553
             'sys/fs',
819553
diff --git a/sos/cleaner/parsers/ip_parser.py b/sos/cleaner/parsers/ip_parser.py
819553
index 71d38be8..b007368c 100644
819553
--- a/sos/cleaner/parsers/ip_parser.py
819553
+++ b/sos/cleaner/parsers/ip_parser.py
819553
@@ -37,7 +37,8 @@ class SoSIPParser(SoSCleanerParser):
819553
         'sos_commands/snappy/snap_list_--all',
819553
         'sos_commands/snappy/snap_--version',
819553
         'sos_commands/vulkan/vulkaninfo',
819553
-        'var/log/.*dnf.*'
819553
+        'var/log/.*dnf.*',
819553
+        'var/log/.*packag.*'  # get 'packages' and 'packaging' logs
819553
     ]
819553
 
819553
     map_file_key = 'ip_map'
819553
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
819553
index 229c7de4..3208a655 100644
819553
--- a/sos/cleaner/parsers/username_parser.py
819553
+++ b/sos/cleaner/parsers/username_parser.py
819553
@@ -32,6 +32,7 @@ class SoSUsernameParser(SoSCleanerParser):
819553
         'nobody',
819553
         'nfsnobody',
819553
         'shutdown',
819553
+        'stack',
819553
         'reboot',
819553
         'root',
819553
         'ubuntu',
819553
-- 
819553
2.31.1
819553
819553
From 7ebb2ce0bcd13c1b3aada648aceb20b5aff636d9 Mon Sep 17 00:00:00 2001
819553
From: Jake Hunsaker <jhunsake@redhat.com>
819553
Date: Tue, 15 Feb 2022 14:18:02 -0500
819553
Subject: [PATCH] [host] Skip entire /etc/sos/cleaner directory
819553
819553
While `default_mapping` is typically the only file expected under
819553
`/etc/sos/cleaner/` it is possible for other mapping files (such as
819553
backups) to appear there.
819553
819553
Make the `add_forbidden_path()` spec here target the entire cleaner
819553
directory to avoid ever capturing these map files.
819553
819553
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
819553
---
819553
 sos/report/plugins/host.py | 2 +-
819553
 1 file changed, 1 insertion(+), 1 deletion(-)
819553
819553
diff --git a/sos/report/plugins/host.py b/sos/report/plugins/host.py
819553
index 5e21da7b8e..95a3b9cd95 100644
819553
--- a/sos/report/plugins/host.py
819553
+++ b/sos/report/plugins/host.py
819553
@@ -20,7 +20,7 @@ class Host(Plugin, IndependentPlugin):
819553
 
819553
     def setup(self):
819553
 
819553
-        self.add_forbidden_path('/etc/sos/cleaner/default_mapping')
819553
+        self.add_forbidden_path('/etc/sos/cleaner')
819553
 
819553
         self.add_cmd_output('hostname', root_symlink='hostname')
819553
         self.add_cmd_output('uptime', root_symlink='uptime')