Blame SOURCES/sos-bz2023867-cleaner-hostnames-improvements.patch

003633
From decd39b7799a0579ea085b0da0728b6eabd49b38 Mon Sep 17 00:00:00 2001
003633
From: Jake Hunsaker <jhunsake@redhat.com>
003633
Date: Wed, 1 Sep 2021 00:28:58 -0400
003633
Subject: [PATCH] [clean] Provide archive abstractions to obfuscate more than
003633
 sos archives
003633
003633
This commit removes the restriction imposed on `sos clean` since its
003633
introduction in sos-4.0 to only work against known sos report archives
003633
or build directories. This is because there has been interest in using
003633
the obfuscation bits of sos in other data-collector projects.
003633
003633
The `SoSObfuscationArchive()` class has been revamped to now be an
003633
abstraction for different types of archives, and the cleaner logic has
003633
been updated to leverage this new abstraction rather than assuming we're
003633
working on an sos archive.
003633
003633
Abstractions are added for our own native use cases - that being `sos
003633
report` and `sos collect` for at-runtime obfuscation, as well as
003633
standalone archives previously generated. Further generic abstractions
003633
are available for plain directories and tarballs however these will not
003633
provide the same level of coverage as fully supported archive types, as
003633
is noted in the manpage for sos-clean.
003633
003633
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
003633
---
003633
 man/en/sos-clean.1                            |  25 ++
003633
 sos/cleaner/__init__.py                       | 308 +++++++++---------
003633
 .../__init__.py}                              |  80 ++++-
003633
 sos/cleaner/archives/generic.py               |  52 +++
003633
 sos/cleaner/archives/sos.py                   | 106 ++++++
003633
 sos/cleaner/parsers/__init__.py               |   6 -
003633
 sos/cleaner/parsers/hostname_parser.py        |   1 -
003633
 sos/cleaner/parsers/ip_parser.py              |   1 -
003633
 sos/cleaner/parsers/keyword_parser.py         |   1 -
003633
 sos/cleaner/parsers/mac_parser.py             |   1 -
003633
 sos/cleaner/parsers/username_parser.py        |   8 -
003633
 tests/cleaner_tests/existing_archive.py       |   7 +
003633
 tests/cleaner_tests/full_report_run.py        |   3 +
003633
 tests/cleaner_tests/report_with_mask.py       |   3 +
003633
 14 files changed, 423 insertions(+), 179 deletions(-)
003633
 rename sos/cleaner/{obfuscation_archive.py => archives/__init__.py} (81%)
003633
 create mode 100644 sos/cleaner/archives/generic.py
003633
 create mode 100644 sos/cleaner/archives/sos.py
003633
003633
diff --git a/man/en/sos-clean.1 b/man/en/sos-clean.1
003633
index b77bc63c..54026713 100644
003633
--- a/man/en/sos-clean.1
003633
+++ b/man/en/sos-clean.1
003633
@@ -10,6 +10,7 @@ sos clean - Obfuscate sensitive data from one or more sosreports
003633
     [\-\-jobs]
003633
     [\-\-no-update]
003633
     [\-\-keep-binary-files]
003633
+    [\-\-archive-type]
003633
 
003633
 .SH DESCRIPTION
003633
 \fBsos clean\fR or \fBsos mask\fR is an sos subcommand used to obfuscate sensitive information from
003633
@@ -88,6 +89,30 @@ Users should review any archive that keeps binary files in place before sending
003633
 a third party.
003633
 
003633
 Default: False (remove encountered binary files)
003633
+.TP
003633
+.B \-\-archive-type TYPE
003633
+Specify the type of archive that TARGET was generated as.
003633
+When sos inspects a TARGET archive, it tries to identify what type of archive it is.
003633
+For example, it may be a report generated by \fBsos report\fR, or a collection of those
003633
+reports generated by \fBsos collect\fR, which require separate approaches.
003633
+
003633
+This option may be useful if a given TARGET archive is known to be of a specific type,
003633
+but due to unknown reasons or some malformed/missing information in the archive directly,
003633
+that is not properly identified by sos.
003633
+
003633
+The following are accepted values for this option:
003633
+
003633
+    \fBauto\fR          Automatically detect the archive type
003633
+    \fBreport\fR        An archive generated by \fBsos report\fR
003633
+    \fBcollect\fR       An archive generated by \fBsos collect\fR
003633
+
003633
+The following may also be used, however note that these do not attempt to pre-load
003633
+any information from the archives into the parsers. This means that, among other limitations,
003633
+items like host and domain names may not be obfuscated unless an obfuscated mapping already exists
003633
+on the system from a previous execution.
003633
+
003633
+    \fBdata-dir\fR      A plain directory on the filesystem.
003633
+    \fBtarball\fR       A generic tar archive not associated with any known tool
003633
 
003633
 .SH SEE ALSO
003633
 .BR sos (1)
003633
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
003633
index 6aadfe79..6d2eb483 100644
003633
--- a/sos/cleaner/__init__.py
003633
+++ b/sos/cleaner/__init__.py
003633
@@ -12,9 +12,7 @@ import hashlib
003633
 import json
003633
 import logging
003633
 import os
003633
-import re
003633
 import shutil
003633
-import tarfile
003633
 import tempfile
003633
 
003633
 from concurrent.futures import ThreadPoolExecutor
003633
@@ -27,7 +25,10 @@ from sos.cleaner.parsers.mac_parser import SoSMacParser
003633
 from sos.cleaner.parsers.hostname_parser import SoSHostnameParser
003633
 from sos.cleaner.parsers.keyword_parser import SoSKeywordParser
003633
 from sos.cleaner.parsers.username_parser import SoSUsernameParser
003633
-from sos.cleaner.obfuscation_archive import SoSObfuscationArchive
003633
+from sos.cleaner.archives.sos import (SoSReportArchive, SoSReportDirectory,
003633
+                                      SoSCollectorArchive,
003633
+                                      SoSCollectorDirectory)
003633
+from sos.cleaner.archives.generic import DataDirArchive, TarballArchive
003633
 from sos.utilities import get_human_readable
003633
 from textwrap import fill
003633
 
003633
@@ -41,6 +42,7 @@ class SoSCleaner(SoSComponent):
003633
     desc = "Obfuscate sensitive networking information in a report"
003633
 
003633
     arg_defaults = {
003633
+        'archive_type': 'auto',
003633
         'domains': [],
003633
         'jobs': 4,
003633
         'keywords': [],
003633
@@ -70,6 +72,7 @@ class SoSCleaner(SoSComponent):
003633
             self.from_cmdline = False
003633
             if not hasattr(self.opts, 'jobs'):
003633
                 self.opts.jobs = 4
003633
+            self.opts.archive_type = 'auto'
003633
             self.soslog = logging.getLogger('sos')
003633
             self.ui_log = logging.getLogger('sos_ui')
003633
             # create the tmp subdir here to avoid a potential race condition
003633
@@ -92,6 +95,17 @@ class SoSCleaner(SoSComponent):
003633
             SoSUsernameParser(self.cleaner_mapping, self.opts.usernames)
003633
         ]
003633
 
003633
+        self.archive_types = [
003633
+            SoSReportDirectory,
003633
+            SoSReportArchive,
003633
+            SoSCollectorDirectory,
003633
+            SoSCollectorArchive,
003633
+            # make sure these two are always last as they are fallbacks
003633
+            DataDirArchive,
003633
+            TarballArchive
003633
+        ]
003633
+        self.nested_archive = None
003633
+
003633
         self.log_info("Cleaner initialized. From cmdline: %s"
003633
                       % self.from_cmdline)
003633
 
003633
@@ -178,6 +192,11 @@ third party.
003633
         )
003633
         clean_grp.add_argument('target', metavar='TARGET',
003633
                                help='The directory or archive to obfuscate')
003633
+        clean_grp.add_argument('--archive-type', default='auto',
003633
+                               choices=['auto', 'report', 'collect',
003633
+                                        'data-dir', 'tarball'],
003633
+                               help=('Specify what kind of archive the target '
003633
+                                     'was generated as'))
003633
         clean_grp.add_argument('--domains', action='extend', default=[],
003633
                                help='List of domain names to obfuscate')
003633
         clean_grp.add_argument('-j', '--jobs', default=4, type=int,
003633
@@ -218,59 +237,28 @@ third party.
003633
 
003633
         In the event the target path is not an archive, abort.
003633
         """
003633
-        if not tarfile.is_tarfile(self.opts.target):
003633
-            self.ui_log.error(
003633
-                "Invalid target: must be directory or tar archive"
003633
-            )
003633
-            self._exit(1)
003633
-
003633
-        archive = tarfile.open(self.opts.target)
003633
-        self.arc_name = self.opts.target.split('/')[-1].split('.')[:-2][0]
003633
-
003633
-        try:
003633
-            archive.getmember(os.path.join(self.arc_name, 'sos_logs'))
003633
-        except Exception:
003633
-            # this is not an sos archive
003633
-            self.ui_log.error("Invalid target: not an sos archive")
003633
-            self._exit(1)
003633
-
003633
-        # see if there are archives within this archive
003633
-        nested_archives = []
003633
-        for _file in archive.getmembers():
003633
-            if (re.match('sosreport-.*.tar', _file.name.split('/')[-1]) and not
003633
-                    (_file.name.endswith(('.md5', '.sha256')))):
003633
-                nested_archives.append(_file.name.split('/')[-1])
003633
-
003633
-        if nested_archives:
003633
-            self.log_info("Found nested archive(s), extracting top level")
003633
-            nested_path = self.extract_archive(archive)
003633
-            for arc_file in os.listdir(nested_path):
003633
-                if re.match('sosreport.*.tar.*', arc_file):
003633
-                    if arc_file.endswith(('.md5', '.sha256')):
003633
-                        continue
003633
-                    self.report_paths.append(os.path.join(nested_path,
003633
-                                                          arc_file))
003633
-            # add the toplevel extracted archive
003633
-            self.report_paths.append(nested_path)
003633
+        _arc = None
003633
+        if self.opts.archive_type != 'auto':
003633
+            check_type = self.opts.archive_type.replace('-', '_')
003633
+            for archive in self.archive_types:
003633
+                if archive.type_name == check_type:
003633
+                    _arc = archive(self.opts.target, self.tmpdir)
003633
         else:
003633
-            self.report_paths.append(self.opts.target)
003633
-
003633
-        archive.close()
003633
-
003633
-    def extract_archive(self, archive):
003633
-        """Extract an archive into our tmpdir so that we may inspect it or
003633
-        iterate through its contents for obfuscation
003633
-
003633
-        Positional arguments:
003633
-
003633
-            :param archive:     An open TarFile object for the archive
003633
-
003633
-        """
003633
-        if not isinstance(archive, tarfile.TarFile):
003633
-            archive = tarfile.open(archive)
003633
-        path = os.path.join(self.tmpdir, 'cleaner')
003633
-        archive.extractall(path)
003633
-        return os.path.join(path, archive.name.split('/')[-1].split('.tar')[0])
003633
+            for arc in self.archive_types:
003633
+                if arc.check_is_type(self.opts.target):
003633
+                    _arc = arc(self.opts.target, self.tmpdir)
003633
+                    break
003633
+        if not _arc:
003633
+            return
003633
+        self.report_paths.append(_arc)
003633
+        if _arc.is_nested:
003633
+            self.report_paths.extend(_arc.get_nested_archives())
003633
+            # We need to preserve the top level archive until all
003633
+            # nested archives are processed
003633
+            self.report_paths.remove(_arc)
003633
+            self.nested_archive = _arc
003633
+        if self.nested_archive:
003633
+            self.nested_archive.ui_name = self.nested_archive.description
003633
 
003633
     def execute(self):
003633
         """SoSCleaner will begin by inspecting the TARGET option to determine
003633
@@ -283,6 +271,7 @@ third party.
003633
         be unpacked, cleaned, and repacked and the final top-level archive will
003633
         then be repacked as well.
003633
         """
003633
+        self.arc_name = self.opts.target.split('/')[-1].split('.tar')[0]
003633
         if self.from_cmdline:
003633
             self.print_disclaimer()
003633
         self.report_paths = []
003633
@@ -290,23 +279,11 @@ third party.
003633
             self.ui_log.error("Invalid target: no such file or directory %s"
003633
                               % self.opts.target)
003633
             self._exit(1)
003633
-        if os.path.isdir(self.opts.target):
003633
-            self.arc_name = self.opts.target.split('/')[-1]
003633
-            for _file in os.listdir(self.opts.target):
003633
-                if _file == 'sos_logs':
003633
-                    self.report_paths.append(self.opts.target)
003633
-                if (_file.startswith('sosreport') and
003633
-                   (_file.endswith(".tar.gz") or _file.endswith(".tar.xz"))):
003633
-                    self.report_paths.append(os.path.join(self.opts.target,
003633
-                                                          _file))
003633
-            if not self.report_paths:
003633
-                self.ui_log.error("Invalid target: not an sos directory")
003633
-                self._exit(1)
003633
-        else:
003633
-            self.inspect_target_archive()
003633
+
003633
+        self.inspect_target_archive()
003633
 
003633
         if not self.report_paths:
003633
-            self.ui_log.error("No valid sos archives or directories found\n")
003633
+            self.ui_log.error("No valid archives or directories found\n")
003633
             self._exit(1)
003633
 
003633
         # we have at least one valid target to obfuscate
003633
@@ -334,33 +311,7 @@ third party.
003633
 
003633
         final_path = None
003633
         if len(self.completed_reports) > 1:
003633
-            # we have an archive of archives, so repack the obfuscated tarball
003633
-            arc_name = self.arc_name + '-obfuscated'
003633
-            self.setup_archive(name=arc_name)
003633
-            for arc in self.completed_reports:
003633
-                if arc.is_tarfile:
003633
-                    arc_dest = self.obfuscate_string(
003633
-                        arc.final_archive_path.split('/')[-1]
003633
-                    )
003633
-                    self.archive.add_file(arc.final_archive_path,
003633
-                                          dest=arc_dest)
003633
-                    checksum = self.get_new_checksum(arc.final_archive_path)
003633
-                    if checksum is not None:
003633
-                        dname = self.obfuscate_string(
003633
-                            "checksums/%s.%s" % (arc_dest, self.hash_name)
003633
-                        )
003633
-                        self.archive.add_string(checksum, dest=dname)
003633
-                else:
003633
-                    for dirname, dirs, files in os.walk(arc.archive_path):
003633
-                        for filename in files:
003633
-                            if filename.startswith('sosreport'):
003633
-                                continue
003633
-                            fname = os.path.join(dirname, filename)
003633
-                            dnm = self.obfuscate_string(
003633
-                                fname.split(arc.archive_name)[-1].lstrip('/')
003633
-                            )
003633
-                            self.archive.add_file(fname, dest=dnm)
003633
-            arc_path = self.archive.finalize(self.opts.compression_type)
003633
+            arc_path = self.rebuild_nested_archive()
003633
         else:
003633
             arc = self.completed_reports[0]
003633
             arc_path = arc.final_archive_path
003633
@@ -371,8 +322,7 @@ third party.
003633
                 )
003633
                 with open(os.path.join(self.sys_tmp, chksum_name), 'w') as cf:
003633
                     cf.write(checksum)
003633
-
003633
-        self.write_cleaner_log()
003633
+            self.write_cleaner_log()
003633
 
003633
         final_path = self.obfuscate_string(
003633
             os.path.join(self.sys_tmp, arc_path.split('/')[-1])
003633
@@ -393,6 +343,30 @@ third party.
003633
 
003633
         self.cleanup()
003633
 
003633
+    def rebuild_nested_archive(self):
003633
+        """Handles repacking the nested tarball, now containing only obfuscated
003633
+        copies of the reports, log files, manifest, etc...
003633
+        """
003633
+        # we have an archive of archives, so repack the obfuscated tarball
003633
+        arc_name = self.arc_name + '-obfuscated'
003633
+        self.setup_archive(name=arc_name)
003633
+        for archive in self.completed_reports:
003633
+            arc_dest = archive.final_archive_path.split('/')[-1]
003633
+            checksum = self.get_new_checksum(archive.final_archive_path)
003633
+            if checksum is not None:
003633
+                dname = "checksums/%s.%s" % (arc_dest, self.hash_name)
003633
+                self.archive.add_string(checksum, dest=dname)
003633
+        for dirn, dirs, files in os.walk(self.nested_archive.extracted_path):
003633
+            for filename in files:
003633
+                fname = os.path.join(dirn, filename)
003633
+                dname = fname.split(self.nested_archive.extracted_path)[-1]
003633
+                dname = dname.lstrip('/')
003633
+                self.archive.add_file(fname, dest=dname)
003633
+                # remove it now so we don't balloon our fs space needs
003633
+                os.remove(fname)
003633
+        self.write_cleaner_log(archive=True)
003633
+        return self.archive.finalize(self.opts.compression_type)
003633
+
003633
     def compile_mapping_dict(self):
003633
         """Build a dict that contains each parser's map as a key, with the
003633
         contents as that key's value. This will then be written to disk in the
003633
@@ -441,7 +415,7 @@ third party.
003633
                 self.log_error("Could not update mapping config file: %s"
003633
                                % err)
003633
 
003633
-    def write_cleaner_log(self):
003633
+    def write_cleaner_log(self, archive=False):
003633
         """When invoked via the command line, the logging from SoSCleaner will
003633
         not be added to the archive(s) it processes, so we need to write it
003633
         separately to disk
003633
@@ -454,6 +428,10 @@ third party.
003633
             for line in self.sos_log_file.readlines():
003633
                 logfile.write(line)
003633
 
003633
+        if archive:
003633
+            self.obfuscate_file(log_name)
003633
+            self.archive.add_file(log_name, dest="sos_logs/cleaner.log")
003633
+
003633
     def get_new_checksum(self, archive_path):
003633
         """Calculate a new checksum for the obfuscated archive, as the previous
003633
         checksum will no longer be valid
003633
@@ -481,11 +459,11 @@ third party.
003633
         be obfuscated concurrently.
003633
         """
003633
         try:
003633
-            if len(self.report_paths) > 1:
003633
-                msg = ("Found %s total reports to obfuscate, processing up to "
003633
-                       "%s concurrently\n"
003633
-                       % (len(self.report_paths), self.opts.jobs))
003633
-                self.ui_log.info(msg)
003633
+            msg = (
003633
+                "Found %s total reports to obfuscate, processing up to %s "
003633
+                "concurrently\n" % (len(self.report_paths), self.opts.jobs)
003633
+            )
003633
+            self.ui_log.info(msg)
003633
             if self.opts.keep_binary_files:
003633
                 self.ui_log.warning(
003633
                     "WARNING: binary files that potentially contain sensitive "
003633
@@ -494,53 +472,67 @@ third party.
003633
             pool = ThreadPoolExecutor(self.opts.jobs)
003633
             pool.map(self.obfuscate_report, self.report_paths, chunksize=1)
003633
             pool.shutdown(wait=True)
003633
+            # finally, obfuscate the nested archive if one exists
003633
+            if self.nested_archive:
003633
+                self._replace_obfuscated_archives()
003633
+                self.obfuscate_report(self.nested_archive)
003633
         except KeyboardInterrupt:
003633
             self.ui_log.info("Exiting on user cancel")
003633
             os._exit(130)
003633
 
003633
+    def _replace_obfuscated_archives(self):
003633
+        """When we have a nested archive, we need to rebuild the original
003633
+        archive, which entails replacing the existing archives with their
003633
+        obfuscated counterparts
003633
+        """
003633
+        for archive in self.completed_reports:
003633
+            os.remove(archive.archive_path)
003633
+            dest = self.nested_archive.extracted_path
003633
+            tarball = archive.final_archive_path.split('/')[-1]
003633
+            dest_name = os.path.join(dest, tarball)
003633
+            shutil.move(archive.final_archive_path, dest)
003633
+            archive.final_archive_path = dest_name
003633
+
003633
     def preload_all_archives_into_maps(self):
003633
         """Before doing the actual obfuscation, if we have multiple archives
003633
         to obfuscate then we need to preload each of them into the mappings
003633
         to ensure that node1 is obfuscated in node2 as well as node2 being
003633
         obfuscated in node1's archive.
003633
         """
003633
-        self.log_info("Pre-loading multiple archives into obfuscation maps")
003633
+        self.log_info("Pre-loading all archives into obfuscation maps")
003633
         for _arc in self.report_paths:
003633
-            is_dir = os.path.isdir(_arc)
003633
-            if is_dir:
003633
-                _arc_name = _arc
003633
-            else:
003633
-                archive = tarfile.open(_arc)
003633
-                _arc_name = _arc.split('/')[-1].split('.tar')[0]
003633
-            # for each parser, load the map_prep_file into memory, and then
003633
-            # send that for obfuscation. We don't actually obfuscate the file
003633
-            # here, do that in the normal archive loop
003633
             for _parser in self.parsers:
003633
-                if not _parser.prep_map_file:
003633
+                try:
003633
+                    pfile = _arc.prep_files[_parser.name.lower().split()[0]]
003633
+                    if not pfile:
003633
+                        continue
003633
+                except (IndexError, KeyError):
003633
                     continue
003633
-                if isinstance(_parser.prep_map_file, str):
003633
-                    _parser.prep_map_file = [_parser.prep_map_file]
003633
-                for parse_file in _parser.prep_map_file:
003633
-                    _arc_path = os.path.join(_arc_name, parse_file)
003633
+                if isinstance(pfile, str):
003633
+                    pfile = [pfile]
003633
+                for parse_file in pfile:
003633
+                    self.log_debug("Attempting to load %s" % parse_file)
003633
                     try:
003633
-                        if is_dir:
003633
-                            _pfile = open(_arc_path, 'r')
003633
-                            content = _pfile.read()
003633
-                        else:
003633
-                            _pfile = archive.extractfile(_arc_path)
003633
-                            content = _pfile.read().decode('utf-8')
003633
-                        _pfile.close()
003633
+                        content = _arc.get_file_content(parse_file)
003633
+                        if not content:
003633
+                            continue
003633
                         if isinstance(_parser, SoSUsernameParser):
003633
                             _parser.load_usernames_into_map(content)
003633
-                        for line in content.splitlines():
003633
-                            if isinstance(_parser, SoSHostnameParser):
003633
-                                _parser.load_hostname_into_map(line)
003633
-                            self.obfuscate_line(line)
003633
+                        elif isinstance(_parser, SoSHostnameParser):
003633
+                            _parser.load_hostname_into_map(
003633
+                                content.splitlines()[0]
003633
+                            )
003633
+                        else:
003633
+                            for line in content.splitlines():
003633
+                                self.obfuscate_line(line)
003633
                     except Exception as err:
003633
-                        self.log_debug("Could not prep %s: %s"
003633
-                                       % (_arc_path, err))
003633
+                        self.log_info(
003633
+                            "Could not prepare %s from %s (archive: %s): %s"
003633
+                            % (_parser.name, parse_file, _arc.archive_name,
003633
+                               err)
003633
+                        )
003633
 
003633
-    def obfuscate_report(self, report):
003633
+    def obfuscate_report(self, archive):
003633
         """Individually handle each archive or directory we've discovered by
003633
         running through each file therein.
003633
 
003633
@@ -549,17 +541,12 @@ third party.
003633
             :param report str:      Filepath to the directory or archive
003633
         """
003633
         try:
003633
-            if not os.access(report, os.W_OK):
003633
-                msg = "Insufficient permissions on %s" % report
003633
-                self.log_info(msg)
003633
-                self.ui_log.error(msg)
003633
-                return
003633
-
003633
-            archive = SoSObfuscationArchive(report, self.tmpdir)
003633
             arc_md = self.cleaner_md.add_section(archive.archive_name)
003633
             start_time = datetime.now()
003633
             arc_md.add_field('start_time', start_time)
003633
-            archive.extract()
003633
+            # don't double extract nested archives
003633
+            if not archive.is_extracted:
003633
+                archive.extract()
003633
             archive.report_msg("Beginning obfuscation...")
003633
 
003633
             file_list = archive.get_file_list()
003633
@@ -586,27 +573,28 @@ third party.
003633
                               caller=archive.archive_name)
003633
 
003633
             # if the archive was already a tarball, repack it
003633
-            method = archive.get_compression()
003633
-            if method:
003633
-                archive.report_msg("Re-compressing...")
003633
-                try:
003633
-                    archive.rename_top_dir(
003633
-                        self.obfuscate_string(archive.archive_name)
003633
-                    )
003633
-                    archive.compress(method)
003633
-                except Exception as err:
003633
-                    self.log_debug("Archive %s failed to compress: %s"
003633
-                                   % (archive.archive_name, err))
003633
-                    archive.report_msg("Failed to re-compress archive: %s"
003633
-                                       % err)
003633
-                    return
003633
+            if not archive.is_nested:
003633
+                method = archive.get_compression()
003633
+                if method:
003633
+                    archive.report_msg("Re-compressing...")
003633
+                    try:
003633
+                        archive.rename_top_dir(
003633
+                            self.obfuscate_string(archive.archive_name)
003633
+                        )
003633
+                        archive.compress(method)
003633
+                    except Exception as err:
003633
+                        self.log_debug("Archive %s failed to compress: %s"
003633
+                                       % (archive.archive_name, err))
003633
+                        archive.report_msg("Failed to re-compress archive: %s"
003633
+                                           % err)
003633
+                        return
003633
+                self.completed_reports.append(archive)
003633
 
003633
             end_time = datetime.now()
003633
             arc_md.add_field('end_time', end_time)
003633
             arc_md.add_field('run_time', end_time - start_time)
003633
             arc_md.add_field('files_obfuscated', len(archive.file_sub_list))
003633
             arc_md.add_field('total_substitutions', archive.total_sub_count)
003633
-            self.completed_reports.append(archive)
003633
             rmsg = ''
003633
             if archive.removed_file_count:
003633
                 rmsg = " [removed %s unprocessable files]"
003633
@@ -615,7 +603,7 @@ third party.
003633
 
003633
         except Exception as err:
003633
             self.ui_log.info("Exception while processing %s: %s"
003633
-                             % (report, err))
003633
+                             % (archive.archive_name, err))
003633
 
003633
     def obfuscate_file(self, filename, short_name=None, arc_name=None):
003633
         """Obfuscate and individual file, line by line.
003633
@@ -635,6 +623,8 @@ third party.
003633
             # the requested file doesn't exist in the archive
003633
             return
003633
         subs = 0
003633
+        if not short_name:
003633
+            short_name = filename.split('/')[-1]
003633
         if not os.path.islink(filename):
003633
             # don't run the obfuscation on the link, but on the actual file
003633
             # at some other point.
003633
@@ -745,3 +735,5 @@ third party.
003633
         for parser in self.parsers:
003633
             _sec = parse_sec.add_section(parser.name.replace(' ', '_').lower())
003633
             _sec.add_field('entries', len(parser.mapping.dataset.keys()))
003633
+
003633
+# vim: set et ts=4 sw=4 :
003633
diff --git a/sos/cleaner/obfuscation_archive.py b/sos/cleaner/archives/__init__.py
003633
similarity index 81%
003633
rename from sos/cleaner/obfuscation_archive.py
003633
rename to sos/cleaner/archives/__init__.py
003633
index ea0b7012..795c5a78 100644
003633
--- a/sos/cleaner/obfuscation_archive.py
003633
+++ b/sos/cleaner/archives/__init__.py
003633
@@ -40,6 +40,10 @@ class SoSObfuscationArchive():
003633
     file_sub_list = []
003633
     total_sub_count = 0
003633
     removed_file_count = 0
003633
+    type_name = 'undetermined'
003633
+    description = 'undetermined'
003633
+    is_nested = False
003633
+    prep_files = {}
003633
 
003633
     def __init__(self, archive_path, tmpdir):
003633
         self.archive_path = archive_path
003633
@@ -50,7 +54,43 @@ class SoSObfuscationArchive():
003633
         self.soslog = logging.getLogger('sos')
003633
         self.ui_log = logging.getLogger('sos_ui')
003633
         self.skip_list = self._load_skip_list()
003633
-        self.log_info("Loaded %s as an archive" % self.archive_path)
003633
+        self.is_extracted = False
003633
+        self._load_self()
003633
+        self.archive_root = ''
003633
+        self.log_info(
003633
+            "Loaded %s as type %s"
003633
+            % (self.archive_path, self.description)
003633
+        )
003633
+
003633
+    @classmethod
003633
+    def check_is_type(cls, arc_path):
003633
+        """Check if the archive is a well-known type we directly support"""
003633
+        return False
003633
+
003633
+    def _load_self(self):
003633
+        if self.is_tarfile:
003633
+            self.tarobj = tarfile.open(self.archive_path)
003633
+
003633
+    def get_nested_archives(self):
003633
+        """Return a list of ObfuscationArchives that represent additional
003633
+        archives found within the target archive. For example, an archive from
003633
+        `sos collect` will return a list of ``SoSReportArchive`` objects.
003633
+
003633
+        This should be overridden by individual types of ObfuscationArchive's
003633
+        """
003633
+        return []
003633
+
003633
+    def get_archive_root(self):
003633
+        """Set the root path for the archive that should be prepended to any
003633
+        filenames given to methods in this class.
003633
+        """
003633
+        if self.is_tarfile:
003633
+            toplevel = self.tarobj.firstmember
003633
+            if toplevel.isdir():
003633
+                return toplevel.name
003633
+            else:
003633
+                return os.sep
003633
+        return os.path.abspath(self.archive_path)
003633
 
003633
     def report_msg(self, msg):
003633
         """Helper to easily format ui messages on a per-report basis"""
003633
@@ -96,10 +136,42 @@ class SoSObfuscationArchive():
003633
             os.remove(full_fname)
003633
             self.removed_file_count += 1
003633
 
003633
-    def extract(self):
003633
+    def format_file_name(self, fname):
003633
+        """Based on the type of archive we're dealing with, do whatever that
003633
+        archive requires to a provided **relative** filepath to be able to
003633
+        access it within the archive
003633
+        """
003633
+        if not self.is_extracted:
003633
+            if not self.archive_root:
003633
+                self.archive_root = self.get_archive_root()
003633
+            return os.path.join(self.archive_root, fname)
003633
+        else:
003633
+            return os.path.join(self.extracted_path, fname)
003633
+
003633
+    def get_file_content(self, fname):
003633
+        """Return the content from the specified fname. Particularly useful for
003633
+        tarball-type archives so we can retrieve prep file contents prior to
003633
+        extracting the entire archive
003633
+        """
003633
+        if self.is_extracted is False and self.is_tarfile:
003633
+            filename = self.format_file_name(fname)
003633
+            try:
003633
+                return self.tarobj.extractfile(filename).read().decode('utf-8')
003633
+            except KeyError:
003633
+                self.log_debug(
003633
+                    "Unable to retrieve %s: no such file in archive" % fname
003633
+                )
003633
+                return ''
003633
+        else:
003633
+            with open(self.format_file_name(fname), 'r') as to_read:
003633
+                return to_read.read()
003633
+
003633
+    def extract(self, quiet=False):
003633
         if self.is_tarfile:
003633
-            self.report_msg("Extracting...")
003633
+            if not quiet:
003633
+                self.report_msg("Extracting...")
003633
             self.extracted_path = self.extract_self()
003633
+            self.is_extracted = True
003633
         else:
003633
             self.extracted_path = self.archive_path
003633
         # if we're running as non-root (e.g. collector), then we can have a
003633
@@ -317,3 +389,5 @@ class SoSObfuscationArchive():
003633
                 return False
003633
             except UnicodeDecodeError:
003633
                 return True
003633
+
003633
+# vim: set et ts=4 sw=4 :
003633
diff --git a/sos/cleaner/archives/generic.py b/sos/cleaner/archives/generic.py
003633
new file mode 100644
003633
index 00000000..2ce6f09b
003633
--- /dev/null
003633
+++ b/sos/cleaner/archives/generic.py
003633
@@ -0,0 +1,52 @@
003633
+# Copyright 2020 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
003633
+
003633
+# This file is part of the sos project: https://github.com/sosreport/sos
003633
+#
003633
+# This copyrighted material is made available to anyone wishing to use,
003633
+# modify, copy, or redistribute it subject to the terms and conditions of
003633
+# version 2 of the GNU General Public License.
003633
+#
003633
+# See the LICENSE file in the source distribution for further information.
003633
+
003633
+
003633
+from sos.cleaner.archives import SoSObfuscationArchive
003633
+
003633
+import os
003633
+import tarfile
003633
+
003633
+
003633
+class DataDirArchive(SoSObfuscationArchive):
003633
+    """A plain directory on the filesystem that is not directly associated with
003633
+    any known or supported collection utility
003633
+    """
003633
+
003633
+    type_name = 'data_dir'
003633
+    description = 'unassociated directory'
003633
+
003633
+    @classmethod
003633
+    def check_is_type(cls, arc_path):
003633
+        return os.path.isdir(arc_path)
003633
+
003633
+    def set_archive_root(self):
003633
+        return os.path.abspath(self.archive_path)
003633
+
003633
+
003633
+class TarballArchive(SoSObfuscationArchive):
003633
+    """A generic tar archive that is not associated with any known or supported
003633
+    collection utility
003633
+    """
003633
+
003633
+    type_name = 'tarball'
003633
+    description = 'unassociated tarball'
003633
+
003633
+    @classmethod
003633
+    def check_is_type(cls, arc_path):
003633
+        try:
003633
+            return tarfile.is_tarfile(arc_path)
003633
+        except Exception:
003633
+            return False
003633
+
003633
+    def set_archive_root(self):
003633
+        if self.tarobj.firstmember.isdir():
003633
+            return self.tarobj.firstmember.name
003633
+        return ''
003633
diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
003633
new file mode 100644
003633
index 00000000..4401d710
003633
--- /dev/null
003633
+++ b/sos/cleaner/archives/sos.py
003633
@@ -0,0 +1,106 @@
003633
+# Copyright 2021 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
003633
+
003633
+# This file is part of the sos project: https://github.com/sosreport/sos
003633
+#
003633
+# This copyrighted material is made available to anyone wishing to use,
003633
+# modify, copy, or redistribute it subject to the terms and conditions of
003633
+# version 2 of the GNU General Public License.
003633
+#
003633
+# See the LICENSE file in the source distribution for further information.
003633
+
003633
+
003633
+from sos.cleaner.archives import SoSObfuscationArchive
003633
+
003633
+import os
003633
+import tarfile
003633
+
003633
+
003633
+class SoSReportArchive(SoSObfuscationArchive):
003633
+    """This is the class representing an sos report, or in other words the
003633
+    type the archive the SoS project natively generates
003633
+    """
003633
+
003633
+    type_name = 'report'
003633
+    description = 'sos report archive'
003633
+    prep_files = {
003633
+        'hostname': 'sos_commands/host/hostname',
003633
+        'ip': 'sos_commands/networking/ip_-o_addr',
003633
+        'mac': 'sos_commands/networking/ip_-d_address',
003633
+        'username': [
003633
+            'sos_commands/login/lastlog_-u_1000-60000',
003633
+            'sos_commands/login/lastlog_-u_60001-65536',
003633
+            'sos_commands/login/lastlog_-u_65537-4294967295',
003633
+            # AD users will be reported here, but favor the lastlog files since
003633
+            # those will include local users who have not logged in
003633
+            'sos_commands/login/last'
003633
+        ]
003633
+    }
003633
+
003633
+    @classmethod
003633
+    def check_is_type(cls, arc_path):
003633
+        try:
003633
+            return tarfile.is_tarfile(arc_path) and 'sosreport-' in arc_path
003633
+        except Exception:
003633
+            return False
003633
+
003633
+
003633
+class SoSReportDirectory(SoSReportArchive):
003633
+    """This is the archive class representing a build directory, or in other
003633
+    words what `sos report --clean` will end up using for in-line obfuscation
003633
+    """
003633
+
003633
+    type_name = 'report_dir'
003633
+    description = 'sos report directory'
003633
+
003633
+    @classmethod
003633
+    def check_is_type(cls, arc_path):
003633
+        if os.path.isdir(arc_path):
003633
+            return 'sos_logs' in os.listdir(arc_path)
003633
+        return False
003633
+
003633
+
003633
+class SoSCollectorArchive(SoSObfuscationArchive):
003633
+    """Archive class representing the tarball created by ``sos collect``. It
003633
+    will not provide prep files on its own, however it will provide a list
003633
+    of SoSReportArchive's which will then be used to prep the parsers
003633
+    """
003633
+
003633
+    type_name = 'collect'
003633
+    description = 'sos collect tarball'
003633
+    is_nested = True
003633
+
003633
+    @classmethod
003633
+    def check_is_type(cls, arc_path):
003633
+        try:
003633
+            return (tarfile.is_tarfile(arc_path) and 'sos-collect' in arc_path)
003633
+        except Exception:
003633
+            return False
003633
+
003633
+    def get_nested_archives(self):
003633
+        self.extract(quiet=True)
003633
+        _path = self.extracted_path
003633
+        archives = []
003633
+        for fname in os.listdir(_path):
003633
+            arc_name = os.path.join(_path, fname)
003633
+            if 'sosreport-' in fname and tarfile.is_tarfile(arc_name):
003633
+                archives.append(SoSReportArchive(arc_name, self.tmpdir))
003633
+        return archives
003633
+
003633
+
003633
+class SoSCollectorDirectory(SoSCollectorArchive):
003633
+    """The archive class representing the temp directory used by ``sos
003633
+    collect`` when ``--clean`` is used during runtime.
003633
+    """
003633
+
003633
+    type_name = 'collect_dir'
003633
+    description = 'sos collect directory'
003633
+
003633
+    @classmethod
003633
+    def check_is_type(cls, arc_path):
003633
+        if os.path.isdir(arc_path):
003633
+            for fname in os.listdir(arc_path):
003633
+                if 'sos-collector-' in fname:
003633
+                    return True
003633
+        return False
003633
+
003633
+# vim: set et ts=4 sw=4 :
003633
diff --git a/sos/cleaner/parsers/__init__.py b/sos/cleaner/parsers/__init__.py
003633
index af6e375e..e62fd938 100644
003633
--- a/sos/cleaner/parsers/__init__.py
003633
+++ b/sos/cleaner/parsers/__init__.py
003633
@@ -37,11 +37,6 @@ class SoSCleanerParser():
003633
     :cvar map_file_key: The key in the ``map_file`` to read when loading
003633
                         previous obfuscation matches
003633
     :vartype map_file_key: ``str``
003633
-
003633
-
003633
-    :cvar prep_map_file: File to read from an archive to pre-seed the map with
003633
-                         matches. E.G. ip_addr for loading IP addresses
003633
-    :vartype prep_map_fie: ``str``
003633
     """
003633
 
003633
     name = 'Undefined Parser'
003633
@@ -49,7 +44,6 @@ class SoSCleanerParser():
003633
     skip_line_patterns = []
003633
     skip_files = []
003633
     map_file_key = 'unset'
003633
-    prep_map_file = []
003633
 
003633
     def __init__(self, config={}):
003633
         if self.map_file_key in config:
003633
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
003633
index 71e13d3f..daa76a62 100644
003633
--- a/sos/cleaner/parsers/hostname_parser.py
003633
+++ b/sos/cleaner/parsers/hostname_parser.py
003633
@@ -16,7 +16,6 @@ class SoSHostnameParser(SoSCleanerParser):
003633
 
003633
     name = 'Hostname Parser'
003633
     map_file_key = 'hostname_map'
003633
-    prep_map_file = 'sos_commands/host/hostname'
003633
     regex_patterns = [
003633
         r'(((\b|_)[a-zA-Z0-9-\.]{1,200}\.[a-zA-Z]{1,63}(\b|_)))'
003633
     ]
003633
diff --git a/sos/cleaner/parsers/ip_parser.py b/sos/cleaner/parsers/ip_parser.py
003633
index 525139e8..71d38be8 100644
003633
--- a/sos/cleaner/parsers/ip_parser.py
003633
+++ b/sos/cleaner/parsers/ip_parser.py
003633
@@ -41,7 +41,6 @@ class SoSIPParser(SoSCleanerParser):
003633
     ]
003633
 
003633
     map_file_key = 'ip_map'
003633
-    prep_map_file = 'sos_commands/networking/ip_-o_addr'
003633
 
003633
     def __init__(self, config):
003633
         self.mapping = SoSIPMap()
003633
diff --git a/sos/cleaner/parsers/keyword_parser.py b/sos/cleaner/parsers/keyword_parser.py
003633
index 68de3727..694c6073 100644
003633
--- a/sos/cleaner/parsers/keyword_parser.py
003633
+++ b/sos/cleaner/parsers/keyword_parser.py
003633
@@ -20,7 +20,6 @@ class SoSKeywordParser(SoSCleanerParser):
003633
 
003633
     name = 'Keyword Parser'
003633
     map_file_key = 'keyword_map'
003633
-    prep_map_file = ''
003633
 
003633
     def __init__(self, config, keywords=None, keyword_file=None):
003633
         self.mapping = SoSKeywordMap()
003633
diff --git a/sos/cleaner/parsers/mac_parser.py b/sos/cleaner/parsers/mac_parser.py
003633
index 7ca80b8d..c74288cf 100644
003633
--- a/sos/cleaner/parsers/mac_parser.py
003633
+++ b/sos/cleaner/parsers/mac_parser.py
003633
@@ -30,7 +30,6 @@ class SoSMacParser(SoSCleanerParser):
003633
         '534f:53'
003633
     )
003633
     map_file_key = 'mac_map'
003633
-    prep_map_file = 'sos_commands/networking/ip_-d_address'
003633
 
003633
     def __init__(self, config):
003633
         self.mapping = SoSMacMap()
003633
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
003633
index b142e371..35377a31 100644
003633
--- a/sos/cleaner/parsers/username_parser.py
003633
+++ b/sos/cleaner/parsers/username_parser.py
003633
@@ -25,14 +25,6 @@ class SoSUsernameParser(SoSCleanerParser):
003633
 
003633
     name = 'Username Parser'
003633
     map_file_key = 'username_map'
003633
-    prep_map_file = [
003633
-        'sos_commands/login/lastlog_-u_1000-60000',
003633
-        'sos_commands/login/lastlog_-u_60001-65536',
003633
-        'sos_commands/login/lastlog_-u_65537-4294967295',
003633
-        # AD users will be reported here, but favor the lastlog files since
003633
-        # those will include local users who have not logged in
003633
-        'sos_commands/login/last'
003633
-    ]
003633
     regex_patterns = []
003633
     skip_list = [
003633
         'core',
003633
diff --git a/tests/cleaner_tests/existing_archive.py b/tests/cleaner_tests/existing_archive.py
003633
index 0eaf6c8d..e13d1cae 100644
003633
--- a/tests/cleaner_tests/existing_archive.py
003633
+++ b/tests/cleaner_tests/existing_archive.py
003633
@@ -28,6 +28,13 @@ class ExistingArchiveCleanTest(StageTwoReportTest):
003633
     def test_obfuscation_log_created(self):
003633
         self.assertFileExists(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE))
003633
 
003633
+    def test_archive_type_correct(self):
003633
+        with open(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE), 'r') as log:
003633
+            for line in log:
003633
+                if "Loaded %s" % ARCHIVE in line:
003633
+                    assert 'as type sos report archive' in line, "Incorrect archive type detected: %s" % line
003633
+                    break
003633
+
003633
     def test_from_cmdline_logged(self):
003633
         with open(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE), 'r') as log:
003633
             for line in log:
003633
diff --git a/tests/cleaner_tests/full_report_run.py b/tests/cleaner_tests/full_report_run.py
003633
index 3b28e7a2..2de54946 100644
003633
--- a/tests/cleaner_tests/full_report_run.py
003633
+++ b/tests/cleaner_tests/full_report_run.py
003633
@@ -35,6 +35,9 @@ class FullCleanTest(StageTwoReportTest):
003633
     def test_tarball_named_obfuscated(self):
003633
         self.assertTrue('obfuscated' in self.archive)
003633
 
003633
+    def test_archive_type_correct(self):
003633
+        self.assertSosLogContains('Loaded .* as type sos report directory')
003633
+
003633
     def test_hostname_not_in_any_file(self):
003633
         host = self.sysinfo['pre']['networking']['hostname']
003633
         # much faster to just use grep here
003633
diff --git a/tests/cleaner_tests/report_with_mask.py b/tests/cleaner_tests/report_with_mask.py
003633
index 4f94ba33..08e873d4 100644
003633
--- a/tests/cleaner_tests/report_with_mask.py
003633
+++ b/tests/cleaner_tests/report_with_mask.py
003633
@@ -31,6 +31,9 @@ class ReportWithMask(StageOneReportTest):
003633
     def test_tarball_named_obfuscated(self):
003633
         self.assertTrue('obfuscated' in self.archive)
003633
 
003633
+    def test_archive_type_correct(self):
003633
+        self.assertSosLogContains('Loaded .* as type sos report directory')
003633
+
003633
     def test_localhost_was_obfuscated(self):
003633
         self.assertFileHasContent('/etc/hostname', 'host0')
003633
 
003633
-- 
003633
2.31.1
003633
003633
From 9b119f860eaec089f7ef884ff39c42589a662994 Mon Sep 17 00:00:00 2001
003633
From: Jake Hunsaker <jhunsake@redhat.com>
003633
Date: Wed, 1 Sep 2021 00:34:04 -0400
003633
Subject: [PATCH] [hostname_map] Add a catch for single-character hostnames
003633
003633
If a log file was truncated at a specific boundary in a string of the
003633
FQDN of the host such that we only get a couple characters before the
003633
rest of the domain, we would previously bodly replace all instances of
003633
that character with the obfuscated short name; not very helpful.
003633
003633
Instead, don't sanitize the short name if this happens and instead
003633
obfuscate the whole FQDN as 'unknown.example.com'.
003633
003633
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
003633
---
003633
 sos/cleaner/mappings/hostname_map.py | 9 ++++++++-
003633
 1 file changed, 8 insertions(+), 1 deletion(-)
003633
003633
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
003633
index d4b2c88e..e70a5530 100644
003633
--- a/sos/cleaner/mappings/hostname_map.py
003633
+++ b/sos/cleaner/mappings/hostname_map.py
003633
@@ -184,7 +184,14 @@ class SoSHostnameMap(SoSMap):
003633
             hostname = host[0]
003633
             domain = host[1:]
003633
             # obfuscate the short name
003633
-            ob_hostname = self.sanitize_short_name(hostname)
003633
+            if len(hostname) > 2:
003633
+                ob_hostname = self.sanitize_short_name(hostname)
003633
+            else:
003633
+                # by best practice it appears the host part of the fqdn was cut
003633
+                # off due to some form of truncating, as such don't obfuscate
003633
+                # short strings that are likely to throw off obfuscation of
003633
+                # unrelated bits and paths
003633
+                ob_hostname = 'unknown'
003633
             ob_domain = self.sanitize_domain(domain)
003633
             self.dataset[item] = ob_domain
003633
             return '.'.join([ob_hostname, ob_domain])
003633
-- 
003633
2.31.1
003633
003633
From f3f3e763d7c31b7b7cafdf8dd4dab87056fb7696 Mon Sep 17 00:00:00 2001
003633
From: Jake Hunsaker <jhunsake@redhat.com>
003633
Date: Wed, 1 Sep 2021 15:54:55 -0400
003633
Subject: [PATCH] [cleaner] Add support for Insights client archives
003633
003633
Adds a new type of `SoSObfuscationArchive` to add support for
003633
obfuscating archives generated by the Insights project.
003633
003633
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
003633
---
003633
 man/en/sos-clean.1               |  1 +
003633
 sos/cleaner/__init__.py          |  4 ++-
003633
 sos/cleaner/archives/insights.py | 42 ++++++++++++++++++++++++++++++++
003633
 3 files changed, 46 insertions(+), 1 deletion(-)
003633
 create mode 100644 sos/cleaner/archives/insights.py
003633
003633
diff --git a/man/en/sos-clean.1 b/man/en/sos-clean.1
003633
index 54026713..358ec0cb 100644
003633
--- a/man/en/sos-clean.1
003633
+++ b/man/en/sos-clean.1
003633
@@ -105,6 +105,7 @@ The following are accepted values for this option:
003633
     \fBauto\fR          Automatically detect the archive type
003633
     \fBreport\fR        An archive generated by \fBsos report\fR
003633
     \fBcollect\fR       An archive generated by \fBsos collect\fR
003633
+    \fBinsights\fR      An archive generated by the \fBinsights-client\fR package
003633
 
003633
 The following may also be used, however note that these do not attempt to pre-load
003633
 any information from the archives into the parsers. This means that, among other limitations,
003633
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
003633
index 6d2eb483..3e08aa28 100644
003633
--- a/sos/cleaner/__init__.py
003633
+++ b/sos/cleaner/__init__.py
003633
@@ -29,6 +29,7 @@ from sos.cleaner.archives.sos import (SoSReportArchive, SoSReportDirectory,
003633
                                       SoSCollectorArchive,
003633
                                       SoSCollectorDirectory)
003633
 from sos.cleaner.archives.generic import DataDirArchive, TarballArchive
003633
+from sos.cleaner.archives.insights import InsightsArchive
003633
 from sos.utilities import get_human_readable
003633
 from textwrap import fill
003633
 
003633
@@ -100,6 +101,7 @@ class SoSCleaner(SoSComponent):
003633
             SoSReportArchive,
003633
             SoSCollectorDirectory,
003633
             SoSCollectorArchive,
003633
+            InsightsArchive,
003633
             # make sure these two are always last as they are fallbacks
003633
             DataDirArchive,
003633
             TarballArchive
003633
@@ -194,7 +196,7 @@ third party.
003633
                                help='The directory or archive to obfuscate')
003633
         clean_grp.add_argument('--archive-type', default='auto',
003633
                                choices=['auto', 'report', 'collect',
003633
-                                        'data-dir', 'tarball'],
003633
+                                        'insights', 'data-dir', 'tarball'],
003633
                                help=('Specify what kind of archive the target '
003633
                                      'was generated as'))
003633
         clean_grp.add_argument('--domains', action='extend', default=[],
003633
diff --git a/sos/cleaner/archives/insights.py b/sos/cleaner/archives/insights.py
003633
new file mode 100644
003633
index 00000000..dab48b16
003633
--- /dev/null
003633
+++ b/sos/cleaner/archives/insights.py
003633
@@ -0,0 +1,42 @@
003633
+# Copyright 2021 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
003633
+
003633
+# This file is part of the sos project: https://github.com/sosreport/sos
003633
+#
003633
+# This copyrighted material is made available to anyone wishing to use,
003633
+# modify, copy, or redistribute it subject to the terms and conditions of
003633
+# version 2 of the GNU General Public License.
003633
+#
003633
+# See the LICENSE file in the source distribution for further information.
003633
+
003633
+
003633
+from sos.cleaner.archives import SoSObfuscationArchive
003633
+
003633
+import tarfile
003633
+
003633
+
003633
+class InsightsArchive(SoSObfuscationArchive):
003633
+    """This class represents archives generated by the insights-client utility
003633
+    for RHEL systems.
003633
+    """
003633
+
003633
+    type_name = 'insights'
003633
+    description = 'insights-client archive'
003633
+
003633
+    prep_files = {
003633
+        'hostname': 'data/insights_commands/hostname_-f',
003633
+        'ip': 'data/insights_commands/ip_addr',
003633
+        'mac': 'data/insights_commands/ip_addr'
003633
+    }
003633
+
003633
+    @classmethod
003633
+    def check_is_type(cls, arc_path):
003633
+        try:
003633
+            return tarfile.is_tarfile(arc_path) and 'insights-' in arc_path
003633
+        except Exception:
003633
+            return False
003633
+
003633
+    def get_archive_root(self):
003633
+        top = self.archive_path.split('/')[-1].split('.tar')[0]
003633
+        if self.tarobj.firstmember.name == '.':
003633
+            top = './' + top
003633
+        return top
003633
-- 
003633
2.31.1
003633
003633
From 9639dc3d240076b55f2a1d04b43ea42bebd09215 Mon Sep 17 00:00:00 2001
003633
From: Jake Hunsaker <jhunsake@redhat.com>
003633
Date: Tue, 16 Nov 2021 17:50:42 -0500
003633
Subject: [PATCH] [clean,hostname_parser] Source /etc/hosts for obfuscation
003633
003633
Up until now, our sourcing of hostnames/domains for obfuscation has been
003633
dependent upon the output of the `hostname` command. However, some
003633
scenarios have come up where sourcing `/etc/hosts` is advantageous for
003633
several reasons:
003633
003633
First, if `hostname` output is unavailable, this provides a fallback
003633
measure.
003633
003633
Second, `/etc/hosts` is a common place to have short names defined which
003633
would otherwise not be detected (or at the very least would result in a
003633
race condition based on where/if the short name was elsewhere able to be
003633
gleaned from an FQDN), thus leaving the potential for unobfuscated data
003633
in an archive.
003633
003633
Due to both the nature of hostname obfuscation and the malleable syntax
003633
of `/etc/hosts`, the parsing of this file needs special handling not
003633
covered by our more generic parsing and obfuscation methods.
003633
003633
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
003633
---
003633
 sos/cleaner/__init__.py                | 11 ++++++++---
003633
 sos/cleaner/archives/sos.py            |  5 ++++-
003633
 sos/cleaner/parsers/hostname_parser.py | 19 +++++++++++++++++++
003633
 3 files changed, 31 insertions(+), 4 deletions(-)
003633
003633
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
003633
index ed461a8f..3f530d44 100644
003633
--- a/sos/cleaner/__init__.py
003633
+++ b/sos/cleaner/__init__.py
003633
@@ -523,9 +523,14 @@ third party.
003633
                         if isinstance(_parser, SoSUsernameParser):
003633
                             _parser.load_usernames_into_map(content)
003633
                         elif isinstance(_parser, SoSHostnameParser):
003633
-                            _parser.load_hostname_into_map(
003633
-                                content.splitlines()[0]
003633
-                            )
003633
+                            if 'hostname' in parse_file:
003633
+                                _parser.load_hostname_into_map(
003633
+                                    content.splitlines()[0]
003633
+                                )
003633
+                            elif 'etc/hosts' in parse_file:
003633
+                                _parser.load_hostname_from_etc_hosts(
003633
+                                    content
003633
+                                )
003633
                         else:
003633
                             for line in content.splitlines():
003633
                                 self.obfuscate_line(line)
003633
diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
003633
index 4401d710..f8720c88 100644
003633
--- a/sos/cleaner/archives/sos.py
003633
+++ b/sos/cleaner/archives/sos.py
003633
@@ -23,7 +23,10 @@ class SoSReportArchive(SoSObfuscationArchive):
003633
     type_name = 'report'
003633
     description = 'sos report archive'
003633
     prep_files = {
003633
-        'hostname': 'sos_commands/host/hostname',
003633
+        'hostname': [
003633
+            'sos_commands/host/hostname',
003633
+            'etc/hosts'
003633
+        ],
003633
         'ip': 'sos_commands/networking/ip_-o_addr',
003633
         'mac': 'sos_commands/networking/ip_-d_address',
003633
         'username': [
003633
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
003633
index daa76a62..0a733bee 100644
003633
--- a/sos/cleaner/parsers/hostname_parser.py
003633
+++ b/sos/cleaner/parsers/hostname_parser.py
003633
@@ -61,6 +61,25 @@ class SoSHostnameParser(SoSCleanerParser):
003633
             self.mapping.add(high_domain)
003633
         self.mapping.add(hostname_string)
003633
 
003633
+    def load_hostname_from_etc_hosts(self, content):
003633
+        """Parse an archive's copy of /etc/hosts, which requires handling that
003633
+        is separate from the output of the `hostname` command. Just like
003633
+        load_hostname_into_map(), this has to be done explicitly and we
003633
+        cannot rely upon the more generic methods to do this reliably.
003633
+        """
003633
+        lines = content.splitlines()
003633
+        for line in lines:
003633
+            if line.startswith('#') or 'localhost' in line:
003633
+                continue
003633
+            hostln = line.split()[1:]
003633
+            for host in hostln:
003633
+                if len(host.split('.')) == 1:
003633
+                    # only generate a mapping for fqdns but still record the
003633
+                    # short name here for later obfuscation with parse_line()
003633
+                    self.short_names.append(host)
003633
+                else:
003633
+                    self.mapping.add(host)
003633
+
003633
     def parse_line(self, line):
003633
         """Override the default parse_line() method to also check for the
003633
         shortname of the host derived from the hostname.
003633
-- 
003633
2.31.1
003633
003633
From c1680226b53452b18f27f2e76c3e0e03e521f935 Mon Sep 17 00:00:00 2001
003633
From: Jake Hunsaker <jhunsake@redhat.com>
003633
Date: Wed, 17 Nov 2021 13:11:33 -0500
003633
Subject: [PATCH] [clean, hostname] Fix unintentionally case sensitive
003633
 shortname handling
003633
003633
It was discovered that our extra handling for shortnames was
003633
unintentionally case sensitive. Fix this to ensure that shortnames are
003633
obfuscated regardless of case in all collected text.
003633
003633
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
003633
---
003633
 sos/cleaner/mappings/hostname_map.py   |  6 +++---
003633
 sos/cleaner/parsers/hostname_parser.py |  8 +++++---
003633
 tests/cleaner_tests/full_report_run.py | 21 ++++++++++++++++++++-
003633
 3 files changed, 28 insertions(+), 7 deletions(-)
003633
003633
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
003633
index e70a5530..0fe78fb1 100644
003633
--- a/sos/cleaner/mappings/hostname_map.py
003633
+++ b/sos/cleaner/mappings/hostname_map.py
003633
@@ -169,13 +169,13 @@ class SoSHostnameMap(SoSMap):
003633
 
003633
     def sanitize_item(self, item):
003633
         host = item.split('.')
003633
-        if all([h.isupper() for h in host]):
003633
+        if len(host) > 1 and all([h.isupper() for h in host]):
003633
             # by convention we have just a domain
003633
             _host = [h.lower() for h in host]
003633
             return self.sanitize_domain(_host).upper()
003633
         if len(host) == 1:
003633
             # we have a shortname for a host
003633
-            return self.sanitize_short_name(host[0])
003633
+            return self.sanitize_short_name(host[0].lower())
003633
         if len(host) == 2:
003633
             # we have just a domain name, e.g. example.com
003633
             return self.sanitize_domain(host)
003633
@@ -185,7 +185,7 @@ class SoSHostnameMap(SoSMap):
003633
             domain = host[1:]
003633
             # obfuscate the short name
003633
             if len(hostname) > 2:
003633
-                ob_hostname = self.sanitize_short_name(hostname)
003633
+                ob_hostname = self.sanitize_short_name(hostname.lower())
003633
             else:
003633
                 # by best practice it appears the host part of the fqdn was cut
003633
                 # off due to some form of truncating, as such don't obfuscate
003633
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
003633
index 0a733bee..7fd0e698 100644
003633
--- a/sos/cleaner/parsers/hostname_parser.py
003633
+++ b/sos/cleaner/parsers/hostname_parser.py
003633
@@ -8,6 +8,8 @@
003633
 #
003633
 # See the LICENSE file in the source distribution for further information.
003633
 
003633
+import re
003633
+
003633
 from sos.cleaner.parsers import SoSCleanerParser
003633
 from sos.cleaner.mappings.hostname_map import SoSHostnameMap
003633
 
003633
@@ -91,9 +93,9 @@ class SoSHostnameParser(SoSCleanerParser):
003633
             """
003633
             if search in self.mapping.skip_keys:
003633
                 return ln, count
003633
-            if search in ln:
003633
-                count += ln.count(search)
003633
-                ln = ln.replace(search, self.mapping.get(repl or search))
003633
+            _reg = re.compile(search, re.I)
003633
+            if _reg.search(ln):
003633
+                return _reg.subn(self.mapping.get(repl or search), ln)
003633
             return ln, count
003633
 
003633
         count = 0
003633
diff --git a/tests/cleaner_tests/full_report_run.py b/tests/cleaner_tests/full_report_run.py
003633
index 2de54946..0b23acaf 100644
003633
--- a/tests/cleaner_tests/full_report_run.py
003633
+++ b/tests/cleaner_tests/full_report_run.py
003633
@@ -26,6 +26,24 @@ class FullCleanTest(StageTwoReportTest):
003633
     # replace with an empty placeholder, make sure that this test case is not
003633
     # influenced by previous clean runs
003633
     files = ['/etc/sos/cleaner/default_mapping']
003633
+    packages = {
003633
+        'rhel': ['python3-systemd'],
003633
+        'ubuntu': ['python3-systemd']
003633
+    }
003633
+
003633
+    def pre_sos_setup(self):
003633
+        # ensure that case-insensitive matching of FQDNs and shortnames work
003633
+        from systemd import journal
003633
+        from socket import gethostname
003633
+        host = gethostname()
003633
+        short = host.split('.')[0]
003633
+        sosfd = journal.stream('sos-testing')
003633
+        sosfd.write(
003633
+            "This is a test line from sos clean testing. The hostname %s "
003633
+            "should not appear, nor should %s in an obfuscated archive. The "
003633
+            "shortnames of %s and %s should also not appear."
003633
+            % (host.lower(), host.upper(), short.lower(), short.upper())
003633
+        )
003633
 
003633
     def test_private_map_was_generated(self):
003633
         self.assertOutputContains('A mapping of obfuscated elements is available at')
003633
@@ -40,8 +58,9 @@ class FullCleanTest(StageTwoReportTest):
003633
 
003633
     def test_hostname_not_in_any_file(self):
003633
         host = self.sysinfo['pre']['networking']['hostname']
003633
+        short = host.split('.')[0]
003633
         # much faster to just use grep here
003633
-        content = self.grep_for_content(host)
003633
+        content = self.grep_for_content(host) + self.grep_for_content(short)
003633
         if not content:
003633
             assert True
003633
         else:
003633
-- 
003633
2.31.1
003633
003633
From aaeb8cb57ed55598ab744b96d4f127aedebcb292 Mon Sep 17 00:00:00 2001
003633
From: Jake Hunsaker <jhunsake@redhat.com>
003633
Date: Tue, 21 Sep 2021 15:23:20 -0400
003633
Subject: [PATCH] [build] Add archives to setup.py packages
003633
003633
Adds the newly abstracted `sos.cleaner.archives` package to `setup.py`
003633
so that manual builds will properly include it.
003633
003633
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
003633
---
003633
 setup.py | 2 +-
003633
 1 file changed, 1 insertion(+), 1 deletion(-)
003633
003633
diff --git a/setup.py b/setup.py
003633
index 1e8d8e2dc5..7653b59de3 100644
003633
--- a/setup.py
003633
+++ b/setup.py
003633
@@ -102,7 +102,7 @@ def copy_file (self, filename, dirname):
003633
         'sos.policies.package_managers', 'sos.policies.init_systems',
003633
         'sos.report', 'sos.report.plugins', 'sos.collector',
003633
         'sos.collector.clusters', 'sos.cleaner', 'sos.cleaner.mappings',
003633
-        'sos.cleaner.parsers'
003633
+        'sos.cleaner.parsers', 'sos.cleaner.archives'
003633
     ],
003633
     cmdclass=cmdclass,
003633
     command_options=command_options,
003633
-- 
003633
2.31.1
003633
003633
From ba3528230256429a4394f155a9ca1fdb91cf3560 Mon Sep 17 00:00:00 2001
003633
From: Jake Hunsaker <jhunsake@redhat.com>
003633
Date: Tue, 30 Nov 2021 12:46:34 -0500
003633
Subject: [PATCH 1/2] [hostname] Simplify case matching for domains
003633
003633
Instead of special handling all uppercase domain conventions, use our
003633
normal flow for obfuscation and just match the casing at the end of the
003633
sanitization routine.
003633
003633
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
003633
---
003633
 sos/cleaner/mappings/hostname_map.py | 14 ++++++++------
003633
 1 file changed, 8 insertions(+), 6 deletions(-)
003633
003633
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
003633
index 0fe78fb1..5cd8e985 100644
003633
--- a/sos/cleaner/mappings/hostname_map.py
003633
+++ b/sos/cleaner/mappings/hostname_map.py
003633
@@ -169,16 +169,15 @@ class SoSHostnameMap(SoSMap):
003633
 
003633
     def sanitize_item(self, item):
003633
         host = item.split('.')
003633
-        if len(host) > 1 and all([h.isupper() for h in host]):
003633
-            # by convention we have just a domain
003633
-            _host = [h.lower() for h in host]
003633
-            return self.sanitize_domain(_host).upper()
003633
         if len(host) == 1:
003633
             # we have a shortname for a host
003633
             return self.sanitize_short_name(host[0].lower())
003633
         if len(host) == 2:
003633
             # we have just a domain name, e.g. example.com
003633
-            return self.sanitize_domain(host)
003633
+            dname = self.sanitize_domain(host)
003633
+            if all([h.isupper() for h in host]):
003633
+                dname = dname.upper()
003633
+            return dname
003633
         if len(host) > 2:
003633
             # we have an FQDN, e.g. foo.example.com
003633
             hostname = host[0]
003633
@@ -194,7 +193,10 @@ class SoSHostnameMap(SoSMap):
003633
                 ob_hostname = 'unknown'
003633
             ob_domain = self.sanitize_domain(domain)
003633
             self.dataset[item] = ob_domain
003633
-            return '.'.join([ob_hostname, ob_domain])
003633
+            _fqdn = '.'.join([ob_hostname, ob_domain])
003633
+            if all([h.isupper() for h in host]):
003633
+                _fqdn = _fqdn.upper()
003633
+            return _fqdn
003633
 
003633
     def sanitize_short_name(self, hostname):
003633
         """Obfuscate the short name of the host with an incremented counter
003633
-- 
003633
2.31.1
003633
003633
003633
From 189586728de22dd55122c1f7e06b19590f9a788f Mon Sep 17 00:00:00 2001
003633
From: Jake Hunsaker <jhunsake@redhat.com>
003633
Date: Tue, 30 Nov 2021 12:47:58 -0500
003633
Subject: [PATCH 2/2] [username] Improve username sourcing and remove case
003633
 sensitivity
003633
003633
First, don't skip the first line of `last` output, and instead add the
003633
header from lastlog to the skip list. Additionally, add
003633
`/etc/cron.allow` and `/etc/cron.deny` as sources for usernames that
003633
might not appear in other locations in certain environments.
003633
003633
Also, make matching and replacement case insensitive.
003633
003633
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
003633
---
003633
 sos/cleaner/archives/sos.py            |  4 +++-
003633
 sos/cleaner/mappings/username_map.py   |  2 +-
003633
 sos/cleaner/parsers/username_parser.py | 14 +++++++++-----
003633
 3 files changed, 13 insertions(+), 7 deletions(-)
003633
003633
diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
003633
index f8720c88..12766496 100644
003633
--- a/sos/cleaner/archives/sos.py
003633
+++ b/sos/cleaner/archives/sos.py
003633
@@ -35,7 +35,9 @@ class SoSReportArchive(SoSObfuscationArchive):
003633
             'sos_commands/login/lastlog_-u_65537-4294967295',
003633
             # AD users will be reported here, but favor the lastlog files since
003633
             # those will include local users who have not logged in
003633
-            'sos_commands/login/last'
003633
+            'sos_commands/login/last',
003633
+            'etc/cron.allow',
003633
+            'etc/cron.deny'
003633
         ]
003633
     }
003633
 
003633
diff --git a/sos/cleaner/mappings/username_map.py b/sos/cleaner/mappings/username_map.py
003633
index cdbf36fe..7ecccd7b 100644
003633
--- a/sos/cleaner/mappings/username_map.py
003633
+++ b/sos/cleaner/mappings/username_map.py
003633
@@ -33,5 +33,5 @@ class SoSUsernameMap(SoSMap):
003633
         ob_name = "obfuscateduser%s" % self.name_count
003633
         self.name_count += 1
003633
         if ob_name in self.dataset.values():
003633
-            return self.sanitize_item(username)
003633
+            return self.sanitize_item(username.lower())
003633
         return ob_name
003633
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
003633
index 35377a31..229c7de4 100644
003633
--- a/sos/cleaner/parsers/username_parser.py
003633
+++ b/sos/cleaner/parsers/username_parser.py
003633
@@ -8,6 +8,7 @@
003633
 #
003633
 # See the LICENSE file in the source distribution for further information.
003633
 
003633
+import re
003633
 
003633
 from sos.cleaner.parsers import SoSCleanerParser
003633
 from sos.cleaner.mappings.username_map import SoSUsernameMap
003633
@@ -34,6 +35,7 @@ class SoSUsernameParser(SoSCleanerParser):
003633
         'reboot',
003633
         'root',
003633
         'ubuntu',
003633
+        'username',
003633
         'wtmp'
003633
     ]
003633
 
003633
@@ -47,12 +49,12 @@ class SoSUsernameParser(SoSCleanerParser):
003633
         this parser, we need to override the initial parser prepping here.
003633
         """
003633
         users = set()
003633
-        for line in content.splitlines()[1:]:
003633
+        for line in content.splitlines():
003633
             try:
003633
                 user = line.split()[0]
003633
             except Exception:
003633
                 continue
003633
-            if user in self.skip_list:
003633
+            if user.lower() in self.skip_list:
003633
                 continue
003633
             users.add(user)
003633
         for each in users:
003633
@@ -61,7 +63,9 @@ class SoSUsernameParser(SoSCleanerParser):
003633
     def parse_line(self, line):
003633
         count = 0
003633
         for username in sorted(self.mapping.dataset.keys(), reverse=True):
003633
-            if username in line:
003633
-                count = line.count(username)
003633
-                line = line.replace(username, self.mapping.get(username))
003633
+            _reg = re.compile(username, re.I)
003633
+            if _reg.search(line):
003633
+                line, count = _reg.subn(
003633
+                    self.mapping.get(username.lower()), line
003633
+                )
003633
         return line, count
003633
-- 
003633
2.31.1
003633
003633
From cafd0f3a52436a3966576e7db21e5dd17c06f0cc Mon Sep 17 00:00:00 2001
003633
From: Jake Hunsaker <jhunsake@redhat.com>
003633
Date: Sun, 12 Dec 2021 11:10:46 -0500
003633
Subject: [PATCH] [hostname] Fix edge case for new hosts in a known subdomain
003633
003633
Fixes an edge case that would cause us to at first not recognize that a
003633
given hostname string is a new host in a known subdomain, but then on
003633
the obfuscation attempt properly recognize it as such and result in an
003633
incomplete obfuscation.
003633
003633
This was mostly triggered by specific patterns for build hosts within
003633
`sos_commands/rpm/package-data`. With this refined check, these types of
003633
matches are properly obfuscated.
003633
003633
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
003633
---
003633
 sos/cleaner/mappings/hostname_map.py | 9 +++++----
003633
 1 file changed, 5 insertions(+), 4 deletions(-)
003633
003633
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
003633
index 5cd8e9857..33b0e6c80 100644
003633
--- a/sos/cleaner/mappings/hostname_map.py
003633
+++ b/sos/cleaner/mappings/hostname_map.py
003633
@@ -129,7 +129,7 @@ def get(self, item):
003633
             item = item[0:-1]
003633
         if not self.domain_name_in_loaded_domains(item.lower()):
003633
             return item
003633
-        if item.endswith(('.yaml', '.yml', '.crt', '.key', '.pem')):
003633
+        if item.endswith(('.yaml', '.yml', '.crt', '.key', '.pem', '.log')):
003633
             ext = '.' + item.split('.')[-1]
003633
             item = item.replace(ext, '')
003633
             suffix += ext
003633
@@ -148,7 +148,8 @@ def get(self, item):
003633
                 if len(_test) == 1 or not _test[0]:
003633
                     # does not match existing obfuscation
003633
                     continue
003633
-                elif _test[0].endswith('.') and not _host_substr:
003633
+                elif not _host_substr and (_test[0].endswith('.') or
003633
+                                           item.endswith(_existing)):
003633
                     # new hostname in known domain
003633
                     final = super(SoSHostnameMap, self).get(item)
003633
                     break
003633
@@ -219,8 +220,8 @@ def sanitize_domain(self, domain):
003633
             # don't obfuscate vendor domains
003633
             if re.match(_skip, '.'.join(domain)):
003633
                 return '.'.join(domain)
003633
-        top_domain = domain[-1]
003633
-        dname = '.'.join(domain[0:-1])
003633
+        top_domain = domain[-1].lower()
003633
+        dname = '.'.join(domain[0:-1]).lower()
003633
         ob_domain = self._new_obfuscated_domain(dname)
003633
         ob_domain = '.'.join([ob_domain, top_domain])
003633
         self.dataset['.'.join(domain)] = ob_domain
003633
From f5e1298162a9393ea2d9f5c4df40dfece50f5f88 Mon Sep 17 00:00:00 2001
003633
From: Jake Hunsaker <jhunsake@redhat.com>
003633
Date: Thu, 6 Jan 2022 13:15:15 -0500
003633
Subject: [PATCH 1/3] [hostname] Fix loading and detection of long base domains
003633
003633
Our domain matching has up to now assumed that users would be providing
003633
'base' domains such as 'example.com' whereby something like
003633
'foo.bar.example.com' is a subdomain (or host) within that base domain.
003633
003633
However, the use case exists to provide 'foo.bar.example.com' as the
003633
base domain, without wanting to obfuscate 'example.com' directly.
003633
003633
This commit fixes our handling of both loading these longer domains and
003633
doing the 'domain is part of a domain we want to obfuscate' check.
003633
003633
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
003633
---
003633
 sos/cleaner/mappings/hostname_map.py | 9 ++++++++-
003633
 1 file changed, 8 insertions(+), 1 deletion(-)
003633
003633
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
003633
index 33b0e6c8..7a7cf6b8 100644
003633
--- a/sos/cleaner/mappings/hostname_map.py
003633
+++ b/sos/cleaner/mappings/hostname_map.py
003633
@@ -50,10 +50,14 @@ class SoSHostnameMap(SoSMap):
003633
         in this parser, we need to re-inject entries from the map_file into
003633
         these dicts and not just the underlying 'dataset' dict
003633
         """
003633
-        for domain in self.dataset:
003633
+        for domain, ob_pair in self.dataset.items():
003633
             if len(domain.split('.')) == 1:
003633
                 self.hosts[domain.split('.')[0]] = self.dataset[domain]
003633
             else:
003633
+                if ob_pair.startswith('obfuscateddomain'):
003633
+                    # directly exact domain matches
003633
+                    self._domains[domain] = ob_pair.split('.')[0]
003633
+                    continue
003633
                 # strip the host name and trailing top-level domain so that
003633
                 # we in inject the domain properly for later string matching
003633
 
003633
@@ -102,9 +106,12 @@ class SoSHostnameMap(SoSMap):
003633
         and should be obfuscated
003633
         """
003633
         host = domain.split('.')
003633
+        no_tld = '.'.join(domain.split('.')[0:-1])
003633
         if len(host) == 1:
003633
             # don't block on host's shortname
003633
             return host[0] in self.hosts.keys()
003633
+        elif any([no_tld.endswith(_d) for _d in self._domains]):
003633
+            return True
003633
         else:
003633
             domain = host[0:-1]
003633
             for known_domain in self._domains:
003633
-- 
003633
2.31.1
003633
003633
003633
From e241cf33a14ecd4e848a5fd857c5d3d7d07fbd71 Mon Sep 17 00:00:00 2001
003633
From: Jake Hunsaker <jhunsake@redhat.com>
003633
Date: Thu, 6 Jan 2022 13:18:44 -0500
003633
Subject: [PATCH 2/3] [cleaner] Improve parser-specific file skipping
003633
003633
This commit improves our handling of skipping files on a per-parser
003633
basis, by first filtering the list of parsers that `obfuscate_line()`
003633
will iterate over by the parser's `skip_file` class attr, rather than
003633
relying on higher-level checks.
003633
003633
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
003633
---
003633
 sos/cleaner/__init__.py | 17 ++++++++++++++---
003633
 1 file changed, 14 insertions(+), 3 deletions(-)
003633
003633
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
003633
index 3f530d44..5686e213 100644
003633
--- a/sos/cleaner/__init__.py
003633
+++ b/sos/cleaner/__init__.py
003633
@@ -12,6 +12,7 @@ import hashlib
003633
 import json
003633
 import logging
003633
 import os
003633
+import re
003633
 import shutil
003633
 import tempfile
003633
 
003633
@@ -640,10 +641,16 @@ third party.
003633
             self.log_debug("Obfuscating %s" % short_name or filename,
003633
                            caller=arc_name)
003633
             tfile = tempfile.NamedTemporaryFile(mode='w', dir=self.tmpdir)
003633
+            _parsers = [
003633
+                _p for _p in self.parsers if not
003633
+                any([
003633
+                    re.match(p, short_name) for p in _p.skip_files
003633
+                ])
003633
+            ]
003633
             with open(filename, 'r') as fname:
003633
                 for line in fname:
003633
                     try:
003633
-                        line, count = self.obfuscate_line(line)
003633
+                        line, count = self.obfuscate_line(line, _parsers)
003633
                         subs += count
003633
                         tfile.write(line)
003633
                     except Exception as err:
003633
@@ -713,7 +720,7 @@ third party.
003633
                 pass
003633
         return string_data
003633
 
003633
-    def obfuscate_line(self, line):
003633
+    def obfuscate_line(self, line, parsers=None):
003633
         """Run a line through each of the obfuscation parsers, keeping a
003633
         cumulative total of substitutions done on that particular line.
003633
 
003633
@@ -721,6 +728,8 @@ third party.
003633
 
003633
             :param line str:        The raw line as read from the file being
003633
                                     processed
003633
+            :param parsers:         A list of parser objects to obfuscate
003633
+                                    with. If None, use all.
003633
 
003633
         Returns the fully obfuscated line and the number of substitutions made
003633
         """
003633
@@ -729,7 +738,9 @@ third party.
003633
         count = 0
003633
         if not line.strip():
003633
             return line, count
003633
-        for parser in self.parsers:
003633
+        if parsers is None:
003633
+            parsers = self.parsers
003633
+        for parser in parsers:
003633
             try:
003633
                 line, _count = parser.parse_line(line)
003633
                 count += _count
003633
-- 
003633
2.31.1
003633
003633
003633
From 96c9a833e77639a853b7d3d6f1df68bbbbe5e9cb Mon Sep 17 00:00:00 2001
003633
From: Jake Hunsaker <jhunsake@redhat.com>
003633
Date: Thu, 6 Jan 2022 13:20:32 -0500
003633
Subject: [PATCH 3/3] [cleaner] Add skips for known files and usernames
003633
003633
Adds skips for `/proc/kallsyms` which should never be obfuscated, as
003633
well as any packaging-related log file for the IP parser. Further, do
003633
not obfuscate the `stack` users, as that is a well-known user for many
003633
configurations that, if obfuscated, could result in undesired string
003633
substitutions in normal logging.
003633
003633
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
003633
---
003633
 sos/cleaner/archives/__init__.py       | 2 ++
003633
 sos/cleaner/parsers/ip_parser.py       | 3 ++-
003633
 sos/cleaner/parsers/username_parser.py | 1 +
003633
 3 files changed, 5 insertions(+), 1 deletion(-)
003633
003633
diff --git a/sos/cleaner/archives/__init__.py b/sos/cleaner/archives/__init__.py
003633
index 795c5a78..cbf1f809 100644
003633
--- a/sos/cleaner/archives/__init__.py
003633
+++ b/sos/cleaner/archives/__init__.py
003633
@@ -43,6 +43,7 @@ class SoSObfuscationArchive():
003633
     type_name = 'undetermined'
003633
     description = 'undetermined'
003633
     is_nested = False
003633
+    skip_files = []
003633
     prep_files = {}
003633
 
003633
     def __init__(self, archive_path, tmpdir):
003633
@@ -111,6 +112,7 @@ class SoSObfuscationArchive():
003633
         Returns: list of files and file regexes
003633
         """
003633
         return [
003633
+            'proc/kallsyms',
003633
             'sosreport-',
003633
             'sys/firmware',
003633
             'sys/fs',
003633
diff --git a/sos/cleaner/parsers/ip_parser.py b/sos/cleaner/parsers/ip_parser.py
003633
index 71d38be8..b007368c 100644
003633
--- a/sos/cleaner/parsers/ip_parser.py
003633
+++ b/sos/cleaner/parsers/ip_parser.py
003633
@@ -37,7 +37,8 @@ class SoSIPParser(SoSCleanerParser):
003633
         'sos_commands/snappy/snap_list_--all',
003633
         'sos_commands/snappy/snap_--version',
003633
         'sos_commands/vulkan/vulkaninfo',
003633
-        'var/log/.*dnf.*'
003633
+        'var/log/.*dnf.*',
003633
+        'var/log/.*packag.*'  # get 'packages' and 'packaging' logs
003633
     ]
003633
 
003633
     map_file_key = 'ip_map'
003633
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
003633
index 229c7de4..3208a655 100644
003633
--- a/sos/cleaner/parsers/username_parser.py
003633
+++ b/sos/cleaner/parsers/username_parser.py
003633
@@ -32,6 +32,7 @@ class SoSUsernameParser(SoSCleanerParser):
003633
         'nobody',
003633
         'nfsnobody',
003633
         'shutdown',
003633
+        'stack',
003633
         'reboot',
003633
         'root',
003633
         'ubuntu',
003633
-- 
003633
2.31.1
003633
003633
From 7ebb2ce0bcd13c1b3aada648aceb20b5aff636d9 Mon Sep 17 00:00:00 2001
003633
From: Jake Hunsaker <jhunsake@redhat.com>
003633
Date: Tue, 15 Feb 2022 14:18:02 -0500
003633
Subject: [PATCH] [host] Skip entire /etc/sos/cleaner directory
003633
003633
While `default_mapping` is typically the only file expected under
003633
`/etc/sos/cleaner/` it is possible for other mapping files (such as
003633
backups) to appear there.
003633
003633
Make the `add_forbidden_path()` spec here target the entire cleaner
003633
directory to avoid ever capturing these map files.
003633
003633
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
003633
---
003633
 sos/report/plugins/host.py | 2 +-
003633
 1 file changed, 1 insertion(+), 1 deletion(-)
003633
003633
diff --git a/sos/report/plugins/host.py b/sos/report/plugins/host.py
003633
index 5e21da7b8e..95a3b9cd95 100644
003633
--- a/sos/report/plugins/host.py
003633
+++ b/sos/report/plugins/host.py
003633
@@ -20,7 +20,7 @@ class Host(Plugin, IndependentPlugin):
003633
 
003633
     def setup(self):
003633
 
003633
-        self.add_forbidden_path('/etc/sos/cleaner/default_mapping')
003633
+        self.add_forbidden_path('/etc/sos/cleaner')
003633
 
003633
         self.add_cmd_output('hostname', root_symlink='hostname')
003633
         self.add_cmd_output('uptime', root_symlink='uptime')