Blob Blame History Raw
From decd39b7799a0579ea085b0da0728b6eabd49b38 Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Wed, 1 Sep 2021 00:28:58 -0400
Subject: [PATCH] [clean] Provide archive abstractions to obfuscate more than
 sos archives

This commit removes the restriction imposed on `sos clean` since its
introduction in sos-4.0 to only work against known sos report archives
or build directories. This is because there has been interest in using
the obfuscation bits of sos in other data-collector projects.

The `SoSObfuscationArchive()` class has been revamped to now be an
abstraction for different types of archives, and the cleaner logic has
been updated to leverage this new abstraction rather than assuming we're
working on an sos archive.

Abstractions are added for our own native use cases - that being `sos
report` and `sos collect` for at-runtime obfuscation, as well as
standalone archives previously generated. Further generic abstractions
are available for plain directories and tarballs however these will not
provide the same level of coverage as fully supported archive types, as
is noted in the manpage for sos-clean.

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 man/en/sos-clean.1                            |  25 ++
 sos/cleaner/__init__.py                       | 308 +++++++++---------
 .../__init__.py}                              |  80 ++++-
 sos/cleaner/archives/generic.py               |  52 +++
 sos/cleaner/archives/sos.py                   | 106 ++++++
 sos/cleaner/parsers/__init__.py               |   6 -
 sos/cleaner/parsers/hostname_parser.py        |   1 -
 sos/cleaner/parsers/ip_parser.py              |   1 -
 sos/cleaner/parsers/keyword_parser.py         |   1 -
 sos/cleaner/parsers/mac_parser.py             |   1 -
 sos/cleaner/parsers/username_parser.py        |   8 -
 tests/cleaner_tests/existing_archive.py       |   7 +
 tests/cleaner_tests/full_report_run.py        |   3 +
 tests/cleaner_tests/report_with_mask.py       |   3 +
 14 files changed, 423 insertions(+), 179 deletions(-)
 rename sos/cleaner/{obfuscation_archive.py => archives/__init__.py} (81%)
 create mode 100644 sos/cleaner/archives/generic.py
 create mode 100644 sos/cleaner/archives/sos.py

diff --git a/man/en/sos-clean.1 b/man/en/sos-clean.1
index b77bc63c..54026713 100644
--- a/man/en/sos-clean.1
+++ b/man/en/sos-clean.1
@@ -10,6 +10,7 @@ sos clean - Obfuscate sensitive data from one or more sosreports
     [\-\-jobs]
     [\-\-no-update]
     [\-\-keep-binary-files]
+    [\-\-archive-type]
 
 .SH DESCRIPTION
 \fBsos clean\fR or \fBsos mask\fR is an sos subcommand used to obfuscate sensitive information from
@@ -88,6 +89,30 @@ Users should review any archive that keeps binary files in place before sending
 a third party.
 
 Default: False (remove encountered binary files)
+.TP
+.B \-\-archive-type TYPE
+Specify the type of archive that TARGET was generated as.
+When sos inspects a TARGET archive, it tries to identify what type of archive it is.
+For example, it may be a report generated by \fBsos report\fR, or a collection of those
+reports generated by \fBsos collect\fR, which require separate approaches.
+
+This option may be useful if a given TARGET archive is known to be of a specific type,
+but due to unknown reasons or some malformed/missing information in the archive directly,
+that is not properly identified by sos.
+
+The following are accepted values for this option:
+
+    \fBauto\fR          Automatically detect the archive type
+    \fBreport\fR        An archive generated by \fBsos report\fR
+    \fBcollect\fR       An archive generated by \fBsos collect\fR
+
+The following may also be used, however note that these do not attempt to pre-load
+any information from the archives into the parsers. This means that, among other limitations,
+items like host and domain names may not be obfuscated unless an obfuscated mapping already exists
+on the system from a previous execution.
+
+    \fBdata-dir\fR      A plain directory on the filesystem.
+    \fBtarball\fR       A generic tar archive not associated with any known tool
 
 .SH SEE ALSO
 .BR sos (1)
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
index 6aadfe79..6d2eb483 100644
--- a/sos/cleaner/__init__.py
+++ b/sos/cleaner/__init__.py
@@ -12,9 +12,7 @@ import hashlib
 import json
 import logging
 import os
-import re
 import shutil
-import tarfile
 import tempfile
 
 from concurrent.futures import ThreadPoolExecutor
@@ -27,7 +25,10 @@ from sos.cleaner.parsers.mac_parser import SoSMacParser
 from sos.cleaner.parsers.hostname_parser import SoSHostnameParser
 from sos.cleaner.parsers.keyword_parser import SoSKeywordParser
 from sos.cleaner.parsers.username_parser import SoSUsernameParser
-from sos.cleaner.obfuscation_archive import SoSObfuscationArchive
+from sos.cleaner.archives.sos import (SoSReportArchive, SoSReportDirectory,
+                                      SoSCollectorArchive,
+                                      SoSCollectorDirectory)
+from sos.cleaner.archives.generic import DataDirArchive, TarballArchive
 from sos.utilities import get_human_readable
 from textwrap import fill
 
@@ -41,6 +42,7 @@ class SoSCleaner(SoSComponent):
     desc = "Obfuscate sensitive networking information in a report"
 
     arg_defaults = {
+        'archive_type': 'auto',
         'domains': [],
         'jobs': 4,
         'keywords': [],
@@ -70,6 +72,7 @@ class SoSCleaner(SoSComponent):
             self.from_cmdline = False
             if not hasattr(self.opts, 'jobs'):
                 self.opts.jobs = 4
+            self.opts.archive_type = 'auto'
             self.soslog = logging.getLogger('sos')
             self.ui_log = logging.getLogger('sos_ui')
             # create the tmp subdir here to avoid a potential race condition
@@ -92,6 +95,17 @@ class SoSCleaner(SoSComponent):
             SoSUsernameParser(self.cleaner_mapping, self.opts.usernames)
         ]
 
+        self.archive_types = [
+            SoSReportDirectory,
+            SoSReportArchive,
+            SoSCollectorDirectory,
+            SoSCollectorArchive,
+            # make sure these two are always last as they are fallbacks
+            DataDirArchive,
+            TarballArchive
+        ]
+        self.nested_archive = None
+
         self.log_info("Cleaner initialized. From cmdline: %s"
                       % self.from_cmdline)
 
@@ -178,6 +192,11 @@ third party.
         )
         clean_grp.add_argument('target', metavar='TARGET',
                                help='The directory or archive to obfuscate')
+        clean_grp.add_argument('--archive-type', default='auto',
+                               choices=['auto', 'report', 'collect',
+                                        'data-dir', 'tarball'],
+                               help=('Specify what kind of archive the target '
+                                     'was generated as'))
         clean_grp.add_argument('--domains', action='extend', default=[],
                                help='List of domain names to obfuscate')
         clean_grp.add_argument('-j', '--jobs', default=4, type=int,
@@ -218,59 +237,28 @@ third party.
 
         In the event the target path is not an archive, abort.
         """
-        if not tarfile.is_tarfile(self.opts.target):
-            self.ui_log.error(
-                "Invalid target: must be directory or tar archive"
-            )
-            self._exit(1)
-
-        archive = tarfile.open(self.opts.target)
-        self.arc_name = self.opts.target.split('/')[-1].split('.')[:-2][0]
-
-        try:
-            archive.getmember(os.path.join(self.arc_name, 'sos_logs'))
-        except Exception:
-            # this is not an sos archive
-            self.ui_log.error("Invalid target: not an sos archive")
-            self._exit(1)
-
-        # see if there are archives within this archive
-        nested_archives = []
-        for _file in archive.getmembers():
-            if (re.match('sosreport-.*.tar', _file.name.split('/')[-1]) and not
-                    (_file.name.endswith(('.md5', '.sha256')))):
-                nested_archives.append(_file.name.split('/')[-1])
-
-        if nested_archives:
-            self.log_info("Found nested archive(s), extracting top level")
-            nested_path = self.extract_archive(archive)
-            for arc_file in os.listdir(nested_path):
-                if re.match('sosreport.*.tar.*', arc_file):
-                    if arc_file.endswith(('.md5', '.sha256')):
-                        continue
-                    self.report_paths.append(os.path.join(nested_path,
-                                                          arc_file))
-            # add the toplevel extracted archive
-            self.report_paths.append(nested_path)
+        _arc = None
+        if self.opts.archive_type != 'auto':
+            check_type = self.opts.archive_type.replace('-', '_')
+            for archive in self.archive_types:
+                if archive.type_name == check_type:
+                    _arc = archive(self.opts.target, self.tmpdir)
         else:
-            self.report_paths.append(self.opts.target)
-
-        archive.close()
-
-    def extract_archive(self, archive):
-        """Extract an archive into our tmpdir so that we may inspect it or
-        iterate through its contents for obfuscation
-
-        Positional arguments:
-
-            :param archive:     An open TarFile object for the archive
-
-        """
-        if not isinstance(archive, tarfile.TarFile):
-            archive = tarfile.open(archive)
-        path = os.path.join(self.tmpdir, 'cleaner')
-        archive.extractall(path)
-        return os.path.join(path, archive.name.split('/')[-1].split('.tar')[0])
+            for arc in self.archive_types:
+                if arc.check_is_type(self.opts.target):
+                    _arc = arc(self.opts.target, self.tmpdir)
+                    break
+        if not _arc:
+            return
+        self.report_paths.append(_arc)
+        if _arc.is_nested:
+            self.report_paths.extend(_arc.get_nested_archives())
+            # We need to preserve the top level archive until all
+            # nested archives are processed
+            self.report_paths.remove(_arc)
+            self.nested_archive = _arc
+        if self.nested_archive:
+            self.nested_archive.ui_name = self.nested_archive.description
 
     def execute(self):
         """SoSCleaner will begin by inspecting the TARGET option to determine
@@ -283,6 +271,7 @@ third party.
         be unpacked, cleaned, and repacked and the final top-level archive will
         then be repacked as well.
         """
+        self.arc_name = self.opts.target.split('/')[-1].split('.tar')[0]
         if self.from_cmdline:
             self.print_disclaimer()
         self.report_paths = []
@@ -290,23 +279,11 @@ third party.
             self.ui_log.error("Invalid target: no such file or directory %s"
                               % self.opts.target)
             self._exit(1)
-        if os.path.isdir(self.opts.target):
-            self.arc_name = self.opts.target.split('/')[-1]
-            for _file in os.listdir(self.opts.target):
-                if _file == 'sos_logs':
-                    self.report_paths.append(self.opts.target)
-                if (_file.startswith('sosreport') and
-                   (_file.endswith(".tar.gz") or _file.endswith(".tar.xz"))):
-                    self.report_paths.append(os.path.join(self.opts.target,
-                                                          _file))
-            if not self.report_paths:
-                self.ui_log.error("Invalid target: not an sos directory")
-                self._exit(1)
-        else:
-            self.inspect_target_archive()
+
+        self.inspect_target_archive()
 
         if not self.report_paths:
-            self.ui_log.error("No valid sos archives or directories found\n")
+            self.ui_log.error("No valid archives or directories found\n")
             self._exit(1)
 
         # we have at least one valid target to obfuscate
@@ -334,33 +311,7 @@ third party.
 
         final_path = None
         if len(self.completed_reports) > 1:
-            # we have an archive of archives, so repack the obfuscated tarball
-            arc_name = self.arc_name + '-obfuscated'
-            self.setup_archive(name=arc_name)
-            for arc in self.completed_reports:
-                if arc.is_tarfile:
-                    arc_dest = self.obfuscate_string(
-                        arc.final_archive_path.split('/')[-1]
-                    )
-                    self.archive.add_file(arc.final_archive_path,
-                                          dest=arc_dest)
-                    checksum = self.get_new_checksum(arc.final_archive_path)
-                    if checksum is not None:
-                        dname = self.obfuscate_string(
-                            "checksums/%s.%s" % (arc_dest, self.hash_name)
-                        )
-                        self.archive.add_string(checksum, dest=dname)
-                else:
-                    for dirname, dirs, files in os.walk(arc.archive_path):
-                        for filename in files:
-                            if filename.startswith('sosreport'):
-                                continue
-                            fname = os.path.join(dirname, filename)
-                            dnm = self.obfuscate_string(
-                                fname.split(arc.archive_name)[-1].lstrip('/')
-                            )
-                            self.archive.add_file(fname, dest=dnm)
-            arc_path = self.archive.finalize(self.opts.compression_type)
+            arc_path = self.rebuild_nested_archive()
         else:
             arc = self.completed_reports[0]
             arc_path = arc.final_archive_path
@@ -371,8 +322,7 @@ third party.
                 )
                 with open(os.path.join(self.sys_tmp, chksum_name), 'w') as cf:
                     cf.write(checksum)
-
-        self.write_cleaner_log()
+            self.write_cleaner_log()
 
         final_path = self.obfuscate_string(
             os.path.join(self.sys_tmp, arc_path.split('/')[-1])
@@ -393,6 +343,30 @@ third party.
 
         self.cleanup()
 
+    def rebuild_nested_archive(self):
+        """Handles repacking the nested tarball, now containing only obfuscated
+        copies of the reports, log files, manifest, etc...
+        """
+        # we have an archive of archives, so repack the obfuscated tarball
+        arc_name = self.arc_name + '-obfuscated'
+        self.setup_archive(name=arc_name)
+        for archive in self.completed_reports:
+            arc_dest = archive.final_archive_path.split('/')[-1]
+            checksum = self.get_new_checksum(archive.final_archive_path)
+            if checksum is not None:
+                dname = "checksums/%s.%s" % (arc_dest, self.hash_name)
+                self.archive.add_string(checksum, dest=dname)
+        for dirn, dirs, files in os.walk(self.nested_archive.extracted_path):
+            for filename in files:
+                fname = os.path.join(dirn, filename)
+                dname = fname.split(self.nested_archive.extracted_path)[-1]
+                dname = dname.lstrip('/')
+                self.archive.add_file(fname, dest=dname)
+                # remove it now so we don't balloon our fs space needs
+                os.remove(fname)
+        self.write_cleaner_log(archive=True)
+        return self.archive.finalize(self.opts.compression_type)
+
     def compile_mapping_dict(self):
         """Build a dict that contains each parser's map as a key, with the
         contents as that key's value. This will then be written to disk in the
@@ -441,7 +415,7 @@ third party.
                 self.log_error("Could not update mapping config file: %s"
                                % err)
 
-    def write_cleaner_log(self):
+    def write_cleaner_log(self, archive=False):
         """When invoked via the command line, the logging from SoSCleaner will
         not be added to the archive(s) it processes, so we need to write it
         separately to disk
@@ -454,6 +428,10 @@ third party.
             for line in self.sos_log_file.readlines():
                 logfile.write(line)
 
+        if archive:
+            self.obfuscate_file(log_name)
+            self.archive.add_file(log_name, dest="sos_logs/cleaner.log")
+
     def get_new_checksum(self, archive_path):
         """Calculate a new checksum for the obfuscated archive, as the previous
         checksum will no longer be valid
@@ -481,11 +459,11 @@ third party.
         be obfuscated concurrently.
         """
         try:
-            if len(self.report_paths) > 1:
-                msg = ("Found %s total reports to obfuscate, processing up to "
-                       "%s concurrently\n"
-                       % (len(self.report_paths), self.opts.jobs))
-                self.ui_log.info(msg)
+            msg = (
+                "Found %s total reports to obfuscate, processing up to %s "
+                "concurrently\n" % (len(self.report_paths), self.opts.jobs)
+            )
+            self.ui_log.info(msg)
             if self.opts.keep_binary_files:
                 self.ui_log.warning(
                     "WARNING: binary files that potentially contain sensitive "
@@ -494,53 +472,67 @@ third party.
             pool = ThreadPoolExecutor(self.opts.jobs)
             pool.map(self.obfuscate_report, self.report_paths, chunksize=1)
             pool.shutdown(wait=True)
+            # finally, obfuscate the nested archive if one exists
+            if self.nested_archive:
+                self._replace_obfuscated_archives()
+                self.obfuscate_report(self.nested_archive)
         except KeyboardInterrupt:
             self.ui_log.info("Exiting on user cancel")
             os._exit(130)
 
+    def _replace_obfuscated_archives(self):
+        """When we have a nested archive, we need to rebuild the original
+        archive, which entails replacing the existing archives with their
+        obfuscated counterparts
+        """
+        for archive in self.completed_reports:
+            os.remove(archive.archive_path)
+            dest = self.nested_archive.extracted_path
+            tarball = archive.final_archive_path.split('/')[-1]
+            dest_name = os.path.join(dest, tarball)
+            shutil.move(archive.final_archive_path, dest)
+            archive.final_archive_path = dest_name
+
     def preload_all_archives_into_maps(self):
         """Before doing the actual obfuscation, if we have multiple archives
         to obfuscate then we need to preload each of them into the mappings
         to ensure that node1 is obfuscated in node2 as well as node2 being
         obfuscated in node1's archive.
         """
-        self.log_info("Pre-loading multiple archives into obfuscation maps")
+        self.log_info("Pre-loading all archives into obfuscation maps")
         for _arc in self.report_paths:
-            is_dir = os.path.isdir(_arc)
-            if is_dir:
-                _arc_name = _arc
-            else:
-                archive = tarfile.open(_arc)
-                _arc_name = _arc.split('/')[-1].split('.tar')[0]
-            # for each parser, load the map_prep_file into memory, and then
-            # send that for obfuscation. We don't actually obfuscate the file
-            # here, do that in the normal archive loop
             for _parser in self.parsers:
-                if not _parser.prep_map_file:
+                try:
+                    pfile = _arc.prep_files[_parser.name.lower().split()[0]]
+                    if not pfile:
+                        continue
+                except (IndexError, KeyError):
                     continue
-                if isinstance(_parser.prep_map_file, str):
-                    _parser.prep_map_file = [_parser.prep_map_file]
-                for parse_file in _parser.prep_map_file:
-                    _arc_path = os.path.join(_arc_name, parse_file)
+                if isinstance(pfile, str):
+                    pfile = [pfile]
+                for parse_file in pfile:
+                    self.log_debug("Attempting to load %s" % parse_file)
                     try:
-                        if is_dir:
-                            _pfile = open(_arc_path, 'r')
-                            content = _pfile.read()
-                        else:
-                            _pfile = archive.extractfile(_arc_path)
-                            content = _pfile.read().decode('utf-8')
-                        _pfile.close()
+                        content = _arc.get_file_content(parse_file)
+                        if not content:
+                            continue
                         if isinstance(_parser, SoSUsernameParser):
                             _parser.load_usernames_into_map(content)
-                        for line in content.splitlines():
-                            if isinstance(_parser, SoSHostnameParser):
-                                _parser.load_hostname_into_map(line)
-                            self.obfuscate_line(line)
+                        elif isinstance(_parser, SoSHostnameParser):
+                            _parser.load_hostname_into_map(
+                                content.splitlines()[0]
+                            )
+                        else:
+                            for line in content.splitlines():
+                                self.obfuscate_line(line)
                     except Exception as err:
-                        self.log_debug("Could not prep %s: %s"
-                                       % (_arc_path, err))
+                        self.log_info(
+                            "Could not prepare %s from %s (archive: %s): %s"
+                            % (_parser.name, parse_file, _arc.archive_name,
+                               err)
+                        )
 
-    def obfuscate_report(self, report):
+    def obfuscate_report(self, archive):
         """Individually handle each archive or directory we've discovered by
         running through each file therein.
 
@@ -549,17 +541,12 @@ third party.
             :param report str:      Filepath to the directory or archive
         """
         try:
-            if not os.access(report, os.W_OK):
-                msg = "Insufficient permissions on %s" % report
-                self.log_info(msg)
-                self.ui_log.error(msg)
-                return
-
-            archive = SoSObfuscationArchive(report, self.tmpdir)
             arc_md = self.cleaner_md.add_section(archive.archive_name)
             start_time = datetime.now()
             arc_md.add_field('start_time', start_time)
-            archive.extract()
+            # don't double extract nested archives
+            if not archive.is_extracted:
+                archive.extract()
             archive.report_msg("Beginning obfuscation...")
 
             file_list = archive.get_file_list()
@@ -586,27 +573,28 @@ third party.
                               caller=archive.archive_name)
 
             # if the archive was already a tarball, repack it
-            method = archive.get_compression()
-            if method:
-                archive.report_msg("Re-compressing...")
-                try:
-                    archive.rename_top_dir(
-                        self.obfuscate_string(archive.archive_name)
-                    )
-                    archive.compress(method)
-                except Exception as err:
-                    self.log_debug("Archive %s failed to compress: %s"
-                                   % (archive.archive_name, err))
-                    archive.report_msg("Failed to re-compress archive: %s"
-                                       % err)
-                    return
+            if not archive.is_nested:
+                method = archive.get_compression()
+                if method:
+                    archive.report_msg("Re-compressing...")
+                    try:
+                        archive.rename_top_dir(
+                            self.obfuscate_string(archive.archive_name)
+                        )
+                        archive.compress(method)
+                    except Exception as err:
+                        self.log_debug("Archive %s failed to compress: %s"
+                                       % (archive.archive_name, err))
+                        archive.report_msg("Failed to re-compress archive: %s"
+                                           % err)
+                        return
+                self.completed_reports.append(archive)
 
             end_time = datetime.now()
             arc_md.add_field('end_time', end_time)
             arc_md.add_field('run_time', end_time - start_time)
             arc_md.add_field('files_obfuscated', len(archive.file_sub_list))
             arc_md.add_field('total_substitutions', archive.total_sub_count)
-            self.completed_reports.append(archive)
             rmsg = ''
             if archive.removed_file_count:
                 rmsg = " [removed %s unprocessable files]"
@@ -615,7 +603,7 @@ third party.
 
         except Exception as err:
             self.ui_log.info("Exception while processing %s: %s"
-                             % (report, err))
+                             % (archive.archive_name, err))
 
     def obfuscate_file(self, filename, short_name=None, arc_name=None):
         """Obfuscate and individual file, line by line.
@@ -635,6 +623,8 @@ third party.
             # the requested file doesn't exist in the archive
             return
         subs = 0
+        if not short_name:
+            short_name = filename.split('/')[-1]
         if not os.path.islink(filename):
             # don't run the obfuscation on the link, but on the actual file
             # at some other point.
@@ -745,3 +735,5 @@ third party.
         for parser in self.parsers:
             _sec = parse_sec.add_section(parser.name.replace(' ', '_').lower())
             _sec.add_field('entries', len(parser.mapping.dataset.keys()))
+
+# vim: set et ts=4 sw=4 :
diff --git a/sos/cleaner/obfuscation_archive.py b/sos/cleaner/archives/__init__.py
similarity index 81%
rename from sos/cleaner/obfuscation_archive.py
rename to sos/cleaner/archives/__init__.py
index ea0b7012..795c5a78 100644
--- a/sos/cleaner/obfuscation_archive.py
+++ b/sos/cleaner/archives/__init__.py
@@ -40,6 +40,10 @@ class SoSObfuscationArchive():
     file_sub_list = []
     total_sub_count = 0
     removed_file_count = 0
+    type_name = 'undetermined'
+    description = 'undetermined'
+    is_nested = False
+    prep_files = {}
 
     def __init__(self, archive_path, tmpdir):
         self.archive_path = archive_path
@@ -50,7 +54,43 @@ class SoSObfuscationArchive():
         self.soslog = logging.getLogger('sos')
         self.ui_log = logging.getLogger('sos_ui')
         self.skip_list = self._load_skip_list()
-        self.log_info("Loaded %s as an archive" % self.archive_path)
+        self.is_extracted = False
+        self._load_self()
+        self.archive_root = ''
+        self.log_info(
+            "Loaded %s as type %s"
+            % (self.archive_path, self.description)
+        )
+
+    @classmethod
+    def check_is_type(cls, arc_path):
+        """Check if the archive is a well-known type we directly support"""
+        return False
+
+    def _load_self(self):
+        if self.is_tarfile:
+            self.tarobj = tarfile.open(self.archive_path)
+
+    def get_nested_archives(self):
+        """Return a list of ObfuscationArchives that represent additional
+        archives found within the target archive. For example, an archive from
+        `sos collect` will return a list of ``SoSReportArchive`` objects.
+
+        This should be overridden by individual types of ObfuscationArchive's
+        """
+        return []
+
+    def get_archive_root(self):
+        """Set the root path for the archive that should be prepended to any
+        filenames given to methods in this class.
+        """
+        if self.is_tarfile:
+            toplevel = self.tarobj.firstmember
+            if toplevel.isdir():
+                return toplevel.name
+            else:
+                return os.sep
+        return os.path.abspath(self.archive_path)
 
     def report_msg(self, msg):
         """Helper to easily format ui messages on a per-report basis"""
@@ -96,10 +136,42 @@ class SoSObfuscationArchive():
             os.remove(full_fname)
             self.removed_file_count += 1
 
-    def extract(self):
+    def format_file_name(self, fname):
+        """Based on the type of archive we're dealing with, do whatever that
+        archive requires to a provided **relative** filepath to be able to
+        access it within the archive
+        """
+        if not self.is_extracted:
+            if not self.archive_root:
+                self.archive_root = self.get_archive_root()
+            return os.path.join(self.archive_root, fname)
+        else:
+            return os.path.join(self.extracted_path, fname)
+
+    def get_file_content(self, fname):
+        """Return the content from the specified fname. Particularly useful for
+        tarball-type archives so we can retrieve prep file contents prior to
+        extracting the entire archive
+        """
+        if self.is_extracted is False and self.is_tarfile:
+            filename = self.format_file_name(fname)
+            try:
+                return self.tarobj.extractfile(filename).read().decode('utf-8')
+            except KeyError:
+                self.log_debug(
+                    "Unable to retrieve %s: no such file in archive" % fname
+                )
+                return ''
+        else:
+            with open(self.format_file_name(fname), 'r') as to_read:
+                return to_read.read()
+
+    def extract(self, quiet=False):
         if self.is_tarfile:
-            self.report_msg("Extracting...")
+            if not quiet:
+                self.report_msg("Extracting...")
             self.extracted_path = self.extract_self()
+            self.is_extracted = True
         else:
             self.extracted_path = self.archive_path
         # if we're running as non-root (e.g. collector), then we can have a
@@ -317,3 +389,5 @@ class SoSObfuscationArchive():
                 return False
             except UnicodeDecodeError:
                 return True
+
+# vim: set et ts=4 sw=4 :
diff --git a/sos/cleaner/archives/generic.py b/sos/cleaner/archives/generic.py
new file mode 100644
index 00000000..2ce6f09b
--- /dev/null
+++ b/sos/cleaner/archives/generic.py
@@ -0,0 +1,52 @@
+# Copyright 2020 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
+
+# This file is part of the sos project: https://github.com/sosreport/sos
+#
+# This copyrighted material is made available to anyone wishing to use,
+# modify, copy, or redistribute it subject to the terms and conditions of
+# version 2 of the GNU General Public License.
+#
+# See the LICENSE file in the source distribution for further information.
+
+
+from sos.cleaner.archives import SoSObfuscationArchive
+
+import os
+import tarfile
+
+
+class DataDirArchive(SoSObfuscationArchive):
+    """A plain directory on the filesystem that is not directly associated with
+    any known or supported collection utility
+    """
+
+    type_name = 'data_dir'
+    description = 'unassociated directory'
+
+    @classmethod
+    def check_is_type(cls, arc_path):
+        return os.path.isdir(arc_path)
+
+    def set_archive_root(self):
+        return os.path.abspath(self.archive_path)
+
+
+class TarballArchive(SoSObfuscationArchive):
+    """A generic tar archive that is not associated with any known or supported
+    collection utility
+    """
+
+    type_name = 'tarball'
+    description = 'unassociated tarball'
+
+    @classmethod
+    def check_is_type(cls, arc_path):
+        try:
+            return tarfile.is_tarfile(arc_path)
+        except Exception:
+            return False
+
+    def set_archive_root(self):
+        if self.tarobj.firstmember.isdir():
+            return self.tarobj.firstmember.name
+        return ''
diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
new file mode 100644
index 00000000..4401d710
--- /dev/null
+++ b/sos/cleaner/archives/sos.py
@@ -0,0 +1,106 @@
+# Copyright 2021 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
+
+# This file is part of the sos project: https://github.com/sosreport/sos
+#
+# This copyrighted material is made available to anyone wishing to use,
+# modify, copy, or redistribute it subject to the terms and conditions of
+# version 2 of the GNU General Public License.
+#
+# See the LICENSE file in the source distribution for further information.
+
+
+from sos.cleaner.archives import SoSObfuscationArchive
+
+import os
+import tarfile
+
+
+class SoSReportArchive(SoSObfuscationArchive):
+    """This is the class representing an sos report, or in other words the
+    type the archive the SoS project natively generates
+    """
+
+    type_name = 'report'
+    description = 'sos report archive'
+    prep_files = {
+        'hostname': 'sos_commands/host/hostname',
+        'ip': 'sos_commands/networking/ip_-o_addr',
+        'mac': 'sos_commands/networking/ip_-d_address',
+        'username': [
+            'sos_commands/login/lastlog_-u_1000-60000',
+            'sos_commands/login/lastlog_-u_60001-65536',
+            'sos_commands/login/lastlog_-u_65537-4294967295',
+            # AD users will be reported here, but favor the lastlog files since
+            # those will include local users who have not logged in
+            'sos_commands/login/last'
+        ]
+    }
+
+    @classmethod
+    def check_is_type(cls, arc_path):
+        try:
+            return tarfile.is_tarfile(arc_path) and 'sosreport-' in arc_path
+        except Exception:
+            return False
+
+
+class SoSReportDirectory(SoSReportArchive):
+    """This is the archive class representing a build directory, or in other
+    words what `sos report --clean` will end up using for in-line obfuscation
+    """
+
+    type_name = 'report_dir'
+    description = 'sos report directory'
+
+    @classmethod
+    def check_is_type(cls, arc_path):
+        if os.path.isdir(arc_path):
+            return 'sos_logs' in os.listdir(arc_path)
+        return False
+
+
+class SoSCollectorArchive(SoSObfuscationArchive):
+    """Archive class representing the tarball created by ``sos collect``. It
+    will not provide prep files on its own, however it will provide a list
+    of SoSReportArchive's which will then be used to prep the parsers
+    """
+
+    type_name = 'collect'
+    description = 'sos collect tarball'
+    is_nested = True
+
+    @classmethod
+    def check_is_type(cls, arc_path):
+        try:
+            return (tarfile.is_tarfile(arc_path) and 'sos-collect' in arc_path)
+        except Exception:
+            return False
+
+    def get_nested_archives(self):
+        self.extract(quiet=True)
+        _path = self.extracted_path
+        archives = []
+        for fname in os.listdir(_path):
+            arc_name = os.path.join(_path, fname)
+            if 'sosreport-' in fname and tarfile.is_tarfile(arc_name):
+                archives.append(SoSReportArchive(arc_name, self.tmpdir))
+        return archives
+
+
+class SoSCollectorDirectory(SoSCollectorArchive):
+    """The archive class representing the temp directory used by ``sos
+    collect`` when ``--clean`` is used during runtime.
+    """
+
+    type_name = 'collect_dir'
+    description = 'sos collect directory'
+
+    @classmethod
+    def check_is_type(cls, arc_path):
+        if os.path.isdir(arc_path):
+            for fname in os.listdir(arc_path):
+                if 'sos-collector-' in fname:
+                    return True
+        return False
+
+# vim: set et ts=4 sw=4 :
diff --git a/sos/cleaner/parsers/__init__.py b/sos/cleaner/parsers/__init__.py
index af6e375e..e62fd938 100644
--- a/sos/cleaner/parsers/__init__.py
+++ b/sos/cleaner/parsers/__init__.py
@@ -37,11 +37,6 @@ class SoSCleanerParser():
     :cvar map_file_key: The key in the ``map_file`` to read when loading
                         previous obfuscation matches
     :vartype map_file_key: ``str``
-
-
-    :cvar prep_map_file: File to read from an archive to pre-seed the map with
-                         matches. E.G. ip_addr for loading IP addresses
-    :vartype prep_map_fie: ``str``
     """
 
     name = 'Undefined Parser'
@@ -49,7 +44,6 @@ class SoSCleanerParser():
     skip_line_patterns = []
     skip_files = []
     map_file_key = 'unset'
-    prep_map_file = []
 
     def __init__(self, config={}):
         if self.map_file_key in config:
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
index 71e13d3f..daa76a62 100644
--- a/sos/cleaner/parsers/hostname_parser.py
+++ b/sos/cleaner/parsers/hostname_parser.py
@@ -16,7 +16,6 @@ class SoSHostnameParser(SoSCleanerParser):
 
     name = 'Hostname Parser'
     map_file_key = 'hostname_map'
-    prep_map_file = 'sos_commands/host/hostname'
     regex_patterns = [
         r'(((\b|_)[a-zA-Z0-9-\.]{1,200}\.[a-zA-Z]{1,63}(\b|_)))'
     ]
diff --git a/sos/cleaner/parsers/ip_parser.py b/sos/cleaner/parsers/ip_parser.py
index 525139e8..71d38be8 100644
--- a/sos/cleaner/parsers/ip_parser.py
+++ b/sos/cleaner/parsers/ip_parser.py
@@ -41,7 +41,6 @@ class SoSIPParser(SoSCleanerParser):
     ]
 
     map_file_key = 'ip_map'
-    prep_map_file = 'sos_commands/networking/ip_-o_addr'
 
     def __init__(self, config):
         self.mapping = SoSIPMap()
diff --git a/sos/cleaner/parsers/keyword_parser.py b/sos/cleaner/parsers/keyword_parser.py
index 68de3727..694c6073 100644
--- a/sos/cleaner/parsers/keyword_parser.py
+++ b/sos/cleaner/parsers/keyword_parser.py
@@ -20,7 +20,6 @@ class SoSKeywordParser(SoSCleanerParser):
 
     name = 'Keyword Parser'
     map_file_key = 'keyword_map'
-    prep_map_file = ''
 
     def __init__(self, config, keywords=None, keyword_file=None):
         self.mapping = SoSKeywordMap()
diff --git a/sos/cleaner/parsers/mac_parser.py b/sos/cleaner/parsers/mac_parser.py
index 7ca80b8d..c74288cf 100644
--- a/sos/cleaner/parsers/mac_parser.py
+++ b/sos/cleaner/parsers/mac_parser.py
@@ -30,7 +30,6 @@ class SoSMacParser(SoSCleanerParser):
         '534f:53'
     )
     map_file_key = 'mac_map'
-    prep_map_file = 'sos_commands/networking/ip_-d_address'
 
     def __init__(self, config):
         self.mapping = SoSMacMap()
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
index b142e371..35377a31 100644
--- a/sos/cleaner/parsers/username_parser.py
+++ b/sos/cleaner/parsers/username_parser.py
@@ -25,14 +25,6 @@ class SoSUsernameParser(SoSCleanerParser):
 
     name = 'Username Parser'
     map_file_key = 'username_map'
-    prep_map_file = [
-        'sos_commands/login/lastlog_-u_1000-60000',
-        'sos_commands/login/lastlog_-u_60001-65536',
-        'sos_commands/login/lastlog_-u_65537-4294967295',
-        # AD users will be reported here, but favor the lastlog files since
-        # those will include local users who have not logged in
-        'sos_commands/login/last'
-    ]
     regex_patterns = []
     skip_list = [
         'core',
diff --git a/tests/cleaner_tests/existing_archive.py b/tests/cleaner_tests/existing_archive.py
index 0eaf6c8d..e13d1cae 100644
--- a/tests/cleaner_tests/existing_archive.py
+++ b/tests/cleaner_tests/existing_archive.py
@@ -28,6 +28,13 @@ class ExistingArchiveCleanTest(StageTwoReportTest):
     def test_obfuscation_log_created(self):
         self.assertFileExists(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE))
 
+    def test_archive_type_correct(self):
+        with open(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE), 'r') as log:
+            for line in log:
+                if "Loaded %s" % ARCHIVE in line:
+                    assert 'as type sos report archive' in line, "Incorrect archive type detected: %s" % line
+                    break
+
     def test_from_cmdline_logged(self):
         with open(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE), 'r') as log:
             for line in log:
diff --git a/tests/cleaner_tests/full_report_run.py b/tests/cleaner_tests/full_report_run.py
index 3b28e7a2..2de54946 100644
--- a/tests/cleaner_tests/full_report_run.py
+++ b/tests/cleaner_tests/full_report_run.py
@@ -35,6 +35,9 @@ class FullCleanTest(StageTwoReportTest):
     def test_tarball_named_obfuscated(self):
         self.assertTrue('obfuscated' in self.archive)
 
+    def test_archive_type_correct(self):
+        self.assertSosLogContains('Loaded .* as type sos report directory')
+
     def test_hostname_not_in_any_file(self):
         host = self.sysinfo['pre']['networking']['hostname']
         # much faster to just use grep here
diff --git a/tests/cleaner_tests/report_with_mask.py b/tests/cleaner_tests/report_with_mask.py
index 4f94ba33..08e873d4 100644
--- a/tests/cleaner_tests/report_with_mask.py
+++ b/tests/cleaner_tests/report_with_mask.py
@@ -31,6 +31,9 @@ class ReportWithMask(StageOneReportTest):
     def test_tarball_named_obfuscated(self):
         self.assertTrue('obfuscated' in self.archive)
 
+    def test_archive_type_correct(self):
+        self.assertSosLogContains('Loaded .* as type sos report directory')
+
     def test_localhost_was_obfuscated(self):
         self.assertFileHasContent('/etc/hostname', 'host0')
 
-- 
2.31.1

From 9b119f860eaec089f7ef884ff39c42589a662994 Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Wed, 1 Sep 2021 00:34:04 -0400
Subject: [PATCH] [hostname_map] Add a catch for single-character hostnames

If a log file was truncated at a specific boundary in a string of the
FQDN of the host such that we only get a couple characters before the
rest of the domain, we would previously bodly replace all instances of
that character with the obfuscated short name; not very helpful.

Instead, don't sanitize the short name if this happens and instead
obfuscate the whole FQDN as 'unknown.example.com'.

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 sos/cleaner/mappings/hostname_map.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
index d4b2c88e..e70a5530 100644
--- a/sos/cleaner/mappings/hostname_map.py
+++ b/sos/cleaner/mappings/hostname_map.py
@@ -184,7 +184,14 @@ class SoSHostnameMap(SoSMap):
             hostname = host[0]
             domain = host[1:]
             # obfuscate the short name
-            ob_hostname = self.sanitize_short_name(hostname)
+            if len(hostname) > 2:
+                ob_hostname = self.sanitize_short_name(hostname)
+            else:
+                # by best practice it appears the host part of the fqdn was cut
+                # off due to some form of truncating, as such don't obfuscate
+                # short strings that are likely to throw off obfuscation of
+                # unrelated bits and paths
+                ob_hostname = 'unknown'
             ob_domain = self.sanitize_domain(domain)
             self.dataset[item] = ob_domain
             return '.'.join([ob_hostname, ob_domain])
-- 
2.31.1

From f3f3e763d7c31b7b7cafdf8dd4dab87056fb7696 Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Wed, 1 Sep 2021 15:54:55 -0400
Subject: [PATCH] [cleaner] Add support for Insights client archives

Adds a new type of `SoSObfuscationArchive` to add support for
obfuscating archives generated by the Insights project.

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 man/en/sos-clean.1               |  1 +
 sos/cleaner/__init__.py          |  4 ++-
 sos/cleaner/archives/insights.py | 42 ++++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 sos/cleaner/archives/insights.py

diff --git a/man/en/sos-clean.1 b/man/en/sos-clean.1
index 54026713..358ec0cb 100644
--- a/man/en/sos-clean.1
+++ b/man/en/sos-clean.1
@@ -105,6 +105,7 @@ The following are accepted values for this option:
     \fBauto\fR          Automatically detect the archive type
     \fBreport\fR        An archive generated by \fBsos report\fR
     \fBcollect\fR       An archive generated by \fBsos collect\fR
+    \fBinsights\fR      An archive generated by the \fBinsights-client\fR package
 
 The following may also be used, however note that these do not attempt to pre-load
 any information from the archives into the parsers. This means that, among other limitations,
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
index 6d2eb483..3e08aa28 100644
--- a/sos/cleaner/__init__.py
+++ b/sos/cleaner/__init__.py
@@ -29,6 +29,7 @@ from sos.cleaner.archives.sos import (SoSReportArchive, SoSReportDirectory,
                                       SoSCollectorArchive,
                                       SoSCollectorDirectory)
 from sos.cleaner.archives.generic import DataDirArchive, TarballArchive
+from sos.cleaner.archives.insights import InsightsArchive
 from sos.utilities import get_human_readable
 from textwrap import fill
 
@@ -100,6 +101,7 @@ class SoSCleaner(SoSComponent):
             SoSReportArchive,
             SoSCollectorDirectory,
             SoSCollectorArchive,
+            InsightsArchive,
             # make sure these two are always last as they are fallbacks
             DataDirArchive,
             TarballArchive
@@ -194,7 +196,7 @@ third party.
                                help='The directory or archive to obfuscate')
         clean_grp.add_argument('--archive-type', default='auto',
                                choices=['auto', 'report', 'collect',
-                                        'data-dir', 'tarball'],
+                                        'insights', 'data-dir', 'tarball'],
                                help=('Specify what kind of archive the target '
                                      'was generated as'))
         clean_grp.add_argument('--domains', action='extend', default=[],
diff --git a/sos/cleaner/archives/insights.py b/sos/cleaner/archives/insights.py
new file mode 100644
index 00000000..dab48b16
--- /dev/null
+++ b/sos/cleaner/archives/insights.py
@@ -0,0 +1,42 @@
+# Copyright 2021 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
+
+# This file is part of the sos project: https://github.com/sosreport/sos
+#
+# This copyrighted material is made available to anyone wishing to use,
+# modify, copy, or redistribute it subject to the terms and conditions of
+# version 2 of the GNU General Public License.
+#
+# See the LICENSE file in the source distribution for further information.
+
+
+from sos.cleaner.archives import SoSObfuscationArchive
+
+import tarfile
+
+
+class InsightsArchive(SoSObfuscationArchive):
+    """This class represents archives generated by the insights-client utility
+    for RHEL systems.
+    """
+
+    type_name = 'insights'
+    description = 'insights-client archive'
+
+    prep_files = {
+        'hostname': 'data/insights_commands/hostname_-f',
+        'ip': 'data/insights_commands/ip_addr',
+        'mac': 'data/insights_commands/ip_addr'
+    }
+
+    @classmethod
+    def check_is_type(cls, arc_path):
+        try:
+            return tarfile.is_tarfile(arc_path) and 'insights-' in arc_path
+        except Exception:
+            return False
+
+    def get_archive_root(self):
+        top = self.archive_path.split('/')[-1].split('.tar')[0]
+        if self.tarobj.firstmember.name == '.':
+            top = './' + top
+        return top
-- 
2.31.1

From 9639dc3d240076b55f2a1d04b43ea42bebd09215 Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Tue, 16 Nov 2021 17:50:42 -0500
Subject: [PATCH] [clean,hostname_parser] Source /etc/hosts for obfuscation

Up until now, our sourcing of hostnames/domains for obfuscation has been
dependent upon the output of the `hostname` command. However, some
scenarios have come up where sourcing `/etc/hosts` is advantageous for
several reasons:

First, if `hostname` output is unavailable, this provides a fallback
measure.

Second, `/etc/hosts` is a common place to have short names defined which
would otherwise not be detected (or at the very least would result in a
race condition based on where/if the short name was elsewhere able to be
gleaned from an FQDN), thus leaving the potential for unobfuscated data
in an archive.

Due to both the nature of hostname obfuscation and the malleable syntax
of `/etc/hosts`, the parsing of this file needs special handling not
covered by our more generic parsing and obfuscation methods.

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 sos/cleaner/__init__.py                | 11 ++++++++---
 sos/cleaner/archives/sos.py            |  5 ++++-
 sos/cleaner/parsers/hostname_parser.py | 19 +++++++++++++++++++
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
index ed461a8f..3f530d44 100644
--- a/sos/cleaner/__init__.py
+++ b/sos/cleaner/__init__.py
@@ -523,9 +523,14 @@ third party.
                         if isinstance(_parser, SoSUsernameParser):
                             _parser.load_usernames_into_map(content)
                         elif isinstance(_parser, SoSHostnameParser):
-                            _parser.load_hostname_into_map(
-                                content.splitlines()[0]
-                            )
+                            if 'hostname' in parse_file:
+                                _parser.load_hostname_into_map(
+                                    content.splitlines()[0]
+                                )
+                            elif 'etc/hosts' in parse_file:
+                                _parser.load_hostname_from_etc_hosts(
+                                    content
+                                )
                         else:
                             for line in content.splitlines():
                                 self.obfuscate_line(line)
diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
index 4401d710..f8720c88 100644
--- a/sos/cleaner/archives/sos.py
+++ b/sos/cleaner/archives/sos.py
@@ -23,7 +23,10 @@ class SoSReportArchive(SoSObfuscationArchive):
     type_name = 'report'
     description = 'sos report archive'
     prep_files = {
-        'hostname': 'sos_commands/host/hostname',
+        'hostname': [
+            'sos_commands/host/hostname',
+            'etc/hosts'
+        ],
         'ip': 'sos_commands/networking/ip_-o_addr',
         'mac': 'sos_commands/networking/ip_-d_address',
         'username': [
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
index daa76a62..0a733bee 100644
--- a/sos/cleaner/parsers/hostname_parser.py
+++ b/sos/cleaner/parsers/hostname_parser.py
@@ -61,6 +61,25 @@ class SoSHostnameParser(SoSCleanerParser):
             self.mapping.add(high_domain)
         self.mapping.add(hostname_string)
 
+    def load_hostname_from_etc_hosts(self, content):
+        """Parse an archive's copy of /etc/hosts, which requires handling that
+        is separate from the output of the `hostname` command. Just like
+        load_hostname_into_map(), this has to be done explicitly and we
+        cannot rely upon the more generic methods to do this reliably.
+        """
+        lines = content.splitlines()
+        for line in lines:
+            if line.startswith('#') or 'localhost' in line:
+                continue
+            hostln = line.split()[1:]
+            for host in hostln:
+                if len(host.split('.')) == 1:
+                    # only generate a mapping for fqdns but still record the
+                    # short name here for later obfuscation with parse_line()
+                    self.short_names.append(host)
+                else:
+                    self.mapping.add(host)
+
     def parse_line(self, line):
         """Override the default parse_line() method to also check for the
         shortname of the host derived from the hostname.
-- 
2.31.1

From c1680226b53452b18f27f2e76c3e0e03e521f935 Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Wed, 17 Nov 2021 13:11:33 -0500
Subject: [PATCH] [clean, hostname] Fix unintentionally case sensitive
 shortname handling

It was discovered that our extra handling for shortnames was
unintentionally case sensitive. Fix this to ensure that shortnames are
obfuscated regardless of case in all collected text.

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 sos/cleaner/mappings/hostname_map.py   |  6 +++---
 sos/cleaner/parsers/hostname_parser.py |  8 +++++---
 tests/cleaner_tests/full_report_run.py | 21 ++++++++++++++++++++-
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
index e70a5530..0fe78fb1 100644
--- a/sos/cleaner/mappings/hostname_map.py
+++ b/sos/cleaner/mappings/hostname_map.py
@@ -169,13 +169,13 @@ class SoSHostnameMap(SoSMap):
 
     def sanitize_item(self, item):
         host = item.split('.')
-        if all([h.isupper() for h in host]):
+        if len(host) > 1 and all([h.isupper() for h in host]):
             # by convention we have just a domain
             _host = [h.lower() for h in host]
             return self.sanitize_domain(_host).upper()
         if len(host) == 1:
             # we have a shortname for a host
-            return self.sanitize_short_name(host[0])
+            return self.sanitize_short_name(host[0].lower())
         if len(host) == 2:
             # we have just a domain name, e.g. example.com
             return self.sanitize_domain(host)
@@ -185,7 +185,7 @@ class SoSHostnameMap(SoSMap):
             domain = host[1:]
             # obfuscate the short name
             if len(hostname) > 2:
-                ob_hostname = self.sanitize_short_name(hostname)
+                ob_hostname = self.sanitize_short_name(hostname.lower())
             else:
                 # by best practice it appears the host part of the fqdn was cut
                 # off due to some form of truncating, as such don't obfuscate
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
index 0a733bee..7fd0e698 100644
--- a/sos/cleaner/parsers/hostname_parser.py
+++ b/sos/cleaner/parsers/hostname_parser.py
@@ -8,6 +8,8 @@
 #
 # See the LICENSE file in the source distribution for further information.
 
+import re
+
 from sos.cleaner.parsers import SoSCleanerParser
 from sos.cleaner.mappings.hostname_map import SoSHostnameMap
 
@@ -91,9 +93,9 @@ class SoSHostnameParser(SoSCleanerParser):
             """
             if search in self.mapping.skip_keys:
                 return ln, count
-            if search in ln:
-                count += ln.count(search)
-                ln = ln.replace(search, self.mapping.get(repl or search))
+            _reg = re.compile(search, re.I)
+            if _reg.search(ln):
+                return _reg.subn(self.mapping.get(repl or search), ln)
             return ln, count
 
         count = 0
diff --git a/tests/cleaner_tests/full_report_run.py b/tests/cleaner_tests/full_report_run.py
index 2de54946..0b23acaf 100644
--- a/tests/cleaner_tests/full_report_run.py
+++ b/tests/cleaner_tests/full_report_run.py
@@ -26,6 +26,24 @@ class FullCleanTest(StageTwoReportTest):
     # replace with an empty placeholder, make sure that this test case is not
     # influenced by previous clean runs
     files = ['/etc/sos/cleaner/default_mapping']
+    packages = {
+        'rhel': ['python3-systemd'],
+        'ubuntu': ['python3-systemd']
+    }
+
+    def pre_sos_setup(self):
+        # ensure that case-insensitive matching of FQDNs and shortnames work
+        from systemd import journal
+        from socket import gethostname
+        host = gethostname()
+        short = host.split('.')[0]
+        sosfd = journal.stream('sos-testing')
+        sosfd.write(
+            "This is a test line from sos clean testing. The hostname %s "
+            "should not appear, nor should %s in an obfuscated archive. The "
+            "shortnames of %s and %s should also not appear."
+            % (host.lower(), host.upper(), short.lower(), short.upper())
+        )
 
     def test_private_map_was_generated(self):
         self.assertOutputContains('A mapping of obfuscated elements is available at')
@@ -40,8 +58,9 @@ class FullCleanTest(StageTwoReportTest):
 
     def test_hostname_not_in_any_file(self):
         host = self.sysinfo['pre']['networking']['hostname']
+        short = host.split('.')[0]
         # much faster to just use grep here
-        content = self.grep_for_content(host)
+        content = self.grep_for_content(host) + self.grep_for_content(short)
         if not content:
             assert True
         else:
-- 
2.31.1

From aaeb8cb57ed55598ab744b96d4f127aedebcb292 Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Tue, 21 Sep 2021 15:23:20 -0400
Subject: [PATCH] [build] Add archives to setup.py packages

Adds the newly abstracted `sos.cleaner.archives` package to `setup.py`
so that manual builds will properly include it.

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1e8d8e2dc5..7653b59de3 100644
--- a/setup.py
+++ b/setup.py
@@ -102,7 +102,7 @@ def copy_file (self, filename, dirname):
         'sos.policies.package_managers', 'sos.policies.init_systems',
         'sos.report', 'sos.report.plugins', 'sos.collector',
         'sos.collector.clusters', 'sos.cleaner', 'sos.cleaner.mappings',
-        'sos.cleaner.parsers'
+        'sos.cleaner.parsers', 'sos.cleaner.archives'
     ],
     cmdclass=cmdclass,
     command_options=command_options,
-- 
2.31.1

From ba3528230256429a4394f155a9ca1fdb91cf3560 Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Tue, 30 Nov 2021 12:46:34 -0500
Subject: [PATCH 1/2] [hostname] Simplify case matching for domains

Instead of special handling all uppercase domain conventions, use our
normal flow for obfuscation and just match the casing at the end of the
sanitization routine.

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 sos/cleaner/mappings/hostname_map.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
index 0fe78fb1..5cd8e985 100644
--- a/sos/cleaner/mappings/hostname_map.py
+++ b/sos/cleaner/mappings/hostname_map.py
@@ -169,16 +169,15 @@ class SoSHostnameMap(SoSMap):
 
     def sanitize_item(self, item):
         host = item.split('.')
-        if len(host) > 1 and all([h.isupper() for h in host]):
-            # by convention we have just a domain
-            _host = [h.lower() for h in host]
-            return self.sanitize_domain(_host).upper()
         if len(host) == 1:
             # we have a shortname for a host
             return self.sanitize_short_name(host[0].lower())
         if len(host) == 2:
             # we have just a domain name, e.g. example.com
-            return self.sanitize_domain(host)
+            dname = self.sanitize_domain(host)
+            if all([h.isupper() for h in host]):
+                dname = dname.upper()
+            return dname
         if len(host) > 2:
             # we have an FQDN, e.g. foo.example.com
             hostname = host[0]
@@ -194,7 +193,10 @@ class SoSHostnameMap(SoSMap):
                 ob_hostname = 'unknown'
             ob_domain = self.sanitize_domain(domain)
             self.dataset[item] = ob_domain
-            return '.'.join([ob_hostname, ob_domain])
+            _fqdn = '.'.join([ob_hostname, ob_domain])
+            if all([h.isupper() for h in host]):
+                _fqdn = _fqdn.upper()
+            return _fqdn
 
     def sanitize_short_name(self, hostname):
         """Obfuscate the short name of the host with an incremented counter
-- 
2.31.1


From 189586728de22dd55122c1f7e06b19590f9a788f Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Tue, 30 Nov 2021 12:47:58 -0500
Subject: [PATCH 2/2] [username] Improve username sourcing and remove case
 sensitivity

First, don't skip the first line of `last` output, and instead add the
header from lastlog to the skip list. Additionally, add
`/etc/cron.allow` and `/etc/cron.deny` as sources for usernames that
might not appear in other locations in certain environments.

Also, make matching and replacement case insensitive.

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 sos/cleaner/archives/sos.py            |  4 +++-
 sos/cleaner/mappings/username_map.py   |  2 +-
 sos/cleaner/parsers/username_parser.py | 14 +++++++++-----
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
index f8720c88..12766496 100644
--- a/sos/cleaner/archives/sos.py
+++ b/sos/cleaner/archives/sos.py
@@ -35,7 +35,9 @@ class SoSReportArchive(SoSObfuscationArchive):
             'sos_commands/login/lastlog_-u_65537-4294967295',
             # AD users will be reported here, but favor the lastlog files since
             # those will include local users who have not logged in
-            'sos_commands/login/last'
+            'sos_commands/login/last',
+            'etc/cron.allow',
+            'etc/cron.deny'
         ]
     }
 
diff --git a/sos/cleaner/mappings/username_map.py b/sos/cleaner/mappings/username_map.py
index cdbf36fe..7ecccd7b 100644
--- a/sos/cleaner/mappings/username_map.py
+++ b/sos/cleaner/mappings/username_map.py
@@ -33,5 +33,5 @@ class SoSUsernameMap(SoSMap):
         ob_name = "obfuscateduser%s" % self.name_count
         self.name_count += 1
         if ob_name in self.dataset.values():
-            return self.sanitize_item(username)
+            return self.sanitize_item(username.lower())
         return ob_name
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
index 35377a31..229c7de4 100644
--- a/sos/cleaner/parsers/username_parser.py
+++ b/sos/cleaner/parsers/username_parser.py
@@ -8,6 +8,7 @@
 #
 # See the LICENSE file in the source distribution for further information.
 
+import re
 
 from sos.cleaner.parsers import SoSCleanerParser
 from sos.cleaner.mappings.username_map import SoSUsernameMap
@@ -34,6 +35,7 @@ class SoSUsernameParser(SoSCleanerParser):
         'reboot',
         'root',
         'ubuntu',
+        'username',
         'wtmp'
     ]
 
@@ -47,12 +49,12 @@ class SoSUsernameParser(SoSCleanerParser):
         this parser, we need to override the initial parser prepping here.
         """
         users = set()
-        for line in content.splitlines()[1:]:
+        for line in content.splitlines():
             try:
                 user = line.split()[0]
             except Exception:
                 continue
-            if user in self.skip_list:
+            if user.lower() in self.skip_list:
                 continue
             users.add(user)
         for each in users:
@@ -61,7 +63,9 @@ class SoSUsernameParser(SoSCleanerParser):
     def parse_line(self, line):
         count = 0
         for username in sorted(self.mapping.dataset.keys(), reverse=True):
-            if username in line:
-                count = line.count(username)
-                line = line.replace(username, self.mapping.get(username))
+            _reg = re.compile(username, re.I)
+            if _reg.search(line):
+                line, count = _reg.subn(
+                    self.mapping.get(username.lower()), line
+                )
         return line, count
-- 
2.31.1

From cafd0f3a52436a3966576e7db21e5dd17c06f0cc Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Sun, 12 Dec 2021 11:10:46 -0500
Subject: [PATCH] [hostname] Fix edge case for new hosts in a known subdomain

Fixes an edge case that would cause us to at first not recognize that a
given hostname string is a new host in a known subdomain, but then on
the obfuscation attempt properly recognize it as such and result in an
incomplete obfuscation.

This was mostly triggered by specific patterns for build hosts within
`sos_commands/rpm/package-data`. With this refined check, these types of
matches are properly obfuscated.

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 sos/cleaner/mappings/hostname_map.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
index 5cd8e9857..33b0e6c80 100644
--- a/sos/cleaner/mappings/hostname_map.py
+++ b/sos/cleaner/mappings/hostname_map.py
@@ -129,7 +129,7 @@ def get(self, item):
             item = item[0:-1]
         if not self.domain_name_in_loaded_domains(item.lower()):
             return item
-        if item.endswith(('.yaml', '.yml', '.crt', '.key', '.pem')):
+        if item.endswith(('.yaml', '.yml', '.crt', '.key', '.pem', '.log')):
             ext = '.' + item.split('.')[-1]
             item = item.replace(ext, '')
             suffix += ext
@@ -148,7 +148,8 @@ def get(self, item):
                 if len(_test) == 1 or not _test[0]:
                     # does not match existing obfuscation
                     continue
-                elif _test[0].endswith('.') and not _host_substr:
+                elif not _host_substr and (_test[0].endswith('.') or
+                                           item.endswith(_existing)):
                     # new hostname in known domain
                     final = super(SoSHostnameMap, self).get(item)
                     break
@@ -219,8 +220,8 @@ def sanitize_domain(self, domain):
             # don't obfuscate vendor domains
             if re.match(_skip, '.'.join(domain)):
                 return '.'.join(domain)
-        top_domain = domain[-1]
-        dname = '.'.join(domain[0:-1])
+        top_domain = domain[-1].lower()
+        dname = '.'.join(domain[0:-1]).lower()
         ob_domain = self._new_obfuscated_domain(dname)
         ob_domain = '.'.join([ob_domain, top_domain])
         self.dataset['.'.join(domain)] = ob_domain
From f5e1298162a9393ea2d9f5c4df40dfece50f5f88 Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Thu, 6 Jan 2022 13:15:15 -0500
Subject: [PATCH 1/3] [hostname] Fix loading and detection of long base domains

Our domain matching has up to now assumed that users would be providing
'base' domains such as 'example.com' whereby something like
'foo.bar.example.com' is a subdomain (or host) within that base domain.

However, the use case exists to provide 'foo.bar.example.com' as the
base domain, without wanting to obfuscate 'example.com' directly.

This commit fixes our handling of both loading these longer domains and
doing the 'domain is part of a domain we want to obfuscate' check.

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 sos/cleaner/mappings/hostname_map.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
index 33b0e6c8..7a7cf6b8 100644
--- a/sos/cleaner/mappings/hostname_map.py
+++ b/sos/cleaner/mappings/hostname_map.py
@@ -50,10 +50,14 @@ class SoSHostnameMap(SoSMap):
         in this parser, we need to re-inject entries from the map_file into
         these dicts and not just the underlying 'dataset' dict
         """
-        for domain in self.dataset:
+        for domain, ob_pair in self.dataset.items():
             if len(domain.split('.')) == 1:
                 self.hosts[domain.split('.')[0]] = self.dataset[domain]
             else:
+                if ob_pair.startswith('obfuscateddomain'):
+                    # directly exact domain matches
+                    self._domains[domain] = ob_pair.split('.')[0]
+                    continue
                 # strip the host name and trailing top-level domain so that
                 # we in inject the domain properly for later string matching
 
@@ -102,9 +106,12 @@ class SoSHostnameMap(SoSMap):
         and should be obfuscated
         """
         host = domain.split('.')
+        no_tld = '.'.join(domain.split('.')[0:-1])
         if len(host) == 1:
             # don't block on host's shortname
             return host[0] in self.hosts.keys()
+        elif any([no_tld.endswith(_d) for _d in self._domains]):
+            return True
         else:
             domain = host[0:-1]
             for known_domain in self._domains:
-- 
2.31.1


From e241cf33a14ecd4e848a5fd857c5d3d7d07fbd71 Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Thu, 6 Jan 2022 13:18:44 -0500
Subject: [PATCH 2/3] [cleaner] Improve parser-specific file skipping

This commit improves our handling of skipping files on a per-parser
basis, by first filtering the list of parsers that `obfuscate_line()`
will iterate over by the parser's `skip_file` class attr, rather than
relying on higher-level checks.

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 sos/cleaner/__init__.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
index 3f530d44..5686e213 100644
--- a/sos/cleaner/__init__.py
+++ b/sos/cleaner/__init__.py
@@ -12,6 +12,7 @@ import hashlib
 import json
 import logging
 import os
+import re
 import shutil
 import tempfile
 
@@ -640,10 +641,16 @@ third party.
             self.log_debug("Obfuscating %s" % short_name or filename,
                            caller=arc_name)
             tfile = tempfile.NamedTemporaryFile(mode='w', dir=self.tmpdir)
+            _parsers = [
+                _p for _p in self.parsers if not
+                any([
+                    re.match(p, short_name) for p in _p.skip_files
+                ])
+            ]
             with open(filename, 'r') as fname:
                 for line in fname:
                     try:
-                        line, count = self.obfuscate_line(line)
+                        line, count = self.obfuscate_line(line, _parsers)
                         subs += count
                         tfile.write(line)
                     except Exception as err:
@@ -713,7 +720,7 @@ third party.
                 pass
         return string_data
 
-    def obfuscate_line(self, line):
+    def obfuscate_line(self, line, parsers=None):
         """Run a line through each of the obfuscation parsers, keeping a
         cumulative total of substitutions done on that particular line.
 
@@ -721,6 +728,8 @@ third party.
 
             :param line str:        The raw line as read from the file being
                                     processed
+            :param parsers:         A list of parser objects to obfuscate
+                                    with. If None, use all.
 
         Returns the fully obfuscated line and the number of substitutions made
         """
@@ -729,7 +738,9 @@ third party.
         count = 0
         if not line.strip():
             return line, count
-        for parser in self.parsers:
+        if parsers is None:
+            parsers = self.parsers
+        for parser in parsers:
             try:
                 line, _count = parser.parse_line(line)
                 count += _count
-- 
2.31.1


From 96c9a833e77639a853b7d3d6f1df68bbbbe5e9cb Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Thu, 6 Jan 2022 13:20:32 -0500
Subject: [PATCH 3/3] [cleaner] Add skips for known files and usernames

Adds skips for `/proc/kallsyms` which should never be obfuscated, as
well as any packaging-related log file for the IP parser. Further, do
not obfuscate the `stack` users, as that is a well-known user for many
configurations that, if obfuscated, could result in undesired string
substitutions in normal logging.

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 sos/cleaner/archives/__init__.py       | 2 ++
 sos/cleaner/parsers/ip_parser.py       | 3 ++-
 sos/cleaner/parsers/username_parser.py | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/sos/cleaner/archives/__init__.py b/sos/cleaner/archives/__init__.py
index 795c5a78..cbf1f809 100644
--- a/sos/cleaner/archives/__init__.py
+++ b/sos/cleaner/archives/__init__.py
@@ -43,6 +43,7 @@ class SoSObfuscationArchive():
     type_name = 'undetermined'
     description = 'undetermined'
     is_nested = False
+    skip_files = []
     prep_files = {}
 
     def __init__(self, archive_path, tmpdir):
@@ -111,6 +112,7 @@ class SoSObfuscationArchive():
         Returns: list of files and file regexes
         """
         return [
+            'proc/kallsyms',
             'sosreport-',
             'sys/firmware',
             'sys/fs',
diff --git a/sos/cleaner/parsers/ip_parser.py b/sos/cleaner/parsers/ip_parser.py
index 71d38be8..b007368c 100644
--- a/sos/cleaner/parsers/ip_parser.py
+++ b/sos/cleaner/parsers/ip_parser.py
@@ -37,7 +37,8 @@ class SoSIPParser(SoSCleanerParser):
         'sos_commands/snappy/snap_list_--all',
         'sos_commands/snappy/snap_--version',
         'sos_commands/vulkan/vulkaninfo',
-        'var/log/.*dnf.*'
+        'var/log/.*dnf.*',
+        'var/log/.*packag.*'  # get 'packages' and 'packaging' logs
     ]
 
     map_file_key = 'ip_map'
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
index 229c7de4..3208a655 100644
--- a/sos/cleaner/parsers/username_parser.py
+++ b/sos/cleaner/parsers/username_parser.py
@@ -32,6 +32,7 @@ class SoSUsernameParser(SoSCleanerParser):
         'nobody',
         'nfsnobody',
         'shutdown',
+        'stack',
         'reboot',
         'root',
         'ubuntu',
-- 
2.31.1

From 7ebb2ce0bcd13c1b3aada648aceb20b5aff636d9 Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Tue, 15 Feb 2022 14:18:02 -0500
Subject: [PATCH] [host] Skip entire /etc/sos/cleaner directory

While `default_mapping` is typically the only file expected under
`/etc/sos/cleaner/` it is possible for other mapping files (such as
backups) to appear there.

Make the `add_forbidden_path()` spec here target the entire cleaner
directory to avoid ever capturing these map files.

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 sos/report/plugins/host.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sos/report/plugins/host.py b/sos/report/plugins/host.py
index 5e21da7b8e..95a3b9cd95 100644
--- a/sos/report/plugins/host.py
+++ b/sos/report/plugins/host.py
@@ -20,7 +20,7 @@ class Host(Plugin, IndependentPlugin):
 
     def setup(self):
 
-        self.add_forbidden_path('/etc/sos/cleaner/default_mapping')
+        self.add_forbidden_path('/etc/sos/cleaner')
 
         self.add_cmd_output('hostname', root_symlink='hostname')
         self.add_cmd_output('uptime', root_symlink='uptime')