Blame SOURCES/sos-bz2024893-cleaner-hostnames-improvements.patch

1dc99f
From decd39b7799a0579ea085b0da0728b6eabd49b38 Mon Sep 17 00:00:00 2001
1dc99f
From: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
Date: Wed, 1 Sep 2021 00:28:58 -0400
1dc99f
Subject: [PATCH] [clean] Provide archive abstractions to obfuscate more than
1dc99f
 sos archives
1dc99f
1dc99f
This commit removes the restriction imposed on `sos clean` since its
1dc99f
introduction in sos-4.0 to only work against known sos report archives
1dc99f
or build directories. This is because there has been interest in using
1dc99f
the obfuscation bits of sos in other data-collector projects.
1dc99f
1dc99f
The `SoSObfuscationArchive()` class has been revamped to now be an
1dc99f
abstraction for different types of archives, and the cleaner logic has
1dc99f
been updated to leverage this new abstraction rather than assuming we're
1dc99f
working on an sos archive.
1dc99f
1dc99f
Abstractions are added for our own native use cases - that being `sos
1dc99f
report` and `sos collect` for at-runtime obfuscation, as well as
1dc99f
standalone archives previously generated. Further generic abstractions
1dc99f
are available for plain directories and tarballs however these will not
1dc99f
provide the same level of coverage as fully supported archive types, as
1dc99f
is noted in the manpage for sos-clean.
1dc99f
1dc99f
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
---
1dc99f
 man/en/sos-clean.1                            |  25 ++
1dc99f
 sos/cleaner/__init__.py                       | 308 +++++++++---------
1dc99f
 .../__init__.py}                              |  80 ++++-
1dc99f
 sos/cleaner/archives/generic.py               |  52 +++
1dc99f
 sos/cleaner/archives/sos.py                   | 106 ++++++
1dc99f
 sos/cleaner/parsers/__init__.py               |   6 -
1dc99f
 sos/cleaner/parsers/hostname_parser.py        |   1 -
1dc99f
 sos/cleaner/parsers/ip_parser.py              |   1 -
1dc99f
 sos/cleaner/parsers/keyword_parser.py         |   1 -
1dc99f
 sos/cleaner/parsers/mac_parser.py             |   1 -
1dc99f
 sos/cleaner/parsers/username_parser.py        |   8 -
1dc99f
 tests/cleaner_tests/existing_archive.py       |   7 +
1dc99f
 tests/cleaner_tests/full_report_run.py        |   3 +
1dc99f
 tests/cleaner_tests/report_with_mask.py       |   3 +
1dc99f
 14 files changed, 423 insertions(+), 179 deletions(-)
1dc99f
 rename sos/cleaner/{obfuscation_archive.py => archives/__init__.py} (81%)
1dc99f
 create mode 100644 sos/cleaner/archives/generic.py
1dc99f
 create mode 100644 sos/cleaner/archives/sos.py
1dc99f
1dc99f
diff --git a/man/en/sos-clean.1 b/man/en/sos-clean.1
1dc99f
index b77bc63c..54026713 100644
1dc99f
--- a/man/en/sos-clean.1
1dc99f
+++ b/man/en/sos-clean.1
1dc99f
@@ -10,6 +10,7 @@ sos clean - Obfuscate sensitive data from one or more sosreports
1dc99f
     [\-\-jobs]
1dc99f
     [\-\-no-update]
1dc99f
     [\-\-keep-binary-files]
1dc99f
+    [\-\-archive-type]
1dc99f
 
1dc99f
 .SH DESCRIPTION
1dc99f
 \fBsos clean\fR or \fBsos mask\fR is an sos subcommand used to obfuscate sensitive information from
1dc99f
@@ -88,6 +89,30 @@ Users should review any archive that keeps binary files in place before sending
1dc99f
 a third party.
1dc99f
 
1dc99f
 Default: False (remove encountered binary files)
1dc99f
+.TP
1dc99f
+.B \-\-archive-type TYPE
1dc99f
+Specify the type of archive that TARGET was generated as.
1dc99f
+When sos inspects a TARGET archive, it tries to identify what type of archive it is.
1dc99f
+For example, it may be a report generated by \fBsos report\fR, or a collection of those
1dc99f
+reports generated by \fBsos collect\fR, which require separate approaches.
1dc99f
+
1dc99f
+This option may be useful if a given TARGET archive is known to be of a specific type,
1dc99f
+but due to unknown reasons or some malformed/missing information in the archive directly,
1dc99f
+that is not properly identified by sos.
1dc99f
+
1dc99f
+The following are accepted values for this option:
1dc99f
+
1dc99f
+    \fBauto\fR          Automatically detect the archive type
1dc99f
+    \fBreport\fR        An archive generated by \fBsos report\fR
1dc99f
+    \fBcollect\fR       An archive generated by \fBsos collect\fR
1dc99f
+
1dc99f
+The following may also be used, however note that these do not attempt to pre-load
1dc99f
+any information from the archives into the parsers. This means that, among other limitations,
1dc99f
+items like host and domain names may not be obfuscated unless an obfuscated mapping already exists
1dc99f
+on the system from a previous execution.
1dc99f
+
1dc99f
+    \fBdata-dir\fR      A plain directory on the filesystem.
1dc99f
+    \fBtarball\fR       A generic tar archive not associated with any known tool
1dc99f
 
1dc99f
 .SH SEE ALSO
1dc99f
 .BR sos (1)
1dc99f
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
1dc99f
index 6aadfe79..6d2eb483 100644
1dc99f
--- a/sos/cleaner/__init__.py
1dc99f
+++ b/sos/cleaner/__init__.py
1dc99f
@@ -12,9 +12,7 @@ import hashlib
1dc99f
 import json
1dc99f
 import logging
1dc99f
 import os
1dc99f
-import re
1dc99f
 import shutil
1dc99f
-import tarfile
1dc99f
 import tempfile
1dc99f
 
1dc99f
 from concurrent.futures import ThreadPoolExecutor
1dc99f
@@ -27,7 +25,10 @@ from sos.cleaner.parsers.mac_parser import SoSMacParser
1dc99f
 from sos.cleaner.parsers.hostname_parser import SoSHostnameParser
1dc99f
 from sos.cleaner.parsers.keyword_parser import SoSKeywordParser
1dc99f
 from sos.cleaner.parsers.username_parser import SoSUsernameParser
1dc99f
-from sos.cleaner.obfuscation_archive import SoSObfuscationArchive
1dc99f
+from sos.cleaner.archives.sos import (SoSReportArchive, SoSReportDirectory,
1dc99f
+                                      SoSCollectorArchive,
1dc99f
+                                      SoSCollectorDirectory)
1dc99f
+from sos.cleaner.archives.generic import DataDirArchive, TarballArchive
1dc99f
 from sos.utilities import get_human_readable
1dc99f
 from textwrap import fill
1dc99f
 
1dc99f
@@ -41,6 +42,7 @@ class SoSCleaner(SoSComponent):
1dc99f
     desc = "Obfuscate sensitive networking information in a report"
1dc99f
 
1dc99f
     arg_defaults = {
1dc99f
+        'archive_type': 'auto',
1dc99f
         'domains': [],
1dc99f
         'jobs': 4,
1dc99f
         'keywords': [],
1dc99f
@@ -70,6 +72,7 @@ class SoSCleaner(SoSComponent):
1dc99f
             self.from_cmdline = False
1dc99f
             if not hasattr(self.opts, 'jobs'):
1dc99f
                 self.opts.jobs = 4
1dc99f
+            self.opts.archive_type = 'auto'
1dc99f
             self.soslog = logging.getLogger('sos')
1dc99f
             self.ui_log = logging.getLogger('sos_ui')
1dc99f
             # create the tmp subdir here to avoid a potential race condition
1dc99f
@@ -92,6 +95,17 @@ class SoSCleaner(SoSComponent):
1dc99f
             SoSUsernameParser(self.cleaner_mapping, self.opts.usernames)
1dc99f
         ]
1dc99f
 
1dc99f
+        self.archive_types = [
1dc99f
+            SoSReportDirectory,
1dc99f
+            SoSReportArchive,
1dc99f
+            SoSCollectorDirectory,
1dc99f
+            SoSCollectorArchive,
1dc99f
+            # make sure these two are always last as they are fallbacks
1dc99f
+            DataDirArchive,
1dc99f
+            TarballArchive
1dc99f
+        ]
1dc99f
+        self.nested_archive = None
1dc99f
+
1dc99f
         self.log_info("Cleaner initialized. From cmdline: %s"
1dc99f
                       % self.from_cmdline)
1dc99f
 
1dc99f
@@ -178,6 +192,11 @@ third party.
1dc99f
         )
1dc99f
         clean_grp.add_argument('target', metavar='TARGET',
1dc99f
                                help='The directory or archive to obfuscate')
1dc99f
+        clean_grp.add_argument('--archive-type', default='auto',
1dc99f
+                               choices=['auto', 'report', 'collect',
1dc99f
+                                        'data-dir', 'tarball'],
1dc99f
+                               help=('Specify what kind of archive the target '
1dc99f
+                                     'was generated as'))
1dc99f
         clean_grp.add_argument('--domains', action='extend', default=[],
1dc99f
                                help='List of domain names to obfuscate')
1dc99f
         clean_grp.add_argument('-j', '--jobs', default=4, type=int,
1dc99f
@@ -218,59 +237,28 @@ third party.
1dc99f
 
1dc99f
         In the event the target path is not an archive, abort.
1dc99f
         """
1dc99f
-        if not tarfile.is_tarfile(self.opts.target):
1dc99f
-            self.ui_log.error(
1dc99f
-                "Invalid target: must be directory or tar archive"
1dc99f
-            )
1dc99f
-            self._exit(1)
1dc99f
-
1dc99f
-        archive = tarfile.open(self.opts.target)
1dc99f
-        self.arc_name = self.opts.target.split('/')[-1].split('.')[:-2][0]
1dc99f
-
1dc99f
-        try:
1dc99f
-            archive.getmember(os.path.join(self.arc_name, 'sos_logs'))
1dc99f
-        except Exception:
1dc99f
-            # this is not an sos archive
1dc99f
-            self.ui_log.error("Invalid target: not an sos archive")
1dc99f
-            self._exit(1)
1dc99f
-
1dc99f
-        # see if there are archives within this archive
1dc99f
-        nested_archives = []
1dc99f
-        for _file in archive.getmembers():
1dc99f
-            if (re.match('sosreport-.*.tar', _file.name.split('/')[-1]) and not
1dc99f
-                    (_file.name.endswith(('.md5', '.sha256')))):
1dc99f
-                nested_archives.append(_file.name.split('/')[-1])
1dc99f
-
1dc99f
-        if nested_archives:
1dc99f
-            self.log_info("Found nested archive(s), extracting top level")
1dc99f
-            nested_path = self.extract_archive(archive)
1dc99f
-            for arc_file in os.listdir(nested_path):
1dc99f
-                if re.match('sosreport.*.tar.*', arc_file):
1dc99f
-                    if arc_file.endswith(('.md5', '.sha256')):
1dc99f
-                        continue
1dc99f
-                    self.report_paths.append(os.path.join(nested_path,
1dc99f
-                                                          arc_file))
1dc99f
-            # add the toplevel extracted archive
1dc99f
-            self.report_paths.append(nested_path)
1dc99f
+        _arc = None
1dc99f
+        if self.opts.archive_type != 'auto':
1dc99f
+            check_type = self.opts.archive_type.replace('-', '_')
1dc99f
+            for archive in self.archive_types:
1dc99f
+                if archive.type_name == check_type:
1dc99f
+                    _arc = archive(self.opts.target, self.tmpdir)
1dc99f
         else:
1dc99f
-            self.report_paths.append(self.opts.target)
1dc99f
-
1dc99f
-        archive.close()
1dc99f
-
1dc99f
-    def extract_archive(self, archive):
1dc99f
-        """Extract an archive into our tmpdir so that we may inspect it or
1dc99f
-        iterate through its contents for obfuscation
1dc99f
-
1dc99f
-        Positional arguments:
1dc99f
-
1dc99f
-            :param archive:     An open TarFile object for the archive
1dc99f
-
1dc99f
-        """
1dc99f
-        if not isinstance(archive, tarfile.TarFile):
1dc99f
-            archive = tarfile.open(archive)
1dc99f
-        path = os.path.join(self.tmpdir, 'cleaner')
1dc99f
-        archive.extractall(path)
1dc99f
-        return os.path.join(path, archive.name.split('/')[-1].split('.tar')[0])
1dc99f
+            for arc in self.archive_types:
1dc99f
+                if arc.check_is_type(self.opts.target):
1dc99f
+                    _arc = arc(self.opts.target, self.tmpdir)
1dc99f
+                    break
1dc99f
+        if not _arc:
1dc99f
+            return
1dc99f
+        self.report_paths.append(_arc)
1dc99f
+        if _arc.is_nested:
1dc99f
+            self.report_paths.extend(_arc.get_nested_archives())
1dc99f
+            # We need to preserve the top level archive until all
1dc99f
+            # nested archives are processed
1dc99f
+            self.report_paths.remove(_arc)
1dc99f
+            self.nested_archive = _arc
1dc99f
+        if self.nested_archive:
1dc99f
+            self.nested_archive.ui_name = self.nested_archive.description
1dc99f
 
1dc99f
     def execute(self):
1dc99f
         """SoSCleaner will begin by inspecting the TARGET option to determine
1dc99f
@@ -283,6 +271,7 @@ third party.
1dc99f
         be unpacked, cleaned, and repacked and the final top-level archive will
1dc99f
         then be repacked as well.
1dc99f
         """
1dc99f
+        self.arc_name = self.opts.target.split('/')[-1].split('.tar')[0]
1dc99f
         if self.from_cmdline:
1dc99f
             self.print_disclaimer()
1dc99f
         self.report_paths = []
1dc99f
@@ -290,23 +279,11 @@ third party.
1dc99f
             self.ui_log.error("Invalid target: no such file or directory %s"
1dc99f
                               % self.opts.target)
1dc99f
             self._exit(1)
1dc99f
-        if os.path.isdir(self.opts.target):
1dc99f
-            self.arc_name = self.opts.target.split('/')[-1]
1dc99f
-            for _file in os.listdir(self.opts.target):
1dc99f
-                if _file == 'sos_logs':
1dc99f
-                    self.report_paths.append(self.opts.target)
1dc99f
-                if (_file.startswith('sosreport') and
1dc99f
-                   (_file.endswith(".tar.gz") or _file.endswith(".tar.xz"))):
1dc99f
-                    self.report_paths.append(os.path.join(self.opts.target,
1dc99f
-                                                          _file))
1dc99f
-            if not self.report_paths:
1dc99f
-                self.ui_log.error("Invalid target: not an sos directory")
1dc99f
-                self._exit(1)
1dc99f
-        else:
1dc99f
-            self.inspect_target_archive()
1dc99f
+
1dc99f
+        self.inspect_target_archive()
1dc99f
 
1dc99f
         if not self.report_paths:
1dc99f
-            self.ui_log.error("No valid sos archives or directories found\n")
1dc99f
+            self.ui_log.error("No valid archives or directories found\n")
1dc99f
             self._exit(1)
1dc99f
 
1dc99f
         # we have at least one valid target to obfuscate
1dc99f
@@ -334,33 +311,7 @@ third party.
1dc99f
 
1dc99f
         final_path = None
1dc99f
         if len(self.completed_reports) > 1:
1dc99f
-            # we have an archive of archives, so repack the obfuscated tarball
1dc99f
-            arc_name = self.arc_name + '-obfuscated'
1dc99f
-            self.setup_archive(name=arc_name)
1dc99f
-            for arc in self.completed_reports:
1dc99f
-                if arc.is_tarfile:
1dc99f
-                    arc_dest = self.obfuscate_string(
1dc99f
-                        arc.final_archive_path.split('/')[-1]
1dc99f
-                    )
1dc99f
-                    self.archive.add_file(arc.final_archive_path,
1dc99f
-                                          dest=arc_dest)
1dc99f
-                    checksum = self.get_new_checksum(arc.final_archive_path)
1dc99f
-                    if checksum is not None:
1dc99f
-                        dname = self.obfuscate_string(
1dc99f
-                            "checksums/%s.%s" % (arc_dest, self.hash_name)
1dc99f
-                        )
1dc99f
-                        self.archive.add_string(checksum, dest=dname)
1dc99f
-                else:
1dc99f
-                    for dirname, dirs, files in os.walk(arc.archive_path):
1dc99f
-                        for filename in files:
1dc99f
-                            if filename.startswith('sosreport'):
1dc99f
-                                continue
1dc99f
-                            fname = os.path.join(dirname, filename)
1dc99f
-                            dnm = self.obfuscate_string(
1dc99f
-                                fname.split(arc.archive_name)[-1].lstrip('/')
1dc99f
-                            )
1dc99f
-                            self.archive.add_file(fname, dest=dnm)
1dc99f
-            arc_path = self.archive.finalize(self.opts.compression_type)
1dc99f
+            arc_path = self.rebuild_nested_archive()
1dc99f
         else:
1dc99f
             arc = self.completed_reports[0]
1dc99f
             arc_path = arc.final_archive_path
1dc99f
@@ -371,8 +322,7 @@ third party.
1dc99f
                 )
1dc99f
                 with open(os.path.join(self.sys_tmp, chksum_name), 'w') as cf:
1dc99f
                     cf.write(checksum)
1dc99f
-
1dc99f
-        self.write_cleaner_log()
1dc99f
+            self.write_cleaner_log()
1dc99f
 
1dc99f
         final_path = self.obfuscate_string(
1dc99f
             os.path.join(self.sys_tmp, arc_path.split('/')[-1])
1dc99f
@@ -393,6 +343,30 @@ third party.
1dc99f
 
1dc99f
         self.cleanup()
1dc99f
 
1dc99f
+    def rebuild_nested_archive(self):
1dc99f
+        """Handles repacking the nested tarball, now containing only obfuscated
1dc99f
+        copies of the reports, log files, manifest, etc...
1dc99f
+        """
1dc99f
+        # we have an archive of archives, so repack the obfuscated tarball
1dc99f
+        arc_name = self.arc_name + '-obfuscated'
1dc99f
+        self.setup_archive(name=arc_name)
1dc99f
+        for archive in self.completed_reports:
1dc99f
+            arc_dest = archive.final_archive_path.split('/')[-1]
1dc99f
+            checksum = self.get_new_checksum(archive.final_archive_path)
1dc99f
+            if checksum is not None:
1dc99f
+                dname = "checksums/%s.%s" % (arc_dest, self.hash_name)
1dc99f
+                self.archive.add_string(checksum, dest=dname)
1dc99f
+        for dirn, dirs, files in os.walk(self.nested_archive.extracted_path):
1dc99f
+            for filename in files:
1dc99f
+                fname = os.path.join(dirn, filename)
1dc99f
+                dname = fname.split(self.nested_archive.extracted_path)[-1]
1dc99f
+                dname = dname.lstrip('/')
1dc99f
+                self.archive.add_file(fname, dest=dname)
1dc99f
+                # remove it now so we don't balloon our fs space needs
1dc99f
+                os.remove(fname)
1dc99f
+        self.write_cleaner_log(archive=True)
1dc99f
+        return self.archive.finalize(self.opts.compression_type)
1dc99f
+
1dc99f
     def compile_mapping_dict(self):
1dc99f
         """Build a dict that contains each parser's map as a key, with the
1dc99f
         contents as that key's value. This will then be written to disk in the
1dc99f
@@ -441,7 +415,7 @@ third party.
1dc99f
                 self.log_error("Could not update mapping config file: %s"
1dc99f
                                % err)
1dc99f
 
1dc99f
-    def write_cleaner_log(self):
1dc99f
+    def write_cleaner_log(self, archive=False):
1dc99f
         """When invoked via the command line, the logging from SoSCleaner will
1dc99f
         not be added to the archive(s) it processes, so we need to write it
1dc99f
         separately to disk
1dc99f
@@ -454,6 +428,10 @@ third party.
1dc99f
             for line in self.sos_log_file.readlines():
1dc99f
                 logfile.write(line)
1dc99f
 
1dc99f
+        if archive:
1dc99f
+            self.obfuscate_file(log_name)
1dc99f
+            self.archive.add_file(log_name, dest="sos_logs/cleaner.log")
1dc99f
+
1dc99f
     def get_new_checksum(self, archive_path):
1dc99f
         """Calculate a new checksum for the obfuscated archive, as the previous
1dc99f
         checksum will no longer be valid
1dc99f
@@ -481,11 +459,11 @@ third party.
1dc99f
         be obfuscated concurrently.
1dc99f
         """
1dc99f
         try:
1dc99f
-            if len(self.report_paths) > 1:
1dc99f
-                msg = ("Found %s total reports to obfuscate, processing up to "
1dc99f
-                       "%s concurrently\n"
1dc99f
-                       % (len(self.report_paths), self.opts.jobs))
1dc99f
-                self.ui_log.info(msg)
1dc99f
+            msg = (
1dc99f
+                "Found %s total reports to obfuscate, processing up to %s "
1dc99f
+                "concurrently\n" % (len(self.report_paths), self.opts.jobs)
1dc99f
+            )
1dc99f
+            self.ui_log.info(msg)
1dc99f
             if self.opts.keep_binary_files:
1dc99f
                 self.ui_log.warning(
1dc99f
                     "WARNING: binary files that potentially contain sensitive "
1dc99f
@@ -494,53 +472,67 @@ third party.
1dc99f
             pool = ThreadPoolExecutor(self.opts.jobs)
1dc99f
             pool.map(self.obfuscate_report, self.report_paths, chunksize=1)
1dc99f
             pool.shutdown(wait=True)
1dc99f
+            # finally, obfuscate the nested archive if one exists
1dc99f
+            if self.nested_archive:
1dc99f
+                self._replace_obfuscated_archives()
1dc99f
+                self.obfuscate_report(self.nested_archive)
1dc99f
         except KeyboardInterrupt:
1dc99f
             self.ui_log.info("Exiting on user cancel")
1dc99f
             os._exit(130)
1dc99f
 
1dc99f
+    def _replace_obfuscated_archives(self):
1dc99f
+        """When we have a nested archive, we need to rebuild the original
1dc99f
+        archive, which entails replacing the existing archives with their
1dc99f
+        obfuscated counterparts
1dc99f
+        """
1dc99f
+        for archive in self.completed_reports:
1dc99f
+            os.remove(archive.archive_path)
1dc99f
+            dest = self.nested_archive.extracted_path
1dc99f
+            tarball = archive.final_archive_path.split('/')[-1]
1dc99f
+            dest_name = os.path.join(dest, tarball)
1dc99f
+            shutil.move(archive.final_archive_path, dest)
1dc99f
+            archive.final_archive_path = dest_name
1dc99f
+
1dc99f
     def preload_all_archives_into_maps(self):
1dc99f
         """Before doing the actual obfuscation, if we have multiple archives
1dc99f
         to obfuscate then we need to preload each of them into the mappings
1dc99f
         to ensure that node1 is obfuscated in node2 as well as node2 being
1dc99f
         obfuscated in node1's archive.
1dc99f
         """
1dc99f
-        self.log_info("Pre-loading multiple archives into obfuscation maps")
1dc99f
+        self.log_info("Pre-loading all archives into obfuscation maps")
1dc99f
         for _arc in self.report_paths:
1dc99f
-            is_dir = os.path.isdir(_arc)
1dc99f
-            if is_dir:
1dc99f
-                _arc_name = _arc
1dc99f
-            else:
1dc99f
-                archive = tarfile.open(_arc)
1dc99f
-                _arc_name = _arc.split('/')[-1].split('.tar')[0]
1dc99f
-            # for each parser, load the map_prep_file into memory, and then
1dc99f
-            # send that for obfuscation. We don't actually obfuscate the file
1dc99f
-            # here, do that in the normal archive loop
1dc99f
             for _parser in self.parsers:
1dc99f
-                if not _parser.prep_map_file:
1dc99f
+                try:
1dc99f
+                    pfile = _arc.prep_files[_parser.name.lower().split()[0]]
1dc99f
+                    if not pfile:
1dc99f
+                        continue
1dc99f
+                except (IndexError, KeyError):
1dc99f
                     continue
1dc99f
-                if isinstance(_parser.prep_map_file, str):
1dc99f
-                    _parser.prep_map_file = [_parser.prep_map_file]
1dc99f
-                for parse_file in _parser.prep_map_file:
1dc99f
-                    _arc_path = os.path.join(_arc_name, parse_file)
1dc99f
+                if isinstance(pfile, str):
1dc99f
+                    pfile = [pfile]
1dc99f
+                for parse_file in pfile:
1dc99f
+                    self.log_debug("Attempting to load %s" % parse_file)
1dc99f
                     try:
1dc99f
-                        if is_dir:
1dc99f
-                            _pfile = open(_arc_path, 'r')
1dc99f
-                            content = _pfile.read()
1dc99f
-                        else:
1dc99f
-                            _pfile = archive.extractfile(_arc_path)
1dc99f
-                            content = _pfile.read().decode('utf-8')
1dc99f
-                        _pfile.close()
1dc99f
+                        content = _arc.get_file_content(parse_file)
1dc99f
+                        if not content:
1dc99f
+                            continue
1dc99f
                         if isinstance(_parser, SoSUsernameParser):
1dc99f
                             _parser.load_usernames_into_map(content)
1dc99f
-                        for line in content.splitlines():
1dc99f
-                            if isinstance(_parser, SoSHostnameParser):
1dc99f
-                                _parser.load_hostname_into_map(line)
1dc99f
-                            self.obfuscate_line(line)
1dc99f
+                        elif isinstance(_parser, SoSHostnameParser):
1dc99f
+                            _parser.load_hostname_into_map(
1dc99f
+                                content.splitlines()[0]
1dc99f
+                            )
1dc99f
+                        else:
1dc99f
+                            for line in content.splitlines():
1dc99f
+                                self.obfuscate_line(line)
1dc99f
                     except Exception as err:
1dc99f
-                        self.log_debug("Could not prep %s: %s"
1dc99f
-                                       % (_arc_path, err))
1dc99f
+                        self.log_info(
1dc99f
+                            "Could not prepare %s from %s (archive: %s): %s"
1dc99f
+                            % (_parser.name, parse_file, _arc.archive_name,
1dc99f
+                               err)
1dc99f
+                        )
1dc99f
 
1dc99f
-    def obfuscate_report(self, report):
1dc99f
+    def obfuscate_report(self, archive):
1dc99f
         """Individually handle each archive or directory we've discovered by
1dc99f
         running through each file therein.
1dc99f
 
1dc99f
@@ -549,17 +541,12 @@ third party.
1dc99f
             :param report str:      Filepath to the directory or archive
1dc99f
         """
1dc99f
         try:
1dc99f
-            if not os.access(report, os.W_OK):
1dc99f
-                msg = "Insufficient permissions on %s" % report
1dc99f
-                self.log_info(msg)
1dc99f
-                self.ui_log.error(msg)
1dc99f
-                return
1dc99f
-
1dc99f
-            archive = SoSObfuscationArchive(report, self.tmpdir)
1dc99f
             arc_md = self.cleaner_md.add_section(archive.archive_name)
1dc99f
             start_time = datetime.now()
1dc99f
             arc_md.add_field('start_time', start_time)
1dc99f
-            archive.extract()
1dc99f
+            # don't double extract nested archives
1dc99f
+            if not archive.is_extracted:
1dc99f
+                archive.extract()
1dc99f
             archive.report_msg("Beginning obfuscation...")
1dc99f
 
1dc99f
             file_list = archive.get_file_list()
1dc99f
@@ -586,27 +573,28 @@ third party.
1dc99f
                               caller=archive.archive_name)
1dc99f
 
1dc99f
             # if the archive was already a tarball, repack it
1dc99f
-            method = archive.get_compression()
1dc99f
-            if method:
1dc99f
-                archive.report_msg("Re-compressing...")
1dc99f
-                try:
1dc99f
-                    archive.rename_top_dir(
1dc99f
-                        self.obfuscate_string(archive.archive_name)
1dc99f
-                    )
1dc99f
-                    archive.compress(method)
1dc99f
-                except Exception as err:
1dc99f
-                    self.log_debug("Archive %s failed to compress: %s"
1dc99f
-                                   % (archive.archive_name, err))
1dc99f
-                    archive.report_msg("Failed to re-compress archive: %s"
1dc99f
-                                       % err)
1dc99f
-                    return
1dc99f
+            if not archive.is_nested:
1dc99f
+                method = archive.get_compression()
1dc99f
+                if method:
1dc99f
+                    archive.report_msg("Re-compressing...")
1dc99f
+                    try:
1dc99f
+                        archive.rename_top_dir(
1dc99f
+                            self.obfuscate_string(archive.archive_name)
1dc99f
+                        )
1dc99f
+                        archive.compress(method)
1dc99f
+                    except Exception as err:
1dc99f
+                        self.log_debug("Archive %s failed to compress: %s"
1dc99f
+                                       % (archive.archive_name, err))
1dc99f
+                        archive.report_msg("Failed to re-compress archive: %s"
1dc99f
+                                           % err)
1dc99f
+                        return
1dc99f
+                self.completed_reports.append(archive)
1dc99f
 
1dc99f
             end_time = datetime.now()
1dc99f
             arc_md.add_field('end_time', end_time)
1dc99f
             arc_md.add_field('run_time', end_time - start_time)
1dc99f
             arc_md.add_field('files_obfuscated', len(archive.file_sub_list))
1dc99f
             arc_md.add_field('total_substitutions', archive.total_sub_count)
1dc99f
-            self.completed_reports.append(archive)
1dc99f
             rmsg = ''
1dc99f
             if archive.removed_file_count:
1dc99f
                 rmsg = " [removed %s unprocessable files]"
1dc99f
@@ -615,7 +603,7 @@ third party.
1dc99f
 
1dc99f
         except Exception as err:
1dc99f
             self.ui_log.info("Exception while processing %s: %s"
1dc99f
-                             % (report, err))
1dc99f
+                             % (archive.archive_name, err))
1dc99f
 
1dc99f
     def obfuscate_file(self, filename, short_name=None, arc_name=None):
1dc99f
         """Obfuscate and individual file, line by line.
1dc99f
@@ -635,6 +623,8 @@ third party.
1dc99f
             # the requested file doesn't exist in the archive
1dc99f
             return
1dc99f
         subs = 0
1dc99f
+        if not short_name:
1dc99f
+            short_name = filename.split('/')[-1]
1dc99f
         if not os.path.islink(filename):
1dc99f
             # don't run the obfuscation on the link, but on the actual file
1dc99f
             # at some other point.
1dc99f
@@ -745,3 +735,5 @@ third party.
1dc99f
         for parser in self.parsers:
1dc99f
             _sec = parse_sec.add_section(parser.name.replace(' ', '_').lower())
1dc99f
             _sec.add_field('entries', len(parser.mapping.dataset.keys()))
1dc99f
+
1dc99f
+# vim: set et ts=4 sw=4 :
1dc99f
diff --git a/sos/cleaner/obfuscation_archive.py b/sos/cleaner/archives/__init__.py
1dc99f
similarity index 81%
1dc99f
rename from sos/cleaner/obfuscation_archive.py
1dc99f
rename to sos/cleaner/archives/__init__.py
1dc99f
index ea0b7012..795c5a78 100644
1dc99f
--- a/sos/cleaner/obfuscation_archive.py
1dc99f
+++ b/sos/cleaner/archives/__init__.py
1dc99f
@@ -40,6 +40,10 @@ class SoSObfuscationArchive():
1dc99f
     file_sub_list = []
1dc99f
     total_sub_count = 0
1dc99f
     removed_file_count = 0
1dc99f
+    type_name = 'undetermined'
1dc99f
+    description = 'undetermined'
1dc99f
+    is_nested = False
1dc99f
+    prep_files = {}
1dc99f
 
1dc99f
     def __init__(self, archive_path, tmpdir):
1dc99f
         self.archive_path = archive_path
1dc99f
@@ -50,7 +54,43 @@ class SoSObfuscationArchive():
1dc99f
         self.soslog = logging.getLogger('sos')
1dc99f
         self.ui_log = logging.getLogger('sos_ui')
1dc99f
         self.skip_list = self._load_skip_list()
1dc99f
-        self.log_info("Loaded %s as an archive" % self.archive_path)
1dc99f
+        self.is_extracted = False
1dc99f
+        self._load_self()
1dc99f
+        self.archive_root = ''
1dc99f
+        self.log_info(
1dc99f
+            "Loaded %s as type %s"
1dc99f
+            % (self.archive_path, self.description)
1dc99f
+        )
1dc99f
+
1dc99f
+    @classmethod
1dc99f
+    def check_is_type(cls, arc_path):
1dc99f
+        """Check if the archive is a well-known type we directly support"""
1dc99f
+        return False
1dc99f
+
1dc99f
+    def _load_self(self):
1dc99f
+        if self.is_tarfile:
1dc99f
+            self.tarobj = tarfile.open(self.archive_path)
1dc99f
+
1dc99f
+    def get_nested_archives(self):
1dc99f
+        """Return a list of ObfuscationArchives that represent additional
1dc99f
+        archives found within the target archive. For example, an archive from
1dc99f
+        `sos collect` will return a list of ``SoSReportArchive`` objects.
1dc99f
+
1dc99f
+        This should be overridden by individual types of ObfuscationArchive's
1dc99f
+        """
1dc99f
+        return []
1dc99f
+
1dc99f
+    def get_archive_root(self):
1dc99f
+        """Set the root path for the archive that should be prepended to any
1dc99f
+        filenames given to methods in this class.
1dc99f
+        """
1dc99f
+        if self.is_tarfile:
1dc99f
+            toplevel = self.tarobj.firstmember
1dc99f
+            if toplevel.isdir():
1dc99f
+                return toplevel.name
1dc99f
+            else:
1dc99f
+                return os.sep
1dc99f
+        return os.path.abspath(self.archive_path)
1dc99f
 
1dc99f
     def report_msg(self, msg):
1dc99f
         """Helper to easily format ui messages on a per-report basis"""
1dc99f
@@ -96,10 +136,42 @@ class SoSObfuscationArchive():
1dc99f
             os.remove(full_fname)
1dc99f
             self.removed_file_count += 1
1dc99f
 
1dc99f
-    def extract(self):
1dc99f
+    def format_file_name(self, fname):
1dc99f
+        """Based on the type of archive we're dealing with, do whatever that
1dc99f
+        archive requires to a provided **relative** filepath to be able to
1dc99f
+        access it within the archive
1dc99f
+        """
1dc99f
+        if not self.is_extracted:
1dc99f
+            if not self.archive_root:
1dc99f
+                self.archive_root = self.get_archive_root()
1dc99f
+            return os.path.join(self.archive_root, fname)
1dc99f
+        else:
1dc99f
+            return os.path.join(self.extracted_path, fname)
1dc99f
+
1dc99f
+    def get_file_content(self, fname):
1dc99f
+        """Return the content from the specified fname. Particularly useful for
1dc99f
+        tarball-type archives so we can retrieve prep file contents prior to
1dc99f
+        extracting the entire archive
1dc99f
+        """
1dc99f
+        if self.is_extracted is False and self.is_tarfile:
1dc99f
+            filename = self.format_file_name(fname)
1dc99f
+            try:
1dc99f
+                return self.tarobj.extractfile(filename).read().decode('utf-8')
1dc99f
+            except KeyError:
1dc99f
+                self.log_debug(
1dc99f
+                    "Unable to retrieve %s: no such file in archive" % fname
1dc99f
+                )
1dc99f
+                return ''
1dc99f
+        else:
1dc99f
+            with open(self.format_file_name(fname), 'r') as to_read:
1dc99f
+                return to_read.read()
1dc99f
+
1dc99f
+    def extract(self, quiet=False):
1dc99f
         if self.is_tarfile:
1dc99f
-            self.report_msg("Extracting...")
1dc99f
+            if not quiet:
1dc99f
+                self.report_msg("Extracting...")
1dc99f
             self.extracted_path = self.extract_self()
1dc99f
+            self.is_extracted = True
1dc99f
         else:
1dc99f
             self.extracted_path = self.archive_path
1dc99f
         # if we're running as non-root (e.g. collector), then we can have a
1dc99f
@@ -317,3 +389,5 @@ class SoSObfuscationArchive():
1dc99f
                 return False
1dc99f
             except UnicodeDecodeError:
1dc99f
                 return True
1dc99f
+
1dc99f
+# vim: set et ts=4 sw=4 :
1dc99f
diff --git a/sos/cleaner/archives/generic.py b/sos/cleaner/archives/generic.py
1dc99f
new file mode 100644
1dc99f
index 00000000..2ce6f09b
1dc99f
--- /dev/null
1dc99f
+++ b/sos/cleaner/archives/generic.py
1dc99f
@@ -0,0 +1,52 @@
1dc99f
+# Copyright 2020 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
1dc99f
+
1dc99f
+# This file is part of the sos project: https://github.com/sosreport/sos
1dc99f
+#
1dc99f
+# This copyrighted material is made available to anyone wishing to use,
1dc99f
+# modify, copy, or redistribute it subject to the terms and conditions of
1dc99f
+# version 2 of the GNU General Public License.
1dc99f
+#
1dc99f
+# See the LICENSE file in the source distribution for further information.
1dc99f
+
1dc99f
+
1dc99f
+from sos.cleaner.archives import SoSObfuscationArchive
1dc99f
+
1dc99f
+import os
1dc99f
+import tarfile
1dc99f
+
1dc99f
+
1dc99f
+class DataDirArchive(SoSObfuscationArchive):
1dc99f
+    """A plain directory on the filesystem that is not directly associated with
1dc99f
+    any known or supported collection utility
1dc99f
+    """
1dc99f
+
1dc99f
+    type_name = 'data_dir'
1dc99f
+    description = 'unassociated directory'
1dc99f
+
1dc99f
+    @classmethod
1dc99f
+    def check_is_type(cls, arc_path):
1dc99f
+        return os.path.isdir(arc_path)
1dc99f
+
1dc99f
+    def set_archive_root(self):
1dc99f
+        return os.path.abspath(self.archive_path)
1dc99f
+
1dc99f
+
1dc99f
+class TarballArchive(SoSObfuscationArchive):
1dc99f
+    """A generic tar archive that is not associated with any known or supported
1dc99f
+    collection utility
1dc99f
+    """
1dc99f
+
1dc99f
+    type_name = 'tarball'
1dc99f
+    description = 'unassociated tarball'
1dc99f
+
1dc99f
+    @classmethod
1dc99f
+    def check_is_type(cls, arc_path):
1dc99f
+        try:
1dc99f
+            return tarfile.is_tarfile(arc_path)
1dc99f
+        except Exception:
1dc99f
+            return False
1dc99f
+
1dc99f
+    def set_archive_root(self):
1dc99f
+        if self.tarobj.firstmember.isdir():
1dc99f
+            return self.tarobj.firstmember.name
1dc99f
+        return ''
1dc99f
diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
1dc99f
new file mode 100644
1dc99f
index 00000000..4401d710
1dc99f
--- /dev/null
1dc99f
+++ b/sos/cleaner/archives/sos.py
1dc99f
@@ -0,0 +1,106 @@
1dc99f
+# Copyright 2021 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
1dc99f
+
1dc99f
+# This file is part of the sos project: https://github.com/sosreport/sos
1dc99f
+#
1dc99f
+# This copyrighted material is made available to anyone wishing to use,
1dc99f
+# modify, copy, or redistribute it subject to the terms and conditions of
1dc99f
+# version 2 of the GNU General Public License.
1dc99f
+#
1dc99f
+# See the LICENSE file in the source distribution for further information.
1dc99f
+
1dc99f
+
1dc99f
+from sos.cleaner.archives import SoSObfuscationArchive
1dc99f
+
1dc99f
+import os
1dc99f
+import tarfile
1dc99f
+
1dc99f
+
1dc99f
+class SoSReportArchive(SoSObfuscationArchive):
1dc99f
+    """This is the class representing an sos report, or in other words the
1dc99f
+    type the archive the SoS project natively generates
1dc99f
+    """
1dc99f
+
1dc99f
+    type_name = 'report'
1dc99f
+    description = 'sos report archive'
1dc99f
+    prep_files = {
1dc99f
+        'hostname': 'sos_commands/host/hostname',
1dc99f
+        'ip': 'sos_commands/networking/ip_-o_addr',
1dc99f
+        'mac': 'sos_commands/networking/ip_-d_address',
1dc99f
+        'username': [
1dc99f
+            'sos_commands/login/lastlog_-u_1000-60000',
1dc99f
+            'sos_commands/login/lastlog_-u_60001-65536',
1dc99f
+            'sos_commands/login/lastlog_-u_65537-4294967295',
1dc99f
+            # AD users will be reported here, but favor the lastlog files since
1dc99f
+            # those will include local users who have not logged in
1dc99f
+            'sos_commands/login/last'
1dc99f
+        ]
1dc99f
+    }
1dc99f
+
1dc99f
+    @classmethod
1dc99f
+    def check_is_type(cls, arc_path):
1dc99f
+        try:
1dc99f
+            return tarfile.is_tarfile(arc_path) and 'sosreport-' in arc_path
1dc99f
+        except Exception:
1dc99f
+            return False
1dc99f
+
1dc99f
+
1dc99f
+class SoSReportDirectory(SoSReportArchive):
1dc99f
+    """This is the archive class representing a build directory, or in other
1dc99f
+    words what `sos report --clean` will end up using for in-line obfuscation
1dc99f
+    """
1dc99f
+
1dc99f
+    type_name = 'report_dir'
1dc99f
+    description = 'sos report directory'
1dc99f
+
1dc99f
+    @classmethod
1dc99f
+    def check_is_type(cls, arc_path):
1dc99f
+        if os.path.isdir(arc_path):
1dc99f
+            return 'sos_logs' in os.listdir(arc_path)
1dc99f
+        return False
1dc99f
+
1dc99f
+
1dc99f
+class SoSCollectorArchive(SoSObfuscationArchive):
1dc99f
+    """Archive class representing the tarball created by ``sos collect``. It
1dc99f
+    will not provide prep files on its own, however it will provide a list
1dc99f
+    of SoSReportArchive's which will then be used to prep the parsers
1dc99f
+    """
1dc99f
+
1dc99f
+    type_name = 'collect'
1dc99f
+    description = 'sos collect tarball'
1dc99f
+    is_nested = True
1dc99f
+
1dc99f
+    @classmethod
1dc99f
+    def check_is_type(cls, arc_path):
1dc99f
+        try:
1dc99f
+            return (tarfile.is_tarfile(arc_path) and 'sos-collect' in arc_path)
1dc99f
+        except Exception:
1dc99f
+            return False
1dc99f
+
1dc99f
+    def get_nested_archives(self):
1dc99f
+        self.extract(quiet=True)
1dc99f
+        _path = self.extracted_path
1dc99f
+        archives = []
1dc99f
+        for fname in os.listdir(_path):
1dc99f
+            arc_name = os.path.join(_path, fname)
1dc99f
+            if 'sosreport-' in fname and tarfile.is_tarfile(arc_name):
1dc99f
+                archives.append(SoSReportArchive(arc_name, self.tmpdir))
1dc99f
+        return archives
1dc99f
+
1dc99f
+
1dc99f
+class SoSCollectorDirectory(SoSCollectorArchive):
1dc99f
+    """The archive class representing the temp directory used by ``sos
1dc99f
+    collect`` when ``--clean`` is used during runtime.
1dc99f
+    """
1dc99f
+
1dc99f
+    type_name = 'collect_dir'
1dc99f
+    description = 'sos collect directory'
1dc99f
+
1dc99f
+    @classmethod
1dc99f
+    def check_is_type(cls, arc_path):
1dc99f
+        if os.path.isdir(arc_path):
1dc99f
+            for fname in os.listdir(arc_path):
1dc99f
+                if 'sos-collector-' in fname:
1dc99f
+                    return True
1dc99f
+        return False
1dc99f
+
1dc99f
+# vim: set et ts=4 sw=4 :
1dc99f
diff --git a/sos/cleaner/parsers/__init__.py b/sos/cleaner/parsers/__init__.py
1dc99f
index af6e375e..e62fd938 100644
1dc99f
--- a/sos/cleaner/parsers/__init__.py
1dc99f
+++ b/sos/cleaner/parsers/__init__.py
1dc99f
@@ -37,11 +37,6 @@ class SoSCleanerParser():
1dc99f
     :cvar map_file_key: The key in the ``map_file`` to read when loading
1dc99f
                         previous obfuscation matches
1dc99f
     :vartype map_file_key: ``str``
1dc99f
-
1dc99f
-
1dc99f
-    :cvar prep_map_file: File to read from an archive to pre-seed the map with
1dc99f
-                         matches. E.G. ip_addr for loading IP addresses
1dc99f
-    :vartype prep_map_fie: ``str``
1dc99f
     """
1dc99f
 
1dc99f
     name = 'Undefined Parser'
1dc99f
@@ -49,7 +44,6 @@ class SoSCleanerParser():
1dc99f
     skip_line_patterns = []
1dc99f
     skip_files = []
1dc99f
     map_file_key = 'unset'
1dc99f
-    prep_map_file = []
1dc99f
 
1dc99f
     def __init__(self, config={}):
1dc99f
         if self.map_file_key in config:
1dc99f
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
1dc99f
index 71e13d3f..daa76a62 100644
1dc99f
--- a/sos/cleaner/parsers/hostname_parser.py
1dc99f
+++ b/sos/cleaner/parsers/hostname_parser.py
1dc99f
@@ -16,7 +16,6 @@ class SoSHostnameParser(SoSCleanerParser):
1dc99f
 
1dc99f
     name = 'Hostname Parser'
1dc99f
     map_file_key = 'hostname_map'
1dc99f
-    prep_map_file = 'sos_commands/host/hostname'
1dc99f
     regex_patterns = [
1dc99f
         r'(((\b|_)[a-zA-Z0-9-\.]{1,200}\.[a-zA-Z]{1,63}(\b|_)))'
1dc99f
     ]
1dc99f
diff --git a/sos/cleaner/parsers/ip_parser.py b/sos/cleaner/parsers/ip_parser.py
1dc99f
index 525139e8..71d38be8 100644
1dc99f
--- a/sos/cleaner/parsers/ip_parser.py
1dc99f
+++ b/sos/cleaner/parsers/ip_parser.py
1dc99f
@@ -41,7 +41,6 @@ class SoSIPParser(SoSCleanerParser):
1dc99f
     ]
1dc99f
 
1dc99f
     map_file_key = 'ip_map'
1dc99f
-    prep_map_file = 'sos_commands/networking/ip_-o_addr'
1dc99f
 
1dc99f
     def __init__(self, config):
1dc99f
         self.mapping = SoSIPMap()
1dc99f
diff --git a/sos/cleaner/parsers/keyword_parser.py b/sos/cleaner/parsers/keyword_parser.py
1dc99f
index 68de3727..694c6073 100644
1dc99f
--- a/sos/cleaner/parsers/keyword_parser.py
1dc99f
+++ b/sos/cleaner/parsers/keyword_parser.py
1dc99f
@@ -20,7 +20,6 @@ class SoSKeywordParser(SoSCleanerParser):
1dc99f
 
1dc99f
     name = 'Keyword Parser'
1dc99f
     map_file_key = 'keyword_map'
1dc99f
-    prep_map_file = ''
1dc99f
 
1dc99f
     def __init__(self, config, keywords=None, keyword_file=None):
1dc99f
         self.mapping = SoSKeywordMap()
1dc99f
diff --git a/sos/cleaner/parsers/mac_parser.py b/sos/cleaner/parsers/mac_parser.py
1dc99f
index 7ca80b8d..c74288cf 100644
1dc99f
--- a/sos/cleaner/parsers/mac_parser.py
1dc99f
+++ b/sos/cleaner/parsers/mac_parser.py
1dc99f
@@ -30,7 +30,6 @@ class SoSMacParser(SoSCleanerParser):
1dc99f
         '534f:53'
1dc99f
     )
1dc99f
     map_file_key = 'mac_map'
1dc99f
-    prep_map_file = 'sos_commands/networking/ip_-d_address'
1dc99f
 
1dc99f
     def __init__(self, config):
1dc99f
         self.mapping = SoSMacMap()
1dc99f
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
1dc99f
index b142e371..35377a31 100644
1dc99f
--- a/sos/cleaner/parsers/username_parser.py
1dc99f
+++ b/sos/cleaner/parsers/username_parser.py
1dc99f
@@ -25,14 +25,6 @@ class SoSUsernameParser(SoSCleanerParser):
1dc99f
 
1dc99f
     name = 'Username Parser'
1dc99f
     map_file_key = 'username_map'
1dc99f
-    prep_map_file = [
1dc99f
-        'sos_commands/login/lastlog_-u_1000-60000',
1dc99f
-        'sos_commands/login/lastlog_-u_60001-65536',
1dc99f
-        'sos_commands/login/lastlog_-u_65537-4294967295',
1dc99f
-        # AD users will be reported here, but favor the lastlog files since
1dc99f
-        # those will include local users who have not logged in
1dc99f
-        'sos_commands/login/last'
1dc99f
-    ]
1dc99f
     regex_patterns = []
1dc99f
     skip_list = [
1dc99f
         'core',
1dc99f
diff --git a/tests/cleaner_tests/existing_archive.py b/tests/cleaner_tests/existing_archive.py
1dc99f
index 0eaf6c8d..e13d1cae 100644
1dc99f
--- a/tests/cleaner_tests/existing_archive.py
1dc99f
+++ b/tests/cleaner_tests/existing_archive.py
1dc99f
@@ -28,6 +28,13 @@ class ExistingArchiveCleanTest(StageTwoReportTest):
1dc99f
     def test_obfuscation_log_created(self):
1dc99f
         self.assertFileExists(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE))
1dc99f
 
1dc99f
+    def test_archive_type_correct(self):
1dc99f
+        with open(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE), 'r') as log:
1dc99f
+            for line in log:
1dc99f
+                if "Loaded %s" % ARCHIVE in line:
1dc99f
+                    assert 'as type sos report archive' in line, "Incorrect archive type detected: %s" % line
1dc99f
+                    break
1dc99f
+
1dc99f
     def test_from_cmdline_logged(self):
1dc99f
         with open(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE), 'r') as log:
1dc99f
             for line in log:
1dc99f
diff --git a/tests/cleaner_tests/full_report_run.py b/tests/cleaner_tests/full_report_run.py
1dc99f
index 3b28e7a2..2de54946 100644
1dc99f
--- a/tests/cleaner_tests/full_report_run.py
1dc99f
+++ b/tests/cleaner_tests/full_report_run.py
1dc99f
@@ -35,6 +35,9 @@ class FullCleanTest(StageTwoReportTest):
1dc99f
     def test_tarball_named_obfuscated(self):
1dc99f
         self.assertTrue('obfuscated' in self.archive)
1dc99f
 
1dc99f
+    def test_archive_type_correct(self):
1dc99f
+        self.assertSosLogContains('Loaded .* as type sos report directory')
1dc99f
+
1dc99f
     def test_hostname_not_in_any_file(self):
1dc99f
         host = self.sysinfo['pre']['networking']['hostname']
1dc99f
         # much faster to just use grep here
1dc99f
diff --git a/tests/cleaner_tests/report_with_mask.py b/tests/cleaner_tests/report_with_mask.py
1dc99f
index 4f94ba33..08e873d4 100644
1dc99f
--- a/tests/cleaner_tests/report_with_mask.py
1dc99f
+++ b/tests/cleaner_tests/report_with_mask.py
1dc99f
@@ -31,6 +31,9 @@ class ReportWithMask(StageOneReportTest):
1dc99f
     def test_tarball_named_obfuscated(self):
1dc99f
         self.assertTrue('obfuscated' in self.archive)
1dc99f
 
1dc99f
+    def test_archive_type_correct(self):
1dc99f
+        self.assertSosLogContains('Loaded .* as type sos report directory')
1dc99f
+
1dc99f
     def test_localhost_was_obfuscated(self):
1dc99f
         self.assertFileHasContent('/etc/hostname', 'host0')
1dc99f
 
1dc99f
-- 
1dc99f
2.31.1
1dc99f
1dc99f
From 9b119f860eaec089f7ef884ff39c42589a662994 Mon Sep 17 00:00:00 2001
1dc99f
From: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
Date: Wed, 1 Sep 2021 00:34:04 -0400
1dc99f
Subject: [PATCH] [hostname_map] Add a catch for single-character hostnames
1dc99f
1dc99f
If a log file was truncated at a specific boundary in a string of the
1dc99f
FQDN of the host such that we only get a couple characters before the
1dc99f
rest of the domain, we would previously bodly replace all instances of
1dc99f
that character with the obfuscated short name; not very helpful.
1dc99f
1dc99f
Instead, don't sanitize the short name if this happens and instead
1dc99f
obfuscate the whole FQDN as 'unknown.example.com'.
1dc99f
1dc99f
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
---
1dc99f
 sos/cleaner/mappings/hostname_map.py | 9 ++++++++-
1dc99f
 1 file changed, 8 insertions(+), 1 deletion(-)
1dc99f
1dc99f
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
1dc99f
index d4b2c88e..e70a5530 100644
1dc99f
--- a/sos/cleaner/mappings/hostname_map.py
1dc99f
+++ b/sos/cleaner/mappings/hostname_map.py
1dc99f
@@ -184,7 +184,14 @@ class SoSHostnameMap(SoSMap):
1dc99f
             hostname = host[0]
1dc99f
             domain = host[1:]
1dc99f
             # obfuscate the short name
1dc99f
-            ob_hostname = self.sanitize_short_name(hostname)
1dc99f
+            if len(hostname) > 2:
1dc99f
+                ob_hostname = self.sanitize_short_name(hostname)
1dc99f
+            else:
1dc99f
+                # by best practice it appears the host part of the fqdn was cut
1dc99f
+                # off due to some form of truncating, as such don't obfuscate
1dc99f
+                # short strings that are likely to throw off obfuscation of
1dc99f
+                # unrelated bits and paths
1dc99f
+                ob_hostname = 'unknown'
1dc99f
             ob_domain = self.sanitize_domain(domain)
1dc99f
             self.dataset[item] = ob_domain
1dc99f
             return '.'.join([ob_hostname, ob_domain])
1dc99f
-- 
1dc99f
2.31.1
1dc99f
1dc99f
From f3f3e763d7c31b7b7cafdf8dd4dab87056fb7696 Mon Sep 17 00:00:00 2001
1dc99f
From: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
Date: Wed, 1 Sep 2021 15:54:55 -0400
1dc99f
Subject: [PATCH] [cleaner] Add support for Insights client archives
1dc99f
1dc99f
Adds a new type of `SoSObfuscationArchive` to add support for
1dc99f
obfuscating archives generated by the Insights project.
1dc99f
1dc99f
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
---
1dc99f
 man/en/sos-clean.1               |  1 +
1dc99f
 sos/cleaner/__init__.py          |  4 ++-
1dc99f
 sos/cleaner/archives/insights.py | 42 ++++++++++++++++++++++++++++++++
1dc99f
 3 files changed, 46 insertions(+), 1 deletion(-)
1dc99f
 create mode 100644 sos/cleaner/archives/insights.py
1dc99f
1dc99f
diff --git a/man/en/sos-clean.1 b/man/en/sos-clean.1
1dc99f
index 54026713..358ec0cb 100644
1dc99f
--- a/man/en/sos-clean.1
1dc99f
+++ b/man/en/sos-clean.1
1dc99f
@@ -105,6 +105,7 @@ The following are accepted values for this option:
1dc99f
     \fBauto\fR          Automatically detect the archive type
1dc99f
     \fBreport\fR        An archive generated by \fBsos report\fR
1dc99f
     \fBcollect\fR       An archive generated by \fBsos collect\fR
1dc99f
+    \fBinsights\fR      An archive generated by the \fBinsights-client\fR package
1dc99f
 
1dc99f
 The following may also be used, however note that these do not attempt to pre-load
1dc99f
 any information from the archives into the parsers. This means that, among other limitations,
1dc99f
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
1dc99f
index 6d2eb483..3e08aa28 100644
1dc99f
--- a/sos/cleaner/__init__.py
1dc99f
+++ b/sos/cleaner/__init__.py
1dc99f
@@ -29,6 +29,7 @@ from sos.cleaner.archives.sos import (SoSReportArchive, SoSReportDirectory,
1dc99f
                                       SoSCollectorArchive,
1dc99f
                                       SoSCollectorDirectory)
1dc99f
 from sos.cleaner.archives.generic import DataDirArchive, TarballArchive
1dc99f
+from sos.cleaner.archives.insights import InsightsArchive
1dc99f
 from sos.utilities import get_human_readable
1dc99f
 from textwrap import fill
1dc99f
 
1dc99f
@@ -100,6 +101,7 @@ class SoSCleaner(SoSComponent):
1dc99f
             SoSReportArchive,
1dc99f
             SoSCollectorDirectory,
1dc99f
             SoSCollectorArchive,
1dc99f
+            InsightsArchive,
1dc99f
             # make sure these two are always last as they are fallbacks
1dc99f
             DataDirArchive,
1dc99f
             TarballArchive
1dc99f
@@ -194,7 +196,7 @@ third party.
1dc99f
                                help='The directory or archive to obfuscate')
1dc99f
         clean_grp.add_argument('--archive-type', default='auto',
1dc99f
                                choices=['auto', 'report', 'collect',
1dc99f
-                                        'data-dir', 'tarball'],
1dc99f
+                                        'insights', 'data-dir', 'tarball'],
1dc99f
                                help=('Specify what kind of archive the target '
1dc99f
                                      'was generated as'))
1dc99f
         clean_grp.add_argument('--domains', action='extend', default=[],
1dc99f
diff --git a/sos/cleaner/archives/insights.py b/sos/cleaner/archives/insights.py
1dc99f
new file mode 100644
1dc99f
index 00000000..dab48b16
1dc99f
--- /dev/null
1dc99f
+++ b/sos/cleaner/archives/insights.py
1dc99f
@@ -0,0 +1,42 @@
1dc99f
+# Copyright 2021 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
1dc99f
+
1dc99f
+# This file is part of the sos project: https://github.com/sosreport/sos
1dc99f
+#
1dc99f
+# This copyrighted material is made available to anyone wishing to use,
1dc99f
+# modify, copy, or redistribute it subject to the terms and conditions of
1dc99f
+# version 2 of the GNU General Public License.
1dc99f
+#
1dc99f
+# See the LICENSE file in the source distribution for further information.
1dc99f
+
1dc99f
+
1dc99f
+from sos.cleaner.archives import SoSObfuscationArchive
1dc99f
+
1dc99f
+import tarfile
1dc99f
+
1dc99f
+
1dc99f
+class InsightsArchive(SoSObfuscationArchive):
1dc99f
+    """This class represents archives generated by the insights-client utility
1dc99f
+    for RHEL systems.
1dc99f
+    """
1dc99f
+
1dc99f
+    type_name = 'insights'
1dc99f
+    description = 'insights-client archive'
1dc99f
+
1dc99f
+    prep_files = {
1dc99f
+        'hostname': 'data/insights_commands/hostname_-f',
1dc99f
+        'ip': 'data/insights_commands/ip_addr',
1dc99f
+        'mac': 'data/insights_commands/ip_addr'
1dc99f
+    }
1dc99f
+
1dc99f
+    @classmethod
1dc99f
+    def check_is_type(cls, arc_path):
1dc99f
+        try:
1dc99f
+            return tarfile.is_tarfile(arc_path) and 'insights-' in arc_path
1dc99f
+        except Exception:
1dc99f
+            return False
1dc99f
+
1dc99f
+    def get_archive_root(self):
1dc99f
+        top = self.archive_path.split('/')[-1].split('.tar')[0]
1dc99f
+        if self.tarobj.firstmember.name == '.':
1dc99f
+            top = './' + top
1dc99f
+        return top
1dc99f
-- 
1dc99f
2.31.1
1dc99f
1dc99f
From 9639dc3d240076b55f2a1d04b43ea42bebd09215 Mon Sep 17 00:00:00 2001
1dc99f
From: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
Date: Tue, 16 Nov 2021 17:50:42 -0500
1dc99f
Subject: [PATCH] [clean,hostname_parser] Source /etc/hosts for obfuscation
1dc99f
1dc99f
Up until now, our sourcing of hostnames/domains for obfuscation has been
1dc99f
dependent upon the output of the `hostname` command. However, some
1dc99f
scenarios have come up where sourcing `/etc/hosts` is advantageous for
1dc99f
several reasons:
1dc99f
1dc99f
First, if `hostname` output is unavailable, this provides a fallback
1dc99f
measure.
1dc99f
1dc99f
Second, `/etc/hosts` is a common place to have short names defined which
1dc99f
would otherwise not be detected (or at the very least would result in a
1dc99f
race condition based on where/if the short name was elsewhere able to be
1dc99f
gleaned from an FQDN), thus leaving the potential for unobfuscated data
1dc99f
in an archive.
1dc99f
1dc99f
Due to both the nature of hostname obfuscation and the malleable syntax
1dc99f
of `/etc/hosts`, the parsing of this file needs special handling not
1dc99f
covered by our more generic parsing and obfuscation methods.
1dc99f
1dc99f
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
---
1dc99f
 sos/cleaner/__init__.py                | 11 ++++++++---
1dc99f
 sos/cleaner/archives/sos.py            |  5 ++++-
1dc99f
 sos/cleaner/parsers/hostname_parser.py | 19 +++++++++++++++++++
1dc99f
 3 files changed, 31 insertions(+), 4 deletions(-)
1dc99f
1dc99f
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
1dc99f
index ed461a8f..3f530d44 100644
1dc99f
--- a/sos/cleaner/__init__.py
1dc99f
+++ b/sos/cleaner/__init__.py
1dc99f
@@ -523,9 +523,14 @@ third party.
1dc99f
                         if isinstance(_parser, SoSUsernameParser):
1dc99f
                             _parser.load_usernames_into_map(content)
1dc99f
                         elif isinstance(_parser, SoSHostnameParser):
1dc99f
-                            _parser.load_hostname_into_map(
1dc99f
-                                content.splitlines()[0]
1dc99f
-                            )
1dc99f
+                            if 'hostname' in parse_file:
1dc99f
+                                _parser.load_hostname_into_map(
1dc99f
+                                    content.splitlines()[0]
1dc99f
+                                )
1dc99f
+                            elif 'etc/hosts' in parse_file:
1dc99f
+                                _parser.load_hostname_from_etc_hosts(
1dc99f
+                                    content
1dc99f
+                                )
1dc99f
                         else:
1dc99f
                             for line in content.splitlines():
1dc99f
                                 self.obfuscate_line(line)
1dc99f
diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
1dc99f
index 4401d710..f8720c88 100644
1dc99f
--- a/sos/cleaner/archives/sos.py
1dc99f
+++ b/sos/cleaner/archives/sos.py
1dc99f
@@ -23,7 +23,10 @@ class SoSReportArchive(SoSObfuscationArchive):
1dc99f
     type_name = 'report'
1dc99f
     description = 'sos report archive'
1dc99f
     prep_files = {
1dc99f
-        'hostname': 'sos_commands/host/hostname',
1dc99f
+        'hostname': [
1dc99f
+            'sos_commands/host/hostname',
1dc99f
+            'etc/hosts'
1dc99f
+        ],
1dc99f
         'ip': 'sos_commands/networking/ip_-o_addr',
1dc99f
         'mac': 'sos_commands/networking/ip_-d_address',
1dc99f
         'username': [
1dc99f
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
1dc99f
index daa76a62..0a733bee 100644
1dc99f
--- a/sos/cleaner/parsers/hostname_parser.py
1dc99f
+++ b/sos/cleaner/parsers/hostname_parser.py
1dc99f
@@ -61,6 +61,25 @@ class SoSHostnameParser(SoSCleanerParser):
1dc99f
             self.mapping.add(high_domain)
1dc99f
         self.mapping.add(hostname_string)
1dc99f
 
1dc99f
+    def load_hostname_from_etc_hosts(self, content):
1dc99f
+        """Parse an archive's copy of /etc/hosts, which requires handling that
1dc99f
+        is separate from the output of the `hostname` command. Just like
1dc99f
+        load_hostname_into_map(), this has to be done explicitly and we
1dc99f
+        cannot rely upon the more generic methods to do this reliably.
1dc99f
+        """
1dc99f
+        lines = content.splitlines()
1dc99f
+        for line in lines:
1dc99f
+            if line.startswith('#') or 'localhost' in line:
1dc99f
+                continue
1dc99f
+            hostln = line.split()[1:]
1dc99f
+            for host in hostln:
1dc99f
+                if len(host.split('.')) == 1:
1dc99f
+                    # only generate a mapping for fqdns but still record the
1dc99f
+                    # short name here for later obfuscation with parse_line()
1dc99f
+                    self.short_names.append(host)
1dc99f
+                else:
1dc99f
+                    self.mapping.add(host)
1dc99f
+
1dc99f
     def parse_line(self, line):
1dc99f
         """Override the default parse_line() method to also check for the
1dc99f
         shortname of the host derived from the hostname.
1dc99f
-- 
1dc99f
2.31.1
1dc99f
1dc99f
From c1680226b53452b18f27f2e76c3e0e03e521f935 Mon Sep 17 00:00:00 2001
1dc99f
From: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
Date: Wed, 17 Nov 2021 13:11:33 -0500
1dc99f
Subject: [PATCH] [clean, hostname] Fix unintentionally case sensitive
1dc99f
 shortname handling
1dc99f
1dc99f
It was discovered that our extra handling for shortnames was
1dc99f
unintentionally case sensitive. Fix this to ensure that shortnames are
1dc99f
obfuscated regardless of case in all collected text.
1dc99f
1dc99f
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
---
1dc99f
 sos/cleaner/mappings/hostname_map.py   |  6 +++---
1dc99f
 sos/cleaner/parsers/hostname_parser.py |  8 +++++---
1dc99f
 tests/cleaner_tests/full_report_run.py | 21 ++++++++++++++++++++-
1dc99f
 3 files changed, 28 insertions(+), 7 deletions(-)
1dc99f
1dc99f
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
1dc99f
index e70a5530..0fe78fb1 100644
1dc99f
--- a/sos/cleaner/mappings/hostname_map.py
1dc99f
+++ b/sos/cleaner/mappings/hostname_map.py
1dc99f
@@ -169,13 +169,13 @@ class SoSHostnameMap(SoSMap):
1dc99f
 
1dc99f
     def sanitize_item(self, item):
1dc99f
         host = item.split('.')
1dc99f
-        if all([h.isupper() for h in host]):
1dc99f
+        if len(host) > 1 and all([h.isupper() for h in host]):
1dc99f
             # by convention we have just a domain
1dc99f
             _host = [h.lower() for h in host]
1dc99f
             return self.sanitize_domain(_host).upper()
1dc99f
         if len(host) == 1:
1dc99f
             # we have a shortname for a host
1dc99f
-            return self.sanitize_short_name(host[0])
1dc99f
+            return self.sanitize_short_name(host[0].lower())
1dc99f
         if len(host) == 2:
1dc99f
             # we have just a domain name, e.g. example.com
1dc99f
             return self.sanitize_domain(host)
1dc99f
@@ -185,7 +185,7 @@ class SoSHostnameMap(SoSMap):
1dc99f
             domain = host[1:]
1dc99f
             # obfuscate the short name
1dc99f
             if len(hostname) > 2:
1dc99f
-                ob_hostname = self.sanitize_short_name(hostname)
1dc99f
+                ob_hostname = self.sanitize_short_name(hostname.lower())
1dc99f
             else:
1dc99f
                 # by best practice it appears the host part of the fqdn was cut
1dc99f
                 # off due to some form of truncating, as such don't obfuscate
1dc99f
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
1dc99f
index 0a733bee..7fd0e698 100644
1dc99f
--- a/sos/cleaner/parsers/hostname_parser.py
1dc99f
+++ b/sos/cleaner/parsers/hostname_parser.py
1dc99f
@@ -8,6 +8,8 @@
1dc99f
 #
1dc99f
 # See the LICENSE file in the source distribution for further information.
1dc99f
 
1dc99f
+import re
1dc99f
+
1dc99f
 from sos.cleaner.parsers import SoSCleanerParser
1dc99f
 from sos.cleaner.mappings.hostname_map import SoSHostnameMap
1dc99f
 
1dc99f
@@ -91,9 +93,9 @@ class SoSHostnameParser(SoSCleanerParser):
1dc99f
             """
1dc99f
             if search in self.mapping.skip_keys:
1dc99f
                 return ln, count
1dc99f
-            if search in ln:
1dc99f
-                count += ln.count(search)
1dc99f
-                ln = ln.replace(search, self.mapping.get(repl or search))
1dc99f
+            _reg = re.compile(search, re.I)
1dc99f
+            if _reg.search(ln):
1dc99f
+                return _reg.subn(self.mapping.get(repl or search), ln)
1dc99f
             return ln, count
1dc99f
 
1dc99f
         count = 0
1dc99f
diff --git a/tests/cleaner_tests/full_report_run.py b/tests/cleaner_tests/full_report_run.py
1dc99f
index 2de54946..0b23acaf 100644
1dc99f
--- a/tests/cleaner_tests/full_report_run.py
1dc99f
+++ b/tests/cleaner_tests/full_report_run.py
1dc99f
@@ -26,6 +26,24 @@ class FullCleanTest(StageTwoReportTest):
1dc99f
     # replace with an empty placeholder, make sure that this test case is not
1dc99f
     # influenced by previous clean runs
1dc99f
     files = ['/etc/sos/cleaner/default_mapping']
1dc99f
+    packages = {
1dc99f
+        'rhel': ['python3-systemd'],
1dc99f
+        'ubuntu': ['python3-systemd']
1dc99f
+    }
1dc99f
+
1dc99f
+    def pre_sos_setup(self):
1dc99f
+        # ensure that case-insensitive matching of FQDNs and shortnames work
1dc99f
+        from systemd import journal
1dc99f
+        from socket import gethostname
1dc99f
+        host = gethostname()
1dc99f
+        short = host.split('.')[0]
1dc99f
+        sosfd = journal.stream('sos-testing')
1dc99f
+        sosfd.write(
1dc99f
+            "This is a test line from sos clean testing. The hostname %s "
1dc99f
+            "should not appear, nor should %s in an obfuscated archive. The "
1dc99f
+            "shortnames of %s and %s should also not appear."
1dc99f
+            % (host.lower(), host.upper(), short.lower(), short.upper())
1dc99f
+        )
1dc99f
 
1dc99f
     def test_private_map_was_generated(self):
1dc99f
         self.assertOutputContains('A mapping of obfuscated elements is available at')
1dc99f
@@ -40,8 +58,9 @@ class FullCleanTest(StageTwoReportTest):
1dc99f
 
1dc99f
     def test_hostname_not_in_any_file(self):
1dc99f
         host = self.sysinfo['pre']['networking']['hostname']
1dc99f
+        short = host.split('.')[0]
1dc99f
         # much faster to just use grep here
1dc99f
-        content = self.grep_for_content(host)
1dc99f
+        content = self.grep_for_content(host) + self.grep_for_content(short)
1dc99f
         if not content:
1dc99f
             assert True
1dc99f
         else:
1dc99f
-- 
1dc99f
2.31.1
1dc99f
1dc99f
From aaeb8cb57ed55598ab744b96d4f127aedebcb292 Mon Sep 17 00:00:00 2001
1dc99f
From: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
Date: Tue, 21 Sep 2021 15:23:20 -0400
1dc99f
Subject: [PATCH] [build] Add archives to setup.py packages
1dc99f
1dc99f
Adds the newly abstracted `sos.cleaner.archives` package to `setup.py`
1dc99f
so that manual builds will properly include it.
1dc99f
1dc99f
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
---
1dc99f
 setup.py | 2 +-
1dc99f
 1 file changed, 1 insertion(+), 1 deletion(-)
1dc99f
1dc99f
diff --git a/setup.py b/setup.py
1dc99f
index 1e8d8e2dc5..7653b59de3 100644
1dc99f
--- a/setup.py
1dc99f
+++ b/setup.py
1dc99f
@@ -102,7 +102,7 @@ def copy_file (self, filename, dirname):
1dc99f
         'sos.policies.package_managers', 'sos.policies.init_systems',
1dc99f
         'sos.report', 'sos.report.plugins', 'sos.collector',
1dc99f
         'sos.collector.clusters', 'sos.cleaner', 'sos.cleaner.mappings',
1dc99f
-        'sos.cleaner.parsers'
1dc99f
+        'sos.cleaner.parsers', 'sos.cleaner.archives'
1dc99f
     ],
1dc99f
     cmdclass=cmdclass,
1dc99f
     command_options=command_options,
1dc99f
-- 
1dc99f
2.31.1
1dc99f
1dc99f
From ba3528230256429a4394f155a9ca1fdb91cf3560 Mon Sep 17 00:00:00 2001
1dc99f
From: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
Date: Tue, 30 Nov 2021 12:46:34 -0500
1dc99f
Subject: [PATCH 1/2] [hostname] Simplify case matching for domains
1dc99f
1dc99f
Instead of special handling all uppercase domain conventions, use our
1dc99f
normal flow for obfuscation and just match the casing at the end of the
1dc99f
sanitization routine.
1dc99f
1dc99f
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
---
1dc99f
 sos/cleaner/mappings/hostname_map.py | 14 ++++++++------
1dc99f
 1 file changed, 8 insertions(+), 6 deletions(-)
1dc99f
1dc99f
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
1dc99f
index 0fe78fb1..5cd8e985 100644
1dc99f
--- a/sos/cleaner/mappings/hostname_map.py
1dc99f
+++ b/sos/cleaner/mappings/hostname_map.py
1dc99f
@@ -169,16 +169,15 @@ class SoSHostnameMap(SoSMap):
1dc99f
 
1dc99f
     def sanitize_item(self, item):
1dc99f
         host = item.split('.')
1dc99f
-        if len(host) > 1 and all([h.isupper() for h in host]):
1dc99f
-            # by convention we have just a domain
1dc99f
-            _host = [h.lower() for h in host]
1dc99f
-            return self.sanitize_domain(_host).upper()
1dc99f
         if len(host) == 1:
1dc99f
             # we have a shortname for a host
1dc99f
             return self.sanitize_short_name(host[0].lower())
1dc99f
         if len(host) == 2:
1dc99f
             # we have just a domain name, e.g. example.com
1dc99f
-            return self.sanitize_domain(host)
1dc99f
+            dname = self.sanitize_domain(host)
1dc99f
+            if all([h.isupper() for h in host]):
1dc99f
+                dname = dname.upper()
1dc99f
+            return dname
1dc99f
         if len(host) > 2:
1dc99f
             # we have an FQDN, e.g. foo.example.com
1dc99f
             hostname = host[0]
1dc99f
@@ -194,7 +193,10 @@ class SoSHostnameMap(SoSMap):
1dc99f
                 ob_hostname = 'unknown'
1dc99f
             ob_domain = self.sanitize_domain(domain)
1dc99f
             self.dataset[item] = ob_domain
1dc99f
-            return '.'.join([ob_hostname, ob_domain])
1dc99f
+            _fqdn = '.'.join([ob_hostname, ob_domain])
1dc99f
+            if all([h.isupper() for h in host]):
1dc99f
+                _fqdn = _fqdn.upper()
1dc99f
+            return _fqdn
1dc99f
 
1dc99f
     def sanitize_short_name(self, hostname):
1dc99f
         """Obfuscate the short name of the host with an incremented counter
1dc99f
-- 
1dc99f
2.31.1
1dc99f
1dc99f
1dc99f
From 189586728de22dd55122c1f7e06b19590f9a788f Mon Sep 17 00:00:00 2001
1dc99f
From: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
Date: Tue, 30 Nov 2021 12:47:58 -0500
1dc99f
Subject: [PATCH 2/2] [username] Improve username sourcing and remove case
1dc99f
 sensitivity
1dc99f
1dc99f
First, don't skip the first line of `last` output, and instead add the
1dc99f
header from lastlog to the skip list. Additionally, add
1dc99f
`/etc/cron.allow` and `/etc/cron.deny` as sources for usernames that
1dc99f
might not appear in other locations in certain environments.
1dc99f
1dc99f
Also, make matching and replacement case insensitive.
1dc99f
1dc99f
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
---
1dc99f
 sos/cleaner/archives/sos.py            |  4 +++-
1dc99f
 sos/cleaner/mappings/username_map.py   |  2 +-
1dc99f
 sos/cleaner/parsers/username_parser.py | 14 +++++++++-----
1dc99f
 3 files changed, 13 insertions(+), 7 deletions(-)
1dc99f
1dc99f
diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
1dc99f
index f8720c88..12766496 100644
1dc99f
--- a/sos/cleaner/archives/sos.py
1dc99f
+++ b/sos/cleaner/archives/sos.py
1dc99f
@@ -35,7 +35,9 @@ class SoSReportArchive(SoSObfuscationArchive):
1dc99f
             'sos_commands/login/lastlog_-u_65537-4294967295',
1dc99f
             # AD users will be reported here, but favor the lastlog files since
1dc99f
             # those will include local users who have not logged in
1dc99f
-            'sos_commands/login/last'
1dc99f
+            'sos_commands/login/last',
1dc99f
+            'etc/cron.allow',
1dc99f
+            'etc/cron.deny'
1dc99f
         ]
1dc99f
     }
1dc99f
 
1dc99f
diff --git a/sos/cleaner/mappings/username_map.py b/sos/cleaner/mappings/username_map.py
1dc99f
index cdbf36fe..7ecccd7b 100644
1dc99f
--- a/sos/cleaner/mappings/username_map.py
1dc99f
+++ b/sos/cleaner/mappings/username_map.py
1dc99f
@@ -33,5 +33,5 @@ class SoSUsernameMap(SoSMap):
1dc99f
         ob_name = "obfuscateduser%s" % self.name_count
1dc99f
         self.name_count += 1
1dc99f
         if ob_name in self.dataset.values():
1dc99f
-            return self.sanitize_item(username)
1dc99f
+            return self.sanitize_item(username.lower())
1dc99f
         return ob_name
1dc99f
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
1dc99f
index 35377a31..229c7de4 100644
1dc99f
--- a/sos/cleaner/parsers/username_parser.py
1dc99f
+++ b/sos/cleaner/parsers/username_parser.py
1dc99f
@@ -8,6 +8,7 @@
1dc99f
 #
1dc99f
 # See the LICENSE file in the source distribution for further information.
1dc99f
 
1dc99f
+import re
1dc99f
 
1dc99f
 from sos.cleaner.parsers import SoSCleanerParser
1dc99f
 from sos.cleaner.mappings.username_map import SoSUsernameMap
1dc99f
@@ -34,6 +35,7 @@ class SoSUsernameParser(SoSCleanerParser):
1dc99f
         'reboot',
1dc99f
         'root',
1dc99f
         'ubuntu',
1dc99f
+        'username',
1dc99f
         'wtmp'
1dc99f
     ]
1dc99f
 
1dc99f
@@ -47,12 +49,12 @@ class SoSUsernameParser(SoSCleanerParser):
1dc99f
         this parser, we need to override the initial parser prepping here.
1dc99f
         """
1dc99f
         users = set()
1dc99f
-        for line in content.splitlines()[1:]:
1dc99f
+        for line in content.splitlines():
1dc99f
             try:
1dc99f
                 user = line.split()[0]
1dc99f
             except Exception:
1dc99f
                 continue
1dc99f
-            if user in self.skip_list:
1dc99f
+            if user.lower() in self.skip_list:
1dc99f
                 continue
1dc99f
             users.add(user)
1dc99f
         for each in users:
1dc99f
@@ -61,7 +63,9 @@ class SoSUsernameParser(SoSCleanerParser):
1dc99f
     def parse_line(self, line):
1dc99f
         count = 0
1dc99f
         for username in sorted(self.mapping.dataset.keys(), reverse=True):
1dc99f
-            if username in line:
1dc99f
-                count = line.count(username)
1dc99f
-                line = line.replace(username, self.mapping.get(username))
1dc99f
+            _reg = re.compile(username, re.I)
1dc99f
+            if _reg.search(line):
1dc99f
+                line, count = _reg.subn(
1dc99f
+                    self.mapping.get(username.lower()), line
1dc99f
+                )
1dc99f
         return line, count
1dc99f
-- 
1dc99f
2.31.1
1dc99f
1dc99f
From cafd0f3a52436a3966576e7db21e5dd17c06f0cc Mon Sep 17 00:00:00 2001
1dc99f
From: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
Date: Sun, 12 Dec 2021 11:10:46 -0500
1dc99f
Subject: [PATCH] [hostname] Fix edge case for new hosts in a known subdomain
1dc99f
1dc99f
Fixes an edge case that would cause us to at first not recognize that a
1dc99f
given hostname string is a new host in a known subdomain, but then on
1dc99f
the obfuscation attempt properly recognize it as such and result in an
1dc99f
incomplete obfuscation.
1dc99f
1dc99f
This was mostly triggered by specific patterns for build hosts within
1dc99f
`sos_commands/rpm/package-data`. With this refined check, these types of
1dc99f
matches are properly obfuscated.
1dc99f
1dc99f
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
1dc99f
---
1dc99f
 sos/cleaner/mappings/hostname_map.py | 9 +++++----
1dc99f
 1 file changed, 5 insertions(+), 4 deletions(-)
1dc99f
1dc99f
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
1dc99f
index 5cd8e9857..33b0e6c80 100644
1dc99f
--- a/sos/cleaner/mappings/hostname_map.py
1dc99f
+++ b/sos/cleaner/mappings/hostname_map.py
1dc99f
@@ -129,7 +129,7 @@ def get(self, item):
1dc99f
             item = item[0:-1]
1dc99f
         if not self.domain_name_in_loaded_domains(item.lower()):
1dc99f
             return item
1dc99f
-        if item.endswith(('.yaml', '.yml', '.crt', '.key', '.pem')):
1dc99f
+        if item.endswith(('.yaml', '.yml', '.crt', '.key', '.pem', '.log')):
1dc99f
             ext = '.' + item.split('.')[-1]
1dc99f
             item = item.replace(ext, '')
1dc99f
             suffix += ext
1dc99f
@@ -148,7 +148,8 @@ def get(self, item):
1dc99f
                 if len(_test) == 1 or not _test[0]:
1dc99f
                     # does not match existing obfuscation
1dc99f
                     continue
1dc99f
-                elif _test[0].endswith('.') and not _host_substr:
1dc99f
+                elif not _host_substr and (_test[0].endswith('.') or
1dc99f
+                                           item.endswith(_existing)):
1dc99f
                     # new hostname in known domain
1dc99f
                     final = super(SoSHostnameMap, self).get(item)
1dc99f
                     break
1dc99f
@@ -219,8 +220,8 @@ def sanitize_domain(self, domain):
1dc99f
             # don't obfuscate vendor domains
1dc99f
             if re.match(_skip, '.'.join(domain)):
1dc99f
                 return '.'.join(domain)
1dc99f
-        top_domain = domain[-1]
1dc99f
-        dname = '.'.join(domain[0:-1])
1dc99f
+        top_domain = domain[-1].lower()
1dc99f
+        dname = '.'.join(domain[0:-1]).lower()
1dc99f
         ob_domain = self._new_obfuscated_domain(dname)
1dc99f
         ob_domain = '.'.join([ob_domain, top_domain])
1dc99f
         self.dataset['.'.join(domain)] = ob_domain
9bf60c
From f5e1298162a9393ea2d9f5c4df40dfece50f5f88 Mon Sep 17 00:00:00 2001
9bf60c
From: Jake Hunsaker <jhunsake@redhat.com>
9bf60c
Date: Thu, 6 Jan 2022 13:15:15 -0500
9bf60c
Subject: [PATCH 1/3] [hostname] Fix loading and detection of long base domains
9bf60c
9bf60c
Our domain matching has up to now assumed that users would be providing
9bf60c
'base' domains such as 'example.com' whereby something like
9bf60c
'foo.bar.example.com' is a subdomain (or host) within that base domain.
9bf60c
9bf60c
However, the use case exists to provide 'foo.bar.example.com' as the
9bf60c
base domain, without wanting to obfuscate 'example.com' directly.
9bf60c
9bf60c
This commit fixes our handling of both loading these longer domains and
9bf60c
doing the 'domain is part of a domain we want to obfuscate' check.
9bf60c
9bf60c
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
9bf60c
---
9bf60c
 sos/cleaner/mappings/hostname_map.py | 9 ++++++++-
9bf60c
 1 file changed, 8 insertions(+), 1 deletion(-)
9bf60c
9bf60c
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
9bf60c
index 33b0e6c8..7a7cf6b8 100644
9bf60c
--- a/sos/cleaner/mappings/hostname_map.py
9bf60c
+++ b/sos/cleaner/mappings/hostname_map.py
9bf60c
@@ -50,10 +50,14 @@ class SoSHostnameMap(SoSMap):
9bf60c
         in this parser, we need to re-inject entries from the map_file into
9bf60c
         these dicts and not just the underlying 'dataset' dict
9bf60c
         """
9bf60c
-        for domain in self.dataset:
9bf60c
+        for domain, ob_pair in self.dataset.items():
9bf60c
             if len(domain.split('.')) == 1:
9bf60c
                 self.hosts[domain.split('.')[0]] = self.dataset[domain]
9bf60c
             else:
9bf60c
+                if ob_pair.startswith('obfuscateddomain'):
9bf60c
+                    # directly exact domain matches
9bf60c
+                    self._domains[domain] = ob_pair.split('.')[0]
9bf60c
+                    continue
9bf60c
                 # strip the host name and trailing top-level domain so that
9bf60c
                 # we in inject the domain properly for later string matching
9bf60c
 
9bf60c
@@ -102,9 +106,12 @@ class SoSHostnameMap(SoSMap):
9bf60c
         and should be obfuscated
9bf60c
         """
9bf60c
         host = domain.split('.')
9bf60c
+        no_tld = '.'.join(domain.split('.')[0:-1])
9bf60c
         if len(host) == 1:
9bf60c
             # don't block on host's shortname
9bf60c
             return host[0] in self.hosts.keys()
9bf60c
+        elif any([no_tld.endswith(_d) for _d in self._domains]):
9bf60c
+            return True
9bf60c
         else:
9bf60c
             domain = host[0:-1]
9bf60c
             for known_domain in self._domains:
9bf60c
-- 
9bf60c
2.31.1
9bf60c
9bf60c
9bf60c
From e241cf33a14ecd4e848a5fd857c5d3d7d07fbd71 Mon Sep 17 00:00:00 2001
9bf60c
From: Jake Hunsaker <jhunsake@redhat.com>
9bf60c
Date: Thu, 6 Jan 2022 13:18:44 -0500
9bf60c
Subject: [PATCH 2/3] [cleaner] Improve parser-specific file skipping
9bf60c
9bf60c
This commit improves our handling of skipping files on a per-parser
9bf60c
basis, by first filtering the list of parsers that `obfuscate_line()`
9bf60c
will iterate over by the parser's `skip_file` class attr, rather than
9bf60c
relying on higher-level checks.
9bf60c
9bf60c
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
9bf60c
---
9bf60c
 sos/cleaner/__init__.py | 17 ++++++++++++++---
9bf60c
 1 file changed, 14 insertions(+), 3 deletions(-)
9bf60c
9bf60c
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
9bf60c
index 3f530d44..5686e213 100644
9bf60c
--- a/sos/cleaner/__init__.py
9bf60c
+++ b/sos/cleaner/__init__.py
9bf60c
@@ -12,6 +12,7 @@ import hashlib
9bf60c
 import json
9bf60c
 import logging
9bf60c
 import os
9bf60c
+import re
9bf60c
 import shutil
9bf60c
 import tempfile
9bf60c
 
9bf60c
@@ -640,10 +641,16 @@ third party.
9bf60c
             self.log_debug("Obfuscating %s" % short_name or filename,
9bf60c
                            caller=arc_name)
9bf60c
             tfile = tempfile.NamedTemporaryFile(mode='w', dir=self.tmpdir)
9bf60c
+            _parsers = [
9bf60c
+                _p for _p in self.parsers if not
9bf60c
+                any([
9bf60c
+                    re.match(p, short_name) for p in _p.skip_files
9bf60c
+                ])
9bf60c
+            ]
9bf60c
             with open(filename, 'r') as fname:
9bf60c
                 for line in fname:
9bf60c
                     try:
9bf60c
-                        line, count = self.obfuscate_line(line)
9bf60c
+                        line, count = self.obfuscate_line(line, _parsers)
9bf60c
                         subs += count
9bf60c
                         tfile.write(line)
9bf60c
                     except Exception as err:
9bf60c
@@ -713,7 +720,7 @@ third party.
9bf60c
                 pass
9bf60c
         return string_data
9bf60c
 
9bf60c
-    def obfuscate_line(self, line):
9bf60c
+    def obfuscate_line(self, line, parsers=None):
9bf60c
         """Run a line through each of the obfuscation parsers, keeping a
9bf60c
         cumulative total of substitutions done on that particular line.
9bf60c
 
9bf60c
@@ -721,6 +728,8 @@ third party.
9bf60c
 
9bf60c
             :param line str:        The raw line as read from the file being
9bf60c
                                     processed
9bf60c
+            :param parsers:         A list of parser objects to obfuscate
9bf60c
+                                    with. If None, use all.
9bf60c
 
9bf60c
         Returns the fully obfuscated line and the number of substitutions made
9bf60c
         """
9bf60c
@@ -729,7 +738,9 @@ third party.
9bf60c
         count = 0
9bf60c
         if not line.strip():
9bf60c
             return line, count
9bf60c
-        for parser in self.parsers:
9bf60c
+        if parsers is None:
9bf60c
+            parsers = self.parsers
9bf60c
+        for parser in parsers:
9bf60c
             try:
9bf60c
                 line, _count = parser.parse_line(line)
9bf60c
                 count += _count
9bf60c
-- 
9bf60c
2.31.1
9bf60c
9bf60c
9bf60c
From 96c9a833e77639a853b7d3d6f1df68bbbbe5e9cb Mon Sep 17 00:00:00 2001
9bf60c
From: Jake Hunsaker <jhunsake@redhat.com>
9bf60c
Date: Thu, 6 Jan 2022 13:20:32 -0500
9bf60c
Subject: [PATCH 3/3] [cleaner] Add skips for known files and usernames
9bf60c
9bf60c
Adds skips for `/proc/kallsyms` which should never be obfuscated, as
9bf60c
well as any packaging-related log file for the IP parser. Further, do
9bf60c
not obfuscate the `stack` users, as that is a well-known user for many
9bf60c
configurations that, if obfuscated, could result in undesired string
9bf60c
substitutions in normal logging.
9bf60c
9bf60c
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
9bf60c
---
9bf60c
 sos/cleaner/archives/__init__.py       | 2 ++
9bf60c
 sos/cleaner/parsers/ip_parser.py       | 3 ++-
9bf60c
 sos/cleaner/parsers/username_parser.py | 1 +
9bf60c
 3 files changed, 5 insertions(+), 1 deletion(-)
9bf60c
9bf60c
diff --git a/sos/cleaner/archives/__init__.py b/sos/cleaner/archives/__init__.py
9bf60c
index 795c5a78..cbf1f809 100644
9bf60c
--- a/sos/cleaner/archives/__init__.py
9bf60c
+++ b/sos/cleaner/archives/__init__.py
9bf60c
@@ -43,6 +43,7 @@ class SoSObfuscationArchive():
9bf60c
     type_name = 'undetermined'
9bf60c
     description = 'undetermined'
9bf60c
     is_nested = False
9bf60c
+    skip_files = []
9bf60c
     prep_files = {}
9bf60c
 
9bf60c
     def __init__(self, archive_path, tmpdir):
9bf60c
@@ -111,6 +112,7 @@ class SoSObfuscationArchive():
9bf60c
         Returns: list of files and file regexes
9bf60c
         """
9bf60c
         return [
9bf60c
+            'proc/kallsyms',
9bf60c
             'sosreport-',
9bf60c
             'sys/firmware',
9bf60c
             'sys/fs',
9bf60c
diff --git a/sos/cleaner/parsers/ip_parser.py b/sos/cleaner/parsers/ip_parser.py
9bf60c
index 71d38be8..b007368c 100644
9bf60c
--- a/sos/cleaner/parsers/ip_parser.py
9bf60c
+++ b/sos/cleaner/parsers/ip_parser.py
9bf60c
@@ -37,7 +37,8 @@ class SoSIPParser(SoSCleanerParser):
9bf60c
         'sos_commands/snappy/snap_list_--all',
9bf60c
         'sos_commands/snappy/snap_--version',
9bf60c
         'sos_commands/vulkan/vulkaninfo',
9bf60c
-        'var/log/.*dnf.*'
9bf60c
+        'var/log/.*dnf.*',
9bf60c
+        'var/log/.*packag.*'  # get 'packages' and 'packaging' logs
9bf60c
     ]
9bf60c
 
9bf60c
     map_file_key = 'ip_map'
9bf60c
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
9bf60c
index 229c7de4..3208a655 100644
9bf60c
--- a/sos/cleaner/parsers/username_parser.py
9bf60c
+++ b/sos/cleaner/parsers/username_parser.py
9bf60c
@@ -32,6 +32,7 @@ class SoSUsernameParser(SoSCleanerParser):
9bf60c
         'nobody',
9bf60c
         'nfsnobody',
9bf60c
         'shutdown',
9bf60c
+        'stack',
9bf60c
         'reboot',
9bf60c
         'root',
9bf60c
         'ubuntu',
9bf60c
-- 
9bf60c
2.31.1
9bf60c