Blame SOURCES/sos-bz2023867-cleaner-hostnames-improvements.patch

9a3f62
From decd39b7799a0579ea085b0da0728b6eabd49b38 Mon Sep 17 00:00:00 2001
9a3f62
From: Jake Hunsaker <jhunsake@redhat.com>
9a3f62
Date: Wed, 1 Sep 2021 00:28:58 -0400
9a3f62
Subject: [PATCH] [clean] Provide archive abstractions to obfuscate more than
9a3f62
 sos archives
9a3f62
9a3f62
This commit removes the restriction imposed on `sos clean` since its
9a3f62
introduction in sos-4.0 to only work against known sos report archives
9a3f62
or build directories. This is because there has been interest in using
9a3f62
the obfuscation bits of sos in other data-collector projects.
9a3f62
9a3f62
The `SoSObfuscationArchive()` class has been revamped to now be an
9a3f62
abstraction for different types of archives, and the cleaner logic has
9a3f62
been updated to leverage this new abstraction rather than assuming we're
9a3f62
working on an sos archive.
9a3f62
9a3f62
Abstractions are added for our own native use cases - that being `sos
9a3f62
report` and `sos collect` for at-runtime obfuscation, as well as
9a3f62
standalone archives previously generated. Further generic abstractions
9a3f62
are available for plain directories and tarballs however these will not
9a3f62
provide the same level of coverage as fully supported archive types, as
9a3f62
is noted in the manpage for sos-clean.
9a3f62
9a3f62
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
9a3f62
---
9a3f62
 man/en/sos-clean.1                            |  25 ++
9a3f62
 sos/cleaner/__init__.py                       | 308 +++++++++---------
9a3f62
 .../__init__.py}                              |  80 ++++-
9a3f62
 sos/cleaner/archives/generic.py               |  52 +++
9a3f62
 sos/cleaner/archives/sos.py                   | 106 ++++++
9a3f62
 sos/cleaner/parsers/__init__.py               |   6 -
9a3f62
 sos/cleaner/parsers/hostname_parser.py        |   1 -
9a3f62
 sos/cleaner/parsers/ip_parser.py              |   1 -
9a3f62
 sos/cleaner/parsers/keyword_parser.py         |   1 -
9a3f62
 sos/cleaner/parsers/mac_parser.py             |   1 -
9a3f62
 sos/cleaner/parsers/username_parser.py        |   8 -
9a3f62
 tests/cleaner_tests/existing_archive.py       |   7 +
9a3f62
 tests/cleaner_tests/full_report_run.py        |   3 +
9a3f62
 tests/cleaner_tests/report_with_mask.py       |   3 +
9a3f62
 14 files changed, 423 insertions(+), 179 deletions(-)
9a3f62
 rename sos/cleaner/{obfuscation_archive.py => archives/__init__.py} (81%)
9a3f62
 create mode 100644 sos/cleaner/archives/generic.py
9a3f62
 create mode 100644 sos/cleaner/archives/sos.py
9a3f62
9a3f62
diff --git a/man/en/sos-clean.1 b/man/en/sos-clean.1
9a3f62
index b77bc63c..54026713 100644
9a3f62
--- a/man/en/sos-clean.1
9a3f62
+++ b/man/en/sos-clean.1
9a3f62
@@ -10,6 +10,7 @@ sos clean - Obfuscate sensitive data from one or more sosreports
9a3f62
     [\-\-jobs]
9a3f62
     [\-\-no-update]
9a3f62
     [\-\-keep-binary-files]
9a3f62
+    [\-\-archive-type]
9a3f62
 
9a3f62
 .SH DESCRIPTION
9a3f62
 \fBsos clean\fR or \fBsos mask\fR is an sos subcommand used to obfuscate sensitive information from
9a3f62
@@ -88,6 +89,30 @@ Users should review any archive that keeps binary files in place before sending
9a3f62
 a third party.
9a3f62
 
9a3f62
 Default: False (remove encountered binary files)
9a3f62
+.TP
9a3f62
+.B \-\-archive-type TYPE
9a3f62
+Specify the type of archive that TARGET was generated as.
9a3f62
+When sos inspects a TARGET archive, it tries to identify what type of archive it is.
9a3f62
+For example, it may be a report generated by \fBsos report\fR, or a collection of those
9a3f62
+reports generated by \fBsos collect\fR, which require separate approaches.
9a3f62
+
9a3f62
+This option may be useful if a given TARGET archive is known to be of a specific type,
9a3f62
+but due to unknown reasons or some malformed/missing information in the archive directly,
9a3f62
+that is not properly identified by sos.
9a3f62
+
9a3f62
+The following are accepted values for this option:
9a3f62
+
9a3f62
+    \fBauto\fR          Automatically detect the archive type
9a3f62
+    \fBreport\fR        An archive generated by \fBsos report\fR
9a3f62
+    \fBcollect\fR       An archive generated by \fBsos collect\fR
9a3f62
+
9a3f62
+The following may also be used, however note that these do not attempt to pre-load
9a3f62
+any information from the archives into the parsers. This means that, among other limitations,
9a3f62
+items like host and domain names may not be obfuscated unless an obfuscated mapping already exists
9a3f62
+on the system from a previous execution.
9a3f62
+
9a3f62
+    \fBdata-dir\fR      A plain directory on the filesystem.
9a3f62
+    \fBtarball\fR       A generic tar archive not associated with any known tool
9a3f62
 
9a3f62
 .SH SEE ALSO
9a3f62
 .BR sos (1)
9a3f62
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
9a3f62
index 6aadfe79..6d2eb483 100644
9a3f62
--- a/sos/cleaner/__init__.py
9a3f62
+++ b/sos/cleaner/__init__.py
9a3f62
@@ -12,9 +12,7 @@ import hashlib
9a3f62
 import json
9a3f62
 import logging
9a3f62
 import os
9a3f62
-import re
9a3f62
 import shutil
9a3f62
-import tarfile
9a3f62
 import tempfile
9a3f62
 
9a3f62
 from concurrent.futures import ThreadPoolExecutor
9a3f62
@@ -27,7 +25,10 @@ from sos.cleaner.parsers.mac_parser import SoSMacParser
9a3f62
 from sos.cleaner.parsers.hostname_parser import SoSHostnameParser
9a3f62
 from sos.cleaner.parsers.keyword_parser import SoSKeywordParser
9a3f62
 from sos.cleaner.parsers.username_parser import SoSUsernameParser
9a3f62
-from sos.cleaner.obfuscation_archive import SoSObfuscationArchive
9a3f62
+from sos.cleaner.archives.sos import (SoSReportArchive, SoSReportDirectory,
9a3f62
+                                      SoSCollectorArchive,
9a3f62
+                                      SoSCollectorDirectory)
9a3f62
+from sos.cleaner.archives.generic import DataDirArchive, TarballArchive
9a3f62
 from sos.utilities import get_human_readable
9a3f62
 from textwrap import fill
9a3f62
 
9a3f62
@@ -41,6 +42,7 @@ class SoSCleaner(SoSComponent):
9a3f62
     desc = "Obfuscate sensitive networking information in a report"
9a3f62
 
9a3f62
     arg_defaults = {
9a3f62
+        'archive_type': 'auto',
9a3f62
         'domains': [],
9a3f62
         'jobs': 4,
9a3f62
         'keywords': [],
9a3f62
@@ -70,6 +72,7 @@ class SoSCleaner(SoSComponent):
9a3f62
             self.from_cmdline = False
9a3f62
             if not hasattr(self.opts, 'jobs'):
9a3f62
                 self.opts.jobs = 4
9a3f62
+            self.opts.archive_type = 'auto'
9a3f62
             self.soslog = logging.getLogger('sos')
9a3f62
             self.ui_log = logging.getLogger('sos_ui')
9a3f62
             # create the tmp subdir here to avoid a potential race condition
9a3f62
@@ -92,6 +95,17 @@ class SoSCleaner(SoSComponent):
9a3f62
             SoSUsernameParser(self.cleaner_mapping, self.opts.usernames)
9a3f62
         ]
9a3f62
 
9a3f62
+        self.archive_types = [
9a3f62
+            SoSReportDirectory,
9a3f62
+            SoSReportArchive,
9a3f62
+            SoSCollectorDirectory,
9a3f62
+            SoSCollectorArchive,
9a3f62
+            # make sure these two are always last as they are fallbacks
9a3f62
+            DataDirArchive,
9a3f62
+            TarballArchive
9a3f62
+        ]
9a3f62
+        self.nested_archive = None
9a3f62
+
9a3f62
         self.log_info("Cleaner initialized. From cmdline: %s"
9a3f62
                       % self.from_cmdline)
9a3f62
 
9a3f62
@@ -178,6 +192,11 @@ third party.
9a3f62
         )
9a3f62
         clean_grp.add_argument('target', metavar='TARGET',
9a3f62
                                help='The directory or archive to obfuscate')
9a3f62
+        clean_grp.add_argument('--archive-type', default='auto',
9a3f62
+                               choices=['auto', 'report', 'collect',
9a3f62
+                                        'data-dir', 'tarball'],
9a3f62
+                               help=('Specify what kind of archive the target '
9a3f62
+                                     'was generated as'))
9a3f62
         clean_grp.add_argument('--domains', action='extend', default=[],
9a3f62
                                help='List of domain names to obfuscate')
9a3f62
         clean_grp.add_argument('-j', '--jobs', default=4, type=int,
9a3f62
@@ -218,59 +237,28 @@ third party.
9a3f62
 
9a3f62
         In the event the target path is not an archive, abort.
9a3f62
         """
9a3f62
-        if not tarfile.is_tarfile(self.opts.target):
9a3f62
-            self.ui_log.error(
9a3f62
-                "Invalid target: must be directory or tar archive"
9a3f62
-            )
9a3f62
-            self._exit(1)
9a3f62
-
9a3f62
-        archive = tarfile.open(self.opts.target)
9a3f62
-        self.arc_name = self.opts.target.split('/')[-1].split('.')[:-2][0]
9a3f62
-
9a3f62
-        try:
9a3f62
-            archive.getmember(os.path.join(self.arc_name, 'sos_logs'))
9a3f62
-        except Exception:
9a3f62
-            # this is not an sos archive
9a3f62
-            self.ui_log.error("Invalid target: not an sos archive")
9a3f62
-            self._exit(1)
9a3f62
-
9a3f62
-        # see if there are archives within this archive
9a3f62
-        nested_archives = []
9a3f62
-        for _file in archive.getmembers():
9a3f62
-            if (re.match('sosreport-.*.tar', _file.name.split('/')[-1]) and not
9a3f62
-                    (_file.name.endswith(('.md5', '.sha256')))):
9a3f62
-                nested_archives.append(_file.name.split('/')[-1])
9a3f62
-
9a3f62
-        if nested_archives:
9a3f62
-            self.log_info("Found nested archive(s), extracting top level")
9a3f62
-            nested_path = self.extract_archive(archive)
9a3f62
-            for arc_file in os.listdir(nested_path):
9a3f62
-                if re.match('sosreport.*.tar.*', arc_file):
9a3f62
-                    if arc_file.endswith(('.md5', '.sha256')):
9a3f62
-                        continue
9a3f62
-                    self.report_paths.append(os.path.join(nested_path,
9a3f62
-                                                          arc_file))
9a3f62
-            # add the toplevel extracted archive
9a3f62
-            self.report_paths.append(nested_path)
9a3f62
+        _arc = None
9a3f62
+        if self.opts.archive_type != 'auto':
9a3f62
+            check_type = self.opts.archive_type.replace('-', '_')
9a3f62
+            for archive in self.archive_types:
9a3f62
+                if archive.type_name == check_type:
9a3f62
+                    _arc = archive(self.opts.target, self.tmpdir)
9a3f62
         else:
9a3f62
-            self.report_paths.append(self.opts.target)
9a3f62
-
9a3f62
-        archive.close()
9a3f62
-
9a3f62
-    def extract_archive(self, archive):
9a3f62
-        """Extract an archive into our tmpdir so that we may inspect it or
9a3f62
-        iterate through its contents for obfuscation
9a3f62
-
9a3f62
-        Positional arguments:
9a3f62
-
9a3f62
-            :param archive:     An open TarFile object for the archive
9a3f62
-
9a3f62
-        """
9a3f62
-        if not isinstance(archive, tarfile.TarFile):
9a3f62
-            archive = tarfile.open(archive)
9a3f62
-        path = os.path.join(self.tmpdir, 'cleaner')
9a3f62
-        archive.extractall(path)
9a3f62
-        return os.path.join(path, archive.name.split('/')[-1].split('.tar')[0])
9a3f62
+            for arc in self.archive_types:
9a3f62
+                if arc.check_is_type(self.opts.target):
9a3f62
+                    _arc = arc(self.opts.target, self.tmpdir)
9a3f62
+                    break
9a3f62
+        if not _arc:
9a3f62
+            return
9a3f62
+        self.report_paths.append(_arc)
9a3f62
+        if _arc.is_nested:
9a3f62
+            self.report_paths.extend(_arc.get_nested_archives())
9a3f62
+            # We need to preserve the top level archive until all
9a3f62
+            # nested archives are processed
9a3f62
+            self.report_paths.remove(_arc)
9a3f62
+            self.nested_archive = _arc
9a3f62
+        if self.nested_archive:
9a3f62
+            self.nested_archive.ui_name = self.nested_archive.description
9a3f62
 
9a3f62
     def execute(self):
9a3f62
         """SoSCleaner will begin by inspecting the TARGET option to determine
9a3f62
@@ -283,6 +271,7 @@ third party.
9a3f62
         be unpacked, cleaned, and repacked and the final top-level archive will
9a3f62
         then be repacked as well.
9a3f62
         """
9a3f62
+        self.arc_name = self.opts.target.split('/')[-1].split('.tar')[0]
9a3f62
         if self.from_cmdline:
9a3f62
             self.print_disclaimer()
9a3f62
         self.report_paths = []
9a3f62
@@ -290,23 +279,11 @@ third party.
9a3f62
             self.ui_log.error("Invalid target: no such file or directory %s"
9a3f62
                               % self.opts.target)
9a3f62
             self._exit(1)
9a3f62
-        if os.path.isdir(self.opts.target):
9a3f62
-            self.arc_name = self.opts.target.split('/')[-1]
9a3f62
-            for _file in os.listdir(self.opts.target):
9a3f62
-                if _file == 'sos_logs':
9a3f62
-                    self.report_paths.append(self.opts.target)
9a3f62
-                if (_file.startswith('sosreport') and
9a3f62
-                   (_file.endswith(".tar.gz") or _file.endswith(".tar.xz"))):
9a3f62
-                    self.report_paths.append(os.path.join(self.opts.target,
9a3f62
-                                                          _file))
9a3f62
-            if not self.report_paths:
9a3f62
-                self.ui_log.error("Invalid target: not an sos directory")
9a3f62
-                self._exit(1)
9a3f62
-        else:
9a3f62
-            self.inspect_target_archive()
9a3f62
+
9a3f62
+        self.inspect_target_archive()
9a3f62
 
9a3f62
         if not self.report_paths:
9a3f62
-            self.ui_log.error("No valid sos archives or directories found\n")
9a3f62
+            self.ui_log.error("No valid archives or directories found\n")
9a3f62
             self._exit(1)
9a3f62
 
9a3f62
         # we have at least one valid target to obfuscate
9a3f62
@@ -334,33 +311,7 @@ third party.
9a3f62
 
9a3f62
         final_path = None
9a3f62
         if len(self.completed_reports) > 1:
9a3f62
-            # we have an archive of archives, so repack the obfuscated tarball
9a3f62
-            arc_name = self.arc_name + '-obfuscated'
9a3f62
-            self.setup_archive(name=arc_name)
9a3f62
-            for arc in self.completed_reports:
9a3f62
-                if arc.is_tarfile:
9a3f62
-                    arc_dest = self.obfuscate_string(
9a3f62
-                        arc.final_archive_path.split('/')[-1]
9a3f62
-                    )
9a3f62
-                    self.archive.add_file(arc.final_archive_path,
9a3f62
-                                          dest=arc_dest)
9a3f62
-                    checksum = self.get_new_checksum(arc.final_archive_path)
9a3f62
-                    if checksum is not None:
9a3f62
-                        dname = self.obfuscate_string(
9a3f62
-                            "checksums/%s.%s" % (arc_dest, self.hash_name)
9a3f62
-                        )
9a3f62
-                        self.archive.add_string(checksum, dest=dname)
9a3f62
-                else:
9a3f62
-                    for dirname, dirs, files in os.walk(arc.archive_path):
9a3f62
-                        for filename in files:
9a3f62
-                            if filename.startswith('sosreport'):
9a3f62
-                                continue
9a3f62
-                            fname = os.path.join(dirname, filename)
9a3f62
-                            dnm = self.obfuscate_string(
9a3f62
-                                fname.split(arc.archive_name)[-1].lstrip('/')
9a3f62
-                            )
9a3f62
-                            self.archive.add_file(fname, dest=dnm)
9a3f62
-            arc_path = self.archive.finalize(self.opts.compression_type)
9a3f62
+            arc_path = self.rebuild_nested_archive()
9a3f62
         else:
9a3f62
             arc = self.completed_reports[0]
9a3f62
             arc_path = arc.final_archive_path
9a3f62
@@ -371,8 +322,7 @@ third party.
9a3f62
                 )
9a3f62
                 with open(os.path.join(self.sys_tmp, chksum_name), 'w') as cf:
9a3f62
                     cf.write(checksum)
9a3f62
-
9a3f62
-        self.write_cleaner_log()
9a3f62
+            self.write_cleaner_log()
9a3f62
 
9a3f62
         final_path = self.obfuscate_string(
9a3f62
             os.path.join(self.sys_tmp, arc_path.split('/')[-1])
9a3f62
@@ -393,6 +343,30 @@ third party.
9a3f62
 
9a3f62
         self.cleanup()
9a3f62
 
9a3f62
+    def rebuild_nested_archive(self):
9a3f62
+        """Handles repacking the nested tarball, now containing only obfuscated
9a3f62
+        copies of the reports, log files, manifest, etc...
9a3f62
+        """
9a3f62
+        # we have an archive of archives, so repack the obfuscated tarball
9a3f62
+        arc_name = self.arc_name + '-obfuscated'
9a3f62
+        self.setup_archive(name=arc_name)
9a3f62
+        for archive in self.completed_reports:
9a3f62
+            arc_dest = archive.final_archive_path.split('/')[-1]
9a3f62
+            checksum = self.get_new_checksum(archive.final_archive_path)
9a3f62
+            if checksum is not None:
9a3f62
+                dname = "checksums/%s.%s" % (arc_dest, self.hash_name)
9a3f62
+                self.archive.add_string(checksum, dest=dname)
9a3f62
+        for dirn, dirs, files in os.walk(self.nested_archive.extracted_path):
9a3f62
+            for filename in files:
9a3f62
+                fname = os.path.join(dirn, filename)
9a3f62
+                dname = fname.split(self.nested_archive.extracted_path)[-1]
9a3f62
+                dname = dname.lstrip('/')
9a3f62
+                self.archive.add_file(fname, dest=dname)
9a3f62
+                # remove it now so we don't balloon our fs space needs
9a3f62
+                os.remove(fname)
9a3f62
+        self.write_cleaner_log(archive=True)
9a3f62
+        return self.archive.finalize(self.opts.compression_type)
9a3f62
+
9a3f62
     def compile_mapping_dict(self):
9a3f62
         """Build a dict that contains each parser's map as a key, with the
9a3f62
         contents as that key's value. This will then be written to disk in the
9a3f62
@@ -441,7 +415,7 @@ third party.
9a3f62
                 self.log_error("Could not update mapping config file: %s"
9a3f62
                                % err)
9a3f62
 
9a3f62
-    def write_cleaner_log(self):
9a3f62
+    def write_cleaner_log(self, archive=False):
9a3f62
         """When invoked via the command line, the logging from SoSCleaner will
9a3f62
         not be added to the archive(s) it processes, so we need to write it
9a3f62
         separately to disk
9a3f62
@@ -454,6 +428,10 @@ third party.
9a3f62
             for line in self.sos_log_file.readlines():
9a3f62
                 logfile.write(line)
9a3f62
 
9a3f62
+        if archive:
9a3f62
+            self.obfuscate_file(log_name)
9a3f62
+            self.archive.add_file(log_name, dest="sos_logs/cleaner.log")
9a3f62
+
9a3f62
     def get_new_checksum(self, archive_path):
9a3f62
         """Calculate a new checksum for the obfuscated archive, as the previous
9a3f62
         checksum will no longer be valid
9a3f62
@@ -481,11 +459,11 @@ third party.
9a3f62
         be obfuscated concurrently.
9a3f62
         """
9a3f62
         try:
9a3f62
-            if len(self.report_paths) > 1:
9a3f62
-                msg = ("Found %s total reports to obfuscate, processing up to "
9a3f62
-                       "%s concurrently\n"
9a3f62
-                       % (len(self.report_paths), self.opts.jobs))
9a3f62
-                self.ui_log.info(msg)
9a3f62
+            msg = (
9a3f62
+                "Found %s total reports to obfuscate, processing up to %s "
9a3f62
+                "concurrently\n" % (len(self.report_paths), self.opts.jobs)
9a3f62
+            )
9a3f62
+            self.ui_log.info(msg)
9a3f62
             if self.opts.keep_binary_files:
9a3f62
                 self.ui_log.warning(
9a3f62
                     "WARNING: binary files that potentially contain sensitive "
9a3f62
@@ -494,53 +472,67 @@ third party.
9a3f62
             pool = ThreadPoolExecutor(self.opts.jobs)
9a3f62
             pool.map(self.obfuscate_report, self.report_paths, chunksize=1)
9a3f62
             pool.shutdown(wait=True)
9a3f62
+            # finally, obfuscate the nested archive if one exists
9a3f62
+            if self.nested_archive:
9a3f62
+                self._replace_obfuscated_archives()
9a3f62
+                self.obfuscate_report(self.nested_archive)
9a3f62
         except KeyboardInterrupt:
9a3f62
             self.ui_log.info("Exiting on user cancel")
9a3f62
             os._exit(130)
9a3f62
 
9a3f62
+    def _replace_obfuscated_archives(self):
9a3f62
+        """When we have a nested archive, we need to rebuild the original
9a3f62
+        archive, which entails replacing the existing archives with their
9a3f62
+        obfuscated counterparts
9a3f62
+        """
9a3f62
+        for archive in self.completed_reports:
9a3f62
+            os.remove(archive.archive_path)
9a3f62
+            dest = self.nested_archive.extracted_path
9a3f62
+            tarball = archive.final_archive_path.split('/')[-1]
9a3f62
+            dest_name = os.path.join(dest, tarball)
9a3f62
+            shutil.move(archive.final_archive_path, dest)
9a3f62
+            archive.final_archive_path = dest_name
9a3f62
+
9a3f62
     def preload_all_archives_into_maps(self):
9a3f62
         """Before doing the actual obfuscation, if we have multiple archives
9a3f62
         to obfuscate then we need to preload each of them into the mappings
9a3f62
         to ensure that node1 is obfuscated in node2 as well as node2 being
9a3f62
         obfuscated in node1's archive.
9a3f62
         """
9a3f62
-        self.log_info("Pre-loading multiple archives into obfuscation maps")
9a3f62
+        self.log_info("Pre-loading all archives into obfuscation maps")
9a3f62
         for _arc in self.report_paths:
9a3f62
-            is_dir = os.path.isdir(_arc)
9a3f62
-            if is_dir:
9a3f62
-                _arc_name = _arc
9a3f62
-            else:
9a3f62
-                archive = tarfile.open(_arc)
9a3f62
-                _arc_name = _arc.split('/')[-1].split('.tar')[0]
9a3f62
-            # for each parser, load the map_prep_file into memory, and then
9a3f62
-            # send that for obfuscation. We don't actually obfuscate the file
9a3f62
-            # here, do that in the normal archive loop
9a3f62
             for _parser in self.parsers:
9a3f62
-                if not _parser.prep_map_file:
9a3f62
+                try:
9a3f62
+                    pfile = _arc.prep_files[_parser.name.lower().split()[0]]
9a3f62
+                    if not pfile:
9a3f62
+                        continue
9a3f62
+                except (IndexError, KeyError):
9a3f62
                     continue
9a3f62
-                if isinstance(_parser.prep_map_file, str):
9a3f62
-                    _parser.prep_map_file = [_parser.prep_map_file]
9a3f62
-                for parse_file in _parser.prep_map_file:
9a3f62
-                    _arc_path = os.path.join(_arc_name, parse_file)
9a3f62
+                if isinstance(pfile, str):
9a3f62
+                    pfile = [pfile]
9a3f62
+                for parse_file in pfile:
9a3f62
+                    self.log_debug("Attempting to load %s" % parse_file)
9a3f62
                     try:
9a3f62
-                        if is_dir:
9a3f62
-                            _pfile = open(_arc_path, 'r')
9a3f62
-                            content = _pfile.read()
9a3f62
-                        else:
9a3f62
-                            _pfile = archive.extractfile(_arc_path)
9a3f62
-                            content = _pfile.read().decode('utf-8')
9a3f62
-                        _pfile.close()
9a3f62
+                        content = _arc.get_file_content(parse_file)
9a3f62
+                        if not content:
9a3f62
+                            continue
9a3f62
                         if isinstance(_parser, SoSUsernameParser):
9a3f62
                             _parser.load_usernames_into_map(content)
9a3f62
-                        for line in content.splitlines():
9a3f62
-                            if isinstance(_parser, SoSHostnameParser):
9a3f62
-                                _parser.load_hostname_into_map(line)
9a3f62
-                            self.obfuscate_line(line)
9a3f62
+                        elif isinstance(_parser, SoSHostnameParser):
9a3f62
+                            _parser.load_hostname_into_map(
9a3f62
+                                content.splitlines()[0]
9a3f62
+                            )
9a3f62
+                        else:
9a3f62
+                            for line in content.splitlines():
9a3f62
+                                self.obfuscate_line(line)
9a3f62
                     except Exception as err:
9a3f62
-                        self.log_debug("Could not prep %s: %s"
9a3f62
-                                       % (_arc_path, err))
9a3f62
+                        self.log_info(
9a3f62
+                            "Could not prepare %s from %s (archive: %s): %s"
9a3f62
+                            % (_parser.name, parse_file, _arc.archive_name,
9a3f62
+                               err)
9a3f62
+                        )
9a3f62
 
9a3f62
-    def obfuscate_report(self, report):
9a3f62
+    def obfuscate_report(self, archive):
9a3f62
         """Individually handle each archive or directory we've discovered by
9a3f62
         running through each file therein.
9a3f62
 
9a3f62
@@ -549,17 +541,12 @@ third party.
9a3f62
             :param report str:      Filepath to the directory or archive
9a3f62
         """
9a3f62
         try:
9a3f62
-            if not os.access(report, os.W_OK):
9a3f62
-                msg = "Insufficient permissions on %s" % report
9a3f62
-                self.log_info(msg)
9a3f62
-                self.ui_log.error(msg)
9a3f62
-                return
9a3f62
-
9a3f62
-            archive = SoSObfuscationArchive(report, self.tmpdir)
9a3f62
             arc_md = self.cleaner_md.add_section(archive.archive_name)
9a3f62
             start_time = datetime.now()
9a3f62
             arc_md.add_field('start_time', start_time)
9a3f62
-            archive.extract()
9a3f62
+            # don't double extract nested archives
9a3f62
+            if not archive.is_extracted:
9a3f62
+                archive.extract()
9a3f62
             archive.report_msg("Beginning obfuscation...")
9a3f62
 
9a3f62
             file_list = archive.get_file_list()
9a3f62
@@ -586,27 +573,28 @@ third party.
9a3f62
                               caller=archive.archive_name)
9a3f62
 
9a3f62
             # if the archive was already a tarball, repack it
9a3f62
-            method = archive.get_compression()
9a3f62
-            if method:
9a3f62
-                archive.report_msg("Re-compressing...")
9a3f62
-                try:
9a3f62
-                    archive.rename_top_dir(
9a3f62
-                        self.obfuscate_string(archive.archive_name)
9a3f62
-                    )
9a3f62
-                    archive.compress(method)
9a3f62
-                except Exception as err:
9a3f62
-                    self.log_debug("Archive %s failed to compress: %s"
9a3f62
-                                   % (archive.archive_name, err))
9a3f62
-                    archive.report_msg("Failed to re-compress archive: %s"
9a3f62
-                                       % err)
9a3f62
-                    return
9a3f62
+            if not archive.is_nested:
9a3f62
+                method = archive.get_compression()
9a3f62
+                if method:
9a3f62
+                    archive.report_msg("Re-compressing...")
9a3f62
+                    try:
9a3f62
+                        archive.rename_top_dir(
9a3f62
+                            self.obfuscate_string(archive.archive_name)
9a3f62
+                        )
9a3f62
+                        archive.compress(method)
9a3f62
+                    except Exception as err:
9a3f62
+                        self.log_debug("Archive %s failed to compress: %s"
9a3f62
+                                       % (archive.archive_name, err))
9a3f62
+                        archive.report_msg("Failed to re-compress archive: %s"
9a3f62
+                                           % err)
9a3f62
+                        return
9a3f62
+                self.completed_reports.append(archive)
9a3f62
 
9a3f62
             end_time = datetime.now()
9a3f62
             arc_md.add_field('end_time', end_time)
9a3f62
             arc_md.add_field('run_time', end_time - start_time)
9a3f62
             arc_md.add_field('files_obfuscated', len(archive.file_sub_list))
9a3f62
             arc_md.add_field('total_substitutions', archive.total_sub_count)
9a3f62
-            self.completed_reports.append(archive)
9a3f62
             rmsg = ''
9a3f62
             if archive.removed_file_count:
9a3f62
                 rmsg = " [removed %s unprocessable files]"
9a3f62
@@ -615,7 +603,7 @@ third party.
9a3f62
 
9a3f62
         except Exception as err:
9a3f62
             self.ui_log.info("Exception while processing %s: %s"
9a3f62
-                             % (report, err))
9a3f62
+                             % (archive.archive_name, err))
9a3f62
 
9a3f62
     def obfuscate_file(self, filename, short_name=None, arc_name=None):
9a3f62
         """Obfuscate and individual file, line by line.
9a3f62
@@ -635,6 +623,8 @@ third party.
9a3f62
             # the requested file doesn't exist in the archive
9a3f62
             return
9a3f62
         subs = 0
9a3f62
+        if not short_name:
9a3f62
+            short_name = filename.split('/')[-1]
9a3f62
         if not os.path.islink(filename):
9a3f62
             # don't run the obfuscation on the link, but on the actual file
9a3f62
             # at some other point.
9a3f62
@@ -745,3 +735,5 @@ third party.
9a3f62
         for parser in self.parsers:
9a3f62
             _sec = parse_sec.add_section(parser.name.replace(' ', '_').lower())
9a3f62
             _sec.add_field('entries', len(parser.mapping.dataset.keys()))
9a3f62
+
9a3f62
+# vim: set et ts=4 sw=4 :
9a3f62
diff --git a/sos/cleaner/obfuscation_archive.py b/sos/cleaner/archives/__init__.py
9a3f62
similarity index 81%
9a3f62
rename from sos/cleaner/obfuscation_archive.py
9a3f62
rename to sos/cleaner/archives/__init__.py
9a3f62
index ea0b7012..795c5a78 100644
9a3f62
--- a/sos/cleaner/obfuscation_archive.py
9a3f62
+++ b/sos/cleaner/archives/__init__.py
9a3f62
@@ -40,6 +40,10 @@ class SoSObfuscationArchive():
9a3f62
     file_sub_list = []
9a3f62
     total_sub_count = 0
9a3f62
     removed_file_count = 0
9a3f62
+    type_name = 'undetermined'
9a3f62
+    description = 'undetermined'
9a3f62
+    is_nested = False
9a3f62
+    prep_files = {}
9a3f62
 
9a3f62
     def __init__(self, archive_path, tmpdir):
9a3f62
         self.archive_path = archive_path
9a3f62
@@ -50,7 +54,43 @@ class SoSObfuscationArchive():
9a3f62
         self.soslog = logging.getLogger('sos')
9a3f62
         self.ui_log = logging.getLogger('sos_ui')
9a3f62
         self.skip_list = self._load_skip_list()
9a3f62
-        self.log_info("Loaded %s as an archive" % self.archive_path)
9a3f62
+        self.is_extracted = False
9a3f62
+        self._load_self()
9a3f62
+        self.archive_root = ''
9a3f62
+        self.log_info(
9a3f62
+            "Loaded %s as type %s"
9a3f62
+            % (self.archive_path, self.description)
9a3f62
+        )
9a3f62
+
9a3f62
+    @classmethod
9a3f62
+    def check_is_type(cls, arc_path):
9a3f62
+        """Check if the archive is a well-known type we directly support"""
9a3f62
+        return False
9a3f62
+
9a3f62
+    def _load_self(self):
9a3f62
+        if self.is_tarfile:
9a3f62
+            self.tarobj = tarfile.open(self.archive_path)
9a3f62
+
9a3f62
+    def get_nested_archives(self):
9a3f62
+        """Return a list of ObfuscationArchives that represent additional
9a3f62
+        archives found within the target archive. For example, an archive from
9a3f62
+        `sos collect` will return a list of ``SoSReportArchive`` objects.
9a3f62
+
9a3f62
+        This should be overridden by individual types of ObfuscationArchive's
9a3f62
+        """
9a3f62
+        return []
9a3f62
+
9a3f62
+    def get_archive_root(self):
9a3f62
+        """Set the root path for the archive that should be prepended to any
9a3f62
+        filenames given to methods in this class.
9a3f62
+        """
9a3f62
+        if self.is_tarfile:
9a3f62
+            toplevel = self.tarobj.firstmember
9a3f62
+            if toplevel.isdir():
9a3f62
+                return toplevel.name
9a3f62
+            else:
9a3f62
+                return os.sep
9a3f62
+        return os.path.abspath(self.archive_path)
9a3f62
 
9a3f62
     def report_msg(self, msg):
9a3f62
         """Helper to easily format ui messages on a per-report basis"""
9a3f62
@@ -96,10 +136,42 @@ class SoSObfuscationArchive():
9a3f62
             os.remove(full_fname)
9a3f62
             self.removed_file_count += 1
9a3f62
 
9a3f62
-    def extract(self):
9a3f62
+    def format_file_name(self, fname):
9a3f62
+        """Based on the type of archive we're dealing with, do whatever that
9a3f62
+        archive requires to a provided **relative** filepath to be able to
9a3f62
+        access it within the archive
9a3f62
+        """
9a3f62
+        if not self.is_extracted:
9a3f62
+            if not self.archive_root:
9a3f62
+                self.archive_root = self.get_archive_root()
9a3f62
+            return os.path.join(self.archive_root, fname)
9a3f62
+        else:
9a3f62
+            return os.path.join(self.extracted_path, fname)
9a3f62
+
9a3f62
+    def get_file_content(self, fname):
9a3f62
+        """Return the content from the specified fname. Particularly useful for
9a3f62
+        tarball-type archives so we can retrieve prep file contents prior to
9a3f62
+        extracting the entire archive
9a3f62
+        """
9a3f62
+        if self.is_extracted is False and self.is_tarfile:
9a3f62
+            filename = self.format_file_name(fname)
9a3f62
+            try:
9a3f62
+                return self.tarobj.extractfile(filename).read().decode('utf-8')
9a3f62
+            except KeyError:
9a3f62
+                self.log_debug(
9a3f62
+                    "Unable to retrieve %s: no such file in archive" % fname
9a3f62
+                )
9a3f62
+                return ''
9a3f62
+        else:
9a3f62
+            with open(self.format_file_name(fname), 'r') as to_read:
9a3f62
+                return to_read.read()
9a3f62
+
9a3f62
+    def extract(self, quiet=False):
9a3f62
         if self.is_tarfile:
9a3f62
-            self.report_msg("Extracting...")
9a3f62
+            if not quiet:
9a3f62
+                self.report_msg("Extracting...")
9a3f62
             self.extracted_path = self.extract_self()
9a3f62
+            self.is_extracted = True
9a3f62
         else:
9a3f62
             self.extracted_path = self.archive_path
9a3f62
         # if we're running as non-root (e.g. collector), then we can have a
9a3f62
@@ -317,3 +389,5 @@ class SoSObfuscationArchive():
9a3f62
                 return False
9a3f62
             except UnicodeDecodeError:
9a3f62
                 return True
9a3f62
+
9a3f62
+# vim: set et ts=4 sw=4 :
9a3f62
diff --git a/sos/cleaner/archives/generic.py b/sos/cleaner/archives/generic.py
9a3f62
new file mode 100644
9a3f62
index 00000000..2ce6f09b
9a3f62
--- /dev/null
9a3f62
+++ b/sos/cleaner/archives/generic.py
9a3f62
@@ -0,0 +1,52 @@
9a3f62
+# Copyright 2020 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
9a3f62
+
9a3f62
+# This file is part of the sos project: https://github.com/sosreport/sos
9a3f62
+#
9a3f62
+# This copyrighted material is made available to anyone wishing to use,
9a3f62
+# modify, copy, or redistribute it subject to the terms and conditions of
9a3f62
+# version 2 of the GNU General Public License.
9a3f62
+#
9a3f62
+# See the LICENSE file in the source distribution for further information.
9a3f62
+
9a3f62
+
9a3f62
+from sos.cleaner.archives import SoSObfuscationArchive
9a3f62
+
9a3f62
+import os
9a3f62
+import tarfile
9a3f62
+
9a3f62
+
9a3f62
+class DataDirArchive(SoSObfuscationArchive):
9a3f62
+    """A plain directory on the filesystem that is not directly associated with
9a3f62
+    any known or supported collection utility
9a3f62
+    """
9a3f62
+
9a3f62
+    type_name = 'data_dir'
9a3f62
+    description = 'unassociated directory'
9a3f62
+
9a3f62
+    @classmethod
9a3f62
+    def check_is_type(cls, arc_path):
9a3f62
+        return os.path.isdir(arc_path)
9a3f62
+
9a3f62
+    def set_archive_root(self):
9a3f62
+        return os.path.abspath(self.archive_path)
9a3f62
+
9a3f62
+
9a3f62
+class TarballArchive(SoSObfuscationArchive):
9a3f62
+    """A generic tar archive that is not associated with any known or supported
9a3f62
+    collection utility
9a3f62
+    """
9a3f62
+
9a3f62
+    type_name = 'tarball'
9a3f62
+    description = 'unassociated tarball'
9a3f62
+
9a3f62
+    @classmethod
9a3f62
+    def check_is_type(cls, arc_path):
9a3f62
+        try:
9a3f62
+            return tarfile.is_tarfile(arc_path)
9a3f62
+        except Exception:
9a3f62
+            return False
9a3f62
+
9a3f62
+    def set_archive_root(self):
9a3f62
+        if self.tarobj.firstmember.isdir():
9a3f62
+            return self.tarobj.firstmember.name
9a3f62
+        return ''
9a3f62
diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
9a3f62
new file mode 100644
9a3f62
index 00000000..4401d710
9a3f62
--- /dev/null
9a3f62
+++ b/sos/cleaner/archives/sos.py
9a3f62
@@ -0,0 +1,106 @@
9a3f62
+# Copyright 2021 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
9a3f62
+
9a3f62
+# This file is part of the sos project: https://github.com/sosreport/sos
9a3f62
+#
9a3f62
+# This copyrighted material is made available to anyone wishing to use,
9a3f62
+# modify, copy, or redistribute it subject to the terms and conditions of
9a3f62
+# version 2 of the GNU General Public License.
9a3f62
+#
9a3f62
+# See the LICENSE file in the source distribution for further information.
9a3f62
+
9a3f62
+
9a3f62
+from sos.cleaner.archives import SoSObfuscationArchive
9a3f62
+
9a3f62
+import os
9a3f62
+import tarfile
9a3f62
+
9a3f62
+
9a3f62
+class SoSReportArchive(SoSObfuscationArchive):
9a3f62
+    """This is the class representing an sos report, or in other words the
9a3f62
+    type the archive the SoS project natively generates
9a3f62
+    """
9a3f62
+
9a3f62
+    type_name = 'report'
9a3f62
+    description = 'sos report archive'
9a3f62
+    prep_files = {
9a3f62
+        'hostname': 'sos_commands/host/hostname',
9a3f62
+        'ip': 'sos_commands/networking/ip_-o_addr',
9a3f62
+        'mac': 'sos_commands/networking/ip_-d_address',
9a3f62
+        'username': [
9a3f62
+            'sos_commands/login/lastlog_-u_1000-60000',
9a3f62
+            'sos_commands/login/lastlog_-u_60001-65536',
9a3f62
+            'sos_commands/login/lastlog_-u_65537-4294967295',
9a3f62
+            # AD users will be reported here, but favor the lastlog files since
9a3f62
+            # those will include local users who have not logged in
9a3f62
+            'sos_commands/login/last'
9a3f62
+        ]
9a3f62
+    }
9a3f62
+
9a3f62
+    @classmethod
9a3f62
+    def check_is_type(cls, arc_path):
9a3f62
+        try:
9a3f62
+            return tarfile.is_tarfile(arc_path) and 'sosreport-' in arc_path
9a3f62
+        except Exception:
9a3f62
+            return False
9a3f62
+
9a3f62
+
9a3f62
+class SoSReportDirectory(SoSReportArchive):
9a3f62
+    """This is the archive class representing a build directory, or in other
9a3f62
+    words what `sos report --clean` will end up using for in-line obfuscation
9a3f62
+    """
9a3f62
+
9a3f62
+    type_name = 'report_dir'
9a3f62
+    description = 'sos report directory'
9a3f62
+
9a3f62
+    @classmethod
9a3f62
+    def check_is_type(cls, arc_path):
9a3f62
+        if os.path.isdir(arc_path):
9a3f62
+            return 'sos_logs' in os.listdir(arc_path)
9a3f62
+        return False
9a3f62
+
9a3f62
+
9a3f62
+class SoSCollectorArchive(SoSObfuscationArchive):
9a3f62
+    """Archive class representing the tarball created by ``sos collect``. It
9a3f62
+    will not provide prep files on its own, however it will provide a list
9a3f62
+    of SoSReportArchive's which will then be used to prep the parsers
9a3f62
+    """
9a3f62
+
9a3f62
+    type_name = 'collect'
9a3f62
+    description = 'sos collect tarball'
9a3f62
+    is_nested = True
9a3f62
+
9a3f62
+    @classmethod
9a3f62
+    def check_is_type(cls, arc_path):
9a3f62
+        try:
9a3f62
+            return (tarfile.is_tarfile(arc_path) and 'sos-collect' in arc_path)
9a3f62
+        except Exception:
9a3f62
+            return False
9a3f62
+
9a3f62
+    def get_nested_archives(self):
9a3f62
+        self.extract(quiet=True)
9a3f62
+        _path = self.extracted_path
9a3f62
+        archives = []
9a3f62
+        for fname in os.listdir(_path):
9a3f62
+            arc_name = os.path.join(_path, fname)
9a3f62
+            if 'sosreport-' in fname and tarfile.is_tarfile(arc_name):
9a3f62
+                archives.append(SoSReportArchive(arc_name, self.tmpdir))
9a3f62
+        return archives
9a3f62
+
9a3f62
+
9a3f62
+class SoSCollectorDirectory(SoSCollectorArchive):
9a3f62
+    """The archive class representing the temp directory used by ``sos
9a3f62
+    collect`` when ``--clean`` is used during runtime.
9a3f62
+    """
9a3f62
+
9a3f62
+    type_name = 'collect_dir'
9a3f62
+    description = 'sos collect directory'
9a3f62
+
9a3f62
+    @classmethod
9a3f62
+    def check_is_type(cls, arc_path):
9a3f62
+        if os.path.isdir(arc_path):
9a3f62
+            for fname in os.listdir(arc_path):
9a3f62
+                if 'sos-collector-' in fname:
9a3f62
+                    return True
9a3f62
+        return False
9a3f62
+
9a3f62
+# vim: set et ts=4 sw=4 :
9a3f62
diff --git a/sos/cleaner/parsers/__init__.py b/sos/cleaner/parsers/__init__.py
9a3f62
index af6e375e..e62fd938 100644
9a3f62
--- a/sos/cleaner/parsers/__init__.py
9a3f62
+++ b/sos/cleaner/parsers/__init__.py
9a3f62
@@ -37,11 +37,6 @@ class SoSCleanerParser():
9a3f62
     :cvar map_file_key: The key in the ``map_file`` to read when loading
9a3f62
                         previous obfuscation matches
9a3f62
     :vartype map_file_key: ``str``
9a3f62
-
9a3f62
-
9a3f62
-    :cvar prep_map_file: File to read from an archive to pre-seed the map with
9a3f62
-                         matches. E.G. ip_addr for loading IP addresses
9a3f62
-    :vartype prep_map_fie: ``str``
9a3f62
     """
9a3f62
 
9a3f62
     name = 'Undefined Parser'
9a3f62
@@ -49,7 +44,6 @@ class SoSCleanerParser():
9a3f62
     skip_line_patterns = []
9a3f62
     skip_files = []
9a3f62
     map_file_key = 'unset'
9a3f62
-    prep_map_file = []
9a3f62
 
9a3f62
     def __init__(self, config={}):
9a3f62
         if self.map_file_key in config:
9a3f62
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
9a3f62
index 71e13d3f..daa76a62 100644
9a3f62
--- a/sos/cleaner/parsers/hostname_parser.py
9a3f62
+++ b/sos/cleaner/parsers/hostname_parser.py
9a3f62
@@ -16,7 +16,6 @@ class SoSHostnameParser(SoSCleanerParser):
9a3f62
 
9a3f62
     name = 'Hostname Parser'
9a3f62
     map_file_key = 'hostname_map'
9a3f62
-    prep_map_file = 'sos_commands/host/hostname'
9a3f62
     regex_patterns = [
9a3f62
         r'(((\b|_)[a-zA-Z0-9-\.]{1,200}\.[a-zA-Z]{1,63}(\b|_)))'
9a3f62
     ]
9a3f62
diff --git a/sos/cleaner/parsers/ip_parser.py b/sos/cleaner/parsers/ip_parser.py
9a3f62
index 525139e8..71d38be8 100644
9a3f62
--- a/sos/cleaner/parsers/ip_parser.py
9a3f62
+++ b/sos/cleaner/parsers/ip_parser.py
9a3f62
@@ -41,7 +41,6 @@ class SoSIPParser(SoSCleanerParser):
9a3f62
     ]
9a3f62
 
9a3f62
     map_file_key = 'ip_map'
9a3f62
-    prep_map_file = 'sos_commands/networking/ip_-o_addr'
9a3f62
 
9a3f62
     def __init__(self, config):
9a3f62
         self.mapping = SoSIPMap()
9a3f62
diff --git a/sos/cleaner/parsers/keyword_parser.py b/sos/cleaner/parsers/keyword_parser.py
9a3f62
index 68de3727..694c6073 100644
9a3f62
--- a/sos/cleaner/parsers/keyword_parser.py
9a3f62
+++ b/sos/cleaner/parsers/keyword_parser.py
9a3f62
@@ -20,7 +20,6 @@ class SoSKeywordParser(SoSCleanerParser):
9a3f62
 
9a3f62
     name = 'Keyword Parser'
9a3f62
     map_file_key = 'keyword_map'
9a3f62
-    prep_map_file = ''
9a3f62
 
9a3f62
     def __init__(self, config, keywords=None, keyword_file=None):
9a3f62
         self.mapping = SoSKeywordMap()
9a3f62
diff --git a/sos/cleaner/parsers/mac_parser.py b/sos/cleaner/parsers/mac_parser.py
9a3f62
index 7ca80b8d..c74288cf 100644
9a3f62
--- a/sos/cleaner/parsers/mac_parser.py
9a3f62
+++ b/sos/cleaner/parsers/mac_parser.py
9a3f62
@@ -30,7 +30,6 @@ class SoSMacParser(SoSCleanerParser):
9a3f62
         '534f:53'
9a3f62
     )
9a3f62
     map_file_key = 'mac_map'
9a3f62
-    prep_map_file = 'sos_commands/networking/ip_-d_address'
9a3f62
 
9a3f62
     def __init__(self, config):
9a3f62
         self.mapping = SoSMacMap()
9a3f62
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
9a3f62
index b142e371..35377a31 100644
9a3f62
--- a/sos/cleaner/parsers/username_parser.py
9a3f62
+++ b/sos/cleaner/parsers/username_parser.py
9a3f62
@@ -25,14 +25,6 @@ class SoSUsernameParser(SoSCleanerParser):
9a3f62
 
9a3f62
     name = 'Username Parser'
9a3f62
     map_file_key = 'username_map'
9a3f62
-    prep_map_file = [
9a3f62
-        'sos_commands/login/lastlog_-u_1000-60000',
9a3f62
-        'sos_commands/login/lastlog_-u_60001-65536',
9a3f62
-        'sos_commands/login/lastlog_-u_65537-4294967295',
9a3f62
-        # AD users will be reported here, but favor the lastlog files since
9a3f62
-        # those will include local users who have not logged in
9a3f62
-        'sos_commands/login/last'
9a3f62
-    ]
9a3f62
     regex_patterns = []
9a3f62
     skip_list = [
9a3f62
         'core',
9a3f62
diff --git a/tests/cleaner_tests/existing_archive.py b/tests/cleaner_tests/existing_archive.py
9a3f62
index 0eaf6c8d..e13d1cae 100644
9a3f62
--- a/tests/cleaner_tests/existing_archive.py
9a3f62
+++ b/tests/cleaner_tests/existing_archive.py
9a3f62
@@ -28,6 +28,13 @@ class ExistingArchiveCleanTest(StageTwoReportTest):
9a3f62
     def test_obfuscation_log_created(self):
9a3f62
         self.assertFileExists(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE))
9a3f62
 
9a3f62
+    def test_archive_type_correct(self):
9a3f62
+        with open(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE), 'r') as log:
9a3f62
+            for line in log:
9a3f62
+                if "Loaded %s" % ARCHIVE in line:
9a3f62
+                    assert 'as type sos report archive' in line, "Incorrect archive type detected: %s" % line
9a3f62
+                    break
9a3f62
+
9a3f62
     def test_from_cmdline_logged(self):
9a3f62
         with open(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE), 'r') as log:
9a3f62
             for line in log:
9a3f62
diff --git a/tests/cleaner_tests/full_report_run.py b/tests/cleaner_tests/full_report_run.py
9a3f62
index 3b28e7a2..2de54946 100644
9a3f62
--- a/tests/cleaner_tests/full_report_run.py
9a3f62
+++ b/tests/cleaner_tests/full_report_run.py
9a3f62
@@ -35,6 +35,9 @@ class FullCleanTest(StageTwoReportTest):
9a3f62
     def test_tarball_named_obfuscated(self):
9a3f62
         self.assertTrue('obfuscated' in self.archive)
9a3f62
 
9a3f62
+    def test_archive_type_correct(self):
9a3f62
+        self.assertSosLogContains('Loaded .* as type sos report directory')
9a3f62
+
9a3f62
     def test_hostname_not_in_any_file(self):
9a3f62
         host = self.sysinfo['pre']['networking']['hostname']
9a3f62
         # much faster to just use grep here
9a3f62
diff --git a/tests/cleaner_tests/report_with_mask.py b/tests/cleaner_tests/report_with_mask.py
9a3f62
index 4f94ba33..08e873d4 100644
9a3f62
--- a/tests/cleaner_tests/report_with_mask.py
9a3f62
+++ b/tests/cleaner_tests/report_with_mask.py
9a3f62
@@ -31,6 +31,9 @@ class ReportWithMask(StageOneReportTest):
9a3f62
     def test_tarball_named_obfuscated(self):
9a3f62
         self.assertTrue('obfuscated' in self.archive)
9a3f62
 
9a3f62
+    def test_archive_type_correct(self):
9a3f62
+        self.assertSosLogContains('Loaded .* as type sos report directory')
9a3f62
+
9a3f62
     def test_localhost_was_obfuscated(self):
9a3f62
         self.assertFileHasContent('/etc/hostname', 'host0')
9a3f62
 
9a3f62
-- 
9a3f62
2.31.1
9a3f62
9a3f62
From 9b119f860eaec089f7ef884ff39c42589a662994 Mon Sep 17 00:00:00 2001
9a3f62
From: Jake Hunsaker <jhunsake@redhat.com>
9a3f62
Date: Wed, 1 Sep 2021 00:34:04 -0400
9a3f62
Subject: [PATCH] [hostname_map] Add a catch for single-character hostnames
9a3f62
9a3f62
If a log file was truncated at a specific boundary in a string of the
9a3f62
FQDN of the host such that we only get a couple characters before the
9a3f62
rest of the domain, we would previously bodly replace all instances of
9a3f62
that character with the obfuscated short name; not very helpful.
9a3f62
9a3f62
Instead, don't sanitize the short name if this happens and instead
9a3f62
obfuscate the whole FQDN as 'unknown.example.com'.
9a3f62
9a3f62
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
9a3f62
---
9a3f62
 sos/cleaner/mappings/hostname_map.py | 9 ++++++++-
9a3f62
 1 file changed, 8 insertions(+), 1 deletion(-)
9a3f62
9a3f62
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
9a3f62
index d4b2c88e..e70a5530 100644
9a3f62
--- a/sos/cleaner/mappings/hostname_map.py
9a3f62
+++ b/sos/cleaner/mappings/hostname_map.py
9a3f62
@@ -184,7 +184,14 @@ class SoSHostnameMap(SoSMap):
9a3f62
             hostname = host[0]
9a3f62
             domain = host[1:]
9a3f62
             # obfuscate the short name
9a3f62
-            ob_hostname = self.sanitize_short_name(hostname)
9a3f62
+            if len(hostname) > 2:
9a3f62
+                ob_hostname = self.sanitize_short_name(hostname)
9a3f62
+            else:
9a3f62
+                # by best practice it appears the host part of the fqdn was cut
9a3f62
+                # off due to some form of truncating, as such don't obfuscate
9a3f62
+                # short strings that are likely to throw off obfuscation of
9a3f62
+                # unrelated bits and paths
9a3f62
+                ob_hostname = 'unknown'
9a3f62
             ob_domain = self.sanitize_domain(domain)
9a3f62
             self.dataset[item] = ob_domain
9a3f62
             return '.'.join([ob_hostname, ob_domain])
9a3f62
-- 
9a3f62
2.31.1
9a3f62
9a3f62
From f3f3e763d7c31b7b7cafdf8dd4dab87056fb7696 Mon Sep 17 00:00:00 2001
9a3f62
From: Jake Hunsaker <jhunsake@redhat.com>
9a3f62
Date: Wed, 1 Sep 2021 15:54:55 -0400
9a3f62
Subject: [PATCH] [cleaner] Add support for Insights client archives
9a3f62
9a3f62
Adds a new type of `SoSObfuscationArchive` to add support for
9a3f62
obfuscating archives generated by the Insights project.
9a3f62
9a3f62
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
9a3f62
---
9a3f62
 man/en/sos-clean.1               |  1 +
9a3f62
 sos/cleaner/__init__.py          |  4 ++-
9a3f62
 sos/cleaner/archives/insights.py | 42 ++++++++++++++++++++++++++++++++
9a3f62
 3 files changed, 46 insertions(+), 1 deletion(-)
9a3f62
 create mode 100644 sos/cleaner/archives/insights.py
9a3f62
9a3f62
diff --git a/man/en/sos-clean.1 b/man/en/sos-clean.1
9a3f62
index 54026713..358ec0cb 100644
9a3f62
--- a/man/en/sos-clean.1
9a3f62
+++ b/man/en/sos-clean.1
9a3f62
@@ -105,6 +105,7 @@ The following are accepted values for this option:
9a3f62
     \fBauto\fR          Automatically detect the archive type
9a3f62
     \fBreport\fR        An archive generated by \fBsos report\fR
9a3f62
     \fBcollect\fR       An archive generated by \fBsos collect\fR
9a3f62
+    \fBinsights\fR      An archive generated by the \fBinsights-client\fR package
9a3f62
 
9a3f62
 The following may also be used, however note that these do not attempt to pre-load
9a3f62
 any information from the archives into the parsers. This means that, among other limitations,
9a3f62
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
9a3f62
index 6d2eb483..3e08aa28 100644
9a3f62
--- a/sos/cleaner/__init__.py
9a3f62
+++ b/sos/cleaner/__init__.py
9a3f62
@@ -29,6 +29,7 @@ from sos.cleaner.archives.sos import (SoSReportArchive, SoSReportDirectory,
9a3f62
                                       SoSCollectorArchive,
9a3f62
                                       SoSCollectorDirectory)
9a3f62
 from sos.cleaner.archives.generic import DataDirArchive, TarballArchive
9a3f62
+from sos.cleaner.archives.insights import InsightsArchive
9a3f62
 from sos.utilities import get_human_readable
9a3f62
 from textwrap import fill
9a3f62
 
9a3f62
@@ -100,6 +101,7 @@ class SoSCleaner(SoSComponent):
9a3f62
             SoSReportArchive,
9a3f62
             SoSCollectorDirectory,
9a3f62
             SoSCollectorArchive,
9a3f62
+            InsightsArchive,
9a3f62
             # make sure these two are always last as they are fallbacks
9a3f62
             DataDirArchive,
9a3f62
             TarballArchive
9a3f62
@@ -194,7 +196,7 @@ third party.
9a3f62
                                help='The directory or archive to obfuscate')
9a3f62
         clean_grp.add_argument('--archive-type', default='auto',
9a3f62
                                choices=['auto', 'report', 'collect',
9a3f62
-                                        'data-dir', 'tarball'],
9a3f62
+                                        'insights', 'data-dir', 'tarball'],
9a3f62
                                help=('Specify what kind of archive the target '
9a3f62
                                      'was generated as'))
9a3f62
         clean_grp.add_argument('--domains', action='extend', default=[],
9a3f62
diff --git a/sos/cleaner/archives/insights.py b/sos/cleaner/archives/insights.py
9a3f62
new file mode 100644
9a3f62
index 00000000..dab48b16
9a3f62
--- /dev/null
9a3f62
+++ b/sos/cleaner/archives/insights.py
9a3f62
@@ -0,0 +1,42 @@
9a3f62
+# Copyright 2021 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
9a3f62
+
9a3f62
+# This file is part of the sos project: https://github.com/sosreport/sos
9a3f62
+#
9a3f62
+# This copyrighted material is made available to anyone wishing to use,
9a3f62
+# modify, copy, or redistribute it subject to the terms and conditions of
9a3f62
+# version 2 of the GNU General Public License.
9a3f62
+#
9a3f62
+# See the LICENSE file in the source distribution for further information.
9a3f62
+
9a3f62
+
9a3f62
+from sos.cleaner.archives import SoSObfuscationArchive
9a3f62
+
9a3f62
+import tarfile
9a3f62
+
9a3f62
+
9a3f62
+class InsightsArchive(SoSObfuscationArchive):
9a3f62
+    """This class represents archives generated by the insights-client utility
9a3f62
+    for RHEL systems.
9a3f62
+    """
9a3f62
+
9a3f62
+    type_name = 'insights'
9a3f62
+    description = 'insights-client archive'
9a3f62
+
9a3f62
+    prep_files = {
9a3f62
+        'hostname': 'data/insights_commands/hostname_-f',
9a3f62
+        'ip': 'data/insights_commands/ip_addr',
9a3f62
+        'mac': 'data/insights_commands/ip_addr'
9a3f62
+    }
9a3f62
+
9a3f62
+    @classmethod
9a3f62
+    def check_is_type(cls, arc_path):
9a3f62
+        try:
9a3f62
+            return tarfile.is_tarfile(arc_path) and 'insights-' in arc_path
9a3f62
+        except Exception:
9a3f62
+            return False
9a3f62
+
9a3f62
+    def get_archive_root(self):
9a3f62
+        top = self.archive_path.split('/')[-1].split('.tar')[0]
9a3f62
+        if self.tarobj.firstmember.name == '.':
9a3f62
+            top = './' + top
9a3f62
+        return top
9a3f62
-- 
9a3f62
2.31.1
9a3f62
9a3f62
From 9639dc3d240076b55f2a1d04b43ea42bebd09215 Mon Sep 17 00:00:00 2001
9a3f62
From: Jake Hunsaker <jhunsake@redhat.com>
9a3f62
Date: Tue, 16 Nov 2021 17:50:42 -0500
9a3f62
Subject: [PATCH] [clean,hostname_parser] Source /etc/hosts for obfuscation
9a3f62
9a3f62
Up until now, our sourcing of hostnames/domains for obfuscation has been
9a3f62
dependent upon the output of the `hostname` command. However, some
9a3f62
scenarios have come up where sourcing `/etc/hosts` is advantageous for
9a3f62
several reasons:
9a3f62
9a3f62
First, if `hostname` output is unavailable, this provides a fallback
9a3f62
measure.
9a3f62
9a3f62
Second, `/etc/hosts` is a common place to have short names defined which
9a3f62
would otherwise not be detected (or at the very least would result in a
9a3f62
race condition based on where/if the short name was elsewhere able to be
9a3f62
gleaned from an FQDN), thus leaving the potential for unobfuscated data
9a3f62
in an archive.
9a3f62
9a3f62
Due to both the nature of hostname obfuscation and the malleable syntax
9a3f62
of `/etc/hosts`, the parsing of this file needs special handling not
9a3f62
covered by our more generic parsing and obfuscation methods.
9a3f62
9a3f62
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
9a3f62
---
9a3f62
 sos/cleaner/__init__.py                | 11 ++++++++---
9a3f62
 sos/cleaner/archives/sos.py            |  5 ++++-
9a3f62
 sos/cleaner/parsers/hostname_parser.py | 19 +++++++++++++++++++
9a3f62
 3 files changed, 31 insertions(+), 4 deletions(-)
9a3f62
9a3f62
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
9a3f62
index ed461a8f..3f530d44 100644
9a3f62
--- a/sos/cleaner/__init__.py
9a3f62
+++ b/sos/cleaner/__init__.py
9a3f62
@@ -523,9 +523,14 @@ third party.
9a3f62
                         if isinstance(_parser, SoSUsernameParser):
9a3f62
                             _parser.load_usernames_into_map(content)
9a3f62
                         elif isinstance(_parser, SoSHostnameParser):
9a3f62
-                            _parser.load_hostname_into_map(
9a3f62
-                                content.splitlines()[0]
9a3f62
-                            )
9a3f62
+                            if 'hostname' in parse_file:
9a3f62
+                                _parser.load_hostname_into_map(
9a3f62
+                                    content.splitlines()[0]
9a3f62
+                                )
9a3f62
+                            elif 'etc/hosts' in parse_file:
9a3f62
+                                _parser.load_hostname_from_etc_hosts(
9a3f62
+                                    content
9a3f62
+                                )
9a3f62
                         else:
9a3f62
                             for line in content.splitlines():
9a3f62
                                 self.obfuscate_line(line)
9a3f62
diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
9a3f62
index 4401d710..f8720c88 100644
9a3f62
--- a/sos/cleaner/archives/sos.py
9a3f62
+++ b/sos/cleaner/archives/sos.py
9a3f62
@@ -23,7 +23,10 @@ class SoSReportArchive(SoSObfuscationArchive):
9a3f62
     type_name = 'report'
9a3f62
     description = 'sos report archive'
9a3f62
     prep_files = {
9a3f62
-        'hostname': 'sos_commands/host/hostname',
9a3f62
+        'hostname': [
9a3f62
+            'sos_commands/host/hostname',
9a3f62
+            'etc/hosts'
9a3f62
+        ],
9a3f62
         'ip': 'sos_commands/networking/ip_-o_addr',
9a3f62
         'mac': 'sos_commands/networking/ip_-d_address',
9a3f62
         'username': [
9a3f62
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
9a3f62
index daa76a62..0a733bee 100644
9a3f62
--- a/sos/cleaner/parsers/hostname_parser.py
9a3f62
+++ b/sos/cleaner/parsers/hostname_parser.py
9a3f62
@@ -61,6 +61,25 @@ class SoSHostnameParser(SoSCleanerParser):
9a3f62
             self.mapping.add(high_domain)
9a3f62
         self.mapping.add(hostname_string)
9a3f62
 
9a3f62
+    def load_hostname_from_etc_hosts(self, content):
9a3f62
+        """Parse an archive's copy of /etc/hosts, which requires handling that
9a3f62
+        is separate from the output of the `hostname` command. Just like
9a3f62
+        load_hostname_into_map(), this has to be done explicitly and we
9a3f62
+        cannot rely upon the more generic methods to do this reliably.
9a3f62
+        """
9a3f62
+        lines = content.splitlines()
9a3f62
+        for line in lines:
9a3f62
+            if line.startswith('#') or 'localhost' in line:
9a3f62
+                continue
9a3f62
+            hostln = line.split()[1:]
9a3f62
+            for host in hostln:
9a3f62
+                if len(host.split('.')) == 1:
9a3f62
+                    # only generate a mapping for fqdns but still record the
9a3f62
+                    # short name here for later obfuscation with parse_line()
9a3f62
+                    self.short_names.append(host)
9a3f62
+                else:
9a3f62
+                    self.mapping.add(host)
9a3f62
+
9a3f62
     def parse_line(self, line):
9a3f62
         """Override the default parse_line() method to also check for the
9a3f62
         shortname of the host derived from the hostname.
9a3f62
-- 
9a3f62
2.31.1
9a3f62
9a3f62
From c1680226b53452b18f27f2e76c3e0e03e521f935 Mon Sep 17 00:00:00 2001
9a3f62
From: Jake Hunsaker <jhunsake@redhat.com>
9a3f62
Date: Wed, 17 Nov 2021 13:11:33 -0500
9a3f62
Subject: [PATCH] [clean, hostname] Fix unintentionally case sensitive
9a3f62
 shortname handling
9a3f62
9a3f62
It was discovered that our extra handling for shortnames was
9a3f62
unintentionally case sensitive. Fix this to ensure that shortnames are
9a3f62
obfuscated regardless of case in all collected text.
9a3f62
9a3f62
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
9a3f62
---
9a3f62
 sos/cleaner/mappings/hostname_map.py   |  6 +++---
9a3f62
 sos/cleaner/parsers/hostname_parser.py |  8 +++++---
9a3f62
 tests/cleaner_tests/full_report_run.py | 21 ++++++++++++++++++++-
9a3f62
 3 files changed, 28 insertions(+), 7 deletions(-)
9a3f62
9a3f62
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
9a3f62
index e70a5530..0fe78fb1 100644
9a3f62
--- a/sos/cleaner/mappings/hostname_map.py
9a3f62
+++ b/sos/cleaner/mappings/hostname_map.py
9a3f62
@@ -169,13 +169,13 @@ class SoSHostnameMap(SoSMap):
9a3f62
 
9a3f62
     def sanitize_item(self, item):
9a3f62
         host = item.split('.')
9a3f62
-        if all([h.isupper() for h in host]):
9a3f62
+        if len(host) > 1 and all([h.isupper() for h in host]):
9a3f62
             # by convention we have just a domain
9a3f62
             _host = [h.lower() for h in host]
9a3f62
             return self.sanitize_domain(_host).upper()
9a3f62
         if len(host) == 1:
9a3f62
             # we have a shortname for a host
9a3f62
-            return self.sanitize_short_name(host[0])
9a3f62
+            return self.sanitize_short_name(host[0].lower())
9a3f62
         if len(host) == 2:
9a3f62
             # we have just a domain name, e.g. example.com
9a3f62
             return self.sanitize_domain(host)
9a3f62
@@ -185,7 +185,7 @@ class SoSHostnameMap(SoSMap):
9a3f62
             domain = host[1:]
9a3f62
             # obfuscate the short name
9a3f62
             if len(hostname) > 2:
9a3f62
-                ob_hostname = self.sanitize_short_name(hostname)
9a3f62
+                ob_hostname = self.sanitize_short_name(hostname.lower())
9a3f62
             else:
9a3f62
                 # by best practice it appears the host part of the fqdn was cut
9a3f62
                 # off due to some form of truncating, as such don't obfuscate
9a3f62
diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py
9a3f62
index 0a733bee..7fd0e698 100644
9a3f62
--- a/sos/cleaner/parsers/hostname_parser.py
9a3f62
+++ b/sos/cleaner/parsers/hostname_parser.py
9a3f62
@@ -8,6 +8,8 @@
9a3f62
 #
9a3f62
 # See the LICENSE file in the source distribution for further information.
9a3f62
 
9a3f62
+import re
9a3f62
+
9a3f62
 from sos.cleaner.parsers import SoSCleanerParser
9a3f62
 from sos.cleaner.mappings.hostname_map import SoSHostnameMap
9a3f62
 
9a3f62
@@ -91,9 +93,9 @@ class SoSHostnameParser(SoSCleanerParser):
9a3f62
             """
9a3f62
             if search in self.mapping.skip_keys:
9a3f62
                 return ln, count
9a3f62
-            if search in ln:
9a3f62
-                count += ln.count(search)
9a3f62
-                ln = ln.replace(search, self.mapping.get(repl or search))
9a3f62
+            _reg = re.compile(search, re.I)
9a3f62
+            if _reg.search(ln):
9a3f62
+                return _reg.subn(self.mapping.get(repl or search), ln)
9a3f62
             return ln, count
9a3f62
 
9a3f62
         count = 0
9a3f62
diff --git a/tests/cleaner_tests/full_report_run.py b/tests/cleaner_tests/full_report_run.py
9a3f62
index 2de54946..0b23acaf 100644
9a3f62
--- a/tests/cleaner_tests/full_report_run.py
9a3f62
+++ b/tests/cleaner_tests/full_report_run.py
9a3f62
@@ -26,6 +26,24 @@ class FullCleanTest(StageTwoReportTest):
9a3f62
     # replace with an empty placeholder, make sure that this test case is not
9a3f62
     # influenced by previous clean runs
9a3f62
     files = ['/etc/sos/cleaner/default_mapping']
9a3f62
+    packages = {
9a3f62
+        'rhel': ['python3-systemd'],
9a3f62
+        'ubuntu': ['python3-systemd']
9a3f62
+    }
9a3f62
+
9a3f62
+    def pre_sos_setup(self):
9a3f62
+        # ensure that case-insensitive matching of FQDNs and shortnames work
9a3f62
+        from systemd import journal
9a3f62
+        from socket import gethostname
9a3f62
+        host = gethostname()
9a3f62
+        short = host.split('.')[0]
9a3f62
+        sosfd = journal.stream('sos-testing')
9a3f62
+        sosfd.write(
9a3f62
+            "This is a test line from sos clean testing. The hostname %s "
9a3f62
+            "should not appear, nor should %s in an obfuscated archive. The "
9a3f62
+            "shortnames of %s and %s should also not appear."
9a3f62
+            % (host.lower(), host.upper(), short.lower(), short.upper())
9a3f62
+        )
9a3f62
 
9a3f62
     def test_private_map_was_generated(self):
9a3f62
         self.assertOutputContains('A mapping of obfuscated elements is available at')
9a3f62
@@ -40,8 +58,9 @@ class FullCleanTest(StageTwoReportTest):
9a3f62
 
9a3f62
     def test_hostname_not_in_any_file(self):
9a3f62
         host = self.sysinfo['pre']['networking']['hostname']
9a3f62
+        short = host.split('.')[0]
9a3f62
         # much faster to just use grep here
9a3f62
-        content = self.grep_for_content(host)
9a3f62
+        content = self.grep_for_content(host) + self.grep_for_content(short)
9a3f62
         if not content:
9a3f62
             assert True
9a3f62
         else:
9a3f62
-- 
9a3f62
2.31.1
9a3f62
9a3f62
From aaeb8cb57ed55598ab744b96d4f127aedebcb292 Mon Sep 17 00:00:00 2001
9a3f62
From: Jake Hunsaker <jhunsake@redhat.com>
9a3f62
Date: Tue, 21 Sep 2021 15:23:20 -0400
9a3f62
Subject: [PATCH] [build] Add archives to setup.py packages
9a3f62
9a3f62
Adds the newly abstracted `sos.cleaner.archives` package to `setup.py`
9a3f62
so that manual builds will properly include it.
9a3f62
9a3f62
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
9a3f62
---
9a3f62
 setup.py | 2 +-
9a3f62
 1 file changed, 1 insertion(+), 1 deletion(-)
9a3f62
9a3f62
diff --git a/setup.py b/setup.py
9a3f62
index 1e8d8e2dc5..7653b59de3 100644
9a3f62
--- a/setup.py
9a3f62
+++ b/setup.py
9a3f62
@@ -102,7 +102,7 @@ def copy_file (self, filename, dirname):
9a3f62
         'sos.policies.package_managers', 'sos.policies.init_systems',
9a3f62
         'sos.report', 'sos.report.plugins', 'sos.collector',
9a3f62
         'sos.collector.clusters', 'sos.cleaner', 'sos.cleaner.mappings',
9a3f62
-        'sos.cleaner.parsers'
9a3f62
+        'sos.cleaner.parsers', 'sos.cleaner.archives'
9a3f62
     ],
9a3f62
     cmdclass=cmdclass,
9a3f62
     command_options=command_options,
a18305
-- 
a18305
2.31.1
a18305
a18305
From ba3528230256429a4394f155a9ca1fdb91cf3560 Mon Sep 17 00:00:00 2001
a18305
From: Jake Hunsaker <jhunsake@redhat.com>
a18305
Date: Tue, 30 Nov 2021 12:46:34 -0500
a18305
Subject: [PATCH 1/2] [hostname] Simplify case matching for domains
a18305
a18305
Instead of special handling all uppercase domain conventions, use our
a18305
normal flow for obfuscation and just match the casing at the end of the
a18305
sanitization routine.
a18305
a18305
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
a18305
---
a18305
 sos/cleaner/mappings/hostname_map.py | 14 ++++++++------
a18305
 1 file changed, 8 insertions(+), 6 deletions(-)
a18305
a18305
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
a18305
index 0fe78fb1..5cd8e985 100644
a18305
--- a/sos/cleaner/mappings/hostname_map.py
a18305
+++ b/sos/cleaner/mappings/hostname_map.py
a18305
@@ -169,16 +169,15 @@ class SoSHostnameMap(SoSMap):
a18305
 
a18305
     def sanitize_item(self, item):
a18305
         host = item.split('.')
a18305
-        if len(host) > 1 and all([h.isupper() for h in host]):
a18305
-            # by convention we have just a domain
a18305
-            _host = [h.lower() for h in host]
a18305
-            return self.sanitize_domain(_host).upper()
a18305
         if len(host) == 1:
a18305
             # we have a shortname for a host
a18305
             return self.sanitize_short_name(host[0].lower())
a18305
         if len(host) == 2:
a18305
             # we have just a domain name, e.g. example.com
a18305
-            return self.sanitize_domain(host)
a18305
+            dname = self.sanitize_domain(host)
a18305
+            if all([h.isupper() for h in host]):
a18305
+                dname = dname.upper()
a18305
+            return dname
a18305
         if len(host) > 2:
a18305
             # we have an FQDN, e.g. foo.example.com
a18305
             hostname = host[0]
a18305
@@ -194,7 +193,10 @@ class SoSHostnameMap(SoSMap):
a18305
                 ob_hostname = 'unknown'
a18305
             ob_domain = self.sanitize_domain(domain)
a18305
             self.dataset[item] = ob_domain
a18305
-            return '.'.join([ob_hostname, ob_domain])
a18305
+            _fqdn = '.'.join([ob_hostname, ob_domain])
a18305
+            if all([h.isupper() for h in host]):
a18305
+                _fqdn = _fqdn.upper()
a18305
+            return _fqdn
a18305
 
a18305
     def sanitize_short_name(self, hostname):
a18305
         """Obfuscate the short name of the host with an incremented counter
a18305
-- 
a18305
2.31.1
a18305
a18305
a18305
From 189586728de22dd55122c1f7e06b19590f9a788f Mon Sep 17 00:00:00 2001
a18305
From: Jake Hunsaker <jhunsake@redhat.com>
a18305
Date: Tue, 30 Nov 2021 12:47:58 -0500
a18305
Subject: [PATCH 2/2] [username] Improve username sourcing and remove case
a18305
 sensitivity
a18305
a18305
First, don't skip the first line of `last` output, and instead add the
a18305
header from lastlog to the skip list. Additionally, add
a18305
`/etc/cron.allow` and `/etc/cron.deny` as sources for usernames that
a18305
might not appear in other locations in certain environments.
a18305
a18305
Also, make matching and replacement case insensitive.
a18305
a18305
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
a18305
---
a18305
 sos/cleaner/archives/sos.py            |  4 +++-
a18305
 sos/cleaner/mappings/username_map.py   |  2 +-
a18305
 sos/cleaner/parsers/username_parser.py | 14 +++++++++-----
a18305
 3 files changed, 13 insertions(+), 7 deletions(-)
a18305
a18305
diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py
a18305
index f8720c88..12766496 100644
a18305
--- a/sos/cleaner/archives/sos.py
a18305
+++ b/sos/cleaner/archives/sos.py
a18305
@@ -35,7 +35,9 @@ class SoSReportArchive(SoSObfuscationArchive):
a18305
             'sos_commands/login/lastlog_-u_65537-4294967295',
a18305
             # AD users will be reported here, but favor the lastlog files since
a18305
             # those will include local users who have not logged in
a18305
-            'sos_commands/login/last'
a18305
+            'sos_commands/login/last',
a18305
+            'etc/cron.allow',
a18305
+            'etc/cron.deny'
a18305
         ]
a18305
     }
a18305
 
a18305
diff --git a/sos/cleaner/mappings/username_map.py b/sos/cleaner/mappings/username_map.py
a18305
index cdbf36fe..7ecccd7b 100644
a18305
--- a/sos/cleaner/mappings/username_map.py
a18305
+++ b/sos/cleaner/mappings/username_map.py
a18305
@@ -33,5 +33,5 @@ class SoSUsernameMap(SoSMap):
a18305
         ob_name = "obfuscateduser%s" % self.name_count
a18305
         self.name_count += 1
a18305
         if ob_name in self.dataset.values():
a18305
-            return self.sanitize_item(username)
a18305
+            return self.sanitize_item(username.lower())
a18305
         return ob_name
a18305
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
a18305
index 35377a31..229c7de4 100644
a18305
--- a/sos/cleaner/parsers/username_parser.py
a18305
+++ b/sos/cleaner/parsers/username_parser.py
a18305
@@ -8,6 +8,7 @@
a18305
 #
a18305
 # See the LICENSE file in the source distribution for further information.
a18305
 
a18305
+import re
a18305
 
a18305
 from sos.cleaner.parsers import SoSCleanerParser
a18305
 from sos.cleaner.mappings.username_map import SoSUsernameMap
a18305
@@ -34,6 +35,7 @@ class SoSUsernameParser(SoSCleanerParser):
a18305
         'reboot',
a18305
         'root',
a18305
         'ubuntu',
a18305
+        'username',
a18305
         'wtmp'
a18305
     ]
a18305
 
a18305
@@ -47,12 +49,12 @@ class SoSUsernameParser(SoSCleanerParser):
a18305
         this parser, we need to override the initial parser prepping here.
a18305
         """
a18305
         users = set()
a18305
-        for line in content.splitlines()[1:]:
a18305
+        for line in content.splitlines():
a18305
             try:
a18305
                 user = line.split()[0]
a18305
             except Exception:
a18305
                 continue
a18305
-            if user in self.skip_list:
a18305
+            if user.lower() in self.skip_list:
a18305
                 continue
a18305
             users.add(user)
a18305
         for each in users:
a18305
@@ -61,7 +63,9 @@ class SoSUsernameParser(SoSCleanerParser):
a18305
     def parse_line(self, line):
a18305
         count = 0
a18305
         for username in sorted(self.mapping.dataset.keys(), reverse=True):
a18305
-            if username in line:
a18305
-                count = line.count(username)
a18305
-                line = line.replace(username, self.mapping.get(username))
a18305
+            _reg = re.compile(username, re.I)
a18305
+            if _reg.search(line):
a18305
+                line, count = _reg.subn(
a18305
+                    self.mapping.get(username.lower()), line
a18305
+                )
a18305
         return line, count
a18305
-- 
a18305
2.31.1
a18305
d7517c
From cafd0f3a52436a3966576e7db21e5dd17c06f0cc Mon Sep 17 00:00:00 2001
d7517c
From: Jake Hunsaker <jhunsake@redhat.com>
d7517c
Date: Sun, 12 Dec 2021 11:10:46 -0500
d7517c
Subject: [PATCH] [hostname] Fix edge case for new hosts in a known subdomain
d7517c
d7517c
Fixes an edge case that would cause us to at first not recognize that a
d7517c
given hostname string is a new host in a known subdomain, but then on
d7517c
the obfuscation attempt properly recognize it as such and result in an
d7517c
incomplete obfuscation.
d7517c
d7517c
This was mostly triggered by specific patterns for build hosts within
d7517c
`sos_commands/rpm/package-data`. With this refined check, these types of
d7517c
matches are properly obfuscated.
d7517c
d7517c
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
d7517c
---
d7517c
 sos/cleaner/mappings/hostname_map.py | 9 +++++----
d7517c
 1 file changed, 5 insertions(+), 4 deletions(-)
d7517c
d7517c
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
d7517c
index 5cd8e9857..33b0e6c80 100644
d7517c
--- a/sos/cleaner/mappings/hostname_map.py
d7517c
+++ b/sos/cleaner/mappings/hostname_map.py
d7517c
@@ -129,7 +129,7 @@ def get(self, item):
d7517c
             item = item[0:-1]
d7517c
         if not self.domain_name_in_loaded_domains(item.lower()):
d7517c
             return item
d7517c
-        if item.endswith(('.yaml', '.yml', '.crt', '.key', '.pem')):
d7517c
+        if item.endswith(('.yaml', '.yml', '.crt', '.key', '.pem', '.log')):
d7517c
             ext = '.' + item.split('.')[-1]
d7517c
             item = item.replace(ext, '')
d7517c
             suffix += ext
d7517c
@@ -148,7 +148,8 @@ def get(self, item):
d7517c
                 if len(_test) == 1 or not _test[0]:
d7517c
                     # does not match existing obfuscation
d7517c
                     continue
d7517c
-                elif _test[0].endswith('.') and not _host_substr:
d7517c
+                elif not _host_substr and (_test[0].endswith('.') or
d7517c
+                                           item.endswith(_existing)):
d7517c
                     # new hostname in known domain
d7517c
                     final = super(SoSHostnameMap, self).get(item)
d7517c
                     break
d7517c
@@ -219,8 +220,8 @@ def sanitize_domain(self, domain):
d7517c
             # don't obfuscate vendor domains
d7517c
             if re.match(_skip, '.'.join(domain)):
d7517c
                 return '.'.join(domain)
d7517c
-        top_domain = domain[-1]
d7517c
-        dname = '.'.join(domain[0:-1])
d7517c
+        top_domain = domain[-1].lower()
d7517c
+        dname = '.'.join(domain[0:-1]).lower()
d7517c
         ob_domain = self._new_obfuscated_domain(dname)
d7517c
         ob_domain = '.'.join([ob_domain, top_domain])
d7517c
         self.dataset['.'.join(domain)] = ob_domain
15bbc2
From f5e1298162a9393ea2d9f5c4df40dfece50f5f88 Mon Sep 17 00:00:00 2001
15bbc2
From: Jake Hunsaker <jhunsake@redhat.com>
15bbc2
Date: Thu, 6 Jan 2022 13:15:15 -0500
15bbc2
Subject: [PATCH 1/3] [hostname] Fix loading and detection of long base domains
15bbc2
15bbc2
Our domain matching has up to now assumed that users would be providing
15bbc2
'base' domains such as 'example.com' whereby something like
15bbc2
'foo.bar.example.com' is a subdomain (or host) within that base domain.
15bbc2
15bbc2
However, the use case exists to provide 'foo.bar.example.com' as the
15bbc2
base domain, without wanting to obfuscate 'example.com' directly.
15bbc2
15bbc2
This commit fixes our handling of both loading these longer domains and
15bbc2
doing the 'domain is part of a domain we want to obfuscate' check.
15bbc2
15bbc2
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
15bbc2
---
15bbc2
 sos/cleaner/mappings/hostname_map.py | 9 ++++++++-
15bbc2
 1 file changed, 8 insertions(+), 1 deletion(-)
15bbc2
15bbc2
diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py
15bbc2
index 33b0e6c8..7a7cf6b8 100644
15bbc2
--- a/sos/cleaner/mappings/hostname_map.py
15bbc2
+++ b/sos/cleaner/mappings/hostname_map.py
15bbc2
@@ -50,10 +50,14 @@ class SoSHostnameMap(SoSMap):
15bbc2
         in this parser, we need to re-inject entries from the map_file into
15bbc2
         these dicts and not just the underlying 'dataset' dict
15bbc2
         """
15bbc2
-        for domain in self.dataset:
15bbc2
+        for domain, ob_pair in self.dataset.items():
15bbc2
             if len(domain.split('.')) == 1:
15bbc2
                 self.hosts[domain.split('.')[0]] = self.dataset[domain]
15bbc2
             else:
15bbc2
+                if ob_pair.startswith('obfuscateddomain'):
15bbc2
+                    # directly exact domain matches
15bbc2
+                    self._domains[domain] = ob_pair.split('.')[0]
15bbc2
+                    continue
15bbc2
                 # strip the host name and trailing top-level domain so that
15bbc2
                 # we in inject the domain properly for later string matching
15bbc2
 
15bbc2
@@ -102,9 +106,12 @@ class SoSHostnameMap(SoSMap):
15bbc2
         and should be obfuscated
15bbc2
         """
15bbc2
         host = domain.split('.')
15bbc2
+        no_tld = '.'.join(domain.split('.')[0:-1])
15bbc2
         if len(host) == 1:
15bbc2
             # don't block on host's shortname
15bbc2
             return host[0] in self.hosts.keys()
15bbc2
+        elif any([no_tld.endswith(_d) for _d in self._domains]):
15bbc2
+            return True
15bbc2
         else:
15bbc2
             domain = host[0:-1]
15bbc2
             for known_domain in self._domains:
15bbc2
-- 
15bbc2
2.31.1
15bbc2
15bbc2
15bbc2
From e241cf33a14ecd4e848a5fd857c5d3d7d07fbd71 Mon Sep 17 00:00:00 2001
15bbc2
From: Jake Hunsaker <jhunsake@redhat.com>
15bbc2
Date: Thu, 6 Jan 2022 13:18:44 -0500
15bbc2
Subject: [PATCH 2/3] [cleaner] Improve parser-specific file skipping
15bbc2
15bbc2
This commit improves our handling of skipping files on a per-parser
15bbc2
basis, by first filtering the list of parsers that `obfuscate_line()`
15bbc2
will iterate over by the parser's `skip_file` class attr, rather than
15bbc2
relying on higher-level checks.
15bbc2
15bbc2
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
15bbc2
---
15bbc2
 sos/cleaner/__init__.py | 17 ++++++++++++++---
15bbc2
 1 file changed, 14 insertions(+), 3 deletions(-)
15bbc2
15bbc2
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
15bbc2
index 3f530d44..5686e213 100644
15bbc2
--- a/sos/cleaner/__init__.py
15bbc2
+++ b/sos/cleaner/__init__.py
15bbc2
@@ -12,6 +12,7 @@ import hashlib
15bbc2
 import json
15bbc2
 import logging
15bbc2
 import os
15bbc2
+import re
15bbc2
 import shutil
15bbc2
 import tempfile
15bbc2
 
15bbc2
@@ -640,10 +641,16 @@ third party.
15bbc2
             self.log_debug("Obfuscating %s" % short_name or filename,
15bbc2
                            caller=arc_name)
15bbc2
             tfile = tempfile.NamedTemporaryFile(mode='w', dir=self.tmpdir)
15bbc2
+            _parsers = [
15bbc2
+                _p for _p in self.parsers if not
15bbc2
+                any([
15bbc2
+                    re.match(p, short_name) for p in _p.skip_files
15bbc2
+                ])
15bbc2
+            ]
15bbc2
             with open(filename, 'r') as fname:
15bbc2
                 for line in fname:
15bbc2
                     try:
15bbc2
-                        line, count = self.obfuscate_line(line)
15bbc2
+                        line, count = self.obfuscate_line(line, _parsers)
15bbc2
                         subs += count
15bbc2
                         tfile.write(line)
15bbc2
                     except Exception as err:
15bbc2
@@ -713,7 +720,7 @@ third party.
15bbc2
                 pass
15bbc2
         return string_data
15bbc2
 
15bbc2
-    def obfuscate_line(self, line):
15bbc2
+    def obfuscate_line(self, line, parsers=None):
15bbc2
         """Run a line through each of the obfuscation parsers, keeping a
15bbc2
         cumulative total of substitutions done on that particular line.
15bbc2
 
15bbc2
@@ -721,6 +728,8 @@ third party.
15bbc2
 
15bbc2
             :param line str:        The raw line as read from the file being
15bbc2
                                     processed
15bbc2
+            :param parsers:         A list of parser objects to obfuscate
15bbc2
+                                    with. If None, use all.
15bbc2
 
15bbc2
         Returns the fully obfuscated line and the number of substitutions made
15bbc2
         """
15bbc2
@@ -729,7 +738,9 @@ third party.
15bbc2
         count = 0
15bbc2
         if not line.strip():
15bbc2
             return line, count
15bbc2
-        for parser in self.parsers:
15bbc2
+        if parsers is None:
15bbc2
+            parsers = self.parsers
15bbc2
+        for parser in parsers:
15bbc2
             try:
15bbc2
                 line, _count = parser.parse_line(line)
15bbc2
                 count += _count
15bbc2
-- 
15bbc2
2.31.1
15bbc2
15bbc2
15bbc2
From 96c9a833e77639a853b7d3d6f1df68bbbbe5e9cb Mon Sep 17 00:00:00 2001
15bbc2
From: Jake Hunsaker <jhunsake@redhat.com>
15bbc2
Date: Thu, 6 Jan 2022 13:20:32 -0500
15bbc2
Subject: [PATCH 3/3] [cleaner] Add skips for known files and usernames
15bbc2
15bbc2
Adds skips for `/proc/kallsyms` which should never be obfuscated, as
15bbc2
well as any packaging-related log file for the IP parser. Further, do
15bbc2
not obfuscate the `stack` users, as that is a well-known user for many
15bbc2
configurations that, if obfuscated, could result in undesired string
15bbc2
substitutions in normal logging.
15bbc2
15bbc2
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
15bbc2
---
15bbc2
 sos/cleaner/archives/__init__.py       | 2 ++
15bbc2
 sos/cleaner/parsers/ip_parser.py       | 3 ++-
15bbc2
 sos/cleaner/parsers/username_parser.py | 1 +
15bbc2
 3 files changed, 5 insertions(+), 1 deletion(-)
15bbc2
15bbc2
diff --git a/sos/cleaner/archives/__init__.py b/sos/cleaner/archives/__init__.py
15bbc2
index 795c5a78..cbf1f809 100644
15bbc2
--- a/sos/cleaner/archives/__init__.py
15bbc2
+++ b/sos/cleaner/archives/__init__.py
15bbc2
@@ -43,6 +43,7 @@ class SoSObfuscationArchive():
15bbc2
     type_name = 'undetermined'
15bbc2
     description = 'undetermined'
15bbc2
     is_nested = False
15bbc2
+    skip_files = []
15bbc2
     prep_files = {}
15bbc2
 
15bbc2
     def __init__(self, archive_path, tmpdir):
15bbc2
@@ -111,6 +112,7 @@ class SoSObfuscationArchive():
15bbc2
         Returns: list of files and file regexes
15bbc2
         """
15bbc2
         return [
15bbc2
+            'proc/kallsyms',
15bbc2
             'sosreport-',
15bbc2
             'sys/firmware',
15bbc2
             'sys/fs',
15bbc2
diff --git a/sos/cleaner/parsers/ip_parser.py b/sos/cleaner/parsers/ip_parser.py
15bbc2
index 71d38be8..b007368c 100644
15bbc2
--- a/sos/cleaner/parsers/ip_parser.py
15bbc2
+++ b/sos/cleaner/parsers/ip_parser.py
15bbc2
@@ -37,7 +37,8 @@ class SoSIPParser(SoSCleanerParser):
15bbc2
         'sos_commands/snappy/snap_list_--all',
15bbc2
         'sos_commands/snappy/snap_--version',
15bbc2
         'sos_commands/vulkan/vulkaninfo',
15bbc2
-        'var/log/.*dnf.*'
15bbc2
+        'var/log/.*dnf.*',
15bbc2
+        'var/log/.*packag.*'  # get 'packages' and 'packaging' logs
15bbc2
     ]
15bbc2
 
15bbc2
     map_file_key = 'ip_map'
15bbc2
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
15bbc2
index 229c7de4..3208a655 100644
15bbc2
--- a/sos/cleaner/parsers/username_parser.py
15bbc2
+++ b/sos/cleaner/parsers/username_parser.py
15bbc2
@@ -32,6 +32,7 @@ class SoSUsernameParser(SoSCleanerParser):
15bbc2
         'nobody',
15bbc2
         'nfsnobody',
15bbc2
         'shutdown',
15bbc2
+        'stack',
15bbc2
         'reboot',
15bbc2
         'root',
15bbc2
         'ubuntu',
15bbc2
-- 
15bbc2
2.31.1
15bbc2