From 9a3f62743eb9efecf573d631fccb4ce87324e450 Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Dec 04 2021 06:57:41 +0000 Subject: import sos-4.2-6.el8 --- diff --git a/SOURCES/sos-bz2023867-cleaner-hostnames-improvements.patch b/SOURCES/sos-bz2023867-cleaner-hostnames-improvements.patch new file mode 100644 index 0000000..148adb6 --- /dev/null +++ b/SOURCES/sos-bz2023867-cleaner-hostnames-improvements.patch @@ -0,0 +1,1389 @@ +From decd39b7799a0579ea085b0da0728b6eabd49b38 Mon Sep 17 00:00:00 2001 +From: Jake Hunsaker +Date: Wed, 1 Sep 2021 00:28:58 -0400 +Subject: [PATCH] [clean] Provide archive abstractions to obfuscate more than + sos archives + +This commit removes the restriction imposed on `sos clean` since its +introduction in sos-4.0 to only work against known sos report archives +or build directories. This is because there has been interest in using +the obfuscation bits of sos in other data-collector projects. + +The `SoSObfuscationArchive()` class has been revamped to now be an +abstraction for different types of archives, and the cleaner logic has +been updated to leverage this new abstraction rather than assuming we're +working on an sos archive. + +Abstractions are added for our own native use cases - that being `sos +report` and `sos collect` for at-runtime obfuscation, as well as +standalone archives previously generated. Further generic abstractions +are available for plain directories and tarballs however these will not +provide the same level of coverage as fully supported archive types, as +is noted in the manpage for sos-clean. + +Signed-off-by: Jake Hunsaker +--- + man/en/sos-clean.1 | 25 ++ + sos/cleaner/__init__.py | 308 +++++++++--------- + .../__init__.py} | 80 ++++- + sos/cleaner/archives/generic.py | 52 +++ + sos/cleaner/archives/sos.py | 106 ++++++ + sos/cleaner/parsers/__init__.py | 6 - + sos/cleaner/parsers/hostname_parser.py | 1 - + sos/cleaner/parsers/ip_parser.py | 1 - + sos/cleaner/parsers/keyword_parser.py | 1 - + sos/cleaner/parsers/mac_parser.py | 1 - + sos/cleaner/parsers/username_parser.py | 8 - + tests/cleaner_tests/existing_archive.py | 7 + + tests/cleaner_tests/full_report_run.py | 3 + + tests/cleaner_tests/report_with_mask.py | 3 + + 14 files changed, 423 insertions(+), 179 deletions(-) + rename sos/cleaner/{obfuscation_archive.py => archives/__init__.py} (81%) + create mode 100644 sos/cleaner/archives/generic.py + create mode 100644 sos/cleaner/archives/sos.py + +diff --git a/man/en/sos-clean.1 b/man/en/sos-clean.1 +index b77bc63c..54026713 100644 +--- a/man/en/sos-clean.1 ++++ b/man/en/sos-clean.1 +@@ -10,6 +10,7 @@ sos clean - Obfuscate sensitive data from one or more sosreports + [\-\-jobs] + [\-\-no-update] + [\-\-keep-binary-files] ++ [\-\-archive-type] + + .SH DESCRIPTION + \fBsos clean\fR or \fBsos mask\fR is an sos subcommand used to obfuscate sensitive information from +@@ -88,6 +89,30 @@ Users should review any archive that keeps binary files in place before sending + a third party. + + Default: False (remove encountered binary files) ++.TP ++.B \-\-archive-type TYPE ++Specify the type of archive that TARGET was generated as. ++When sos inspects a TARGET archive, it tries to identify what type of archive it is. ++For example, it may be a report generated by \fBsos report\fR, or a collection of those ++reports generated by \fBsos collect\fR, which require separate approaches. ++ ++This option may be useful if a given TARGET archive is known to be of a specific type, ++but due to unknown reasons or some malformed/missing information in the archive directly, ++that is not properly identified by sos. ++ ++The following are accepted values for this option: ++ ++ \fBauto\fR Automatically detect the archive type ++ \fBreport\fR An archive generated by \fBsos report\fR ++ \fBcollect\fR An archive generated by \fBsos collect\fR ++ ++The following may also be used, however note that these do not attempt to pre-load ++any information from the archives into the parsers. This means that, among other limitations, ++items like host and domain names may not be obfuscated unless an obfuscated mapping already exists ++on the system from a previous execution. ++ ++ \fBdata-dir\fR A plain directory on the filesystem. ++ \fBtarball\fR A generic tar archive not associated with any known tool + + .SH SEE ALSO + .BR sos (1) +diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py +index 6aadfe79..6d2eb483 100644 +--- a/sos/cleaner/__init__.py ++++ b/sos/cleaner/__init__.py +@@ -12,9 +12,7 @@ import hashlib + import json + import logging + import os +-import re + import shutil +-import tarfile + import tempfile + + from concurrent.futures import ThreadPoolExecutor +@@ -27,7 +25,10 @@ from sos.cleaner.parsers.mac_parser import SoSMacParser + from sos.cleaner.parsers.hostname_parser import SoSHostnameParser + from sos.cleaner.parsers.keyword_parser import SoSKeywordParser + from sos.cleaner.parsers.username_parser import SoSUsernameParser +-from sos.cleaner.obfuscation_archive import SoSObfuscationArchive ++from sos.cleaner.archives.sos import (SoSReportArchive, SoSReportDirectory, ++ SoSCollectorArchive, ++ SoSCollectorDirectory) ++from sos.cleaner.archives.generic import DataDirArchive, TarballArchive + from sos.utilities import get_human_readable + from textwrap import fill + +@@ -41,6 +42,7 @@ class SoSCleaner(SoSComponent): + desc = "Obfuscate sensitive networking information in a report" + + arg_defaults = { ++ 'archive_type': 'auto', + 'domains': [], + 'jobs': 4, + 'keywords': [], +@@ -70,6 +72,7 @@ class SoSCleaner(SoSComponent): + self.from_cmdline = False + if not hasattr(self.opts, 'jobs'): + self.opts.jobs = 4 ++ self.opts.archive_type = 'auto' + self.soslog = logging.getLogger('sos') + self.ui_log = logging.getLogger('sos_ui') + # create the tmp subdir here to avoid a potential race condition +@@ -92,6 +95,17 @@ class SoSCleaner(SoSComponent): + SoSUsernameParser(self.cleaner_mapping, self.opts.usernames) + ] + ++ self.archive_types = [ ++ SoSReportDirectory, ++ SoSReportArchive, ++ SoSCollectorDirectory, ++ SoSCollectorArchive, ++ # make sure these two are always last as they are fallbacks ++ DataDirArchive, ++ TarballArchive ++ ] ++ self.nested_archive = None ++ + self.log_info("Cleaner initialized. From cmdline: %s" + % self.from_cmdline) + +@@ -178,6 +192,11 @@ third party. + ) + clean_grp.add_argument('target', metavar='TARGET', + help='The directory or archive to obfuscate') ++ clean_grp.add_argument('--archive-type', default='auto', ++ choices=['auto', 'report', 'collect', ++ 'data-dir', 'tarball'], ++ help=('Specify what kind of archive the target ' ++ 'was generated as')) + clean_grp.add_argument('--domains', action='extend', default=[], + help='List of domain names to obfuscate') + clean_grp.add_argument('-j', '--jobs', default=4, type=int, +@@ -218,59 +237,28 @@ third party. + + In the event the target path is not an archive, abort. + """ +- if not tarfile.is_tarfile(self.opts.target): +- self.ui_log.error( +- "Invalid target: must be directory or tar archive" +- ) +- self._exit(1) +- +- archive = tarfile.open(self.opts.target) +- self.arc_name = self.opts.target.split('/')[-1].split('.')[:-2][0] +- +- try: +- archive.getmember(os.path.join(self.arc_name, 'sos_logs')) +- except Exception: +- # this is not an sos archive +- self.ui_log.error("Invalid target: not an sos archive") +- self._exit(1) +- +- # see if there are archives within this archive +- nested_archives = [] +- for _file in archive.getmembers(): +- if (re.match('sosreport-.*.tar', _file.name.split('/')[-1]) and not +- (_file.name.endswith(('.md5', '.sha256')))): +- nested_archives.append(_file.name.split('/')[-1]) +- +- if nested_archives: +- self.log_info("Found nested archive(s), extracting top level") +- nested_path = self.extract_archive(archive) +- for arc_file in os.listdir(nested_path): +- if re.match('sosreport.*.tar.*', arc_file): +- if arc_file.endswith(('.md5', '.sha256')): +- continue +- self.report_paths.append(os.path.join(nested_path, +- arc_file)) +- # add the toplevel extracted archive +- self.report_paths.append(nested_path) ++ _arc = None ++ if self.opts.archive_type != 'auto': ++ check_type = self.opts.archive_type.replace('-', '_') ++ for archive in self.archive_types: ++ if archive.type_name == check_type: ++ _arc = archive(self.opts.target, self.tmpdir) + else: +- self.report_paths.append(self.opts.target) +- +- archive.close() +- +- def extract_archive(self, archive): +- """Extract an archive into our tmpdir so that we may inspect it or +- iterate through its contents for obfuscation +- +- Positional arguments: +- +- :param archive: An open TarFile object for the archive +- +- """ +- if not isinstance(archive, tarfile.TarFile): +- archive = tarfile.open(archive) +- path = os.path.join(self.tmpdir, 'cleaner') +- archive.extractall(path) +- return os.path.join(path, archive.name.split('/')[-1].split('.tar')[0]) ++ for arc in self.archive_types: ++ if arc.check_is_type(self.opts.target): ++ _arc = arc(self.opts.target, self.tmpdir) ++ break ++ if not _arc: ++ return ++ self.report_paths.append(_arc) ++ if _arc.is_nested: ++ self.report_paths.extend(_arc.get_nested_archives()) ++ # We need to preserve the top level archive until all ++ # nested archives are processed ++ self.report_paths.remove(_arc) ++ self.nested_archive = _arc ++ if self.nested_archive: ++ self.nested_archive.ui_name = self.nested_archive.description + + def execute(self): + """SoSCleaner will begin by inspecting the TARGET option to determine +@@ -283,6 +271,7 @@ third party. + be unpacked, cleaned, and repacked and the final top-level archive will + then be repacked as well. + """ ++ self.arc_name = self.opts.target.split('/')[-1].split('.tar')[0] + if self.from_cmdline: + self.print_disclaimer() + self.report_paths = [] +@@ -290,23 +279,11 @@ third party. + self.ui_log.error("Invalid target: no such file or directory %s" + % self.opts.target) + self._exit(1) +- if os.path.isdir(self.opts.target): +- self.arc_name = self.opts.target.split('/')[-1] +- for _file in os.listdir(self.opts.target): +- if _file == 'sos_logs': +- self.report_paths.append(self.opts.target) +- if (_file.startswith('sosreport') and +- (_file.endswith(".tar.gz") or _file.endswith(".tar.xz"))): +- self.report_paths.append(os.path.join(self.opts.target, +- _file)) +- if not self.report_paths: +- self.ui_log.error("Invalid target: not an sos directory") +- self._exit(1) +- else: +- self.inspect_target_archive() ++ ++ self.inspect_target_archive() + + if not self.report_paths: +- self.ui_log.error("No valid sos archives or directories found\n") ++ self.ui_log.error("No valid archives or directories found\n") + self._exit(1) + + # we have at least one valid target to obfuscate +@@ -334,33 +311,7 @@ third party. + + final_path = None + if len(self.completed_reports) > 1: +- # we have an archive of archives, so repack the obfuscated tarball +- arc_name = self.arc_name + '-obfuscated' +- self.setup_archive(name=arc_name) +- for arc in self.completed_reports: +- if arc.is_tarfile: +- arc_dest = self.obfuscate_string( +- arc.final_archive_path.split('/')[-1] +- ) +- self.archive.add_file(arc.final_archive_path, +- dest=arc_dest) +- checksum = self.get_new_checksum(arc.final_archive_path) +- if checksum is not None: +- dname = self.obfuscate_string( +- "checksums/%s.%s" % (arc_dest, self.hash_name) +- ) +- self.archive.add_string(checksum, dest=dname) +- else: +- for dirname, dirs, files in os.walk(arc.archive_path): +- for filename in files: +- if filename.startswith('sosreport'): +- continue +- fname = os.path.join(dirname, filename) +- dnm = self.obfuscate_string( +- fname.split(arc.archive_name)[-1].lstrip('/') +- ) +- self.archive.add_file(fname, dest=dnm) +- arc_path = self.archive.finalize(self.opts.compression_type) ++ arc_path = self.rebuild_nested_archive() + else: + arc = self.completed_reports[0] + arc_path = arc.final_archive_path +@@ -371,8 +322,7 @@ third party. + ) + with open(os.path.join(self.sys_tmp, chksum_name), 'w') as cf: + cf.write(checksum) +- +- self.write_cleaner_log() ++ self.write_cleaner_log() + + final_path = self.obfuscate_string( + os.path.join(self.sys_tmp, arc_path.split('/')[-1]) +@@ -393,6 +343,30 @@ third party. + + self.cleanup() + ++ def rebuild_nested_archive(self): ++ """Handles repacking the nested tarball, now containing only obfuscated ++ copies of the reports, log files, manifest, etc... ++ """ ++ # we have an archive of archives, so repack the obfuscated tarball ++ arc_name = self.arc_name + '-obfuscated' ++ self.setup_archive(name=arc_name) ++ for archive in self.completed_reports: ++ arc_dest = archive.final_archive_path.split('/')[-1] ++ checksum = self.get_new_checksum(archive.final_archive_path) ++ if checksum is not None: ++ dname = "checksums/%s.%s" % (arc_dest, self.hash_name) ++ self.archive.add_string(checksum, dest=dname) ++ for dirn, dirs, files in os.walk(self.nested_archive.extracted_path): ++ for filename in files: ++ fname = os.path.join(dirn, filename) ++ dname = fname.split(self.nested_archive.extracted_path)[-1] ++ dname = dname.lstrip('/') ++ self.archive.add_file(fname, dest=dname) ++ # remove it now so we don't balloon our fs space needs ++ os.remove(fname) ++ self.write_cleaner_log(archive=True) ++ return self.archive.finalize(self.opts.compression_type) ++ + def compile_mapping_dict(self): + """Build a dict that contains each parser's map as a key, with the + contents as that key's value. This will then be written to disk in the +@@ -441,7 +415,7 @@ third party. + self.log_error("Could not update mapping config file: %s" + % err) + +- def write_cleaner_log(self): ++ def write_cleaner_log(self, archive=False): + """When invoked via the command line, the logging from SoSCleaner will + not be added to the archive(s) it processes, so we need to write it + separately to disk +@@ -454,6 +428,10 @@ third party. + for line in self.sos_log_file.readlines(): + logfile.write(line) + ++ if archive: ++ self.obfuscate_file(log_name) ++ self.archive.add_file(log_name, dest="sos_logs/cleaner.log") ++ + def get_new_checksum(self, archive_path): + """Calculate a new checksum for the obfuscated archive, as the previous + checksum will no longer be valid +@@ -481,11 +459,11 @@ third party. + be obfuscated concurrently. + """ + try: +- if len(self.report_paths) > 1: +- msg = ("Found %s total reports to obfuscate, processing up to " +- "%s concurrently\n" +- % (len(self.report_paths), self.opts.jobs)) +- self.ui_log.info(msg) ++ msg = ( ++ "Found %s total reports to obfuscate, processing up to %s " ++ "concurrently\n" % (len(self.report_paths), self.opts.jobs) ++ ) ++ self.ui_log.info(msg) + if self.opts.keep_binary_files: + self.ui_log.warning( + "WARNING: binary files that potentially contain sensitive " +@@ -494,53 +472,67 @@ third party. + pool = ThreadPoolExecutor(self.opts.jobs) + pool.map(self.obfuscate_report, self.report_paths, chunksize=1) + pool.shutdown(wait=True) ++ # finally, obfuscate the nested archive if one exists ++ if self.nested_archive: ++ self._replace_obfuscated_archives() ++ self.obfuscate_report(self.nested_archive) + except KeyboardInterrupt: + self.ui_log.info("Exiting on user cancel") + os._exit(130) + ++ def _replace_obfuscated_archives(self): ++ """When we have a nested archive, we need to rebuild the original ++ archive, which entails replacing the existing archives with their ++ obfuscated counterparts ++ """ ++ for archive in self.completed_reports: ++ os.remove(archive.archive_path) ++ dest = self.nested_archive.extracted_path ++ tarball = archive.final_archive_path.split('/')[-1] ++ dest_name = os.path.join(dest, tarball) ++ shutil.move(archive.final_archive_path, dest) ++ archive.final_archive_path = dest_name ++ + def preload_all_archives_into_maps(self): + """Before doing the actual obfuscation, if we have multiple archives + to obfuscate then we need to preload each of them into the mappings + to ensure that node1 is obfuscated in node2 as well as node2 being + obfuscated in node1's archive. + """ +- self.log_info("Pre-loading multiple archives into obfuscation maps") ++ self.log_info("Pre-loading all archives into obfuscation maps") + for _arc in self.report_paths: +- is_dir = os.path.isdir(_arc) +- if is_dir: +- _arc_name = _arc +- else: +- archive = tarfile.open(_arc) +- _arc_name = _arc.split('/')[-1].split('.tar')[0] +- # for each parser, load the map_prep_file into memory, and then +- # send that for obfuscation. We don't actually obfuscate the file +- # here, do that in the normal archive loop + for _parser in self.parsers: +- if not _parser.prep_map_file: ++ try: ++ pfile = _arc.prep_files[_parser.name.lower().split()[0]] ++ if not pfile: ++ continue ++ except (IndexError, KeyError): + continue +- if isinstance(_parser.prep_map_file, str): +- _parser.prep_map_file = [_parser.prep_map_file] +- for parse_file in _parser.prep_map_file: +- _arc_path = os.path.join(_arc_name, parse_file) ++ if isinstance(pfile, str): ++ pfile = [pfile] ++ for parse_file in pfile: ++ self.log_debug("Attempting to load %s" % parse_file) + try: +- if is_dir: +- _pfile = open(_arc_path, 'r') +- content = _pfile.read() +- else: +- _pfile = archive.extractfile(_arc_path) +- content = _pfile.read().decode('utf-8') +- _pfile.close() ++ content = _arc.get_file_content(parse_file) ++ if not content: ++ continue + if isinstance(_parser, SoSUsernameParser): + _parser.load_usernames_into_map(content) +- for line in content.splitlines(): +- if isinstance(_parser, SoSHostnameParser): +- _parser.load_hostname_into_map(line) +- self.obfuscate_line(line) ++ elif isinstance(_parser, SoSHostnameParser): ++ _parser.load_hostname_into_map( ++ content.splitlines()[0] ++ ) ++ else: ++ for line in content.splitlines(): ++ self.obfuscate_line(line) + except Exception as err: +- self.log_debug("Could not prep %s: %s" +- % (_arc_path, err)) ++ self.log_info( ++ "Could not prepare %s from %s (archive: %s): %s" ++ % (_parser.name, parse_file, _arc.archive_name, ++ err) ++ ) + +- def obfuscate_report(self, report): ++ def obfuscate_report(self, archive): + """Individually handle each archive or directory we've discovered by + running through each file therein. + +@@ -549,17 +541,12 @@ third party. + :param report str: Filepath to the directory or archive + """ + try: +- if not os.access(report, os.W_OK): +- msg = "Insufficient permissions on %s" % report +- self.log_info(msg) +- self.ui_log.error(msg) +- return +- +- archive = SoSObfuscationArchive(report, self.tmpdir) + arc_md = self.cleaner_md.add_section(archive.archive_name) + start_time = datetime.now() + arc_md.add_field('start_time', start_time) +- archive.extract() ++ # don't double extract nested archives ++ if not archive.is_extracted: ++ archive.extract() + archive.report_msg("Beginning obfuscation...") + + file_list = archive.get_file_list() +@@ -586,27 +573,28 @@ third party. + caller=archive.archive_name) + + # if the archive was already a tarball, repack it +- method = archive.get_compression() +- if method: +- archive.report_msg("Re-compressing...") +- try: +- archive.rename_top_dir( +- self.obfuscate_string(archive.archive_name) +- ) +- archive.compress(method) +- except Exception as err: +- self.log_debug("Archive %s failed to compress: %s" +- % (archive.archive_name, err)) +- archive.report_msg("Failed to re-compress archive: %s" +- % err) +- return ++ if not archive.is_nested: ++ method = archive.get_compression() ++ if method: ++ archive.report_msg("Re-compressing...") ++ try: ++ archive.rename_top_dir( ++ self.obfuscate_string(archive.archive_name) ++ ) ++ archive.compress(method) ++ except Exception as err: ++ self.log_debug("Archive %s failed to compress: %s" ++ % (archive.archive_name, err)) ++ archive.report_msg("Failed to re-compress archive: %s" ++ % err) ++ return ++ self.completed_reports.append(archive) + + end_time = datetime.now() + arc_md.add_field('end_time', end_time) + arc_md.add_field('run_time', end_time - start_time) + arc_md.add_field('files_obfuscated', len(archive.file_sub_list)) + arc_md.add_field('total_substitutions', archive.total_sub_count) +- self.completed_reports.append(archive) + rmsg = '' + if archive.removed_file_count: + rmsg = " [removed %s unprocessable files]" +@@ -615,7 +603,7 @@ third party. + + except Exception as err: + self.ui_log.info("Exception while processing %s: %s" +- % (report, err)) ++ % (archive.archive_name, err)) + + def obfuscate_file(self, filename, short_name=None, arc_name=None): + """Obfuscate and individual file, line by line. +@@ -635,6 +623,8 @@ third party. + # the requested file doesn't exist in the archive + return + subs = 0 ++ if not short_name: ++ short_name = filename.split('/')[-1] + if not os.path.islink(filename): + # don't run the obfuscation on the link, but on the actual file + # at some other point. +@@ -745,3 +735,5 @@ third party. + for parser in self.parsers: + _sec = parse_sec.add_section(parser.name.replace(' ', '_').lower()) + _sec.add_field('entries', len(parser.mapping.dataset.keys())) ++ ++# vim: set et ts=4 sw=4 : +diff --git a/sos/cleaner/obfuscation_archive.py b/sos/cleaner/archives/__init__.py +similarity index 81% +rename from sos/cleaner/obfuscation_archive.py +rename to sos/cleaner/archives/__init__.py +index ea0b7012..795c5a78 100644 +--- a/sos/cleaner/obfuscation_archive.py ++++ b/sos/cleaner/archives/__init__.py +@@ -40,6 +40,10 @@ class SoSObfuscationArchive(): + file_sub_list = [] + total_sub_count = 0 + removed_file_count = 0 ++ type_name = 'undetermined' ++ description = 'undetermined' ++ is_nested = False ++ prep_files = {} + + def __init__(self, archive_path, tmpdir): + self.archive_path = archive_path +@@ -50,7 +54,43 @@ class SoSObfuscationArchive(): + self.soslog = logging.getLogger('sos') + self.ui_log = logging.getLogger('sos_ui') + self.skip_list = self._load_skip_list() +- self.log_info("Loaded %s as an archive" % self.archive_path) ++ self.is_extracted = False ++ self._load_self() ++ self.archive_root = '' ++ self.log_info( ++ "Loaded %s as type %s" ++ % (self.archive_path, self.description) ++ ) ++ ++ @classmethod ++ def check_is_type(cls, arc_path): ++ """Check if the archive is a well-known type we directly support""" ++ return False ++ ++ def _load_self(self): ++ if self.is_tarfile: ++ self.tarobj = tarfile.open(self.archive_path) ++ ++ def get_nested_archives(self): ++ """Return a list of ObfuscationArchives that represent additional ++ archives found within the target archive. For example, an archive from ++ `sos collect` will return a list of ``SoSReportArchive`` objects. ++ ++ This should be overridden by individual types of ObfuscationArchive's ++ """ ++ return [] ++ ++ def get_archive_root(self): ++ """Set the root path for the archive that should be prepended to any ++ filenames given to methods in this class. ++ """ ++ if self.is_tarfile: ++ toplevel = self.tarobj.firstmember ++ if toplevel.isdir(): ++ return toplevel.name ++ else: ++ return os.sep ++ return os.path.abspath(self.archive_path) + + def report_msg(self, msg): + """Helper to easily format ui messages on a per-report basis""" +@@ -96,10 +136,42 @@ class SoSObfuscationArchive(): + os.remove(full_fname) + self.removed_file_count += 1 + +- def extract(self): ++ def format_file_name(self, fname): ++ """Based on the type of archive we're dealing with, do whatever that ++ archive requires to a provided **relative** filepath to be able to ++ access it within the archive ++ """ ++ if not self.is_extracted: ++ if not self.archive_root: ++ self.archive_root = self.get_archive_root() ++ return os.path.join(self.archive_root, fname) ++ else: ++ return os.path.join(self.extracted_path, fname) ++ ++ def get_file_content(self, fname): ++ """Return the content from the specified fname. Particularly useful for ++ tarball-type archives so we can retrieve prep file contents prior to ++ extracting the entire archive ++ """ ++ if self.is_extracted is False and self.is_tarfile: ++ filename = self.format_file_name(fname) ++ try: ++ return self.tarobj.extractfile(filename).read().decode('utf-8') ++ except KeyError: ++ self.log_debug( ++ "Unable to retrieve %s: no such file in archive" % fname ++ ) ++ return '' ++ else: ++ with open(self.format_file_name(fname), 'r') as to_read: ++ return to_read.read() ++ ++ def extract(self, quiet=False): + if self.is_tarfile: +- self.report_msg("Extracting...") ++ if not quiet: ++ self.report_msg("Extracting...") + self.extracted_path = self.extract_self() ++ self.is_extracted = True + else: + self.extracted_path = self.archive_path + # if we're running as non-root (e.g. collector), then we can have a +@@ -317,3 +389,5 @@ class SoSObfuscationArchive(): + return False + except UnicodeDecodeError: + return True ++ ++# vim: set et ts=4 sw=4 : +diff --git a/sos/cleaner/archives/generic.py b/sos/cleaner/archives/generic.py +new file mode 100644 +index 00000000..2ce6f09b +--- /dev/null ++++ b/sos/cleaner/archives/generic.py +@@ -0,0 +1,52 @@ ++# Copyright 2020 Red Hat, Inc. Jake Hunsaker ++ ++# This file is part of the sos project: https://github.com/sosreport/sos ++# ++# This copyrighted material is made available to anyone wishing to use, ++# modify, copy, or redistribute it subject to the terms and conditions of ++# version 2 of the GNU General Public License. ++# ++# See the LICENSE file in the source distribution for further information. ++ ++ ++from sos.cleaner.archives import SoSObfuscationArchive ++ ++import os ++import tarfile ++ ++ ++class DataDirArchive(SoSObfuscationArchive): ++ """A plain directory on the filesystem that is not directly associated with ++ any known or supported collection utility ++ """ ++ ++ type_name = 'data_dir' ++ description = 'unassociated directory' ++ ++ @classmethod ++ def check_is_type(cls, arc_path): ++ return os.path.isdir(arc_path) ++ ++ def set_archive_root(self): ++ return os.path.abspath(self.archive_path) ++ ++ ++class TarballArchive(SoSObfuscationArchive): ++ """A generic tar archive that is not associated with any known or supported ++ collection utility ++ """ ++ ++ type_name = 'tarball' ++ description = 'unassociated tarball' ++ ++ @classmethod ++ def check_is_type(cls, arc_path): ++ try: ++ return tarfile.is_tarfile(arc_path) ++ except Exception: ++ return False ++ ++ def set_archive_root(self): ++ if self.tarobj.firstmember.isdir(): ++ return self.tarobj.firstmember.name ++ return '' +diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py +new file mode 100644 +index 00000000..4401d710 +--- /dev/null ++++ b/sos/cleaner/archives/sos.py +@@ -0,0 +1,106 @@ ++# Copyright 2021 Red Hat, Inc. Jake Hunsaker ++ ++# This file is part of the sos project: https://github.com/sosreport/sos ++# ++# This copyrighted material is made available to anyone wishing to use, ++# modify, copy, or redistribute it subject to the terms and conditions of ++# version 2 of the GNU General Public License. ++# ++# See the LICENSE file in the source distribution for further information. ++ ++ ++from sos.cleaner.archives import SoSObfuscationArchive ++ ++import os ++import tarfile ++ ++ ++class SoSReportArchive(SoSObfuscationArchive): ++ """This is the class representing an sos report, or in other words the ++ type the archive the SoS project natively generates ++ """ ++ ++ type_name = 'report' ++ description = 'sos report archive' ++ prep_files = { ++ 'hostname': 'sos_commands/host/hostname', ++ 'ip': 'sos_commands/networking/ip_-o_addr', ++ 'mac': 'sos_commands/networking/ip_-d_address', ++ 'username': [ ++ 'sos_commands/login/lastlog_-u_1000-60000', ++ 'sos_commands/login/lastlog_-u_60001-65536', ++ 'sos_commands/login/lastlog_-u_65537-4294967295', ++ # AD users will be reported here, but favor the lastlog files since ++ # those will include local users who have not logged in ++ 'sos_commands/login/last' ++ ] ++ } ++ ++ @classmethod ++ def check_is_type(cls, arc_path): ++ try: ++ return tarfile.is_tarfile(arc_path) and 'sosreport-' in arc_path ++ except Exception: ++ return False ++ ++ ++class SoSReportDirectory(SoSReportArchive): ++ """This is the archive class representing a build directory, or in other ++ words what `sos report --clean` will end up using for in-line obfuscation ++ """ ++ ++ type_name = 'report_dir' ++ description = 'sos report directory' ++ ++ @classmethod ++ def check_is_type(cls, arc_path): ++ if os.path.isdir(arc_path): ++ return 'sos_logs' in os.listdir(arc_path) ++ return False ++ ++ ++class SoSCollectorArchive(SoSObfuscationArchive): ++ """Archive class representing the tarball created by ``sos collect``. It ++ will not provide prep files on its own, however it will provide a list ++ of SoSReportArchive's which will then be used to prep the parsers ++ """ ++ ++ type_name = 'collect' ++ description = 'sos collect tarball' ++ is_nested = True ++ ++ @classmethod ++ def check_is_type(cls, arc_path): ++ try: ++ return (tarfile.is_tarfile(arc_path) and 'sos-collect' in arc_path) ++ except Exception: ++ return False ++ ++ def get_nested_archives(self): ++ self.extract(quiet=True) ++ _path = self.extracted_path ++ archives = [] ++ for fname in os.listdir(_path): ++ arc_name = os.path.join(_path, fname) ++ if 'sosreport-' in fname and tarfile.is_tarfile(arc_name): ++ archives.append(SoSReportArchive(arc_name, self.tmpdir)) ++ return archives ++ ++ ++class SoSCollectorDirectory(SoSCollectorArchive): ++ """The archive class representing the temp directory used by ``sos ++ collect`` when ``--clean`` is used during runtime. ++ """ ++ ++ type_name = 'collect_dir' ++ description = 'sos collect directory' ++ ++ @classmethod ++ def check_is_type(cls, arc_path): ++ if os.path.isdir(arc_path): ++ for fname in os.listdir(arc_path): ++ if 'sos-collector-' in fname: ++ return True ++ return False ++ ++# vim: set et ts=4 sw=4 : +diff --git a/sos/cleaner/parsers/__init__.py b/sos/cleaner/parsers/__init__.py +index af6e375e..e62fd938 100644 +--- a/sos/cleaner/parsers/__init__.py ++++ b/sos/cleaner/parsers/__init__.py +@@ -37,11 +37,6 @@ class SoSCleanerParser(): + :cvar map_file_key: The key in the ``map_file`` to read when loading + previous obfuscation matches + :vartype map_file_key: ``str`` +- +- +- :cvar prep_map_file: File to read from an archive to pre-seed the map with +- matches. E.G. ip_addr for loading IP addresses +- :vartype prep_map_fie: ``str`` + """ + + name = 'Undefined Parser' +@@ -49,7 +44,6 @@ class SoSCleanerParser(): + skip_line_patterns = [] + skip_files = [] + map_file_key = 'unset' +- prep_map_file = [] + + def __init__(self, config={}): + if self.map_file_key in config: +diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py +index 71e13d3f..daa76a62 100644 +--- a/sos/cleaner/parsers/hostname_parser.py ++++ b/sos/cleaner/parsers/hostname_parser.py +@@ -16,7 +16,6 @@ class SoSHostnameParser(SoSCleanerParser): + + name = 'Hostname Parser' + map_file_key = 'hostname_map' +- prep_map_file = 'sos_commands/host/hostname' + regex_patterns = [ + r'(((\b|_)[a-zA-Z0-9-\.]{1,200}\.[a-zA-Z]{1,63}(\b|_)))' + ] +diff --git a/sos/cleaner/parsers/ip_parser.py b/sos/cleaner/parsers/ip_parser.py +index 525139e8..71d38be8 100644 +--- a/sos/cleaner/parsers/ip_parser.py ++++ b/sos/cleaner/parsers/ip_parser.py +@@ -41,7 +41,6 @@ class SoSIPParser(SoSCleanerParser): + ] + + map_file_key = 'ip_map' +- prep_map_file = 'sos_commands/networking/ip_-o_addr' + + def __init__(self, config): + self.mapping = SoSIPMap() +diff --git a/sos/cleaner/parsers/keyword_parser.py b/sos/cleaner/parsers/keyword_parser.py +index 68de3727..694c6073 100644 +--- a/sos/cleaner/parsers/keyword_parser.py ++++ b/sos/cleaner/parsers/keyword_parser.py +@@ -20,7 +20,6 @@ class SoSKeywordParser(SoSCleanerParser): + + name = 'Keyword Parser' + map_file_key = 'keyword_map' +- prep_map_file = '' + + def __init__(self, config, keywords=None, keyword_file=None): + self.mapping = SoSKeywordMap() +diff --git a/sos/cleaner/parsers/mac_parser.py b/sos/cleaner/parsers/mac_parser.py +index 7ca80b8d..c74288cf 100644 +--- a/sos/cleaner/parsers/mac_parser.py ++++ b/sos/cleaner/parsers/mac_parser.py +@@ -30,7 +30,6 @@ class SoSMacParser(SoSCleanerParser): + '534f:53' + ) + map_file_key = 'mac_map' +- prep_map_file = 'sos_commands/networking/ip_-d_address' + + def __init__(self, config): + self.mapping = SoSMacMap() +diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py +index b142e371..35377a31 100644 +--- a/sos/cleaner/parsers/username_parser.py ++++ b/sos/cleaner/parsers/username_parser.py +@@ -25,14 +25,6 @@ class SoSUsernameParser(SoSCleanerParser): + + name = 'Username Parser' + map_file_key = 'username_map' +- prep_map_file = [ +- 'sos_commands/login/lastlog_-u_1000-60000', +- 'sos_commands/login/lastlog_-u_60001-65536', +- 'sos_commands/login/lastlog_-u_65537-4294967295', +- # AD users will be reported here, but favor the lastlog files since +- # those will include local users who have not logged in +- 'sos_commands/login/last' +- ] + regex_patterns = [] + skip_list = [ + 'core', +diff --git a/tests/cleaner_tests/existing_archive.py b/tests/cleaner_tests/existing_archive.py +index 0eaf6c8d..e13d1cae 100644 +--- a/tests/cleaner_tests/existing_archive.py ++++ b/tests/cleaner_tests/existing_archive.py +@@ -28,6 +28,13 @@ class ExistingArchiveCleanTest(StageTwoReportTest): + def test_obfuscation_log_created(self): + self.assertFileExists(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE)) + ++ def test_archive_type_correct(self): ++ with open(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE), 'r') as log: ++ for line in log: ++ if "Loaded %s" % ARCHIVE in line: ++ assert 'as type sos report archive' in line, "Incorrect archive type detected: %s" % line ++ break ++ + def test_from_cmdline_logged(self): + with open(os.path.join(self.tmpdir, '%s-obfuscation.log' % ARCHIVE), 'r') as log: + for line in log: +diff --git a/tests/cleaner_tests/full_report_run.py b/tests/cleaner_tests/full_report_run.py +index 3b28e7a2..2de54946 100644 +--- a/tests/cleaner_tests/full_report_run.py ++++ b/tests/cleaner_tests/full_report_run.py +@@ -35,6 +35,9 @@ class FullCleanTest(StageTwoReportTest): + def test_tarball_named_obfuscated(self): + self.assertTrue('obfuscated' in self.archive) + ++ def test_archive_type_correct(self): ++ self.assertSosLogContains('Loaded .* as type sos report directory') ++ + def test_hostname_not_in_any_file(self): + host = self.sysinfo['pre']['networking']['hostname'] + # much faster to just use grep here +diff --git a/tests/cleaner_tests/report_with_mask.py b/tests/cleaner_tests/report_with_mask.py +index 4f94ba33..08e873d4 100644 +--- a/tests/cleaner_tests/report_with_mask.py ++++ b/tests/cleaner_tests/report_with_mask.py +@@ -31,6 +31,9 @@ class ReportWithMask(StageOneReportTest): + def test_tarball_named_obfuscated(self): + self.assertTrue('obfuscated' in self.archive) + ++ def test_archive_type_correct(self): ++ self.assertSosLogContains('Loaded .* as type sos report directory') ++ + def test_localhost_was_obfuscated(self): + self.assertFileHasContent('/etc/hostname', 'host0') + +-- +2.31.1 + +From 9b119f860eaec089f7ef884ff39c42589a662994 Mon Sep 17 00:00:00 2001 +From: Jake Hunsaker +Date: Wed, 1 Sep 2021 00:34:04 -0400 +Subject: [PATCH] [hostname_map] Add a catch for single-character hostnames + +If a log file was truncated at a specific boundary in a string of the +FQDN of the host such that we only get a couple characters before the +rest of the domain, we would previously bodly replace all instances of +that character with the obfuscated short name; not very helpful. + +Instead, don't sanitize the short name if this happens and instead +obfuscate the whole FQDN as 'unknown.example.com'. + +Signed-off-by: Jake Hunsaker +--- + sos/cleaner/mappings/hostname_map.py | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py +index d4b2c88e..e70a5530 100644 +--- a/sos/cleaner/mappings/hostname_map.py ++++ b/sos/cleaner/mappings/hostname_map.py +@@ -184,7 +184,14 @@ class SoSHostnameMap(SoSMap): + hostname = host[0] + domain = host[1:] + # obfuscate the short name +- ob_hostname = self.sanitize_short_name(hostname) ++ if len(hostname) > 2: ++ ob_hostname = self.sanitize_short_name(hostname) ++ else: ++ # by best practice it appears the host part of the fqdn was cut ++ # off due to some form of truncating, as such don't obfuscate ++ # short strings that are likely to throw off obfuscation of ++ # unrelated bits and paths ++ ob_hostname = 'unknown' + ob_domain = self.sanitize_domain(domain) + self.dataset[item] = ob_domain + return '.'.join([ob_hostname, ob_domain]) +-- +2.31.1 + +From f3f3e763d7c31b7b7cafdf8dd4dab87056fb7696 Mon Sep 17 00:00:00 2001 +From: Jake Hunsaker +Date: Wed, 1 Sep 2021 15:54:55 -0400 +Subject: [PATCH] [cleaner] Add support for Insights client archives + +Adds a new type of `SoSObfuscationArchive` to add support for +obfuscating archives generated by the Insights project. + +Signed-off-by: Jake Hunsaker +--- + man/en/sos-clean.1 | 1 + + sos/cleaner/__init__.py | 4 ++- + sos/cleaner/archives/insights.py | 42 ++++++++++++++++++++++++++++++++ + 3 files changed, 46 insertions(+), 1 deletion(-) + create mode 100644 sos/cleaner/archives/insights.py + +diff --git a/man/en/sos-clean.1 b/man/en/sos-clean.1 +index 54026713..358ec0cb 100644 +--- a/man/en/sos-clean.1 ++++ b/man/en/sos-clean.1 +@@ -105,6 +105,7 @@ The following are accepted values for this option: + \fBauto\fR Automatically detect the archive type + \fBreport\fR An archive generated by \fBsos report\fR + \fBcollect\fR An archive generated by \fBsos collect\fR ++ \fBinsights\fR An archive generated by the \fBinsights-client\fR package + + The following may also be used, however note that these do not attempt to pre-load + any information from the archives into the parsers. This means that, among other limitations, +diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py +index 6d2eb483..3e08aa28 100644 +--- a/sos/cleaner/__init__.py ++++ b/sos/cleaner/__init__.py +@@ -29,6 +29,7 @@ from sos.cleaner.archives.sos import (SoSReportArchive, SoSReportDirectory, + SoSCollectorArchive, + SoSCollectorDirectory) + from sos.cleaner.archives.generic import DataDirArchive, TarballArchive ++from sos.cleaner.archives.insights import InsightsArchive + from sos.utilities import get_human_readable + from textwrap import fill + +@@ -100,6 +101,7 @@ class SoSCleaner(SoSComponent): + SoSReportArchive, + SoSCollectorDirectory, + SoSCollectorArchive, ++ InsightsArchive, + # make sure these two are always last as they are fallbacks + DataDirArchive, + TarballArchive +@@ -194,7 +196,7 @@ third party. + help='The directory or archive to obfuscate') + clean_grp.add_argument('--archive-type', default='auto', + choices=['auto', 'report', 'collect', +- 'data-dir', 'tarball'], ++ 'insights', 'data-dir', 'tarball'], + help=('Specify what kind of archive the target ' + 'was generated as')) + clean_grp.add_argument('--domains', action='extend', default=[], +diff --git a/sos/cleaner/archives/insights.py b/sos/cleaner/archives/insights.py +new file mode 100644 +index 00000000..dab48b16 +--- /dev/null ++++ b/sos/cleaner/archives/insights.py +@@ -0,0 +1,42 @@ ++# Copyright 2021 Red Hat, Inc. Jake Hunsaker ++ ++# This file is part of the sos project: https://github.com/sosreport/sos ++# ++# This copyrighted material is made available to anyone wishing to use, ++# modify, copy, or redistribute it subject to the terms and conditions of ++# version 2 of the GNU General Public License. ++# ++# See the LICENSE file in the source distribution for further information. ++ ++ ++from sos.cleaner.archives import SoSObfuscationArchive ++ ++import tarfile ++ ++ ++class InsightsArchive(SoSObfuscationArchive): ++ """This class represents archives generated by the insights-client utility ++ for RHEL systems. ++ """ ++ ++ type_name = 'insights' ++ description = 'insights-client archive' ++ ++ prep_files = { ++ 'hostname': 'data/insights_commands/hostname_-f', ++ 'ip': 'data/insights_commands/ip_addr', ++ 'mac': 'data/insights_commands/ip_addr' ++ } ++ ++ @classmethod ++ def check_is_type(cls, arc_path): ++ try: ++ return tarfile.is_tarfile(arc_path) and 'insights-' in arc_path ++ except Exception: ++ return False ++ ++ def get_archive_root(self): ++ top = self.archive_path.split('/')[-1].split('.tar')[0] ++ if self.tarobj.firstmember.name == '.': ++ top = './' + top ++ return top +-- +2.31.1 + +From 9639dc3d240076b55f2a1d04b43ea42bebd09215 Mon Sep 17 00:00:00 2001 +From: Jake Hunsaker +Date: Tue, 16 Nov 2021 17:50:42 -0500 +Subject: [PATCH] [clean,hostname_parser] Source /etc/hosts for obfuscation + +Up until now, our sourcing of hostnames/domains for obfuscation has been +dependent upon the output of the `hostname` command. However, some +scenarios have come up where sourcing `/etc/hosts` is advantageous for +several reasons: + +First, if `hostname` output is unavailable, this provides a fallback +measure. + +Second, `/etc/hosts` is a common place to have short names defined which +would otherwise not be detected (or at the very least would result in a +race condition based on where/if the short name was elsewhere able to be +gleaned from an FQDN), thus leaving the potential for unobfuscated data +in an archive. + +Due to both the nature of hostname obfuscation and the malleable syntax +of `/etc/hosts`, the parsing of this file needs special handling not +covered by our more generic parsing and obfuscation methods. + +Signed-off-by: Jake Hunsaker +--- + sos/cleaner/__init__.py | 11 ++++++++--- + sos/cleaner/archives/sos.py | 5 ++++- + sos/cleaner/parsers/hostname_parser.py | 19 +++++++++++++++++++ + 3 files changed, 31 insertions(+), 4 deletions(-) + +diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py +index ed461a8f..3f530d44 100644 +--- a/sos/cleaner/__init__.py ++++ b/sos/cleaner/__init__.py +@@ -523,9 +523,14 @@ third party. + if isinstance(_parser, SoSUsernameParser): + _parser.load_usernames_into_map(content) + elif isinstance(_parser, SoSHostnameParser): +- _parser.load_hostname_into_map( +- content.splitlines()[0] +- ) ++ if 'hostname' in parse_file: ++ _parser.load_hostname_into_map( ++ content.splitlines()[0] ++ ) ++ elif 'etc/hosts' in parse_file: ++ _parser.load_hostname_from_etc_hosts( ++ content ++ ) + else: + for line in content.splitlines(): + self.obfuscate_line(line) +diff --git a/sos/cleaner/archives/sos.py b/sos/cleaner/archives/sos.py +index 4401d710..f8720c88 100644 +--- a/sos/cleaner/archives/sos.py ++++ b/sos/cleaner/archives/sos.py +@@ -23,7 +23,10 @@ class SoSReportArchive(SoSObfuscationArchive): + type_name = 'report' + description = 'sos report archive' + prep_files = { +- 'hostname': 'sos_commands/host/hostname', ++ 'hostname': [ ++ 'sos_commands/host/hostname', ++ 'etc/hosts' ++ ], + 'ip': 'sos_commands/networking/ip_-o_addr', + 'mac': 'sos_commands/networking/ip_-d_address', + 'username': [ +diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py +index daa76a62..0a733bee 100644 +--- a/sos/cleaner/parsers/hostname_parser.py ++++ b/sos/cleaner/parsers/hostname_parser.py +@@ -61,6 +61,25 @@ class SoSHostnameParser(SoSCleanerParser): + self.mapping.add(high_domain) + self.mapping.add(hostname_string) + ++ def load_hostname_from_etc_hosts(self, content): ++ """Parse an archive's copy of /etc/hosts, which requires handling that ++ is separate from the output of the `hostname` command. Just like ++ load_hostname_into_map(), this has to be done explicitly and we ++ cannot rely upon the more generic methods to do this reliably. ++ """ ++ lines = content.splitlines() ++ for line in lines: ++ if line.startswith('#') or 'localhost' in line: ++ continue ++ hostln = line.split()[1:] ++ for host in hostln: ++ if len(host.split('.')) == 1: ++ # only generate a mapping for fqdns but still record the ++ # short name here for later obfuscation with parse_line() ++ self.short_names.append(host) ++ else: ++ self.mapping.add(host) ++ + def parse_line(self, line): + """Override the default parse_line() method to also check for the + shortname of the host derived from the hostname. +-- +2.31.1 + +From c1680226b53452b18f27f2e76c3e0e03e521f935 Mon Sep 17 00:00:00 2001 +From: Jake Hunsaker +Date: Wed, 17 Nov 2021 13:11:33 -0500 +Subject: [PATCH] [clean, hostname] Fix unintentionally case sensitive + shortname handling + +It was discovered that our extra handling for shortnames was +unintentionally case sensitive. Fix this to ensure that shortnames are +obfuscated regardless of case in all collected text. + +Signed-off-by: Jake Hunsaker +--- + sos/cleaner/mappings/hostname_map.py | 6 +++--- + sos/cleaner/parsers/hostname_parser.py | 8 +++++--- + tests/cleaner_tests/full_report_run.py | 21 ++++++++++++++++++++- + 3 files changed, 28 insertions(+), 7 deletions(-) + +diff --git a/sos/cleaner/mappings/hostname_map.py b/sos/cleaner/mappings/hostname_map.py +index e70a5530..0fe78fb1 100644 +--- a/sos/cleaner/mappings/hostname_map.py ++++ b/sos/cleaner/mappings/hostname_map.py +@@ -169,13 +169,13 @@ class SoSHostnameMap(SoSMap): + + def sanitize_item(self, item): + host = item.split('.') +- if all([h.isupper() for h in host]): ++ if len(host) > 1 and all([h.isupper() for h in host]): + # by convention we have just a domain + _host = [h.lower() for h in host] + return self.sanitize_domain(_host).upper() + if len(host) == 1: + # we have a shortname for a host +- return self.sanitize_short_name(host[0]) ++ return self.sanitize_short_name(host[0].lower()) + if len(host) == 2: + # we have just a domain name, e.g. example.com + return self.sanitize_domain(host) +@@ -185,7 +185,7 @@ class SoSHostnameMap(SoSMap): + domain = host[1:] + # obfuscate the short name + if len(hostname) > 2: +- ob_hostname = self.sanitize_short_name(hostname) ++ ob_hostname = self.sanitize_short_name(hostname.lower()) + else: + # by best practice it appears the host part of the fqdn was cut + # off due to some form of truncating, as such don't obfuscate +diff --git a/sos/cleaner/parsers/hostname_parser.py b/sos/cleaner/parsers/hostname_parser.py +index 0a733bee..7fd0e698 100644 +--- a/sos/cleaner/parsers/hostname_parser.py ++++ b/sos/cleaner/parsers/hostname_parser.py +@@ -8,6 +8,8 @@ + # + # See the LICENSE file in the source distribution for further information. + ++import re ++ + from sos.cleaner.parsers import SoSCleanerParser + from sos.cleaner.mappings.hostname_map import SoSHostnameMap + +@@ -91,9 +93,9 @@ class SoSHostnameParser(SoSCleanerParser): + """ + if search in self.mapping.skip_keys: + return ln, count +- if search in ln: +- count += ln.count(search) +- ln = ln.replace(search, self.mapping.get(repl or search)) ++ _reg = re.compile(search, re.I) ++ if _reg.search(ln): ++ return _reg.subn(self.mapping.get(repl or search), ln) + return ln, count + + count = 0 +diff --git a/tests/cleaner_tests/full_report_run.py b/tests/cleaner_tests/full_report_run.py +index 2de54946..0b23acaf 100644 +--- a/tests/cleaner_tests/full_report_run.py ++++ b/tests/cleaner_tests/full_report_run.py +@@ -26,6 +26,24 @@ class FullCleanTest(StageTwoReportTest): + # replace with an empty placeholder, make sure that this test case is not + # influenced by previous clean runs + files = ['/etc/sos/cleaner/default_mapping'] ++ packages = { ++ 'rhel': ['python3-systemd'], ++ 'ubuntu': ['python3-systemd'] ++ } ++ ++ def pre_sos_setup(self): ++ # ensure that case-insensitive matching of FQDNs and shortnames work ++ from systemd import journal ++ from socket import gethostname ++ host = gethostname() ++ short = host.split('.')[0] ++ sosfd = journal.stream('sos-testing') ++ sosfd.write( ++ "This is a test line from sos clean testing. The hostname %s " ++ "should not appear, nor should %s in an obfuscated archive. The " ++ "shortnames of %s and %s should also not appear." ++ % (host.lower(), host.upper(), short.lower(), short.upper()) ++ ) + + def test_private_map_was_generated(self): + self.assertOutputContains('A mapping of obfuscated elements is available at') +@@ -40,8 +58,9 @@ class FullCleanTest(StageTwoReportTest): + + def test_hostname_not_in_any_file(self): + host = self.sysinfo['pre']['networking']['hostname'] ++ short = host.split('.')[0] + # much faster to just use grep here +- content = self.grep_for_content(host) ++ content = self.grep_for_content(host) + self.grep_for_content(short) + if not content: + assert True + else: +-- +2.31.1 + +From aaeb8cb57ed55598ab744b96d4f127aedebcb292 Mon Sep 17 00:00:00 2001 +From: Jake Hunsaker +Date: Tue, 21 Sep 2021 15:23:20 -0400 +Subject: [PATCH] [build] Add archives to setup.py packages + +Adds the newly abstracted `sos.cleaner.archives` package to `setup.py` +so that manual builds will properly include it. + +Signed-off-by: Jake Hunsaker +--- + setup.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/setup.py b/setup.py +index 1e8d8e2dc5..7653b59de3 100644 +--- a/setup.py ++++ b/setup.py +@@ -102,7 +102,7 @@ def copy_file (self, filename, dirname): + 'sos.policies.package_managers', 'sos.policies.init_systems', + 'sos.report', 'sos.report.plugins', 'sos.collector', + 'sos.collector.clusters', 'sos.cleaner', 'sos.cleaner.mappings', +- 'sos.cleaner.parsers' ++ 'sos.cleaner.parsers', 'sos.cleaner.archives' + ], + cmdclass=cmdclass, + command_options=command_options, diff --git a/SOURCES/sos-bz2025610-RHTS-api-change.patch b/SOURCES/sos-bz2025610-RHTS-api-change.patch new file mode 100644 index 0000000..580117f --- /dev/null +++ b/SOURCES/sos-bz2025610-RHTS-api-change.patch @@ -0,0 +1,224 @@ +From 2e8b5e2d4f30854cce93d149fc7d24b9d9cfd02c Mon Sep 17 00:00:00 2001 +From: Pavel Moravec +Date: Fri, 19 Nov 2021 16:16:07 +0100 +Subject: [PATCH 1/3] [policies] strip path from SFTP upload filename + +When case_id is not supplied, we ask SFTP server to store the uploaded +file under name /var/tmp/, which is confusing. + +Let remove the path from it also in case_id not supplied. + +Related to: #2764 + +Signed-off-by: Pavel Moravec +--- + sos/policies/distros/redhat.py | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/sos/policies/distros/redhat.py b/sos/policies/distros/redhat.py +index 3476e21fb..8817fc785 100644 +--- a/sos/policies/distros/redhat.py ++++ b/sos/policies/distros/redhat.py +@@ -269,10 +269,10 @@ def _get_sftp_upload_name(self): + """The RH SFTP server will only automatically connect file uploads to + cases if the filename _starts_ with the case number + """ ++ fname = self.upload_archive_name.split('/')[-1] + if self.case_id: +- return "%s_%s" % (self.case_id, +- self.upload_archive_name.split('/')[-1]) +- return self.upload_archive_name ++ return "%s_%s" % (self.case_id, fname) ++ return fname + + def upload_sftp(self): + """Override the base upload_sftp to allow for setting an on-demand + +From 61023b29a656dd7afaa4a0643368b0a53f1a3779 Mon Sep 17 00:00:00 2001 +From: Pavel Moravec +Date: Fri, 19 Nov 2021 17:31:31 +0100 +Subject: [PATCH 2/3] [redhat] update SFTP API version to v2 + +Change API version from v1 to v2, which includes: +- change of URL +- different URI +- POST method for token generation instead of GET + +Resolves: #2764 + +Signed-off-by: Pavel Moravec +--- + sos/policies/distros/redhat.py | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/sos/policies/distros/redhat.py b/sos/policies/distros/redhat.py +index 8817fc785..e4e2b8835 100644 +--- a/sos/policies/distros/redhat.py ++++ b/sos/policies/distros/redhat.py +@@ -175,7 +175,7 @@ def get_tmp_dir(self, opt_tmp_dir): + No changes will be made to system configuration. + """ + +-RH_API_HOST = "https://access.redhat.com" ++RH_API_HOST = "https://api.access.redhat.com" + RH_SFTP_HOST = "sftp://sftp.access.redhat.com" + + +@@ -287,12 +287,12 @@ def upload_sftp(self): + " for obtaining SFTP auth token.") + _token = None + _user = None ++ url = RH_API_HOST + '/support/v2/sftp/token' + # we have a username and password, but we need to reset the password + # to be the token returned from the auth endpoint + if self.get_upload_user() and self.get_upload_password(): +- url = RH_API_HOST + '/hydra/rest/v1/sftp/token' + auth = self.get_upload_https_auth() +- ret = requests.get(url, auth=auth, timeout=10) ++ ret = requests.post(url, auth=auth, timeout=10) + if ret.status_code == 200: + # credentials are valid + _user = self.get_upload_user() +@@ -302,8 +302,8 @@ def upload_sftp(self): + "credentials. Will try anonymous.") + # we either do not have a username or password/token, or both + if not _token: +- aurl = RH_API_HOST + '/hydra/rest/v1/sftp/token?isAnonymous=true' +- anon = requests.get(aurl, timeout=10) ++ adata = {"isAnonymous": True} ++ anon = requests.post(url, data=json.dumps(adata), timeout=10) + if anon.status_code == 200: + resp = json.loads(anon.text) + _user = resp['username'] + +From 267da2156ec61f526dd28e760ff6528408a76c3f Mon Sep 17 00:00:00 2001 +From: Pavel Moravec +Date: Mon, 22 Nov 2021 15:22:32 +0100 +Subject: [PATCH 3/3] [policies] Deal 200 return code as success + +Return code 200 of POST method request must be dealt as success. + +Newly required due to the SFTP API change using POST. + +Related to: #2764 + +Signed-off-by: Pavel Moravec +--- + sos/policies/distros/__init__.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sos/policies/distros/__init__.py b/sos/policies/distros/__init__.py +index 0906fa779..6f257fdce 100644 +--- a/sos/policies/distros/__init__.py ++++ b/sos/policies/distros/__init__.py +@@ -551,7 +551,7 @@ def upload_https(self): + r = self._upload_https_put(arc, verify) + else: + r = self._upload_https_post(arc, verify) +- if r.status_code != 201: ++ if r.status_code != 200 and r.status_code != 201: + if r.status_code == 401: + raise Exception( + "Authentication failed: invalid user credentials" +From 8da1b14246226792c160dd04e5c7c75dd4e8d44b Mon Sep 17 00:00:00 2001 +From: Pavel Moravec +Date: Mon, 22 Nov 2021 10:44:09 +0100 +Subject: [PATCH] [collect] fix moved get_upload_url under Policy class + +SoSCollector does not further declare get_upload_url method +as that was moved under Policy class(es). + +Resolves: #2766 + +Signed-off-by: Pavel Moravec +--- + sos/collector/__init__.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sos/collector/__init__.py b/sos/collector/__init__.py +index 50183e873..42a7731d6 100644 +--- a/sos/collector/__init__.py ++++ b/sos/collector/__init__.py +@@ -1219,7 +1219,7 @@ this utility or remote systems that it c + msg = 'No sosreports were collected, nothing to archive...' + self.exit(msg, 1) + +- if self.opts.upload and self.get_upload_url(): ++ if self.opts.upload and self.policy.get_upload_url(): + try: + self.policy.upload_archive(arc_name) + self.ui_log.info("Uploaded archive successfully") +From abb2fc65bd14760021c61699ad3113cab3bd4c64 Mon Sep 17 00:00:00 2001 +From: Pavel Moravec +Date: Tue, 30 Nov 2021 11:37:02 +0100 +Subject: [PATCH 1/2] [redhat] Fix broken URI to upload to customer portal + +Revert back the unwanted change in URI of uploading tarball to the +Red Hat Customer portal. + +Related: #2772 + +Signed-off-by: Pavel Moravec +--- + sos/policies/distros/redhat.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sos/policies/distros/redhat.py b/sos/policies/distros/redhat.py +index e4e2b883..eb442407 100644 +--- a/sos/policies/distros/redhat.py ++++ b/sos/policies/distros/redhat.py +@@ -250,7 +250,7 @@ support representative. + elif self.commons['cmdlineopts'].upload_protocol == 'sftp': + return RH_SFTP_HOST + else: +- rh_case_api = "/hydra/rest/cases/%s/attachments" ++ rh_case_api = "/support/v1/cases/%s/attachments" + return RH_API_HOST + rh_case_api % self.case_id + + def _get_upload_headers(self): +-- +2.31.1 + + +From ea4f9e88a412c80a4791396e1bb78ac1e24ece14 Mon Sep 17 00:00:00 2001 +From: Pavel Moravec +Date: Tue, 30 Nov 2021 13:00:26 +0100 +Subject: [PATCH 2/2] [policy] Add error message when FTP upload write failure + +When (S)FTP upload fails to write the destination file, +our "expect" code should detect it sooner than after timeout happens +and write appropriate error message. + +Resolves: #2772 + +Signed-off-by: Pavel Moravec +--- + sos/policies/distros/__init__.py | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/sos/policies/distros/__init__.py b/sos/policies/distros/__init__.py +index 6f257fdc..7bdc81b8 100644 +--- a/sos/policies/distros/__init__.py ++++ b/sos/policies/distros/__init__.py +@@ -473,7 +473,8 @@ class LinuxPolicy(Policy): + put_expects = [ + u'100%', + pexpect.TIMEOUT, +- pexpect.EOF ++ pexpect.EOF, ++ u'No such file or directory' + ] + + put_success = ret.expect(put_expects, timeout=180) +@@ -485,6 +486,8 @@ class LinuxPolicy(Policy): + raise Exception("Timeout expired while uploading") + elif put_success == 2: + raise Exception("Unknown error during upload: %s" % ret.before) ++ elif put_success == 3: ++ raise Exception("Unable to write archive to destination") + else: + raise Exception("Unexpected response from server: %s" % ret.before) + +-- +2.31.1 + diff --git a/SPECS/sos.spec b/SPECS/sos.spec index 8498505..a23b105 100644 --- a/SPECS/sos.spec +++ b/SPECS/sos.spec @@ -5,13 +5,13 @@ Summary: A set of tools to gather troubleshooting information from a system Name: sos Version: 4.2 -Release: 4%{?dist} +Release: 6%{?dist} Group: Applications/System Source0: https://github.com/sosreport/sos/archive/%{version}/sos-%{version}.tar.gz Source1: sos-audit-%{auditversion}.tgz License: GPLv2+ BuildArch: noarch -Url: http://github.com/sosreport/sos +Url: https://github.com/sosreport/sos BuildRequires: python3-devel BuildRequires: gettext Requires: libxml2-python3 @@ -34,6 +34,8 @@ Patch10: sos-bz2004929-openvswitch-offline-analysis.patch Patch11: sos-bz2012857-plugin-timeout-unhandled-exception.patch Patch12: sos-bz2018033-plugin-timeouts-proper-handling.patch Patch13: sos-bz2020777-filter-namespace-per-pattern.patch +Patch14: sos-bz2023867-cleaner-hostnames-improvements.patch +Patch15: sos-bz2025610-RHTS-api-change.patch %description @@ -58,6 +60,8 @@ support technicians and developers. %patch11 -p1 %patch12 -p1 %patch13 -p1 +%patch14 -p1 +%patch15 -p1 %build %py3_build @@ -124,6 +128,18 @@ of the system. Currently storage and filesystem commands are audited. %ghost /etc/audit/rules.d/40-sos-storage.rules %changelog +* Tue Nov 30 2021 Pavel Moravec = 4.2-6 +- [redhat] Fix broken URI to upload to customer portal + Resolves: bz2025610 + +* Mon Nov 22 2021 Pavel Moravec = 4.2-5 +- [clean,hostname_parser] Source /etc/hosts for obfuscation + Resolves: bz2023867 +- [clean, hostname] Fix unintentionally case sensitive + Resolves: bz2023863 +- [redhat] update SFTP API version to v2 + Resolves: bz2025610 + * Tue Nov 16 2021 Pavel Moravec = 4.2-4 - [report] Calculate sizes of dirs, symlinks and manifest in Resolves: bz1873185