Blame SOURCES/sos-bz1930181-collect-cleaning-consistency.patch

24a42c
From fc0218638f3e865c4315823e72aef2f46d012d07 Mon Sep 17 00:00:00 2001
24a42c
From: Jake Hunsaker <jhunsake@redhat.com>
24a42c
Date: Wed, 14 Apr 2021 11:55:03 -0400
24a42c
Subject: [PATCH 1/2] [clean] Load maps from all archives before obfuscation
24a42c
 loop
24a42c
24a42c
Previously, maps were being prepped via archives after extraction. This
24a42c
reduced the amount of file IO being done, but made it so that necessary
24a42c
obfuscations from later archives in a series would not be obfuscated in
24a42c
the archives obfuscated before those later archives were extracted.
24a42c
24a42c
Fix this by extracting the map prep files into memory for each archive
24a42c
to prep the maps before we enter the obfuscation loop entirely.
24a42c
24a42c
Closes: #2490
24a42c
Related: RHBZ#1930181
24a42c
Resolves: #2492
24a42c
24a42c
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
24a42c
---
24a42c
 sos/cleaner/__init__.py                | 69 +++++++++++++++-----------
24a42c
 sos/cleaner/parsers/username_parser.py | 13 +++--
24a42c
 2 files changed, 45 insertions(+), 37 deletions(-)
24a42c
24a42c
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
24a42c
index b9eb61ef..d10cdc55 100644
24a42c
--- a/sos/cleaner/__init__.py
24a42c
+++ b/sos/cleaner/__init__.py
24a42c
@@ -292,6 +292,7 @@ third party.
24a42c
 
24a42c
         # we have at least one valid target to obfuscate
24a42c
         self.completed_reports = []
24a42c
+        self.preload_all_archives_into_maps()
24a42c
         self.obfuscate_report_paths()
24a42c
 
24a42c
         if not self.completed_reports:
24a42c
@@ -473,6 +474,44 @@ third party.
24a42c
             self.ui_log.info("Exiting on user cancel")
24a42c
             os._exit(130)
24a42c
 
24a42c
+    def preload_all_archives_into_maps(self):
24a42c
+        """Before doing the actual obfuscation, if we have multiple archives
24a42c
+        to obfuscate then we need to preload each of them into the mappings
24a42c
+        to ensure that node1 is obfuscated in node2 as well as node2 being
24a42c
+        obfuscated in node1's archive.
24a42c
+        """
24a42c
+        self.log_info("Pre-loading multiple archives into obfuscation maps")
24a42c
+        for _arc in self.report_paths:
24a42c
+            is_dir = os.path.isdir(_arc)
24a42c
+            if is_dir:
24a42c
+                _arc_name = _arc
24a42c
+            else:
24a42c
+                archive = tarfile.open(_arc)
24a42c
+                _arc_name = _arc.split('/')[-1].split('.tar')[0]
24a42c
+            # for each parser, load the map_prep_file into memory, and then
24a42c
+            # send that for obfuscation. We don't actually obfuscate the file
24a42c
+            # here, do that in the normal archive loop
24a42c
+            for _parser in self.parsers:
24a42c
+                if not _parser.prep_map_file:
24a42c
+                    continue
24a42c
+                _arc_path = os.path.join(_arc_name, _parser.prep_map_file)
24a42c
+                try:
24a42c
+                    if is_dir:
24a42c
+                        _pfile = open(_arc_path, 'r')
24a42c
+                        content = _pfile.read()
24a42c
+                    else:
24a42c
+                        _pfile = archive.extractfile(_arc_path)
24a42c
+                        content = _pfile.read().decode('utf-8')
24a42c
+                    _pfile.close()
24a42c
+                    if isinstance(_parser, SoSUsernameParser):
24a42c
+                        _parser.load_usernames_into_map(content)
24a42c
+                    for line in content.splitlines():
24a42c
+                        if isinstance(_parser, SoSHostnameParser):
24a42c
+                            _parser.load_hostname_into_map(line)
24a42c
+                        self.obfuscate_line(line, _parser.prep_map_file)
24a42c
+                except Exception as err:
24a42c
+                    self.log_debug("Could not prep %s: %s" % (_arc_path, err))
24a42c
+
24a42c
     def obfuscate_report(self, report):
24a42c
         """Individually handle each archive or directory we've discovered by
24a42c
         running through each file therein.
24a42c
@@ -493,7 +532,6 @@ third party.
24a42c
             start_time = datetime.now()
24a42c
             arc_md.add_field('start_time', start_time)
24a42c
             archive.extract()
24a42c
-            self.prep_maps_from_archive(archive)
24a42c
             archive.report_msg("Beginning obfuscation...")
24a42c
 
24a42c
             file_list = archive.get_file_list()
24a42c
@@ -542,35 +580,6 @@ third party.
24a42c
             self.ui_log.info("Exception while processing %s: %s"
24a42c
                              % (report, err))
24a42c
 
24a42c
-    def prep_maps_from_archive(self, archive):
24a42c
-        """Open specific files from an archive and try to load those values
24a42c
-        into our mappings before iterating through the entire archive.
24a42c
-
24a42c
-        Positional arguments:
24a42c
-
24a42c
-            :param archive SoSObfuscationArchive:   An open archive object
24a42c
-        """
24a42c
-        for parser in self.parsers:
24a42c
-            if not parser.prep_map_file:
24a42c
-                continue
24a42c
-            prep_file = archive.get_file_path(parser.prep_map_file)
24a42c
-            if not prep_file:
24a42c
-                self.log_debug("Could not prepare %s: %s does not exist"
24a42c
-                               % (parser.name, parser.prep_map_file),
24a42c
-                               caller=archive.archive_name)
24a42c
-                continue
24a42c
-            # this is a bit clunky, but we need to load this particular
24a42c
-            # parser in a different way due to how hostnames are validated for
24a42c
-            # obfuscation
24a42c
-            if isinstance(parser, SoSHostnameParser):
24a42c
-                with open(prep_file, 'r') as host_file:
24a42c
-                    hostname = host_file.readline().strip()
24a42c
-                    parser.load_hostname_into_map(hostname)
24a42c
-            if isinstance(parser, SoSUsernameParser):
24a42c
-                parser.load_usernames_into_map(prep_file)
24a42c
-            self.obfuscate_file(prep_file, parser.prep_map_file,
24a42c
-                                archive.archive_name)
24a42c
-
24a42c
     def obfuscate_file(self, filename, short_name=None, arc_name=None):
24a42c
         """Obfuscate and individual file, line by line.
24a42c
 
24a42c
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
24a42c
index 5223c018..2bb6c7f3 100644
24a42c
--- a/sos/cleaner/parsers/username_parser.py
24a42c
+++ b/sos/cleaner/parsers/username_parser.py
24a42c
@@ -39,16 +39,15 @@ class SoSUsernameParser(SoSCleanerParser):
24a42c
         super(SoSUsernameParser, self).__init__(conf_file)
24a42c
         self.mapping.load_names_from_options(opt_names)
24a42c
 
24a42c
-    def load_usernames_into_map(self, fname):
24a42c
+    def load_usernames_into_map(self, content):
24a42c
         """Since we don't get the list of usernames from a straight regex for
24a42c
         this parser, we need to override the initial parser prepping here.
24a42c
         """
24a42c
-        with open(fname, 'r') as lastfile:
24a42c
-            for line in lastfile.read().splitlines()[1:]:
24a42c
-                user = line.split()[0]
24a42c
-                if user in self.skip_list:
24a42c
-                    continue
24a42c
-                self.mapping.get(user)
24a42c
+        for line in content.splitlines()[1:]:
24a42c
+            user = line.split()[0]
24a42c
+            if user in self.skip_list:
24a42c
+                continue
24a42c
+            self.mapping.get(user)
24a42c
 
24a42c
     def parse_line(self, line):
24a42c
         count = 0
24a42c
-- 
24a42c
2.26.3
24a42c
24a42c
24a42c
From b713f458bfa92427147de754ea36054bfde53d71 Mon Sep 17 00:00:00 2001
24a42c
From: Jake Hunsaker <jhunsake@redhat.com>
24a42c
Date: Wed, 14 Apr 2021 12:22:28 -0400
24a42c
Subject: [PATCH 2/2] [clean] Remove duplicate file skipping within
24a42c
 obfuscate_line()
24a42c
24a42c
A redundant file skipping check was being executed within
24a42c
`obfuscate_line()` that would cause subsequent archives being obfuscated
24a42c
to skip line obfuscation within a file, despite iterating through the
24a42c
entire file.
24a42c
24a42c
Remove this redundant check, thus allowing proper obfuscation.
24a42c
24a42c
Closes: #2490
24a42c
Related: RHBZ#1930181
24a42c
Resolves: #2492
24a42c
24a42c
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
24a42c
---
24a42c
 sos/cleaner/__init__.py            | 11 +++--------
24a42c
 sos/cleaner/obfuscation_archive.py |  2 --
24a42c
 2 files changed, 3 insertions(+), 10 deletions(-)
24a42c
24a42c
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
24a42c
index d10cdc55..bdd24f95 100644
24a42c
--- a/sos/cleaner/__init__.py
24a42c
+++ b/sos/cleaner/__init__.py
24a42c
@@ -508,7 +508,7 @@ third party.
24a42c
                     for line in content.splitlines():
24a42c
                         if isinstance(_parser, SoSHostnameParser):
24a42c
                             _parser.load_hostname_into_map(line)
24a42c
-                        self.obfuscate_line(line, _parser.prep_map_file)
24a42c
+                        self.obfuscate_line(line)
24a42c
                 except Exception as err:
24a42c
                     self.log_debug("Could not prep %s: %s" % (_arc_path, err))
24a42c
 
24a42c
@@ -606,7 +606,7 @@ third party.
24a42c
                 if not line.strip():
24a42c
                     continue
24a42c
                 try:
24a42c
-                    line, count = self.obfuscate_line(line, short_name)
24a42c
+                    line, count = self.obfuscate_line(line)
24a42c
                     subs += count
24a42c
                     tfile.write(line)
24a42c
                 except Exception as err:
24a42c
@@ -631,7 +631,7 @@ third party.
24a42c
                 pass
24a42c
         return string_data
24a42c
 
24a42c
-    def obfuscate_line(self, line, filename):
24a42c
+    def obfuscate_line(self, line):
24a42c
         """Run a line through each of the obfuscation parsers, keeping a
24a42c
         cumulative total of substitutions done on that particular line.
24a42c
 
24a42c
@@ -639,16 +639,11 @@ third party.
24a42c
 
24a42c
             :param line str:        The raw line as read from the file being
24a42c
                                     processed
24a42c
-            :param filename str:    Filename the line was read from
24a42c
 
24a42c
         Returns the fully obfuscated line and the number of substitutions made
24a42c
         """
24a42c
         count = 0
24a42c
         for parser in self.parsers:
24a42c
-            if filename and any([
24a42c
-                re.match(_s, filename) for _s in parser.skip_files
24a42c
-            ]):
24a42c
-                continue
24a42c
             try:
24a42c
                 line, _count = parser.parse_line(line)
24a42c
                 count += _count
24a42c
diff --git a/sos/cleaner/obfuscation_archive.py b/sos/cleaner/obfuscation_archive.py
24a42c
index 84ca30cd..c64ab13b 100644
24a42c
--- a/sos/cleaner/obfuscation_archive.py
24a42c
+++ b/sos/cleaner/obfuscation_archive.py
24a42c
@@ -219,8 +219,6 @@ class SoSObfuscationArchive():
24a42c
             :param filename str:        Filename relative to the extracted
24a42c
                                         archive root
24a42c
         """
24a42c
-        if filename in self.file_sub_list:
24a42c
-            return True
24a42c
 
24a42c
         if not os.path.isfile(self.get_file_path(filename)):
24a42c
             return True
24a42c
-- 
24a42c
2.26.3
24a42c