Blame SOURCES/sos-bz1930181-collect-cleaning-consistency.patch

ba407d
From fc0218638f3e865c4315823e72aef2f46d012d07 Mon Sep 17 00:00:00 2001
ba407d
From: Jake Hunsaker <jhunsake@redhat.com>
ba407d
Date: Wed, 14 Apr 2021 11:55:03 -0400
ba407d
Subject: [PATCH 1/2] [clean] Load maps from all archives before obfuscation
ba407d
 loop
ba407d
ba407d
Previously, maps were being prepped via archives after extraction. This
ba407d
reduced the amount of file IO being done, but made it so that necessary
ba407d
obfuscations from later archives in a series would not be obfuscated in
ba407d
the archives obfuscated before those later archives were extracted.
ba407d
ba407d
Fix this by extracting the map prep files into memory for each archive
ba407d
to prep the maps before we enter the obfuscation loop entirely.
ba407d
ba407d
Closes: #2490
ba407d
Related: RHBZ#1930181
ba407d
Resolves: #2492
ba407d
ba407d
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
ba407d
---
ba407d
 sos/cleaner/__init__.py                | 69 +++++++++++++++-----------
ba407d
 sos/cleaner/parsers/username_parser.py | 13 +++--
ba407d
 2 files changed, 45 insertions(+), 37 deletions(-)
ba407d
ba407d
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
ba407d
index b9eb61ef..d10cdc55 100644
ba407d
--- a/sos/cleaner/__init__.py
ba407d
+++ b/sos/cleaner/__init__.py
ba407d
@@ -292,6 +292,7 @@ third party.
ba407d
 
ba407d
         # we have at least one valid target to obfuscate
ba407d
         self.completed_reports = []
ba407d
+        self.preload_all_archives_into_maps()
ba407d
         self.obfuscate_report_paths()
ba407d
 
ba407d
         if not self.completed_reports:
ba407d
@@ -473,6 +474,44 @@ third party.
ba407d
             self.ui_log.info("Exiting on user cancel")
ba407d
             os._exit(130)
ba407d
 
ba407d
+    def preload_all_archives_into_maps(self):
ba407d
+        """Before doing the actual obfuscation, if we have multiple archives
ba407d
+        to obfuscate then we need to preload each of them into the mappings
ba407d
+        to ensure that node1 is obfuscated in node2 as well as node2 being
ba407d
+        obfuscated in node1's archive.
ba407d
+        """
ba407d
+        self.log_info("Pre-loading multiple archives into obfuscation maps")
ba407d
+        for _arc in self.report_paths:
ba407d
+            is_dir = os.path.isdir(_arc)
ba407d
+            if is_dir:
ba407d
+                _arc_name = _arc
ba407d
+            else:
ba407d
+                archive = tarfile.open(_arc)
ba407d
+                _arc_name = _arc.split('/')[-1].split('.tar')[0]
ba407d
+            # for each parser, load the map_prep_file into memory, and then
ba407d
+            # send that for obfuscation. We don't actually obfuscate the file
ba407d
+            # here, do that in the normal archive loop
ba407d
+            for _parser in self.parsers:
ba407d
+                if not _parser.prep_map_file:
ba407d
+                    continue
ba407d
+                _arc_path = os.path.join(_arc_name, _parser.prep_map_file)
ba407d
+                try:
ba407d
+                    if is_dir:
ba407d
+                        _pfile = open(_arc_path, 'r')
ba407d
+                        content = _pfile.read()
ba407d
+                    else:
ba407d
+                        _pfile = archive.extractfile(_arc_path)
ba407d
+                        content = _pfile.read().decode('utf-8')
ba407d
+                    _pfile.close()
ba407d
+                    if isinstance(_parser, SoSUsernameParser):
ba407d
+                        _parser.load_usernames_into_map(content)
ba407d
+                    for line in content.splitlines():
ba407d
+                        if isinstance(_parser, SoSHostnameParser):
ba407d
+                            _parser.load_hostname_into_map(line)
ba407d
+                        self.obfuscate_line(line, _parser.prep_map_file)
ba407d
+                except Exception as err:
ba407d
+                    self.log_debug("Could not prep %s: %s" % (_arc_path, err))
ba407d
+
ba407d
     def obfuscate_report(self, report):
ba407d
         """Individually handle each archive or directory we've discovered by
ba407d
         running through each file therein.
ba407d
@@ -493,7 +532,6 @@ third party.
ba407d
             start_time = datetime.now()
ba407d
             arc_md.add_field('start_time', start_time)
ba407d
             archive.extract()
ba407d
-            self.prep_maps_from_archive(archive)
ba407d
             archive.report_msg("Beginning obfuscation...")
ba407d
 
ba407d
             file_list = archive.get_file_list()
ba407d
@@ -542,35 +580,6 @@ third party.
ba407d
             self.ui_log.info("Exception while processing %s: %s"
ba407d
                              % (report, err))
ba407d
 
ba407d
-    def prep_maps_from_archive(self, archive):
ba407d
-        """Open specific files from an archive and try to load those values
ba407d
-        into our mappings before iterating through the entire archive.
ba407d
-
ba407d
-        Positional arguments:
ba407d
-
ba407d
-            :param archive SoSObfuscationArchive:   An open archive object
ba407d
-        """
ba407d
-        for parser in self.parsers:
ba407d
-            if not parser.prep_map_file:
ba407d
-                continue
ba407d
-            prep_file = archive.get_file_path(parser.prep_map_file)
ba407d
-            if not prep_file:
ba407d
-                self.log_debug("Could not prepare %s: %s does not exist"
ba407d
-                               % (parser.name, parser.prep_map_file),
ba407d
-                               caller=archive.archive_name)
ba407d
-                continue
ba407d
-            # this is a bit clunky, but we need to load this particular
ba407d
-            # parser in a different way due to how hostnames are validated for
ba407d
-            # obfuscation
ba407d
-            if isinstance(parser, SoSHostnameParser):
ba407d
-                with open(prep_file, 'r') as host_file:
ba407d
-                    hostname = host_file.readline().strip()
ba407d
-                    parser.load_hostname_into_map(hostname)
ba407d
-            if isinstance(parser, SoSUsernameParser):
ba407d
-                parser.load_usernames_into_map(prep_file)
ba407d
-            self.obfuscate_file(prep_file, parser.prep_map_file,
ba407d
-                                archive.archive_name)
ba407d
-
ba407d
     def obfuscate_file(self, filename, short_name=None, arc_name=None):
ba407d
         """Obfuscate and individual file, line by line.
ba407d
 
ba407d
diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
ba407d
index 5223c018..2bb6c7f3 100644
ba407d
--- a/sos/cleaner/parsers/username_parser.py
ba407d
+++ b/sos/cleaner/parsers/username_parser.py
ba407d
@@ -39,16 +39,15 @@ class SoSUsernameParser(SoSCleanerParser):
ba407d
         super(SoSUsernameParser, self).__init__(conf_file)
ba407d
         self.mapping.load_names_from_options(opt_names)
ba407d
 
ba407d
-    def load_usernames_into_map(self, fname):
ba407d
+    def load_usernames_into_map(self, content):
ba407d
         """Since we don't get the list of usernames from a straight regex for
ba407d
         this parser, we need to override the initial parser prepping here.
ba407d
         """
ba407d
-        with open(fname, 'r') as lastfile:
ba407d
-            for line in lastfile.read().splitlines()[1:]:
ba407d
-                user = line.split()[0]
ba407d
-                if user in self.skip_list:
ba407d
-                    continue
ba407d
-                self.mapping.get(user)
ba407d
+        for line in content.splitlines()[1:]:
ba407d
+            user = line.split()[0]
ba407d
+            if user in self.skip_list:
ba407d
+                continue
ba407d
+            self.mapping.get(user)
ba407d
 
ba407d
     def parse_line(self, line):
ba407d
         count = 0
ba407d
-- 
ba407d
2.26.3
ba407d
ba407d
ba407d
From b713f458bfa92427147de754ea36054bfde53d71 Mon Sep 17 00:00:00 2001
ba407d
From: Jake Hunsaker <jhunsake@redhat.com>
ba407d
Date: Wed, 14 Apr 2021 12:22:28 -0400
ba407d
Subject: [PATCH 2/2] [clean] Remove duplicate file skipping within
ba407d
 obfuscate_line()
ba407d
ba407d
A redundant file skipping check was being executed within
ba407d
`obfuscate_line()` that would cause subsequent archives being obfuscated
ba407d
to skip line obfuscation within a file, despite iterating through the
ba407d
entire file.
ba407d
ba407d
Remove this redundant check, thus allowing proper obfuscation.
ba407d
ba407d
Closes: #2490
ba407d
Related: RHBZ#1930181
ba407d
Resolves: #2492
ba407d
ba407d
Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
ba407d
---
ba407d
 sos/cleaner/__init__.py            | 11 +++--------
ba407d
 sos/cleaner/obfuscation_archive.py |  2 --
ba407d
 2 files changed, 3 insertions(+), 10 deletions(-)
ba407d
ba407d
diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
ba407d
index d10cdc55..bdd24f95 100644
ba407d
--- a/sos/cleaner/__init__.py
ba407d
+++ b/sos/cleaner/__init__.py
ba407d
@@ -508,7 +508,7 @@ third party.
ba407d
                     for line in content.splitlines():
ba407d
                         if isinstance(_parser, SoSHostnameParser):
ba407d
                             _parser.load_hostname_into_map(line)
ba407d
-                        self.obfuscate_line(line, _parser.prep_map_file)
ba407d
+                        self.obfuscate_line(line)
ba407d
                 except Exception as err:
ba407d
                     self.log_debug("Could not prep %s: %s" % (_arc_path, err))
ba407d
 
ba407d
@@ -606,7 +606,7 @@ third party.
ba407d
                 if not line.strip():
ba407d
                     continue
ba407d
                 try:
ba407d
-                    line, count = self.obfuscate_line(line, short_name)
ba407d
+                    line, count = self.obfuscate_line(line)
ba407d
                     subs += count
ba407d
                     tfile.write(line)
ba407d
                 except Exception as err:
ba407d
@@ -631,7 +631,7 @@ third party.
ba407d
                 pass
ba407d
         return string_data
ba407d
 
ba407d
-    def obfuscate_line(self, line, filename):
ba407d
+    def obfuscate_line(self, line):
ba407d
         """Run a line through each of the obfuscation parsers, keeping a
ba407d
         cumulative total of substitutions done on that particular line.
ba407d
 
ba407d
@@ -639,16 +639,11 @@ third party.
ba407d
 
ba407d
             :param line str:        The raw line as read from the file being
ba407d
                                     processed
ba407d
-            :param filename str:    Filename the line was read from
ba407d
 
ba407d
         Returns the fully obfuscated line and the number of substitutions made
ba407d
         """
ba407d
         count = 0
ba407d
         for parser in self.parsers:
ba407d
-            if filename and any([
ba407d
-                re.match(_s, filename) for _s in parser.skip_files
ba407d
-            ]):
ba407d
-                continue
ba407d
             try:
ba407d
                 line, _count = parser.parse_line(line)
ba407d
                 count += _count
ba407d
diff --git a/sos/cleaner/obfuscation_archive.py b/sos/cleaner/obfuscation_archive.py
ba407d
index 84ca30cd..c64ab13b 100644
ba407d
--- a/sos/cleaner/obfuscation_archive.py
ba407d
+++ b/sos/cleaner/obfuscation_archive.py
ba407d
@@ -219,8 +219,6 @@ class SoSObfuscationArchive():
ba407d
             :param filename str:        Filename relative to the extracted
ba407d
                                         archive root
ba407d
         """
ba407d
-        if filename in self.file_sub_list:
ba407d
-            return True
ba407d
 
ba407d
         if not os.path.isfile(self.get_file_path(filename)):
ba407d
             return True
ba407d
-- 
ba407d
2.26.3
ba407d