Blame SOURCES/perl-5.27.0-perl-131211-fixup-File-Glob-degenerate-matching.patch

243a19
From 0db967b2e6a4093a6a5f649190159767e5d005e0 Mon Sep 17 00:00:00 2001
243a19
From: Yves Orton <demerphq@gmail.com>
243a19
Date: Tue, 25 Apr 2017 15:17:06 +0200
243a19
Subject: [PATCH] [perl #131211] fixup File::Glob degenerate matching
243a19
MIME-Version: 1.0
243a19
Content-Type: text/plain; charset=UTF-8
243a19
Content-Transfer-Encoding: 8bit
243a19
243a19
The old code would go quadratic with recursion and backtracking
243a19
when doing patterns like "a*a*a*a*a*a*a*x" on a file like
243a19
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".
243a19
243a19
This patch changes the code to not recurse, and to not backtrack,
243a19
as per this article from Russ Cox: https://research.swtch.com/glob
243a19
243a19
It also adds a micro-optimisation for M_ONE and M_SET under the new code.
243a19
243a19
Thanks to Avar and Russ Cox for helping with this patch, along with
243a19
Jilles Tjoelker and the rest of the FreeBSD community.
243a19
243a19
Signed-off-by: Petr Písař <ppisar@redhat.com>
243a19
---
243a19
 MANIFEST                   |  1 +
243a19
 ext/File-Glob/bsd_glob.c   | 64 +++++++++++++++++++++++--------
243a19
 ext/File-Glob/t/rt131211.t | 94 ++++++++++++++++++++++++++++++++++++++++++++++
243a19
 3 files changed, 144 insertions(+), 15 deletions(-)
243a19
 create mode 100644 ext/File-Glob/t/rt131211.t
243a19
243a19
diff --git a/MANIFEST b/MANIFEST
243a19
index b7b6e74..af0da6c 100644
243a19
--- a/MANIFEST
243a19
+++ b/MANIFEST
243a19
@@ -3948,6 +3948,7 @@ ext/File-Glob/t/basic.t		See if File::Glob works
243a19
 ext/File-Glob/t/case.t		See if File::Glob works
243a19
 ext/File-Glob/t/global.t	See if File::Glob works
243a19
 ext/File-Glob/t/rt114984.t	See if File::Glob works
243a19
+ext/File-Glob/t/rt131211.t	See if File::Glob works
243a19
 ext/File-Glob/t/taint.t		See if File::Glob works
243a19
 ext/File-Glob/t/threads.t	See if File::Glob + threads works
243a19
 ext/File-Glob/TODO		File::Glob extension todo list
243a19
diff --git a/ext/File-Glob/bsd_glob.c b/ext/File-Glob/bsd_glob.c
243a19
index 821ef20..e96fb73 100644
243a19
--- a/ext/File-Glob/bsd_glob.c
243a19
+++ b/ext/File-Glob/bsd_glob.c
243a19
@@ -563,8 +563,12 @@ glob0(const Char *pattern, glob_t *pglob)
243a19
 			break;
243a19
 		case BG_STAR:
243a19
 			pglob->gl_flags |= GLOB_MAGCHAR;
243a19
-			/* collapse adjacent stars to one,
243a19
-			 * to avoid exponential behavior
243a19
+                        /* Collapse adjacent stars to one.
243a19
+                         * This is required to ensure that a pattern like
243a19
+                         * "a**" matches a name like "a", as without this
243a19
+                         * check when the first star matched everything it would
243a19
+                         * cause the second star to return a match fail.
243a19
+                         * As long ** is folded here this does not happen.
243a19
 			 */
243a19
 			if (bufnext == patbuf || bufnext[-1] != M_ALL)
243a19
 				*bufnext++ = M_ALL;
243a19
@@ -909,35 +913,56 @@ globextend(const Char *path, glob_t *pglob, size_t *limitp)
243a19
 
243a19
 
243a19
 /*
243a19
- * pattern matching function for filenames.  Each occurrence of the *
243a19
- * pattern causes a recursion level.
243a19
+ * pattern matching function for filenames using state machine to avoid
243a19
+ * recursion. We maintain a "nextp" and "nextn" to allow us to backtrack
243a19
+ * without additional callframes, and to do cleanly prune the backtracking
243a19
+ * state when multiple '*' (start) matches are included in the patter.
243a19
+ *
243a19
+ * Thanks to Russ Cox for the improved state machine logic to avoid quadratic
243a19
+ * matching on failure.
243a19
+ *
243a19
+ * https://research.swtch.com/glob
243a19
+ *
243a19
+ * An example would be a pattern
243a19
+ *  ("a*" x 100) . "y"
243a19
+ * against a file name like
243a19
+ *  ("a" x 100) . "x"
243a19
+ *
243a19
  */
243a19
 static int
243a19
 match(Char *name, Char *pat, Char *patend, int nocase)
243a19
 {
243a19
 	int ok, negate_range;
243a19
 	Char c, k;
243a19
+	Char *nextp = NULL;
243a19
+	Char *nextn = NULL;
243a19
 
243a19
+    loop:
243a19
 	while (pat < patend) {
243a19
 		c = *pat++;
243a19
 		switch (c & M_MASK) {
243a19
 		case M_ALL:
243a19
 			if (pat == patend)
243a19
 				return(1);
243a19
-			do
243a19
-			    if (match(name, pat, patend, nocase))
243a19
-				    return(1);
243a19
-			while (*name++ != BG_EOS)
243a19
-				;
243a19
-			return(0);
243a19
+	                if (*name == BG_EOS)
243a19
+	                        return 0;
243a19
+			nextn = name + 1;
243a19
+	                nextp = pat - 1;
243a19
+			break;
243a19
 		case M_ONE:
243a19
+                        /* since * matches leftmost-shortest first   *
243a19
+                         * if we encounter the EOS then backtracking *
243a19
+                         * will not help, so we can exit early here. */
243a19
 			if (*name++ == BG_EOS)
243a19
-				return(0);
243a19
+                                return 0;
243a19
 			break;
243a19
 		case M_SET:
243a19
 			ok = 0;
243a19
+                        /* since * matches leftmost-shortest first   *
243a19
+                         * if we encounter the EOS then backtracking *
243a19
+                         * will not help, so we can exit early here. */
243a19
 			if ((k = *name++) == BG_EOS)
243a19
-				return(0);
243a19
+                                return 0;
243a19
 			if ((negate_range = ((*pat & M_MASK) == M_NOT)) != BG_EOS)
243a19
 				++pat;
243a19
 			while (((c = *pat++) & M_MASK) != M_END)
243a19
@@ -953,16 +978,25 @@ match(Char *name, Char *pat, Char *patend, int nocase)
243a19
 				} else if (nocase ? (tolower(c) == tolower(k)) : (c == k))
243a19
 					ok = 1;
243a19
 			if (ok == negate_range)
243a19
-				return(0);
243a19
+				goto fail;
243a19
 			break;
243a19
 		default:
243a19
 			k = *name++;
243a19
 			if (nocase ? (tolower(k) != tolower(c)) : (k != c))
243a19
-				return(0);
243a19
+				goto fail;
243a19
 			break;
243a19
 		}
243a19
 	}
243a19
-	return(*name == BG_EOS);
243a19
+	if (*name == BG_EOS)
243a19
+		return 1;
243a19
+
243a19
+    fail:
243a19
+	if (nextn) {
243a19
+		pat = nextp;
243a19
+		name = nextn;
243a19
+		goto loop;
243a19
+	}
243a19
+	return 0;
243a19
 }
243a19
 
243a19
 /* Free allocated data belonging to a glob_t structure. */
243a19
diff --git a/ext/File-Glob/t/rt131211.t b/ext/File-Glob/t/rt131211.t
243a19
new file mode 100644
243a19
index 0000000..c1bcbe0
243a19
--- /dev/null
243a19
+++ b/ext/File-Glob/t/rt131211.t
243a19
@@ -0,0 +1,94 @@
243a19
+use strict;
243a19
+use warnings;
243a19
+use v5.16.0;
243a19
+use File::Temp 'tempdir';
243a19
+use File::Spec::Functions;
243a19
+use Test::More;
243a19
+use Time::HiRes qw(time);
243a19
+
243a19
+plan tests => 13;
243a19
+
243a19
+my $path = tempdir uc cleanup => 1;
243a19
+my @files= (
243a19
+    "x".("a" x 50)."b", # 0
243a19
+    "abbbbbbbbbbbbc",   # 1
243a19
+    "abbbbbbbbbbbbd",   # 2
243a19
+    "aaabaaaabaaaabc",  # 3
243a19
+    "pq",               # 4
243a19
+    "r",                # 5
243a19
+    "rttiiiiiii",       # 6
243a19
+    "wewewewewewe",     # 7
243a19
+    "weeeweeeweee",     # 8
243a19
+    "weewweewweew",     # 9
243a19
+    "wewewewewewewewewewewewewewewewewq", # 10
243a19
+    "wtttttttetttttttwr", # 11
243a19
+);
243a19
+
243a19
+
243a19
+foreach (@files) {
243a19
+    open(my $f, ">", catfile $path, $_);
243a19
+}
243a19
+
243a19
+my $elapsed_fail= 0;
243a19
+my $elapsed_match= 0;
243a19
+my @got_files;
243a19
+my @no_files;
243a19
+my $count = 0;
243a19
+
243a19
+while (++$count < 10) {
243a19
+    $elapsed_match -= time;
243a19
+    @got_files= glob catfile $path, "x".("a*" x $count) . "b";
243a19
+    $elapsed_match += time;
243a19
+
243a19
+    $elapsed_fail -= time;
243a19
+    @no_files= glob catfile $path, "x".("a*" x $count) . "c";
243a19
+    $elapsed_fail += time;
243a19
+    last if $elapsed_fail > $elapsed_match * 100;
243a19
+}
243a19
+
243a19
+is $count,10,
243a19
+    "tried all the patterns without bailing out";
243a19
+
243a19
+cmp_ok $elapsed_fail/$elapsed_match,"<",2,
243a19
+    "time to fail less than twice the time to match";
243a19
+is "@got_files", catfile($path, $files[0]),
243a19
+    "only got the expected file for xa*..b";
243a19
+is "@no_files", "", "shouldnt have files for xa*..c";
243a19
+
243a19
+
243a19
+@got_files= glob catfile $path, "a*b*b*b*bc";
243a19
+is "@got_files", catfile($path, $files[1]),
243a19
+    "only got the expected file for a*b*b*b*bc";
243a19
+
243a19
+@got_files= sort glob catfile $path, "a*b*b*bc";
243a19
+is "@got_files", catfile($path, $files[3])." ".catfile($path,$files[1]),
243a19
+    "got the expected two files for a*b*b*bc";
243a19
+
243a19
+@got_files= sort glob catfile $path, "p*";
243a19
+is "@got_files", catfile($path, $files[4]),
243a19
+    "p* matches pq";
243a19
+
243a19
+@got_files= sort glob catfile $path, "r*???????";
243a19
+is "@got_files", catfile($path, $files[6]),
243a19
+    "r*??????? works as expected";
243a19
+
243a19
+@got_files= sort glob catfile $path, "w*e*w??e";
243a19
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8)),
243a19
+    "w*e*w??e works as expected";
243a19
+
243a19
+@got_files= sort glob catfile $path, "w*e*we??";
243a19
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)),
243a19
+    "w*e*we?? works as expected";
243a19
+
243a19
+@got_files= sort glob catfile $path, "w**e**w";
243a19
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (9)),
243a19
+    "w**e**w works as expected";
243a19
+
243a19
+@got_files= sort glob catfile $path, "*wee*";
243a19
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (8,9)),
243a19
+    "*wee* works as expected";
243a19
+
243a19
+@got_files= sort glob catfile $path, "we*";
243a19
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)),
243a19
+    "we* works as expected";
243a19
+
243a19
-- 
243a19
2.9.4
243a19