f6ea51
From 0db967b2e6a4093a6a5f649190159767e5d005e0 Mon Sep 17 00:00:00 2001
f6ea51
From: Yves Orton <demerphq@gmail.com>
f6ea51
Date: Tue, 25 Apr 2017 15:17:06 +0200
f6ea51
Subject: [PATCH] [perl #131211] fixup File::Glob degenerate matching
f6ea51
MIME-Version: 1.0
f6ea51
Content-Type: text/plain; charset=UTF-8
f6ea51
Content-Transfer-Encoding: 8bit
f6ea51
f6ea51
The old code would go quadratic with recursion and backtracking
f6ea51
when doing patterns like "a*a*a*a*a*a*a*x" on a file like
f6ea51
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".
f6ea51
f6ea51
This patch changes the code to not recurse, and to not backtrack,
f6ea51
as per this article from Russ Cox: https://research.swtch.com/glob
f6ea51
f6ea51
It also adds a micro-optimisation for M_ONE and M_SET under the new code.
f6ea51
f6ea51
Thanks to Avar and Russ Cox for helping with this patch, along with
f6ea51
Jilles Tjoelker and the rest of the FreeBSD community.
f6ea51
f6ea51
Signed-off-by: Petr Písař <ppisar@redhat.com>
f6ea51
---
f6ea51
 MANIFEST                   |  1 +
f6ea51
 ext/File-Glob/bsd_glob.c   | 64 +++++++++++++++++++++++--------
f6ea51
 ext/File-Glob/t/rt131211.t | 94 ++++++++++++++++++++++++++++++++++++++++++++++
f6ea51
 3 files changed, 144 insertions(+), 15 deletions(-)
f6ea51
 create mode 100644 ext/File-Glob/t/rt131211.t
f6ea51
f6ea51
diff --git a/MANIFEST b/MANIFEST
f6ea51
index b7b6e74..af0da6c 100644
f6ea51
--- a/MANIFEST
f6ea51
+++ b/MANIFEST
f6ea51
@@ -3948,6 +3948,7 @@ ext/File-Glob/t/basic.t		See if File::Glob works
f6ea51
 ext/File-Glob/t/case.t		See if File::Glob works
f6ea51
 ext/File-Glob/t/global.t	See if File::Glob works
f6ea51
 ext/File-Glob/t/rt114984.t	See if File::Glob works
f6ea51
+ext/File-Glob/t/rt131211.t	See if File::Glob works
f6ea51
 ext/File-Glob/t/taint.t		See if File::Glob works
f6ea51
 ext/File-Glob/t/threads.t	See if File::Glob + threads works
f6ea51
 ext/File-Glob/TODO		File::Glob extension todo list
f6ea51
diff --git a/ext/File-Glob/bsd_glob.c b/ext/File-Glob/bsd_glob.c
f6ea51
index 821ef20..e96fb73 100644
f6ea51
--- a/ext/File-Glob/bsd_glob.c
f6ea51
+++ b/ext/File-Glob/bsd_glob.c
f6ea51
@@ -563,8 +563,12 @@ glob0(const Char *pattern, glob_t *pglob)
f6ea51
 			break;
f6ea51
 		case BG_STAR:
f6ea51
 			pglob->gl_flags |= GLOB_MAGCHAR;
f6ea51
-			/* collapse adjacent stars to one,
f6ea51
-			 * to avoid exponential behavior
f6ea51
+                        /* Collapse adjacent stars to one.
f6ea51
+                         * This is required to ensure that a pattern like
f6ea51
+                         * "a**" matches a name like "a", as without this
f6ea51
+                         * check when the first star matched everything it would
f6ea51
+                         * cause the second star to return a match fail.
f6ea51
+                         * As long ** is folded here this does not happen.
f6ea51
 			 */
f6ea51
 			if (bufnext == patbuf || bufnext[-1] != M_ALL)
f6ea51
 				*bufnext++ = M_ALL;
f6ea51
@@ -909,35 +913,56 @@ globextend(const Char *path, glob_t *pglob, size_t *limitp)
f6ea51
 
f6ea51
 
f6ea51
 /*
f6ea51
- * pattern matching function for filenames.  Each occurrence of the *
f6ea51
- * pattern causes a recursion level.
f6ea51
+ * pattern matching function for filenames using state machine to avoid
f6ea51
+ * recursion. We maintain a "nextp" and "nextn" to allow us to backtrack
f6ea51
+ * without additional callframes, and to do cleanly prune the backtracking
f6ea51
+ * state when multiple '*' (start) matches are included in the patter.
f6ea51
+ *
f6ea51
+ * Thanks to Russ Cox for the improved state machine logic to avoid quadratic
f6ea51
+ * matching on failure.
f6ea51
+ *
f6ea51
+ * https://research.swtch.com/glob
f6ea51
+ *
f6ea51
+ * An example would be a pattern
f6ea51
+ *  ("a*" x 100) . "y"
f6ea51
+ * against a file name like
f6ea51
+ *  ("a" x 100) . "x"
f6ea51
+ *
f6ea51
  */
f6ea51
 static int
f6ea51
 match(Char *name, Char *pat, Char *patend, int nocase)
f6ea51
 {
f6ea51
 	int ok, negate_range;
f6ea51
 	Char c, k;
f6ea51
+	Char *nextp = NULL;
f6ea51
+	Char *nextn = NULL;
f6ea51
 
f6ea51
+    loop:
f6ea51
 	while (pat < patend) {
f6ea51
 		c = *pat++;
f6ea51
 		switch (c & M_MASK) {
f6ea51
 		case M_ALL:
f6ea51
 			if (pat == patend)
f6ea51
 				return(1);
f6ea51
-			do
f6ea51
-			    if (match(name, pat, patend, nocase))
f6ea51
-				    return(1);
f6ea51
-			while (*name++ != BG_EOS)
f6ea51
-				;
f6ea51
-			return(0);
f6ea51
+	                if (*name == BG_EOS)
f6ea51
+	                        return 0;
f6ea51
+			nextn = name + 1;
f6ea51
+	                nextp = pat - 1;
f6ea51
+			break;
f6ea51
 		case M_ONE:
f6ea51
+                        /* since * matches leftmost-shortest first   *
f6ea51
+                         * if we encounter the EOS then backtracking *
f6ea51
+                         * will not help, so we can exit early here. */
f6ea51
 			if (*name++ == BG_EOS)
f6ea51
-				return(0);
f6ea51
+                                return 0;
f6ea51
 			break;
f6ea51
 		case M_SET:
f6ea51
 			ok = 0;
f6ea51
+                        /* since * matches leftmost-shortest first   *
f6ea51
+                         * if we encounter the EOS then backtracking *
f6ea51
+                         * will not help, so we can exit early here. */
f6ea51
 			if ((k = *name++) == BG_EOS)
f6ea51
-				return(0);
f6ea51
+                                return 0;
f6ea51
 			if ((negate_range = ((*pat & M_MASK) == M_NOT)) != BG_EOS)
f6ea51
 				++pat;
f6ea51
 			while (((c = *pat++) & M_MASK) != M_END)
f6ea51
@@ -953,16 +978,25 @@ match(Char *name, Char *pat, Char *patend, int nocase)
f6ea51
 				} else if (nocase ? (tolower(c) == tolower(k)) : (c == k))
f6ea51
 					ok = 1;
f6ea51
 			if (ok == negate_range)
f6ea51
-				return(0);
f6ea51
+				goto fail;
f6ea51
 			break;
f6ea51
 		default:
f6ea51
 			k = *name++;
f6ea51
 			if (nocase ? (tolower(k) != tolower(c)) : (k != c))
f6ea51
-				return(0);
f6ea51
+				goto fail;
f6ea51
 			break;
f6ea51
 		}
f6ea51
 	}
f6ea51
-	return(*name == BG_EOS);
f6ea51
+	if (*name == BG_EOS)
f6ea51
+		return 1;
f6ea51
+
f6ea51
+    fail:
f6ea51
+	if (nextn) {
f6ea51
+		pat = nextp;
f6ea51
+		name = nextn;
f6ea51
+		goto loop;
f6ea51
+	}
f6ea51
+	return 0;
f6ea51
 }
f6ea51
 
f6ea51
 /* Free allocated data belonging to a glob_t structure. */
f6ea51
diff --git a/ext/File-Glob/t/rt131211.t b/ext/File-Glob/t/rt131211.t
f6ea51
new file mode 100644
f6ea51
index 0000000..c1bcbe0
f6ea51
--- /dev/null
f6ea51
+++ b/ext/File-Glob/t/rt131211.t
f6ea51
@@ -0,0 +1,94 @@
f6ea51
+use strict;
f6ea51
+use warnings;
f6ea51
+use v5.16.0;
f6ea51
+use File::Temp 'tempdir';
f6ea51
+use File::Spec::Functions;
f6ea51
+use Test::More;
f6ea51
+use Time::HiRes qw(time);
f6ea51
+
f6ea51
+plan tests => 13;
f6ea51
+
f6ea51
+my $path = tempdir uc cleanup => 1;
f6ea51
+my @files= (
f6ea51
+    "x".("a" x 50)."b", # 0
f6ea51
+    "abbbbbbbbbbbbc",   # 1
f6ea51
+    "abbbbbbbbbbbbd",   # 2
f6ea51
+    "aaabaaaabaaaabc",  # 3
f6ea51
+    "pq",               # 4
f6ea51
+    "r",                # 5
f6ea51
+    "rttiiiiiii",       # 6
f6ea51
+    "wewewewewewe",     # 7
f6ea51
+    "weeeweeeweee",     # 8
f6ea51
+    "weewweewweew",     # 9
f6ea51
+    "wewewewewewewewewewewewewewewewewq", # 10
f6ea51
+    "wtttttttetttttttwr", # 11
f6ea51
+);
f6ea51
+
f6ea51
+
f6ea51
+foreach (@files) {
f6ea51
+    open(my $f, ">", catfile $path, $_);
f6ea51
+}
f6ea51
+
f6ea51
+my $elapsed_fail= 0;
f6ea51
+my $elapsed_match= 0;
f6ea51
+my @got_files;
f6ea51
+my @no_files;
f6ea51
+my $count = 0;
f6ea51
+
f6ea51
+while (++$count < 10) {
f6ea51
+    $elapsed_match -= time;
f6ea51
+    @got_files= glob catfile $path, "x".("a*" x $count) . "b";
f6ea51
+    $elapsed_match += time;
f6ea51
+
f6ea51
+    $elapsed_fail -= time;
f6ea51
+    @no_files= glob catfile $path, "x".("a*" x $count) . "c";
f6ea51
+    $elapsed_fail += time;
f6ea51
+    last if $elapsed_fail > $elapsed_match * 100;
f6ea51
+}
f6ea51
+
f6ea51
+is $count,10,
f6ea51
+    "tried all the patterns without bailing out";
f6ea51
+
f6ea51
+cmp_ok $elapsed_fail/$elapsed_match,"<",2,
f6ea51
+    "time to fail less than twice the time to match";
f6ea51
+is "@got_files", catfile($path, $files[0]),
f6ea51
+    "only got the expected file for xa*..b";
f6ea51
+is "@no_files", "", "shouldnt have files for xa*..c";
f6ea51
+
f6ea51
+
f6ea51
+@got_files= glob catfile $path, "a*b*b*b*bc";
f6ea51
+is "@got_files", catfile($path, $files[1]),
f6ea51
+    "only got the expected file for a*b*b*b*bc";
f6ea51
+
f6ea51
+@got_files= sort glob catfile $path, "a*b*b*bc";
f6ea51
+is "@got_files", catfile($path, $files[3])." ".catfile($path,$files[1]),
f6ea51
+    "got the expected two files for a*b*b*bc";
f6ea51
+
f6ea51
+@got_files= sort glob catfile $path, "p*";
f6ea51
+is "@got_files", catfile($path, $files[4]),
f6ea51
+    "p* matches pq";
f6ea51
+
f6ea51
+@got_files= sort glob catfile $path, "r*???????";
f6ea51
+is "@got_files", catfile($path, $files[6]),
f6ea51
+    "r*??????? works as expected";
f6ea51
+
f6ea51
+@got_files= sort glob catfile $path, "w*e*w??e";
f6ea51
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8)),
f6ea51
+    "w*e*w??e works as expected";
f6ea51
+
f6ea51
+@got_files= sort glob catfile $path, "w*e*we??";
f6ea51
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)),
f6ea51
+    "w*e*we?? works as expected";
f6ea51
+
f6ea51
+@got_files= sort glob catfile $path, "w**e**w";
f6ea51
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (9)),
f6ea51
+    "w**e**w works as expected";
f6ea51
+
f6ea51
+@got_files= sort glob catfile $path, "*wee*";
f6ea51
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (8,9)),
f6ea51
+    "*wee* works as expected";
f6ea51
+
f6ea51
+@got_files= sort glob catfile $path, "we*";
f6ea51
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)),
f6ea51
+    "we* works as expected";
f6ea51
+
f6ea51
-- 
f6ea51
2.9.4
f6ea51