b8c914
From 0db967b2e6a4093a6a5f649190159767e5d005e0 Mon Sep 17 00:00:00 2001
b8c914
From: Yves Orton <demerphq@gmail.com>
b8c914
Date: Tue, 25 Apr 2017 15:17:06 +0200
b8c914
Subject: [PATCH] [perl #131211] fixup File::Glob degenerate matching
b8c914
MIME-Version: 1.0
b8c914
Content-Type: text/plain; charset=UTF-8
b8c914
Content-Transfer-Encoding: 8bit
b8c914
b8c914
The old code would go quadratic with recursion and backtracking
b8c914
when doing patterns like "a*a*a*a*a*a*a*x" on a file like
b8c914
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".
b8c914
b8c914
This patch changes the code to not recurse, and to not backtrack,
b8c914
as per this article from Russ Cox: https://research.swtch.com/glob
b8c914
b8c914
It also adds a micro-optimisation for M_ONE and M_SET under the new code.
b8c914
b8c914
Thanks to Avar and Russ Cox for helping with this patch, along with
b8c914
Jilles Tjoelker and the rest of the FreeBSD community.
b8c914
b8c914
Signed-off-by: Petr Písař <ppisar@redhat.com>
b8c914
---
b8c914
 MANIFEST                   |  1 +
b8c914
 ext/File-Glob/bsd_glob.c   | 64 +++++++++++++++++++++++--------
b8c914
 ext/File-Glob/t/rt131211.t | 94 ++++++++++++++++++++++++++++++++++++++++++++++
b8c914
 3 files changed, 144 insertions(+), 15 deletions(-)
b8c914
 create mode 100644 ext/File-Glob/t/rt131211.t
b8c914
b8c914
diff --git a/MANIFEST b/MANIFEST
b8c914
index b7b6e74..af0da6c 100644
b8c914
--- a/MANIFEST
b8c914
+++ b/MANIFEST
b8c914
@@ -3948,6 +3948,7 @@ ext/File-Glob/t/basic.t		See if File::Glob works
b8c914
 ext/File-Glob/t/case.t		See if File::Glob works
b8c914
 ext/File-Glob/t/global.t	See if File::Glob works
b8c914
 ext/File-Glob/t/rt114984.t	See if File::Glob works
b8c914
+ext/File-Glob/t/rt131211.t	See if File::Glob works
b8c914
 ext/File-Glob/t/taint.t		See if File::Glob works
b8c914
 ext/File-Glob/t/threads.t	See if File::Glob + threads works
b8c914
 ext/File-Glob/TODO		File::Glob extension todo list
b8c914
diff --git a/ext/File-Glob/bsd_glob.c b/ext/File-Glob/bsd_glob.c
b8c914
index 821ef20..e96fb73 100644
b8c914
--- a/ext/File-Glob/bsd_glob.c
b8c914
+++ b/ext/File-Glob/bsd_glob.c
b8c914
@@ -563,8 +563,12 @@ glob0(const Char *pattern, glob_t *pglob)
b8c914
 			break;
b8c914
 		case BG_STAR:
b8c914
 			pglob->gl_flags |= GLOB_MAGCHAR;
b8c914
-			/* collapse adjacent stars to one,
b8c914
-			 * to avoid exponential behavior
b8c914
+                        /* Collapse adjacent stars to one.
b8c914
+                         * This is required to ensure that a pattern like
b8c914
+                         * "a**" matches a name like "a", as without this
b8c914
+                         * check when the first star matched everything it would
b8c914
+                         * cause the second star to return a match fail.
b8c914
+                         * As long ** is folded here this does not happen.
b8c914
 			 */
b8c914
 			if (bufnext == patbuf || bufnext[-1] != M_ALL)
b8c914
 				*bufnext++ = M_ALL;
b8c914
@@ -909,35 +913,56 @@ globextend(const Char *path, glob_t *pglob, size_t *limitp)
b8c914
 
b8c914
 
b8c914
 /*
b8c914
- * pattern matching function for filenames.  Each occurrence of the *
b8c914
- * pattern causes a recursion level.
b8c914
+ * pattern matching function for filenames using state machine to avoid
b8c914
+ * recursion. We maintain a "nextp" and "nextn" to allow us to backtrack
b8c914
+ * without additional callframes, and to do cleanly prune the backtracking
b8c914
+ * state when multiple '*' (start) matches are included in the patter.
b8c914
+ *
b8c914
+ * Thanks to Russ Cox for the improved state machine logic to avoid quadratic
b8c914
+ * matching on failure.
b8c914
+ *
b8c914
+ * https://research.swtch.com/glob
b8c914
+ *
b8c914
+ * An example would be a pattern
b8c914
+ *  ("a*" x 100) . "y"
b8c914
+ * against a file name like
b8c914
+ *  ("a" x 100) . "x"
b8c914
+ *
b8c914
  */
b8c914
 static int
b8c914
 match(Char *name, Char *pat, Char *patend, int nocase)
b8c914
 {
b8c914
 	int ok, negate_range;
b8c914
 	Char c, k;
b8c914
+	Char *nextp = NULL;
b8c914
+	Char *nextn = NULL;
b8c914
 
b8c914
+    loop:
b8c914
 	while (pat < patend) {
b8c914
 		c = *pat++;
b8c914
 		switch (c & M_MASK) {
b8c914
 		case M_ALL:
b8c914
 			if (pat == patend)
b8c914
 				return(1);
b8c914
-			do
b8c914
-			    if (match(name, pat, patend, nocase))
b8c914
-				    return(1);
b8c914
-			while (*name++ != BG_EOS)
b8c914
-				;
b8c914
-			return(0);
b8c914
+	                if (*name == BG_EOS)
b8c914
+	                        return 0;
b8c914
+			nextn = name + 1;
b8c914
+	                nextp = pat - 1;
b8c914
+			break;
b8c914
 		case M_ONE:
b8c914
+                        /* since * matches leftmost-shortest first   *
b8c914
+                         * if we encounter the EOS then backtracking *
b8c914
+                         * will not help, so we can exit early here. */
b8c914
 			if (*name++ == BG_EOS)
b8c914
-				return(0);
b8c914
+                                return 0;
b8c914
 			break;
b8c914
 		case M_SET:
b8c914
 			ok = 0;
b8c914
+                        /* since * matches leftmost-shortest first   *
b8c914
+                         * if we encounter the EOS then backtracking *
b8c914
+                         * will not help, so we can exit early here. */
b8c914
 			if ((k = *name++) == BG_EOS)
b8c914
-				return(0);
b8c914
+                                return 0;
b8c914
 			if ((negate_range = ((*pat & M_MASK) == M_NOT)) != BG_EOS)
b8c914
 				++pat;
b8c914
 			while (((c = *pat++) & M_MASK) != M_END)
b8c914
@@ -953,16 +978,25 @@ match(Char *name, Char *pat, Char *patend, int nocase)
b8c914
 				} else if (nocase ? (tolower(c) == tolower(k)) : (c == k))
b8c914
 					ok = 1;
b8c914
 			if (ok == negate_range)
b8c914
-				return(0);
b8c914
+				goto fail;
b8c914
 			break;
b8c914
 		default:
b8c914
 			k = *name++;
b8c914
 			if (nocase ? (tolower(k) != tolower(c)) : (k != c))
b8c914
-				return(0);
b8c914
+				goto fail;
b8c914
 			break;
b8c914
 		}
b8c914
 	}
b8c914
-	return(*name == BG_EOS);
b8c914
+	if (*name == BG_EOS)
b8c914
+		return 1;
b8c914
+
b8c914
+    fail:
b8c914
+	if (nextn) {
b8c914
+		pat = nextp;
b8c914
+		name = nextn;
b8c914
+		goto loop;
b8c914
+	}
b8c914
+	return 0;
b8c914
 }
b8c914
 
b8c914
 /* Free allocated data belonging to a glob_t structure. */
b8c914
diff --git a/ext/File-Glob/t/rt131211.t b/ext/File-Glob/t/rt131211.t
b8c914
new file mode 100644
b8c914
index 0000000..c1bcbe0
b8c914
--- /dev/null
b8c914
+++ b/ext/File-Glob/t/rt131211.t
b8c914
@@ -0,0 +1,94 @@
b8c914
+use strict;
b8c914
+use warnings;
b8c914
+use v5.16.0;
b8c914
+use File::Temp 'tempdir';
b8c914
+use File::Spec::Functions;
b8c914
+use Test::More;
b8c914
+use Time::HiRes qw(time);
b8c914
+
b8c914
+plan tests => 13;
b8c914
+
b8c914
+my $path = tempdir uc cleanup => 1;
b8c914
+my @files= (
b8c914
+    "x".("a" x 50)."b", # 0
b8c914
+    "abbbbbbbbbbbbc",   # 1
b8c914
+    "abbbbbbbbbbbbd",   # 2
b8c914
+    "aaabaaaabaaaabc",  # 3
b8c914
+    "pq",               # 4
b8c914
+    "r",                # 5
b8c914
+    "rttiiiiiii",       # 6
b8c914
+    "wewewewewewe",     # 7
b8c914
+    "weeeweeeweee",     # 8
b8c914
+    "weewweewweew",     # 9
b8c914
+    "wewewewewewewewewewewewewewewewewq", # 10
b8c914
+    "wtttttttetttttttwr", # 11
b8c914
+);
b8c914
+
b8c914
+
b8c914
+foreach (@files) {
b8c914
+    open(my $f, ">", catfile $path, $_);
b8c914
+}
b8c914
+
b8c914
+my $elapsed_fail= 0;
b8c914
+my $elapsed_match= 0;
b8c914
+my @got_files;
b8c914
+my @no_files;
b8c914
+my $count = 0;
b8c914
+
b8c914
+while (++$count < 10) {
b8c914
+    $elapsed_match -= time;
b8c914
+    @got_files= glob catfile $path, "x".("a*" x $count) . "b";
b8c914
+    $elapsed_match += time;
b8c914
+
b8c914
+    $elapsed_fail -= time;
b8c914
+    @no_files= glob catfile $path, "x".("a*" x $count) . "c";
b8c914
+    $elapsed_fail += time;
b8c914
+    last if $elapsed_fail > $elapsed_match * 100;
b8c914
+}
b8c914
+
b8c914
+is $count,10,
b8c914
+    "tried all the patterns without bailing out";
b8c914
+
b8c914
+cmp_ok $elapsed_fail/$elapsed_match,"<",2,
b8c914
+    "time to fail less than twice the time to match";
b8c914
+is "@got_files", catfile($path, $files[0]),
b8c914
+    "only got the expected file for xa*..b";
b8c914
+is "@no_files", "", "shouldnt have files for xa*..c";
b8c914
+
b8c914
+
b8c914
+@got_files= glob catfile $path, "a*b*b*b*bc";
b8c914
+is "@got_files", catfile($path, $files[1]),
b8c914
+    "only got the expected file for a*b*b*b*bc";
b8c914
+
b8c914
+@got_files= sort glob catfile $path, "a*b*b*bc";
b8c914
+is "@got_files", catfile($path, $files[3])." ".catfile($path,$files[1]),
b8c914
+    "got the expected two files for a*b*b*bc";
b8c914
+
b8c914
+@got_files= sort glob catfile $path, "p*";
b8c914
+is "@got_files", catfile($path, $files[4]),
b8c914
+    "p* matches pq";
b8c914
+
b8c914
+@got_files= sort glob catfile $path, "r*???????";
b8c914
+is "@got_files", catfile($path, $files[6]),
b8c914
+    "r*??????? works as expected";
b8c914
+
b8c914
+@got_files= sort glob catfile $path, "w*e*w??e";
b8c914
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8)),
b8c914
+    "w*e*w??e works as expected";
b8c914
+
b8c914
+@got_files= sort glob catfile $path, "w*e*we??";
b8c914
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)),
b8c914
+    "w*e*we?? works as expected";
b8c914
+
b8c914
+@got_files= sort glob catfile $path, "w**e**w";
b8c914
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (9)),
b8c914
+    "w**e**w works as expected";
b8c914
+
b8c914
+@got_files= sort glob catfile $path, "*wee*";
b8c914
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (8,9)),
b8c914
+    "*wee* works as expected";
b8c914
+
b8c914
+@got_files= sort glob catfile $path, "we*";
b8c914
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)),
b8c914
+    "we* works as expected";
b8c914
+
b8c914
-- 
b8c914
2.9.4
b8c914