34484a
From 0db967b2e6a4093a6a5f649190159767e5d005e0 Mon Sep 17 00:00:00 2001
34484a
From: Yves Orton <demerphq@gmail.com>
34484a
Date: Tue, 25 Apr 2017 15:17:06 +0200
34484a
Subject: [PATCH] [perl #131211] fixup File::Glob degenerate matching
34484a
MIME-Version: 1.0
34484a
Content-Type: text/plain; charset=UTF-8
34484a
Content-Transfer-Encoding: 8bit
34484a
34484a
The old code would go quadratic with recursion and backtracking
34484a
when doing patterns like "a*a*a*a*a*a*a*x" on a file like
34484a
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".
34484a
34484a
This patch changes the code to not recurse, and to not backtrack,
34484a
as per this article from Russ Cox: https://research.swtch.com/glob
34484a
34484a
It also adds a micro-optimisation for M_ONE and M_SET under the new code.
34484a
34484a
Thanks to Avar and Russ Cox for helping with this patch, along with
34484a
Jilles Tjoelker and the rest of the FreeBSD community.
34484a
34484a
Signed-off-by: Petr Písař <ppisar@redhat.com>
34484a
---
34484a
 MANIFEST                   |  1 +
34484a
 ext/File-Glob/bsd_glob.c   | 64 +++++++++++++++++++++++--------
34484a
 ext/File-Glob/t/rt131211.t | 94 ++++++++++++++++++++++++++++++++++++++++++++++
34484a
 3 files changed, 144 insertions(+), 15 deletions(-)
34484a
 create mode 100644 ext/File-Glob/t/rt131211.t
34484a
34484a
diff --git a/MANIFEST b/MANIFEST
34484a
index b7b6e74..af0da6c 100644
34484a
--- a/MANIFEST
34484a
+++ b/MANIFEST
34484a
@@ -3948,6 +3948,7 @@ ext/File-Glob/t/basic.t		See if File::Glob works
34484a
 ext/File-Glob/t/case.t		See if File::Glob works
34484a
 ext/File-Glob/t/global.t	See if File::Glob works
34484a
 ext/File-Glob/t/rt114984.t	See if File::Glob works
34484a
+ext/File-Glob/t/rt131211.t	See if File::Glob works
34484a
 ext/File-Glob/t/taint.t		See if File::Glob works
34484a
 ext/File-Glob/t/threads.t	See if File::Glob + threads works
34484a
 ext/File-Glob/TODO		File::Glob extension todo list
34484a
diff --git a/ext/File-Glob/bsd_glob.c b/ext/File-Glob/bsd_glob.c
34484a
index 821ef20..e96fb73 100644
34484a
--- a/ext/File-Glob/bsd_glob.c
34484a
+++ b/ext/File-Glob/bsd_glob.c
34484a
@@ -563,8 +563,12 @@ glob0(const Char *pattern, glob_t *pglob)
34484a
 			break;
34484a
 		case BG_STAR:
34484a
 			pglob->gl_flags |= GLOB_MAGCHAR;
34484a
-			/* collapse adjacent stars to one,
34484a
-			 * to avoid exponential behavior
34484a
+                        /* Collapse adjacent stars to one.
34484a
+                         * This is required to ensure that a pattern like
34484a
+                         * "a**" matches a name like "a", as without this
34484a
+                         * check when the first star matched everything it would
34484a
+                         * cause the second star to return a match fail.
34484a
+                         * As long ** is folded here this does not happen.
34484a
 			 */
34484a
 			if (bufnext == patbuf || bufnext[-1] != M_ALL)
34484a
 				*bufnext++ = M_ALL;
34484a
@@ -909,35 +913,56 @@ globextend(const Char *path, glob_t *pglob, size_t *limitp)
34484a
 
34484a
 
34484a
 /*
34484a
- * pattern matching function for filenames.  Each occurrence of the *
34484a
- * pattern causes a recursion level.
34484a
+ * pattern matching function for filenames using state machine to avoid
34484a
+ * recursion. We maintain a "nextp" and "nextn" to allow us to backtrack
34484a
+ * without additional callframes, and to do cleanly prune the backtracking
34484a
+ * state when multiple '*' (start) matches are included in the patter.
34484a
+ *
34484a
+ * Thanks to Russ Cox for the improved state machine logic to avoid quadratic
34484a
+ * matching on failure.
34484a
+ *
34484a
+ * https://research.swtch.com/glob
34484a
+ *
34484a
+ * An example would be a pattern
34484a
+ *  ("a*" x 100) . "y"
34484a
+ * against a file name like
34484a
+ *  ("a" x 100) . "x"
34484a
+ *
34484a
  */
34484a
 static int
34484a
 match(Char *name, Char *pat, Char *patend, int nocase)
34484a
 {
34484a
 	int ok, negate_range;
34484a
 	Char c, k;
34484a
+	Char *nextp = NULL;
34484a
+	Char *nextn = NULL;
34484a
 
34484a
+    loop:
34484a
 	while (pat < patend) {
34484a
 		c = *pat++;
34484a
 		switch (c & M_MASK) {
34484a
 		case M_ALL:
34484a
 			if (pat == patend)
34484a
 				return(1);
34484a
-			do
34484a
-			    if (match(name, pat, patend, nocase))
34484a
-				    return(1);
34484a
-			while (*name++ != BG_EOS)
34484a
-				;
34484a
-			return(0);
34484a
+	                if (*name == BG_EOS)
34484a
+	                        return 0;
34484a
+			nextn = name + 1;
34484a
+	                nextp = pat - 1;
34484a
+			break;
34484a
 		case M_ONE:
34484a
+                        /* since * matches leftmost-shortest first   *
34484a
+                         * if we encounter the EOS then backtracking *
34484a
+                         * will not help, so we can exit early here. */
34484a
 			if (*name++ == BG_EOS)
34484a
-				return(0);
34484a
+                                return 0;
34484a
 			break;
34484a
 		case M_SET:
34484a
 			ok = 0;
34484a
+                        /* since * matches leftmost-shortest first   *
34484a
+                         * if we encounter the EOS then backtracking *
34484a
+                         * will not help, so we can exit early here. */
34484a
 			if ((k = *name++) == BG_EOS)
34484a
-				return(0);
34484a
+                                return 0;
34484a
 			if ((negate_range = ((*pat & M_MASK) == M_NOT)) != BG_EOS)
34484a
 				++pat;
34484a
 			while (((c = *pat++) & M_MASK) != M_END)
34484a
@@ -953,16 +978,25 @@ match(Char *name, Char *pat, Char *patend, int nocase)
34484a
 				} else if (nocase ? (tolower(c) == tolower(k)) : (c == k))
34484a
 					ok = 1;
34484a
 			if (ok == negate_range)
34484a
-				return(0);
34484a
+				goto fail;
34484a
 			break;
34484a
 		default:
34484a
 			k = *name++;
34484a
 			if (nocase ? (tolower(k) != tolower(c)) : (k != c))
34484a
-				return(0);
34484a
+				goto fail;
34484a
 			break;
34484a
 		}
34484a
 	}
34484a
-	return(*name == BG_EOS);
34484a
+	if (*name == BG_EOS)
34484a
+		return 1;
34484a
+
34484a
+    fail:
34484a
+	if (nextn) {
34484a
+		pat = nextp;
34484a
+		name = nextn;
34484a
+		goto loop;
34484a
+	}
34484a
+	return 0;
34484a
 }
34484a
 
34484a
 /* Free allocated data belonging to a glob_t structure. */
34484a
diff --git a/ext/File-Glob/t/rt131211.t b/ext/File-Glob/t/rt131211.t
34484a
new file mode 100644
34484a
index 0000000..c1bcbe0
34484a
--- /dev/null
34484a
+++ b/ext/File-Glob/t/rt131211.t
34484a
@@ -0,0 +1,94 @@
34484a
+use strict;
34484a
+use warnings;
34484a
+use v5.16.0;
34484a
+use File::Temp 'tempdir';
34484a
+use File::Spec::Functions;
34484a
+use Test::More;
34484a
+use Time::HiRes qw(time);
34484a
+
34484a
+plan tests => 13;
34484a
+
34484a
+my $path = tempdir uc cleanup => 1;
34484a
+my @files= (
34484a
+    "x".("a" x 50)."b", # 0
34484a
+    "abbbbbbbbbbbbc",   # 1
34484a
+    "abbbbbbbbbbbbd",   # 2
34484a
+    "aaabaaaabaaaabc",  # 3
34484a
+    "pq",               # 4
34484a
+    "r",                # 5
34484a
+    "rttiiiiiii",       # 6
34484a
+    "wewewewewewe",     # 7
34484a
+    "weeeweeeweee",     # 8
34484a
+    "weewweewweew",     # 9
34484a
+    "wewewewewewewewewewewewewewewewewq", # 10
34484a
+    "wtttttttetttttttwr", # 11
34484a
+);
34484a
+
34484a
+
34484a
+foreach (@files) {
34484a
+    open(my $f, ">", catfile $path, $_);
34484a
+}
34484a
+
34484a
+my $elapsed_fail= 0;
34484a
+my $elapsed_match= 0;
34484a
+my @got_files;
34484a
+my @no_files;
34484a
+my $count = 0;
34484a
+
34484a
+while (++$count < 10) {
34484a
+    $elapsed_match -= time;
34484a
+    @got_files= glob catfile $path, "x".("a*" x $count) . "b";
34484a
+    $elapsed_match += time;
34484a
+
34484a
+    $elapsed_fail -= time;
34484a
+    @no_files= glob catfile $path, "x".("a*" x $count) . "c";
34484a
+    $elapsed_fail += time;
34484a
+    last if $elapsed_fail > $elapsed_match * 100;
34484a
+}
34484a
+
34484a
+is $count,10,
34484a
+    "tried all the patterns without bailing out";
34484a
+
34484a
+cmp_ok $elapsed_fail/$elapsed_match,"<",2,
34484a
+    "time to fail less than twice the time to match";
34484a
+is "@got_files", catfile($path, $files[0]),
34484a
+    "only got the expected file for xa*..b";
34484a
+is "@no_files", "", "shouldnt have files for xa*..c";
34484a
+
34484a
+
34484a
+@got_files= glob catfile $path, "a*b*b*b*bc";
34484a
+is "@got_files", catfile($path, $files[1]),
34484a
+    "only got the expected file for a*b*b*b*bc";
34484a
+
34484a
+@got_files= sort glob catfile $path, "a*b*b*bc";
34484a
+is "@got_files", catfile($path, $files[3])." ".catfile($path,$files[1]),
34484a
+    "got the expected two files for a*b*b*bc";
34484a
+
34484a
+@got_files= sort glob catfile $path, "p*";
34484a
+is "@got_files", catfile($path, $files[4]),
34484a
+    "p* matches pq";
34484a
+
34484a
+@got_files= sort glob catfile $path, "r*???????";
34484a
+is "@got_files", catfile($path, $files[6]),
34484a
+    "r*??????? works as expected";
34484a
+
34484a
+@got_files= sort glob catfile $path, "w*e*w??e";
34484a
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8)),
34484a
+    "w*e*w??e works as expected";
34484a
+
34484a
+@got_files= sort glob catfile $path, "w*e*we??";
34484a
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)),
34484a
+    "w*e*we?? works as expected";
34484a
+
34484a
+@got_files= sort glob catfile $path, "w**e**w";
34484a
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (9)),
34484a
+    "w**e**w works as expected";
34484a
+
34484a
+@got_files= sort glob catfile $path, "*wee*";
34484a
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (8,9)),
34484a
+    "*wee* works as expected";
34484a
+
34484a
+@got_files= sort glob catfile $path, "we*";
34484a
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)),
34484a
+    "we* works as expected";
34484a
+
34484a
-- 
34484a
2.9.4
34484a