b8876f
From 30cba075ecbb662b392b2c6e896dec287ea49aa8 Mon Sep 17 00:00:00 2001
b8876f
From: Yves Orton <demerphq@gmail.com>
b8876f
Date: Tue, 25 Apr 2017 15:17:06 +0200
b8876f
Subject: [PATCH] fixup File::Glob degenerate matching
b8876f
MIME-Version: 1.0
b8876f
Content-Type: text/plain; charset=UTF-8
b8876f
Content-Transfer-Encoding: 8bit
b8876f
b8876f
Ported to 5.24.1:
b8876f
b8876f
commit 0db967b2e6a4093a6a5f649190159767e5d005e0
b8876f
Author: Yves Orton <demerphq@gmail.com>
b8876f
Date:   Tue Apr 25 15:17:06 2017 +0200
b8876f
b8876f
    [perl #131211] fixup File::Glob degenerate matching
b8876f
b8876f
    The old code would go quadratic with recursion and backtracking
b8876f
    when doing patterns like "a*a*a*a*a*a*a*x" on a file like
b8876f
    "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".
b8876f
b8876f
    This patch changes the code to not recurse, and to not backtrack,
b8876f
    as per this article from Russ Cox: https://research.swtch.com/glob
b8876f
b8876f
    It also adds a micro-optimisation for M_ONE and M_SET under the new code.
b8876f
b8876f
    Thanks to Avar and Russ Cox for helping with this patch, along with
b8876f
    Jilles Tjoelker and the rest of the FreeBSD community.
b8876f
b8876f
Signed-off-by: Petr Písař <ppisar@redhat.com>
b8876f
---
b8876f
 MANIFEST                   |  1 +
b8876f
 ext/File-Glob/bsd_glob.c   | 64 +++++++++++++++++++++++--------
b8876f
 ext/File-Glob/t/rt131211.t | 94 ++++++++++++++++++++++++++++++++++++++++++++++
b8876f
 3 files changed, 144 insertions(+), 15 deletions(-)
b8876f
 create mode 100644 ext/File-Glob/t/rt131211.t
b8876f
b8876f
diff --git a/MANIFEST b/MANIFEST
b8876f
index fe045a7..be2a44f 100644
b8876f
--- a/MANIFEST
b8876f
+++ b/MANIFEST
b8876f
@@ -3678,6 +3678,7 @@ ext/File-Glob/t/case.t		See if File::Glob works
b8876f
 ext/File-Glob/t/global.t	See if File::Glob works
b8876f
 ext/File-Glob/TODO		File::Glob extension todo list
b8876f
 ext/File-Glob/t/rt114984.t	See if File::Glob works
b8876f
+ext/File-Glob/t/rt131211.t	See if File::Glob works
b8876f
 ext/File-Glob/t/taint.t		See if File::Glob works
b8876f
 ext/File-Glob/t/threads.t	See if File::Glob + threads works
b8876f
 ext/GDBM_File/GDBM_File.pm	GDBM extension Perl module
b8876f
diff --git a/ext/File-Glob/bsd_glob.c b/ext/File-Glob/bsd_glob.c
b8876f
index 821ef20..e96fb73 100644
b8876f
--- a/ext/File-Glob/bsd_glob.c
b8876f
+++ b/ext/File-Glob/bsd_glob.c
b8876f
@@ -563,8 +563,12 @@ glob0(const Char *pattern, glob_t *pglob)
b8876f
 			break;
b8876f
 		case BG_STAR:
b8876f
 			pglob->gl_flags |= GLOB_MAGCHAR;
b8876f
-			/* collapse adjacent stars to one,
b8876f
-			 * to avoid exponential behavior
b8876f
+                        /* Collapse adjacent stars to one.
b8876f
+                         * This is required to ensure that a pattern like
b8876f
+                         * "a**" matches a name like "a", as without this
b8876f
+                         * check when the first star matched everything it would
b8876f
+                         * cause the second star to return a match fail.
b8876f
+                         * As long ** is folded here this does not happen.
b8876f
 			 */
b8876f
 			if (bufnext == patbuf || bufnext[-1] != M_ALL)
b8876f
 				*bufnext++ = M_ALL;
b8876f
@@ -909,35 +913,56 @@ globextend(const Char *path, glob_t *pglob, size_t *limitp)
b8876f
 
b8876f
 
b8876f
 /*
b8876f
- * pattern matching function for filenames.  Each occurrence of the *
b8876f
- * pattern causes a recursion level.
b8876f
+ * pattern matching function for filenames using state machine to avoid
b8876f
+ * recursion. We maintain a "nextp" and "nextn" to allow us to backtrack
b8876f
+ * without additional callframes, and to do cleanly prune the backtracking
b8876f
+ * state when multiple '*' (start) matches are included in the patter.
b8876f
+ *
b8876f
+ * Thanks to Russ Cox for the improved state machine logic to avoid quadratic
b8876f
+ * matching on failure.
b8876f
+ *
b8876f
+ * https://research.swtch.com/glob
b8876f
+ *
b8876f
+ * An example would be a pattern
b8876f
+ *  ("a*" x 100) . "y"
b8876f
+ * against a file name like
b8876f
+ *  ("a" x 100) . "x"
b8876f
+ *
b8876f
  */
b8876f
 static int
b8876f
 match(Char *name, Char *pat, Char *patend, int nocase)
b8876f
 {
b8876f
 	int ok, negate_range;
b8876f
 	Char c, k;
b8876f
+	Char *nextp = NULL;
b8876f
+	Char *nextn = NULL;
b8876f
 
b8876f
+    loop:
b8876f
 	while (pat < patend) {
b8876f
 		c = *pat++;
b8876f
 		switch (c & M_MASK) {
b8876f
 		case M_ALL:
b8876f
 			if (pat == patend)
b8876f
 				return(1);
b8876f
-			do
b8876f
-			    if (match(name, pat, patend, nocase))
b8876f
-				    return(1);
b8876f
-			while (*name++ != BG_EOS)
b8876f
-				;
b8876f
-			return(0);
b8876f
+	                if (*name == BG_EOS)
b8876f
+	                        return 0;
b8876f
+			nextn = name + 1;
b8876f
+	                nextp = pat - 1;
b8876f
+			break;
b8876f
 		case M_ONE:
b8876f
+                        /* since * matches leftmost-shortest first   *
b8876f
+                         * if we encounter the EOS then backtracking *
b8876f
+                         * will not help, so we can exit early here. */
b8876f
 			if (*name++ == BG_EOS)
b8876f
-				return(0);
b8876f
+                                return 0;
b8876f
 			break;
b8876f
 		case M_SET:
b8876f
 			ok = 0;
b8876f
+                        /* since * matches leftmost-shortest first   *
b8876f
+                         * if we encounter the EOS then backtracking *
b8876f
+                         * will not help, so we can exit early here. */
b8876f
 			if ((k = *name++) == BG_EOS)
b8876f
-				return(0);
b8876f
+                                return 0;
b8876f
 			if ((negate_range = ((*pat & M_MASK) == M_NOT)) != BG_EOS)
b8876f
 				++pat;
b8876f
 			while (((c = *pat++) & M_MASK) != M_END)
b8876f
@@ -953,16 +978,25 @@ match(Char *name, Char *pat, Char *patend, int nocase)
b8876f
 				} else if (nocase ? (tolower(c) == tolower(k)) : (c == k))
b8876f
 					ok = 1;
b8876f
 			if (ok == negate_range)
b8876f
-				return(0);
b8876f
+				goto fail;
b8876f
 			break;
b8876f
 		default:
b8876f
 			k = *name++;
b8876f
 			if (nocase ? (tolower(k) != tolower(c)) : (k != c))
b8876f
-				return(0);
b8876f
+				goto fail;
b8876f
 			break;
b8876f
 		}
b8876f
 	}
b8876f
-	return(*name == BG_EOS);
b8876f
+	if (*name == BG_EOS)
b8876f
+		return 1;
b8876f
+
b8876f
+    fail:
b8876f
+	if (nextn) {
b8876f
+		pat = nextp;
b8876f
+		name = nextn;
b8876f
+		goto loop;
b8876f
+	}
b8876f
+	return 0;
b8876f
 }
b8876f
 
b8876f
 /* Free allocated data belonging to a glob_t structure. */
b8876f
diff --git a/ext/File-Glob/t/rt131211.t b/ext/File-Glob/t/rt131211.t
b8876f
new file mode 100644
b8876f
index 0000000..c1bcbe0
b8876f
--- /dev/null
b8876f
+++ b/ext/File-Glob/t/rt131211.t
b8876f
@@ -0,0 +1,94 @@
b8876f
+use strict;
b8876f
+use warnings;
b8876f
+use v5.16.0;
b8876f
+use File::Temp 'tempdir';
b8876f
+use File::Spec::Functions;
b8876f
+use Test::More;
b8876f
+use Time::HiRes qw(time);
b8876f
+
b8876f
+plan tests => 13;
b8876f
+
b8876f
+my $path = tempdir uc cleanup => 1;
b8876f
+my @files= (
b8876f
+    "x".("a" x 50)."b", # 0
b8876f
+    "abbbbbbbbbbbbc",   # 1
b8876f
+    "abbbbbbbbbbbbd",   # 2
b8876f
+    "aaabaaaabaaaabc",  # 3
b8876f
+    "pq",               # 4
b8876f
+    "r",                # 5
b8876f
+    "rttiiiiiii",       # 6
b8876f
+    "wewewewewewe",     # 7
b8876f
+    "weeeweeeweee",     # 8
b8876f
+    "weewweewweew",     # 9
b8876f
+    "wewewewewewewewewewewewewewewewewq", # 10
b8876f
+    "wtttttttetttttttwr", # 11
b8876f
+);
b8876f
+
b8876f
+
b8876f
+foreach (@files) {
b8876f
+    open(my $f, ">", catfile $path, $_);
b8876f
+}
b8876f
+
b8876f
+my $elapsed_fail= 0;
b8876f
+my $elapsed_match= 0;
b8876f
+my @got_files;
b8876f
+my @no_files;
b8876f
+my $count = 0;
b8876f
+
b8876f
+while (++$count < 10) {
b8876f
+    $elapsed_match -= time;
b8876f
+    @got_files= glob catfile $path, "x".("a*" x $count) . "b";
b8876f
+    $elapsed_match += time;
b8876f
+
b8876f
+    $elapsed_fail -= time;
b8876f
+    @no_files= glob catfile $path, "x".("a*" x $count) . "c";
b8876f
+    $elapsed_fail += time;
b8876f
+    last if $elapsed_fail > $elapsed_match * 100;
b8876f
+}
b8876f
+
b8876f
+is $count,10,
b8876f
+    "tried all the patterns without bailing out";
b8876f
+
b8876f
+cmp_ok $elapsed_fail/$elapsed_match,"<",2,
b8876f
+    "time to fail less than twice the time to match";
b8876f
+is "@got_files", catfile($path, $files[0]),
b8876f
+    "only got the expected file for xa*..b";
b8876f
+is "@no_files", "", "shouldnt have files for xa*..c";
b8876f
+
b8876f
+
b8876f
+@got_files= glob catfile $path, "a*b*b*b*bc";
b8876f
+is "@got_files", catfile($path, $files[1]),
b8876f
+    "only got the expected file for a*b*b*b*bc";
b8876f
+
b8876f
+@got_files= sort glob catfile $path, "a*b*b*bc";
b8876f
+is "@got_files", catfile($path, $files[3])." ".catfile($path,$files[1]),
b8876f
+    "got the expected two files for a*b*b*bc";
b8876f
+
b8876f
+@got_files= sort glob catfile $path, "p*";
b8876f
+is "@got_files", catfile($path, $files[4]),
b8876f
+    "p* matches pq";
b8876f
+
b8876f
+@got_files= sort glob catfile $path, "r*???????";
b8876f
+is "@got_files", catfile($path, $files[6]),
b8876f
+    "r*??????? works as expected";
b8876f
+
b8876f
+@got_files= sort glob catfile $path, "w*e*w??e";
b8876f
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8)),
b8876f
+    "w*e*w??e works as expected";
b8876f
+
b8876f
+@got_files= sort glob catfile $path, "w*e*we??";
b8876f
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)),
b8876f
+    "w*e*we?? works as expected";
b8876f
+
b8876f
+@got_files= sort glob catfile $path, "w**e**w";
b8876f
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (9)),
b8876f
+    "w**e**w works as expected";
b8876f
+
b8876f
+@got_files= sort glob catfile $path, "*wee*";
b8876f
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (8,9)),
b8876f
+    "*wee* works as expected";
b8876f
+
b8876f
+@got_files= sort glob catfile $path, "we*";
b8876f
+is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)),
b8876f
+    "we* works as expected";
b8876f
+
b8876f
-- 
b8876f
2.9.4
b8876f