From 30cba075ecbb662b392b2c6e896dec287ea49aa8 Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Tue, 25 Apr 2017 15:17:06 +0200 Subject: [PATCH] fixup File::Glob degenerate matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ported to 5.24.1: commit 0db967b2e6a4093a6a5f649190159767e5d005e0 Author: Yves Orton Date: Tue Apr 25 15:17:06 2017 +0200 [perl #131211] fixup File::Glob degenerate matching The old code would go quadratic with recursion and backtracking when doing patterns like "a*a*a*a*a*a*a*x" on a file like "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa". This patch changes the code to not recurse, and to not backtrack, as per this article from Russ Cox: https://research.swtch.com/glob It also adds a micro-optimisation for M_ONE and M_SET under the new code. Thanks to Avar and Russ Cox for helping with this patch, along with Jilles Tjoelker and the rest of the FreeBSD community. Signed-off-by: Petr Písař --- MANIFEST | 1 + ext/File-Glob/bsd_glob.c | 64 +++++++++++++++++++++++-------- ext/File-Glob/t/rt131211.t | 94 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+), 15 deletions(-) create mode 100644 ext/File-Glob/t/rt131211.t diff --git a/MANIFEST b/MANIFEST index fe045a7..be2a44f 100644 --- a/MANIFEST +++ b/MANIFEST @@ -3678,6 +3678,7 @@ ext/File-Glob/t/case.t See if File::Glob works ext/File-Glob/t/global.t See if File::Glob works ext/File-Glob/TODO File::Glob extension todo list ext/File-Glob/t/rt114984.t See if File::Glob works +ext/File-Glob/t/rt131211.t See if File::Glob works ext/File-Glob/t/taint.t See if File::Glob works ext/File-Glob/t/threads.t See if File::Glob + threads works ext/GDBM_File/GDBM_File.pm GDBM extension Perl module diff --git a/ext/File-Glob/bsd_glob.c b/ext/File-Glob/bsd_glob.c index 821ef20..e96fb73 100644 --- a/ext/File-Glob/bsd_glob.c +++ b/ext/File-Glob/bsd_glob.c @@ -563,8 +563,12 @@ glob0(const Char *pattern, glob_t *pglob) break; case BG_STAR: pglob->gl_flags |= GLOB_MAGCHAR; - /* collapse adjacent stars to one, - * to avoid exponential behavior + /* Collapse adjacent stars to one. + * This is required to ensure that a pattern like + * "a**" matches a name like "a", as without this + * check when the first star matched everything it would + * cause the second star to return a match fail. + * As long ** is folded here this does not happen. */ if (bufnext == patbuf || bufnext[-1] != M_ALL) *bufnext++ = M_ALL; @@ -909,35 +913,56 @@ globextend(const Char *path, glob_t *pglob, size_t *limitp) /* - * pattern matching function for filenames. Each occurrence of the * - * pattern causes a recursion level. + * pattern matching function for filenames using state machine to avoid + * recursion. We maintain a "nextp" and "nextn" to allow us to backtrack + * without additional callframes, and to do cleanly prune the backtracking + * state when multiple '*' (start) matches are included in the patter. + * + * Thanks to Russ Cox for the improved state machine logic to avoid quadratic + * matching on failure. + * + * https://research.swtch.com/glob + * + * An example would be a pattern + * ("a*" x 100) . "y" + * against a file name like + * ("a" x 100) . "x" + * */ static int match(Char *name, Char *pat, Char *patend, int nocase) { int ok, negate_range; Char c, k; + Char *nextp = NULL; + Char *nextn = NULL; + loop: while (pat < patend) { c = *pat++; switch (c & M_MASK) { case M_ALL: if (pat == patend) return(1); - do - if (match(name, pat, patend, nocase)) - return(1); - while (*name++ != BG_EOS) - ; - return(0); + if (*name == BG_EOS) + return 0; + nextn = name + 1; + nextp = pat - 1; + break; case M_ONE: + /* since * matches leftmost-shortest first * + * if we encounter the EOS then backtracking * + * will not help, so we can exit early here. */ if (*name++ == BG_EOS) - return(0); + return 0; break; case M_SET: ok = 0; + /* since * matches leftmost-shortest first * + * if we encounter the EOS then backtracking * + * will not help, so we can exit early here. */ if ((k = *name++) == BG_EOS) - return(0); + return 0; if ((negate_range = ((*pat & M_MASK) == M_NOT)) != BG_EOS) ++pat; while (((c = *pat++) & M_MASK) != M_END) @@ -953,16 +978,25 @@ match(Char *name, Char *pat, Char *patend, int nocase) } else if (nocase ? (tolower(c) == tolower(k)) : (c == k)) ok = 1; if (ok == negate_range) - return(0); + goto fail; break; default: k = *name++; if (nocase ? (tolower(k) != tolower(c)) : (k != c)) - return(0); + goto fail; break; } } - return(*name == BG_EOS); + if (*name == BG_EOS) + return 1; + + fail: + if (nextn) { + pat = nextp; + name = nextn; + goto loop; + } + return 0; } /* Free allocated data belonging to a glob_t structure. */ diff --git a/ext/File-Glob/t/rt131211.t b/ext/File-Glob/t/rt131211.t new file mode 100644 index 0000000..c1bcbe0 --- /dev/null +++ b/ext/File-Glob/t/rt131211.t @@ -0,0 +1,94 @@ +use strict; +use warnings; +use v5.16.0; +use File::Temp 'tempdir'; +use File::Spec::Functions; +use Test::More; +use Time::HiRes qw(time); + +plan tests => 13; + +my $path = tempdir uc cleanup => 1; +my @files= ( + "x".("a" x 50)."b", # 0 + "abbbbbbbbbbbbc", # 1 + "abbbbbbbbbbbbd", # 2 + "aaabaaaabaaaabc", # 3 + "pq", # 4 + "r", # 5 + "rttiiiiiii", # 6 + "wewewewewewe", # 7 + "weeeweeeweee", # 8 + "weewweewweew", # 9 + "wewewewewewewewewewewewewewewewewq", # 10 + "wtttttttetttttttwr", # 11 +); + + +foreach (@files) { + open(my $f, ">", catfile $path, $_); +} + +my $elapsed_fail= 0; +my $elapsed_match= 0; +my @got_files; +my @no_files; +my $count = 0; + +while (++$count < 10) { + $elapsed_match -= time; + @got_files= glob catfile $path, "x".("a*" x $count) . "b"; + $elapsed_match += time; + + $elapsed_fail -= time; + @no_files= glob catfile $path, "x".("a*" x $count) . "c"; + $elapsed_fail += time; + last if $elapsed_fail > $elapsed_match * 100; +} + +is $count,10, + "tried all the patterns without bailing out"; + +cmp_ok $elapsed_fail/$elapsed_match,"<",2, + "time to fail less than twice the time to match"; +is "@got_files", catfile($path, $files[0]), + "only got the expected file for xa*..b"; +is "@no_files", "", "shouldnt have files for xa*..c"; + + +@got_files= glob catfile $path, "a*b*b*b*bc"; +is "@got_files", catfile($path, $files[1]), + "only got the expected file for a*b*b*b*bc"; + +@got_files= sort glob catfile $path, "a*b*b*bc"; +is "@got_files", catfile($path, $files[3])." ".catfile($path,$files[1]), + "got the expected two files for a*b*b*bc"; + +@got_files= sort glob catfile $path, "p*"; +is "@got_files", catfile($path, $files[4]), + "p* matches pq"; + +@got_files= sort glob catfile $path, "r*???????"; +is "@got_files", catfile($path, $files[6]), + "r*??????? works as expected"; + +@got_files= sort glob catfile $path, "w*e*w??e"; +is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8)), + "w*e*w??e works as expected"; + +@got_files= sort glob catfile $path, "w*e*we??"; +is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)), + "w*e*we?? works as expected"; + +@got_files= sort glob catfile $path, "w**e**w"; +is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (9)), + "w**e**w works as expected"; + +@got_files= sort glob catfile $path, "*wee*"; +is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (8,9)), + "*wee* works as expected"; + +@got_files= sort glob catfile $path, "we*"; +is "@got_files", join(" ", sort map { catfile($path, $files[$_]) } (7,8,9,10)), + "we* works as expected"; + -- 2.9.4