To: vim_dev@googlegroups.com Subject: Patch 7.3.970 Fcc: outbox From: Bram Moolenaar Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ------------ Patch 7.3.970 Problem: Syntax highlighting can be slow. Solution: Include the NFA regexp engine. Add the 'regexpengine' option to select which one is used. (various authors, including Ken Takata, Andrei Aiordachioaie, Russ Cox, Xiaozhou Liua, Ian Young) Files: src/Make_cyg.mak, src/Make_ming.mak, src/Make_mvc.mak, src/Makefile, src/regexp.c, src/regexp.h, src/regexp_nfa.c, src/structs.h, src/testdir/Makefile, src/testdir/test64.in, src/testdir/test64.ok, Filelist, runtime/doc/pattern.txt, runtime/doc/option.txt, src/option.c, src/option.h, src/testdir/test95.in, src/testdir/test95.ok, src/testdir/Make_amiga.mak, src/testdir/Make_dos.mak, src/testdir/Make_ming.mak, src/testdir/Make_os2.mak, src/testdir/Make_vms.mms *** ../vim-7.3.969/src/Make_cyg.mak 2013-05-06 04:21:35.000000000 +0200 --- src/Make_cyg.mak 2013-05-17 18:54:19.000000000 +0200 *************** *** 672,677 **** --- 672,680 ---- $(OUTDIR)/netbeans.o: netbeans.c $(INCL) $(NBDEBUG_DEP) $(CC) -c $(CFLAGS) netbeans.c -o $(OUTDIR)/netbeans.o + $(OUTDIR)/regexp.o: regexp.c regexp_nfa.c $(INCL) + $(CC) -c $(CFLAGS) regexp.c -o $(OUTDIR)/regexp.o + $(OUTDIR)/if_mzsch.o: if_mzsch.c $(INCL) if_mzsch.h $(MZ_EXTRA_DEP) $(CC) -c $(CFLAGS) if_mzsch.c -o $(OUTDIR)/if_mzsch.o *** ../vim-7.3.969/src/Make_ming.mak 2013-03-07 15:16:16.000000000 +0100 --- src/Make_ming.mak 2013-05-17 18:54:19.000000000 +0200 *************** *** 765,770 **** --- 765,773 ---- $(OUTDIR)/netbeans.o: netbeans.c $(INCL) $(NBDEBUG_INCL) $(NBDEBUG_SRC) $(CC) -c $(CFLAGS) netbeans.c -o $(OUTDIR)/netbeans.o + $(OUTDIR)/regexp.o: regexp.c regexp_nfa.c $(INCL) + $(CC) -c $(CFLAGS) regexp.c -o $(OUTDIR)/regexp.o + $(OUTDIR)/if_mzsch.o: if_mzsch.c $(INCL) if_mzsch.h $(MZ_EXTRA_DEP) $(CC) -c $(CFLAGS) if_mzsch.c -o $(OUTDIR)/if_mzsch.o *** ../vim-7.3.969/src/Make_mvc.mak 2013-05-07 05:11:12.000000000 +0200 --- src/Make_mvc.mak 2013-05-19 16:38:29.000000000 +0200 *************** *** 1166,1172 **** $(OUTDIR)/quickfix.obj: $(OUTDIR) quickfix.c $(INCL) ! $(OUTDIR)/regexp.obj: $(OUTDIR) regexp.c $(INCL) $(OUTDIR)/screen.obj: $(OUTDIR) screen.c $(INCL) --- 1173,1179 ---- $(OUTDIR)/quickfix.obj: $(OUTDIR) quickfix.c $(INCL) ! $(OUTDIR)/regexp.obj: $(OUTDIR) regexp.c regexp_nfa.c $(INCL) $(OUTDIR)/screen.obj: $(OUTDIR) screen.c $(INCL) *** ../vim-7.3.969/src/Makefile 2013-05-06 04:21:35.000000000 +0200 --- src/Makefile 2013-05-18 23:42:30.000000000 +0200 *************** *** 454,460 **** # MULTIBYTE - To edit multi-byte characters. # Uncomment this when you want to edit a multibyte language. ! # It's automatically enabled with big features or IME support. # Note: Compile on a machine where setlocale() actually works, otherwise the # configure tests may fail. #CONF_OPT_MULTIBYTE = --enable-multibyte --- 454,460 ---- # MULTIBYTE - To edit multi-byte characters. # Uncomment this when you want to edit a multibyte language. ! # It's automatically enabled with normal features, GTK or IME support. # Note: Compile on a machine where setlocale() actually works, otherwise the # configure tests may fail. #CONF_OPT_MULTIBYTE = --enable-multibyte *************** *** 2664,2670 **** objects/quickfix.o: quickfix.c $(CCC) -o $@ quickfix.c ! objects/regexp.o: regexp.c $(CCC) -o $@ regexp.c objects/screen.o: screen.c --- 2664,2670 ---- objects/quickfix.o: quickfix.c $(CCC) -o $@ quickfix.c ! objects/regexp.o: regexp.c regexp_nfa.c $(CCC) -o $@ regexp.c objects/screen.o: screen.c *************** *** 2938,2947 **** auto/osdef.h ascii.h keymap.h term.h macros.h option.h structs.h \ regexp.h gui.h gui_beval.h proto/gui_beval.pro ex_cmds.h proto.h \ globals.h farsi.h arabic.h ! objects/regexp.o: regexp.c vim.h auto/config.h feature.h os_unix.h auto/osdef.h \ ! ascii.h keymap.h term.h macros.h option.h structs.h regexp.h gui.h \ ! gui_beval.h proto/gui_beval.pro ex_cmds.h proto.h globals.h farsi.h \ ! arabic.h objects/screen.o: screen.c vim.h auto/config.h feature.h os_unix.h auto/osdef.h \ ascii.h keymap.h term.h macros.h option.h structs.h regexp.h gui.h \ gui_beval.h proto/gui_beval.pro ex_cmds.h proto.h globals.h farsi.h \ --- 2938,2947 ---- auto/osdef.h ascii.h keymap.h term.h macros.h option.h structs.h \ regexp.h gui.h gui_beval.h proto/gui_beval.pro ex_cmds.h proto.h \ globals.h farsi.h arabic.h ! objects/regexp.o: regexp.c regexp_nfa.c vim.h auto/config.h feature.h os_unix.h \ ! auto/osdef.h ascii.h keymap.h term.h macros.h option.h structs.h \ ! regexp.h gui.h gui_beval.h proto/gui_beval.pro ex_cmds.h proto.h \ ! globals.h farsi.h arabic.h objects/screen.o: screen.c vim.h auto/config.h feature.h os_unix.h auto/osdef.h \ ascii.h keymap.h term.h macros.h option.h structs.h regexp.h gui.h \ gui_beval.h proto/gui_beval.pro ex_cmds.h proto.h globals.h farsi.h \ *** ../vim-7.3.969/src/regexp.c 2013-03-19 17:42:10.000000000 +0100 --- src/regexp.c 2013-05-18 19:47:06.000000000 +0200 *************** *** 38,46 **** * Named character class support added by Walter Briscoe (1998 Jul 01) */ #include "vim.h" ! #undef DEBUG /* * The "internal use only" fields in regexp.h are present to pass info from --- 38,57 ---- * Named character class support added by Walter Briscoe (1998 Jul 01) */ + /* Uncomment the first if you do not want to see debugging logs or files + * related to regular expressions, even when compiling with -DDEBUG. + * Uncomment the second to get the regexp debugging. */ + /* #undef DEBUG */ + /* #define DEBUG */ + #include "vim.h" ! #ifdef DEBUG ! /* show/save debugging data when BT engine is used */ ! # define BT_REGEXP_DUMP ! /* save the debugging data to a file instead of displaying it */ ! # define BT_REGEXP_LOG ! #endif /* * The "internal use only" fields in regexp.h are present to pass info from *************** *** 326,334 **** /* Used for an error (down from) vim_regcomp(): give the error message, set * rc_did_emsg and return NULL */ #define EMSG_RET_NULL(m) return (EMSG(m), rc_did_emsg = TRUE, (void *)NULL) - #define EMSG_M_RET_NULL(m, c) return (EMSG2((m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL) #define EMSG_RET_FAIL(m) return (EMSG(m), rc_did_emsg = TRUE, FAIL) ! #define EMSG_ONE_RET_NULL EMSG_M_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL) #define MAX_LIMIT (32767L << 16L) --- 337,346 ---- /* Used for an error (down from) vim_regcomp(): give the error message, set * rc_did_emsg and return NULL */ #define EMSG_RET_NULL(m) return (EMSG(m), rc_did_emsg = TRUE, (void *)NULL) #define EMSG_RET_FAIL(m) return (EMSG(m), rc_did_emsg = TRUE, FAIL) ! #define EMSG2_RET_NULL(m, c) return (EMSG2((m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL) ! #define EMSG2_RET_FAIL(m, c) return (EMSG2((m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL) ! #define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL) #define MAX_LIMIT (32767L << 16L) *************** *** 336,346 **** static int cstrncmp __ARGS((char_u *s1, char_u *s2, int *n)); static char_u *cstrchr __ARGS((char_u *, int)); #ifdef DEBUG - static void regdump __ARGS((char_u *, regprog_T *)); static char_u *regprop __ARGS((char_u *)); #endif #define NOT_MULTI 0 #define MULTI_ONE 1 #define MULTI_MULT 2 --- 348,365 ---- static int cstrncmp __ARGS((char_u *s1, char_u *s2, int *n)); static char_u *cstrchr __ARGS((char_u *, int)); + #ifdef BT_REGEXP_DUMP + static void regdump __ARGS((char_u *, bt_regprog_T *)); + #endif #ifdef DEBUG static char_u *regprop __ARGS((char_u *)); #endif + static char_u e_missingbracket[] = N_("E769: Missing ] after %s["); + static char_u e_unmatchedpp[] = N_("E53: Unmatched %s%%("); + static char_u e_unmatchedp[] = N_("E54: Unmatched %s("); + static char_u e_unmatchedpar[] = N_("E55: Unmatched %s)"); + #define NOT_MULTI 0 #define MULTI_ONE 1 #define MULTI_MULT 2 *************** *** 630,636 **** }; #endif ! static int curchr; /* arguments for reg() */ #define REG_NOPAREN 0 /* toplevel reg() */ --- 649,661 ---- }; #endif ! static int curchr; /* currently parsed character */ ! /* Previous character. Note: prevchr is sometimes -1 when we are not at the ! * start, eg in /[ ^I]^ the pattern was never found even if it existed, ! * because ^ was taken to be magic -- webb */ ! static int prevchr; ! static int prevprevchr; /* previous-previous character */ ! static int nextchr; /* used for ungetchr() */ /* arguments for reg() */ #define REG_NOPAREN 0 /* toplevel reg() */ *************** *** 680,685 **** --- 705,713 ---- static void regtail __ARGS((char_u *, char_u *)); static void regoptail __ARGS((char_u *, char_u *)); + static regengine_T bt_regengine; + static regengine_T nfa_regengine; + /* * Return TRUE if compiled regular expression "prog" can match a line break. */ *************** *** 762,767 **** --- 790,796 ---- /* * Produce the bytes for equivalence class "c". * Currently only handles latin1, latin9 and utf-8. + * NOTE: When changing this function, also change nfa_emit_equi_class() */ static void reg_equi_class(c) *************** *** 1239,1246 **** return p; } /* ! * vim_regcomp() - compile a regular expression into internal code * Returns the program in allocated space. Returns NULL for an error. * * We can't allocate space until we know how big the compiled form will be, --- 1268,1278 ---- return p; } + static regprog_T *bt_regcomp __ARGS((char_u *expr, int re_flags)); + /* ! * bt_regcomp() - compile a regular expression into internal code for the ! * traditional back track matcher. * Returns the program in allocated space. Returns NULL for an error. * * We can't allocate space until we know how big the compiled form will be, *************** *** 1259,1270 **** * of the structure of the compiled regexp. * "re_flags": RE_MAGIC and/or RE_STRING. */ ! regprog_T * ! vim_regcomp(expr, re_flags) char_u *expr; int re_flags; { ! regprog_T *r; char_u *scan; char_u *longest; int len; --- 1291,1302 ---- * of the structure of the compiled regexp. * "re_flags": RE_MAGIC and/or RE_STRING. */ ! static regprog_T * ! bt_regcomp(expr, re_flags) char_u *expr; int re_flags; { ! bt_regprog_T *r; char_u *scan; char_u *longest; int len; *************** *** 1291,1297 **** #endif /* Allocate space. */ ! r = (regprog_T *)lalloc(sizeof(regprog_T) + regsize, TRUE); if (r == NULL) return NULL; --- 1323,1329 ---- #endif /* Allocate space. */ ! r = (bt_regprog_T *)lalloc(sizeof(bt_regprog_T) + regsize, TRUE); if (r == NULL) return NULL; *************** *** 1386,1395 **** r->regmlen = len; } } ! #ifdef DEBUG regdump(expr, r); #endif ! return r; } /* --- 1418,1428 ---- r->regmlen = len; } } ! #ifdef BT_REGEXP_DUMP regdump(expr, r); #endif ! r->engine = &bt_regengine; ! return (regprog_T *)r; } /* *************** *** 1436,1442 **** #endif /* ! * reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * --- 1469,1475 ---- #endif /* ! * Parse regular expression, i.e. main body or parenthesized thing. * * Caller must absorb opening parenthesis. * *************** *** 1473,1479 **** { /* Make a MOPEN node. */ if (regnpar >= NSUBEXP) ! EMSG_M_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL); parno = regnpar; ++regnpar; ret = regnode(MOPEN + parno); --- 1506,1512 ---- { /* Make a MOPEN node. */ if (regnpar >= NSUBEXP) ! EMSG2_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL); parno = regnpar; ++regnpar; ret = regnode(MOPEN + parno); *************** *** 1534,1547 **** else #endif if (paren == REG_NPAREN) ! EMSG_M_RET_NULL(_("E53: Unmatched %s%%("), reg_magic == MAGIC_ALL); else ! EMSG_M_RET_NULL(_("E54: Unmatched %s("), reg_magic == MAGIC_ALL); } else if (paren == REG_NOPAREN && peekchr() != NUL) { if (curchr == Magic(')')) ! EMSG_M_RET_NULL(_("E55: Unmatched %s)"), reg_magic == MAGIC_ALL); else EMSG_RET_NULL(_(e_trailing)); /* "Can't happen". */ /* NOTREACHED */ --- 1567,1580 ---- else #endif if (paren == REG_NPAREN) ! EMSG2_RET_NULL(_(e_unmatchedpp), reg_magic == MAGIC_ALL); else ! EMSG2_RET_NULL(_(e_unmatchedp), reg_magic == MAGIC_ALL); } else if (paren == REG_NOPAREN && peekchr() != NUL) { if (curchr == Magic(')')) ! EMSG2_RET_NULL(_(e_unmatchedpar), reg_magic == MAGIC_ALL); else EMSG_RET_NULL(_(e_trailing)); /* "Can't happen". */ /* NOTREACHED */ *************** *** 1556,1562 **** } /* ! * Handle one alternative of an | operator. * Implements the & operator. */ static char_u * --- 1589,1595 ---- } /* ! * Parse one alternative of an | operator. * Implements the & operator. */ static char_u * *************** *** 1599,1605 **** } /* ! * Handle one alternative of an | or & operator. * Implements the concatenation operator. */ static char_u * --- 1632,1638 ---- } /* ! * Parse one alternative of an | or & operator. * Implements the concatenation operator. */ static char_u * *************** *** 1679,1685 **** } /* ! * regpiece - something followed by possible [*+=] * * Note that the branching code sequences used for = and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as --- 1712,1718 ---- } /* ! * Parse something followed by possible [*+=]. * * Note that the branching code sequences used for = and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as *************** *** 1759,1765 **** } } if (lop == END) ! EMSG_M_RET_NULL(_("E59: invalid character after %s@"), reg_magic == MAGIC_ALL); /* Look behind must match with behind_pos. */ if (lop == BEHIND || lop == NOBEHIND) --- 1792,1798 ---- } } if (lop == END) ! EMSG2_RET_NULL(_("E59: invalid character after %s@"), reg_magic == MAGIC_ALL); /* Look behind must match with behind_pos. */ if (lop == BEHIND || lop == NOBEHIND) *************** *** 1793,1799 **** else { if (num_complex_braces >= 10) ! EMSG_M_RET_NULL(_("E60: Too many complex %s{...}s"), reg_magic == MAGIC_ALL); reginsert(BRACE_COMPLEX + num_complex_braces, ret); regoptail(ret, regnode(BACK)); --- 1826,1832 ---- else { if (num_complex_braces >= 10) ! EMSG2_RET_NULL(_("E60: Too many complex %s{...}s"), reg_magic == MAGIC_ALL); reginsert(BRACE_COMPLEX + num_complex_braces, ret); regoptail(ret, regnode(BACK)); *************** *** 1820,1827 **** return ret; } /* ! * regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and --- 1853,1872 ---- return ret; } + /* When making changes to classchars also change nfa_classcodes. */ + static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU"; + static int classcodes[] = { + ANY, IDENT, SIDENT, KWORD, SKWORD, + FNAME, SFNAME, PRINT, SPRINT, + WHITE, NWHITE, DIGIT, NDIGIT, + HEX, NHEX, OCTAL, NOCTAL, + WORD, NWORD, HEAD, NHEAD, + ALPHA, NALPHA, LOWER, NLOWER, + UPPER, NUPPER + }; + /* ! * Parse the lowest level. * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and *************** *** 1836,1850 **** int cpo_lit; /* 'cpoptions' contains 'l' flag */ int cpo_bsl; /* 'cpoptions' contains '\' flag */ int c; - static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU"; - static int classcodes[] = {ANY, IDENT, SIDENT, KWORD, SKWORD, - FNAME, SFNAME, PRINT, SPRINT, - WHITE, NWHITE, DIGIT, NDIGIT, - HEX, NHEX, OCTAL, NOCTAL, - WORD, NWORD, HEAD, NHEAD, - ALPHA, NALPHA, LOWER, NLOWER, - UPPER, NUPPER - }; char_u *p; int extra = 0; --- 1881,1886 ---- *************** *** 2140,2146 **** while ((c = getchr()) != ']') { if (c == NUL) ! EMSG_M_RET_NULL(_("E69: Missing ] after %s%%["), reg_magic == MAGIC_ALL); br = regnode(BRANCH); if (ret == NULL) --- 2176,2182 ---- while ((c = getchr()) != ']') { if (c == NUL) ! EMSG2_RET_NULL(_("E69: Missing ] after %s%%["), reg_magic == MAGIC_ALL); br = regnode(BRANCH); if (ret == NULL) *************** *** 2156,2162 **** return NULL; } if (ret == NULL) ! EMSG_M_RET_NULL(_("E70: Empty %s%%[]"), reg_magic == MAGIC_ALL); lastbranch = regnode(BRANCH); br = regnode(NOTHING); --- 2192,2198 ---- return NULL; } if (ret == NULL) ! EMSG2_RET_NULL(_("E70: Empty %s%%[]"), reg_magic == MAGIC_ALL); lastbranch = regnode(BRANCH); br = regnode(NOTHING); *************** *** 2200,2206 **** } if (i < 0) ! EMSG_M_RET_NULL( _("E678: Invalid character after %s%%[dxouU]"), reg_magic == MAGIC_ALL); #ifdef FEAT_MBYTE --- 2236,2242 ---- } if (i < 0) ! EMSG2_RET_NULL( _("E678: Invalid character after %s%%[dxouU]"), reg_magic == MAGIC_ALL); #ifdef FEAT_MBYTE *************** *** 2272,2278 **** } } ! EMSG_M_RET_NULL(_("E71: Invalid character after %s%%"), reg_magic == MAGIC_ALL); } } --- 2308,2314 ---- } } ! EMSG2_RET_NULL(_("E71: Invalid character after %s%%"), reg_magic == MAGIC_ALL); } } *************** *** 2567,2574 **** break; } else if (reg_strict) ! EMSG_M_RET_NULL(_("E769: Missing ] after %s["), ! reg_magic > MAGIC_OFF); } /* FALLTHROUGH */ --- 2603,2609 ---- break; } else if (reg_strict) ! EMSG2_RET_NULL(_(e_missingbracket), reg_magic > MAGIC_OFF); } /* FALLTHROUGH */ *************** *** 2659,2665 **** #endif /* ! * emit a node * Return pointer to generated code. */ static char_u * --- 2694,2700 ---- #endif /* ! * Emit a node. * Return pointer to generated code. */ static char_u * *************** *** 2711,2717 **** #endif /* ! * reginsert - insert an operator in front of already-emitted operand * * Means relocating the operand. */ --- 2746,2752 ---- #endif /* ! * Insert an operator in front of already-emitted operand * * Means relocating the operand. */ *************** *** 2742,2748 **** } /* ! * reginsert_limits - insert an operator in front of already-emitted operand. * The operator has the given limit values as operands. Also set next pointer. * * Means relocating the operand. --- 2777,2783 ---- } /* ! * Insert an operator in front of already-emitted operand. * The operator has the given limit values as operands. Also set next pointer. * * Means relocating the operand. *************** *** 2794,2800 **** } /* ! * regtail - set the next-pointer at the end of a node chain */ static void regtail(p, val) --- 2829,2835 ---- } /* ! * Set the next-pointer at the end of a node chain. */ static void regtail(p, val) *************** *** 2835,2841 **** } /* ! * regoptail - regtail on item after a BRANCH; nop if none */ static void regoptail(p, val) --- 2870,2876 ---- } /* ! * Like regtail, on item after a BRANCH; nop if none. */ static void regoptail(p, val) *************** *** 2851,2872 **** } /* ! * getchr() - get the next character from the pattern. We know about ! * magic and such, so therefore we need a lexical analyzer. */ - /* static int curchr; */ - static int prevprevchr; - static int prevchr; - static int nextchr; /* used for ungetchr() */ - /* - * Note: prevchr is sometimes -1 when we are not at the start, - * eg in /[ ^I]^ the pattern was never found even if it existed, because ^ was - * taken to be magic -- webb - */ static int at_start; /* True when on the first character */ static int prev_at_start; /* True when on the second character */ static void initchr(str) char_u *str; --- 2886,2900 ---- } /* ! * Functions for getting characters from the regexp input. */ static int at_start; /* True when on the first character */ static int prev_at_start; /* True when on the second character */ + /* + * Start parsing at "str". + */ static void initchr(str) char_u *str; *************** *** 2878,2883 **** --- 2906,2914 ---- prev_at_start = FALSE; } + /* + * Get the next character without advancing. + */ static int peekchr() { *************** *** 3086,3091 **** --- 3117,3126 ---- prevprevchr = prpr; } + /* + * Get the next character from the pattern. We know about magic and such, so + * therefore we need a lexical analyzer. + */ static int getchr() { *************** *** 3340,3347 **** } regbehind_T; static char_u *reg_getline __ARGS((linenr_T lnum)); ! static long vim_regexec_both __ARGS((char_u *line, colnr_T col, proftime_T *tm)); ! static long regtry __ARGS((regprog_T *prog, colnr_T col)); static void cleanup_subexpr __ARGS((void)); #ifdef FEAT_SYN_HL static void cleanup_zsubexpr __ARGS((void)); --- 3375,3382 ---- } regbehind_T; static char_u *reg_getline __ARGS((linenr_T lnum)); ! static long bt_regexec_both __ARGS((char_u *line, colnr_T col, proftime_T *tm)); ! static long regtry __ARGS((bt_regprog_T *prog, colnr_T col)); static void cleanup_subexpr __ARGS((void)); #ifdef FEAT_SYN_HL static void cleanup_zsubexpr __ARGS((void)); *************** *** 3398,3404 **** /* * Sometimes need to save a copy of a line. Since alloc()/free() is very * slow, we keep one allocated piece of memory and only re-allocate it when ! * it's too small. It's freed in vim_regexec_both() when finished. */ static char_u *reg_tofree = NULL; static unsigned reg_tofreelen; --- 3433,3439 ---- /* * Sometimes need to save a copy of a line. Since alloc()/free() is very * slow, we keep one allocated piece of memory and only re-allocate it when ! * it's too small. It's freed in bt_regexec_both() when finished. */ static char_u *reg_tofree = NULL; static unsigned reg_tofreelen; *************** *** 3556,3561 **** --- 3591,3598 ---- /* TRUE if using multi-line regexp. */ #define REG_MULTI (reg_match == NULL) + static int bt_regexec __ARGS((regmatch_T *rmp, char_u *line, colnr_T col)); + /* * Match a regexp against a string. * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). *************** *** 3563,3570 **** * * Return TRUE if there is a match, FALSE if not. */ ! int ! vim_regexec(rmp, line, col) regmatch_T *rmp; char_u *line; /* string to match against */ colnr_T col; /* column to start looking for match */ --- 3600,3607 ---- * * Return TRUE if there is a match, FALSE if not. */ ! static int ! bt_regexec(rmp, line, col) regmatch_T *rmp; char_u *line; /* string to match against */ colnr_T col; /* column to start looking for match */ *************** *** 3580,3595 **** ireg_icombine = FALSE; #endif ireg_maxcol = 0; ! return (vim_regexec_both(line, col, NULL) != 0); } #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \ || defined(FIND_REPLACE_DIALOG) || defined(PROTO) /* * Like vim_regexec(), but consider a "\n" in "line" to be a line break. */ ! int ! vim_regexec_nl(rmp, line, col) regmatch_T *rmp; char_u *line; /* string to match against */ colnr_T col; /* column to start looking for match */ --- 3617,3635 ---- ireg_icombine = FALSE; #endif ireg_maxcol = 0; ! return (bt_regexec_both(line, col, NULL) != 0); } #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \ || defined(FIND_REPLACE_DIALOG) || defined(PROTO) + + static int bt_regexec_nl __ARGS((regmatch_T *rmp, char_u *line, colnr_T col)); + /* * Like vim_regexec(), but consider a "\n" in "line" to be a line break. */ ! static int ! bt_regexec_nl(rmp, line, col) regmatch_T *rmp; char_u *line; /* string to match against */ colnr_T col; /* column to start looking for match */ *************** *** 3605,3614 **** ireg_icombine = FALSE; #endif ireg_maxcol = 0; ! return (vim_regexec_both(line, col, NULL) != 0); } #endif /* * Match a regexp against multiple lines. * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). --- 3645,3656 ---- ireg_icombine = FALSE; #endif ireg_maxcol = 0; ! return (bt_regexec_both(line, col, NULL) != 0); } #endif + static long bt_regexec_multi __ARGS((regmmatch_T *rmp, win_T *win, buf_T *buf, linenr_T lnum, colnr_T col, proftime_T *tm)); + /* * Match a regexp against multiple lines. * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). *************** *** 3617,3624 **** * Return zero if there is no match. Return number of lines contained in the * match otherwise. */ ! long ! vim_regexec_multi(rmp, win, buf, lnum, col, tm) regmmatch_T *rmp; win_T *win; /* window in which to search or NULL */ buf_T *buf; /* buffer in which to search */ --- 3659,3666 ---- * Return zero if there is no match. Return number of lines contained in the * match otherwise. */ ! static long ! bt_regexec_multi(rmp, win, buf, lnum, col, tm) regmmatch_T *rmp; win_T *win; /* window in which to search or NULL */ buf_T *buf; /* buffer in which to search */ *************** *** 3641,3647 **** #endif ireg_maxcol = rmp->rmm_maxcol; ! r = vim_regexec_both(NULL, col, tm); return r; } --- 3683,3689 ---- #endif ireg_maxcol = rmp->rmm_maxcol; ! r = bt_regexec_both(NULL, col, tm); return r; } *************** *** 3651,3662 **** * lines ("line" is NULL, use reg_getline()). */ static long ! vim_regexec_both(line, col, tm) char_u *line; colnr_T col; /* column to start looking for match */ proftime_T *tm UNUSED; /* timeout limit or NULL */ { ! regprog_T *prog; char_u *s; long retval = 0L; --- 3693,3704 ---- * lines ("line" is NULL, use reg_getline()). */ static long ! bt_regexec_both(line, col, tm) char_u *line; colnr_T col; /* column to start looking for match */ proftime_T *tm UNUSED; /* timeout limit or NULL */ { ! bt_regprog_T *prog; char_u *s; long retval = 0L; *************** *** 3682,3695 **** if (REG_MULTI) { ! prog = reg_mmatch->regprog; line = reg_getline((linenr_T)0); reg_startpos = reg_mmatch->startpos; reg_endpos = reg_mmatch->endpos; } else { ! prog = reg_match->regprog; reg_startp = reg_match->startp; reg_endp = reg_match->endp; } --- 3724,3737 ---- if (REG_MULTI) { ! prog = (bt_regprog_T *)reg_mmatch->regprog; line = reg_getline((linenr_T)0); reg_startpos = reg_mmatch->startpos; reg_endpos = reg_mmatch->endpos; } else { ! prog = (bt_regprog_T *)reg_match->regprog; reg_startp = reg_match->startp; reg_endp = reg_match->endp; } *************** *** 3931,3937 **** */ static long regtry(prog, col) ! regprog_T *prog; colnr_T col; { reginput = regline + col; --- 3973,3979 ---- */ static long regtry(prog, col) ! bt_regprog_T *prog; colnr_T col; { reginput = regline + col; *************** *** 4063,4069 **** #define RA_NOMATCH 5 /* didn't match */ /* Make "regstack" and "backpos" empty. They are allocated and freed in ! * vim_regexec_both() to reduce malloc()/free() calls. */ regstack.ga_len = 0; backpos.ga_len = 0; --- 4105,4111 ---- #define RA_NOMATCH 5 /* didn't match */ /* Make "regstack" and "backpos" empty. They are allocated and freed in ! * bt_regexec_both() to reduce malloc()/free() calls. */ regstack.ga_len = 0; backpos.ga_len = 0; *************** *** 4072,4085 **** */ for (;;) { ! /* Some patterns my cause a long time to match, even though they are not * illegal. E.g., "\([a-z]\+\)\+Q". Allow breaking them with CTRL-C. */ fast_breakcheck(); #ifdef DEBUG if (scan != NULL && regnarrate) { ! mch_errmsg(regprop(scan)); mch_errmsg("(\n"); } #endif --- 4114,4127 ---- */ for (;;) { ! /* Some patterns may cause a long time to match, even though they are not * illegal. E.g., "\([a-z]\+\)\+Q". Allow breaking them with CTRL-C. */ fast_breakcheck(); #ifdef DEBUG if (scan != NULL && regnarrate) { ! mch_errmsg((char *)regprop(scan)); mch_errmsg("(\n"); } #endif *************** *** 4100,4106 **** #ifdef DEBUG if (regnarrate) { ! mch_errmsg(regprop(scan)); mch_errmsg("...\n"); # ifdef FEAT_SYN_HL if (re_extmatch_in != NULL) --- 4142,4148 ---- #ifdef DEBUG if (regnarrate) { ! mch_errmsg((char *)regprop(scan)); mch_errmsg("...\n"); # ifdef FEAT_SYN_HL if (re_extmatch_in != NULL) *************** *** 4112,4118 **** { mch_errmsg(" \""); if (re_extmatch_in->matches[i] != NULL) ! mch_errmsg(re_extmatch_in->matches[i]); mch_errmsg("\"\n"); } } --- 4154,4160 ---- { mch_errmsg(" \""); if (re_extmatch_in->matches[i] != NULL) ! mch_errmsg((char *)re_extmatch_in->matches[i]); mch_errmsg("\"\n"); } } *************** *** 6091,6099 **** static int prog_magic_wrong() { ! if (UCHARAT(REG_MULTI ! ? reg_mmatch->regprog->program ! : reg_match->regprog->program) != REGMAGIC) { EMSG(_(e_re_corr)); return TRUE; --- 6133,6146 ---- static int prog_magic_wrong() { ! regprog_T *prog; ! ! prog = REG_MULTI ? reg_mmatch->regprog : reg_match->regprog; ! if (prog->engine == &nfa_regengine) ! /* For NFA matcher we don't check the magic */ ! return FALSE; ! ! if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC) { EMSG(_(e_re_corr)); return TRUE; *************** *** 6318,6324 **** } ! #ifdef DEBUG /* * regdump - dump a regexp onto stdout in vaguely comprehensible form --- 6365,6371 ---- } ! #ifdef BT_REGEXP_DUMP /* * regdump - dump a regexp onto stdout in vaguely comprehensible form *************** *** 6326,6339 **** static void regdump(pattern, r) char_u *pattern; ! regprog_T *r; { char_u *s; int op = EXACTLY; /* Arbitrary non-END op. */ char_u *next; char_u *end = NULL; ! printf("\r\nregcomp(%s):\r\n", pattern); s = r->program + 1; /* --- 6373,6394 ---- static void regdump(pattern, r) char_u *pattern; ! bt_regprog_T *r; { char_u *s; int op = EXACTLY; /* Arbitrary non-END op. */ char_u *next; char_u *end = NULL; + FILE *f; ! #ifdef BT_REGEXP_LOG ! f = fopen("bt_regexp_log.log", "a"); ! #else ! f = stdout; ! #endif ! if (f == NULL) ! return; ! fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n", pattern); s = r->program + 1; /* *************** *** 6343,6360 **** while (op != END || s <= end) { op = OP(s); ! printf("%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */ next = regnext(s); if (next == NULL) /* Next ptr. */ ! printf("(0)"); else ! printf("(%d)", (int)((s - r->program) + (next - s))); if (end < next) end = next; if (op == BRACE_LIMITS) { /* Two short ints */ ! printf(" minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s)); s += 8; } s += 3; --- 6398,6415 ---- while (op != END || s <= end) { op = OP(s); ! fprintf(f, "%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */ next = regnext(s); if (next == NULL) /* Next ptr. */ ! fprintf(f, "(0)"); else ! fprintf(f, "(%d)", (int)((s - r->program) + (next - s))); if (end < next) end = next; if (op == BRACE_LIMITS) { /* Two short ints */ ! fprintf(f, " minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s)); s += 8; } s += 3; *************** *** 6363,6387 **** || op == EXACTLY) { /* Literal string, where present. */ while (*s != NUL) ! printf("%c", *s++); s++; } ! printf("\r\n"); } /* Header fields of interest. */ if (r->regstart != NUL) ! printf("start `%s' 0x%x; ", r->regstart < 256 ? (char *)transchar(r->regstart) : "multibyte", r->regstart); if (r->reganch) ! printf("anchored; "); if (r->regmust != NULL) ! printf("must have \"%s\"", r->regmust); ! printf("\r\n"); } /* * regprop - printable representation of opcode */ --- 6418,6450 ---- || op == EXACTLY) { /* Literal string, where present. */ + fprintf(f, "\nxxxxxxxxx\n"); while (*s != NUL) ! fprintf(f, "%c", *s++); ! fprintf(f, "\nxxxxxxxxx\n"); s++; } ! fprintf(f, "\r\n"); } /* Header fields of interest. */ if (r->regstart != NUL) ! fprintf(f, "start `%s' 0x%x; ", r->regstart < 256 ? (char *)transchar(r->regstart) : "multibyte", r->regstart); if (r->reganch) ! fprintf(f, "anchored; "); if (r->regmust != NULL) ! fprintf(f, "must have \"%s\"", r->regmust); ! fprintf(f, "\r\n"); ! ! #ifdef BT_REGEXP_LOG ! fclose(f); ! #endif } + #endif /* BT_REGEXP_DUMP */ + #ifdef DEBUG /* * regprop - printable representation of opcode */ *************** *** 6389,6400 **** regprop(op) char_u *op; { ! char_u *p; ! static char_u buf[50]; ! (void) strcpy(buf, ":"); ! switch (OP(op)) { case BOL: p = "BOL"; --- 6452,6463 ---- regprop(op) char_u *op; { ! char *p; ! static char buf[50]; ! STRCPY(buf, ":"); ! switch ((int) OP(op)) { case BOL: p = "BOL"; *************** *** 6761,6770 **** break; } if (p != NULL) ! (void) strcat(buf, p); ! return buf; } ! #endif #ifdef FEAT_MBYTE static void mb_decompose __ARGS((int c, int *c1, int *c2, int *c3)); --- 6824,6833 ---- break; } if (p != NULL) ! STRCAT(buf, p); ! return (char_u *)buf; } ! #endif /* DEBUG */ #ifdef FEAT_MBYTE static void mb_decompose __ARGS((int c, int *c1, int *c2, int *c3)); *************** *** 7667,7669 **** --- 7730,7916 ---- return retval; } #endif + + static regengine_T bt_regengine = + { + bt_regcomp, + bt_regexec, + #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \ + || defined(FIND_REPLACE_DIALOG) || defined(PROTO) + bt_regexec_nl, + #endif + bt_regexec_multi + #ifdef DEBUG + ,(char_u *)"" + #endif + }; + + + #include "regexp_nfa.c" + + static regengine_T nfa_regengine = + { + nfa_regcomp, + nfa_regexec, + #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \ + || defined(FIND_REPLACE_DIALOG) || defined(PROTO) + nfa_regexec_nl, + #endif + nfa_regexec_multi + #ifdef DEBUG + ,(char_u *)"" + #endif + }; + + /* Which regexp engine to use? Needed for vim_regcomp(). + * Must match with 'regexpengine'. */ + static int regexp_engine = 0; + #define AUTOMATIC_ENGINE 0 + #define BACKTRACKING_ENGINE 1 + #define NFA_ENGINE 2 + #ifdef DEBUG + static char_u regname[][30] = { + "AUTOMATIC Regexp Engine", + "BACKTACKING Regexp Engine", + "NFA Regexp Engine" + }; + #endif + + /* + * Compile a regular expression into internal code. + * Returns the program in allocated memory. Returns NULL for an error. + */ + regprog_T * + vim_regcomp(expr_arg, re_flags) + char_u *expr_arg; + int re_flags; + { + regprog_T *prog = NULL; + char_u *expr = expr_arg; + + syntax_error = FALSE; + regexp_engine = p_re; + + /* Check for prefix "\%#=", that sets the regexp engine */ + if (STRNCMP(expr, "\\%#=", 4) == 0) + { + int newengine = expr[4] - '0'; + + if (newengine == AUTOMATIC_ENGINE + || newengine == BACKTRACKING_ENGINE + || newengine == NFA_ENGINE) + { + regexp_engine = expr[4] - '0'; + expr += 5; + #ifdef DEBUG + EMSG3("New regexp mode selected (%d): %s", regexp_engine, + regname[newengine]); + #endif + } + else + { + EMSG(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used ")); + regexp_engine = AUTOMATIC_ENGINE; + } + } + #ifdef DEBUG + bt_regengine.expr = expr; + nfa_regengine.expr = expr; + #endif + + /* + * First try the NFA engine, unless backtracking was requested. + */ + if (regexp_engine != BACKTRACKING_ENGINE) + prog = nfa_regengine.regcomp(expr, re_flags); + else + prog = bt_regengine.regcomp(expr, re_flags); + + if (prog == NULL) /* error compiling regexp with initial engine */ + { + #ifdef DEBUG + if (regexp_engine != BACKTRACKING_ENGINE) /* debugging log for NFA */ + { + FILE *f; + f = fopen("debug.log", "a"); + if (f) + { + if (!syntax_error) + fprintf(f, "NFA engine could not handle \"%s\"\n", expr); + else + fprintf(f, "Syntax error in \"%s\"\n", expr); + fclose(f); + } + else + EMSG("(NFA) Could not open \"debug.log\" to write !!!"); + /* + if (syntax_error) + EMSG("NFA Regexp: Syntax Error !"); + */ + } + #endif + /* + * If NFA engine failed, then revert to the backtracking engine. + * Except when there was a syntax error, which was properly handled by + * NFA engine. + */ + if (regexp_engine == AUTOMATIC_ENGINE) + if (!syntax_error) + prog = bt_regengine.regcomp(expr, re_flags); + + } /* endif prog==NULL */ + + + return prog; + } + + /* + * Match a regexp against a string. + * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). + * Uses curbuf for line count and 'iskeyword'. + * + * Return TRUE if there is a match, FALSE if not. + */ + int + vim_regexec(rmp, line, col) + regmatch_T *rmp; + char_u *line; /* string to match against */ + colnr_T col; /* column to start looking for match */ + { + return rmp->regprog->engine->regexec(rmp, line, col); + } + + #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \ + || defined(FIND_REPLACE_DIALOG) || defined(PROTO) + /* + * Like vim_regexec(), but consider a "\n" in "line" to be a line break. + */ + int + vim_regexec_nl(rmp, line, col) + regmatch_T *rmp; + char_u *line; + colnr_T col; + { + return rmp->regprog->engine->regexec_nl(rmp, line, col); + } + #endif + + /* + * Match a regexp against multiple lines. + * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). + * Uses curbuf for line count and 'iskeyword'. + * + * Return zero if there is no match. Return number of lines contained in the + * match otherwise. + */ + long + vim_regexec_multi(rmp, win, buf, lnum, col, tm) + regmmatch_T *rmp; + win_T *win; /* window in which to search or NULL */ + buf_T *buf; /* buffer in which to search */ + linenr_T lnum; /* nr of line to start looking for match */ + colnr_T col; /* column to start looking for match */ + proftime_T *tm; /* timeout limit or NULL */ + { + return rmp->regprog->engine->regexec_multi(rmp, win, buf, lnum, col, tm); + } *** ../vim-7.3.969/src/regexp.h 2010-08-15 21:57:27.000000000 +0200 --- src/regexp.h 2013-05-18 17:24:42.000000000 +0200 *************** *** 22,41 **** #define NSUBEXP 10 /* * Structure returned by vim_regcomp() to pass on to vim_regexec(). * These fields are only to be used in regexp.c! ! * See regep.c for an explanation. */ typedef struct { int regstart; char_u reganch; char_u *regmust; int regmlen; - unsigned regflags; char_u reghasz; ! char_u program[1]; /* actually longer.. */ ! } regprog_T; /* * Structure to be used for single-line matching. --- 22,97 ---- #define NSUBEXP 10 /* + * In the NFA engine: how many braces are allowed. + * TODO(RE): Use dynamic memory allocation instead of static, like here + */ + #define NFA_MAX_BRACES 20 + + typedef struct regengine regengine_T; + + typedef struct thread thread_T; + + /* * Structure returned by vim_regcomp() to pass on to vim_regexec(). + * This is the general structure. For the actual matcher, two specific + * structures are used. See code below. + */ + typedef struct regprog + { + regengine_T *engine; + unsigned regflags; + } regprog_T; + + /* + * Structure used by the back track matcher. * These fields are only to be used in regexp.c! ! * See regexp.c for an explanation. */ typedef struct { + /* These two members implement regprog_T */ + regengine_T *engine; + unsigned regflags; + int regstart; char_u reganch; char_u *regmust; int regmlen; char_u reghasz; ! char_u program[1]; /* actually longer.. */ ! } bt_regprog_T; ! ! /* ! * Structure representing a NFA state. ! * A NFA state may have no outgoing edge, when it is a NFA_MATCH state. ! */ ! typedef struct nfa_state nfa_state_T; ! struct nfa_state ! { ! int c; ! nfa_state_T *out; ! nfa_state_T *out1; ! int id; ! int lastlist; ! int visits; ! thread_T *lastthread; ! int negated; ! }; ! ! /* ! * Structure used by the NFA matcher. ! */ ! typedef struct ! { ! /* These two members implement regprog_T */ ! regengine_T *engine; ! unsigned regflags; ! ! regprog_T regprog; ! nfa_state_T *start; ! int nstate; ! nfa_state_T state[0]; /* actually longer.. */ ! } nfa_regprog_T; /* * Structure to be used for single-line matching. *************** *** 78,81 **** --- 134,151 ---- char_u *matches[NSUBEXP]; } reg_extmatch_T; + struct regengine + { + regprog_T *(*regcomp)(char_u*, int); + int (*regexec)(regmatch_T*, char_u*, colnr_T); + #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \ + || defined(FIND_REPLACE_DIALOG) || defined(PROTO) + int (*regexec_nl)(regmatch_T*, char_u*, colnr_T); + #endif + long (*regexec_multi)(regmmatch_T*, win_T*, buf_T*, linenr_T, colnr_T, proftime_T*); + #ifdef DEBUG + char_u *expr; + #endif + }; + #endif /* _REGEXP_H */ *** ../vim-7.3.969/src/regexp_nfa.c 2013-05-19 19:10:10.000000000 +0200 --- src/regexp_nfa.c 2013-05-19 16:08:09.000000000 +0200 *************** *** 0 **** --- 1,3819 ---- + /* vi:set ts=8 sts=4 sw=4: + * + * NFA regular expression implementation. + * + * This file is included in "regexp.c". + */ + + #ifdef DEBUG + /* Comment this out to disable log files. They can get pretty big */ + # define ENABLE_LOG + # define LOG_NAME "log_nfarun.log" + #endif + + /* Upper limit allowed for {m,n} repetitions handled by NFA */ + #define NFA_BRACES_MAXLIMIT 10 + /* For allocating space for the postfix representation */ + #define NFA_POSTFIX_MULTIPLIER (NFA_BRACES_MAXLIMIT + 2)*2 + /* Size of stack, used when converting the postfix regexp into NFA */ + #define NFA_STACK_SIZE 1024 + + enum + { + NFA_SPLIT = -1024, + NFA_MATCH, + NFA_SKIP_CHAR, /* matches a 0-length char */ + NFA_END_NEG_RANGE, /* Used when expanding [^ab] */ + + NFA_CONCAT, + NFA_OR, + NFA_STAR, + NFA_PLUS, + NFA_QUEST, + NFA_QUEST_NONGREEDY, /* Non-greedy version of \? */ + NFA_NOT, /* used for [^ab] negated char ranges */ + + NFA_BOL, /* ^ Begin line */ + NFA_EOL, /* $ End line */ + NFA_BOW, /* \< Begin word */ + NFA_EOW, /* \> End word */ + NFA_BOF, /* \%^ Begin file */ + NFA_EOF, /* \%$ End file */ + NFA_NEWL, + NFA_ZSTART, /* Used for \zs */ + NFA_ZEND, /* Used for \ze */ + NFA_NOPEN, /* Start of subexpression marked with \%( */ + NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */ + NFA_START_INVISIBLE, + NFA_END_INVISIBLE, + NFA_MULTIBYTE, /* Next nodes in NFA are part of the same + multibyte char */ + NFA_END_MULTIBYTE, /* End of multibyte char in the NFA */ + NFA_COMPOSING, /* Next nodes in NFA are part of the + composing multibyte char */ + NFA_END_COMPOSING, /* End of a composing char in the NFA */ + + /* The following are used only in the postfix form, not in the NFA */ + NFA_PREV_ATOM_NO_WIDTH, /* Used for \@= */ + NFA_PREV_ATOM_NO_WIDTH_NEG, /* Used for \@! */ + NFA_PREV_ATOM_JUST_BEFORE, /* Used for \@<= */ + NFA_PREV_ATOM_JUST_BEFORE_NEG, /* Used for \@ */ + + NFA_MOPEN, + NFA_MCLOSE = NFA_MOPEN + NSUBEXP, + + /* NFA_FIRST_NL */ + NFA_ANY = NFA_MCLOSE + NSUBEXP, /* Match any one character. */ + NFA_ANYOF, /* Match any character in this string. */ + NFA_ANYBUT, /* Match any character not in this string. */ + NFA_IDENT, /* Match identifier char */ + NFA_SIDENT, /* Match identifier char but no digit */ + NFA_KWORD, /* Match keyword char */ + NFA_SKWORD, /* Match word char but no digit */ + NFA_FNAME, /* Match file name char */ + NFA_SFNAME, /* Match file name char but no digit */ + NFA_PRINT, /* Match printable char */ + NFA_SPRINT, /* Match printable char but no digit */ + NFA_WHITE, /* Match whitespace char */ + NFA_NWHITE, /* Match non-whitespace char */ + NFA_DIGIT, /* Match digit char */ + NFA_NDIGIT, /* Match non-digit char */ + NFA_HEX, /* Match hex char */ + NFA_NHEX, /* Match non-hex char */ + NFA_OCTAL, /* Match octal char */ + NFA_NOCTAL, /* Match non-octal char */ + NFA_WORD, /* Match word char */ + NFA_NWORD, /* Match non-word char */ + NFA_HEAD, /* Match head char */ + NFA_NHEAD, /* Match non-head char */ + NFA_ALPHA, /* Match alpha char */ + NFA_NALPHA, /* Match non-alpha char */ + NFA_LOWER, /* Match lowercase char */ + NFA_NLOWER, /* Match non-lowercase char */ + NFA_UPPER, /* Match uppercase char */ + NFA_NUPPER, /* Match non-uppercase char */ + NFA_FIRST_NL = NFA_ANY + ADD_NL, + NFA_LAST_NL = NFA_NUPPER + ADD_NL, + + /* Character classes [:alnum:] etc */ + NFA_CLASS_ALNUM, + NFA_CLASS_ALPHA, + NFA_CLASS_BLANK, + NFA_CLASS_CNTRL, + NFA_CLASS_DIGIT, + NFA_CLASS_GRAPH, + NFA_CLASS_LOWER, + NFA_CLASS_PRINT, + NFA_CLASS_PUNCT, + NFA_CLASS_SPACE, + NFA_CLASS_UPPER, + NFA_CLASS_XDIGIT, + NFA_CLASS_TAB, + NFA_CLASS_RETURN, + NFA_CLASS_BACKSPACE, + NFA_CLASS_ESCAPE + }; + + /* Keep in sync with classchars. */ + static int nfa_classcodes[] = { + NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD, + NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT, + NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT, + NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL, + NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD, + NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER, + NFA_UPPER, NFA_NUPPER + }; + + static char_u e_misplaced[] = N_("E866: (NFA regexp) Misplaced %c"); + + /* + * NFA errors can be of 3 types: + * *** NFA runtime errors, when something unknown goes wrong. The NFA fails + * silently and revert the to backtracking engine. + * syntax_error = FALSE; + * *** Regexp syntax errors, when the input regexp is not syntactically correct. + * The NFA engine displays an error message, and nothing else happens. + * syntax_error = TRUE + * *** Unsupported features, when the input regexp uses an operator that is not + * implemented in the NFA. The NFA engine fails silently, and reverts to the + * old backtracking engine. + * syntax_error = FALSE + * "The NFA fails" means that "compiling the regexp with the NFA fails": + * nfa_regcomp() returns FAIL. + */ + static int syntax_error = FALSE; + + /* NFA regexp \ze operator encountered. */ + static int nfa_has_zend = FALSE; + + static int *post_start; /* holds the postfix form of r.e. */ + static int *post_end; + static int *post_ptr; + + static int nstate; /* Number of states in the NFA. */ + static int istate; /* Index in the state vector, used in new_state() */ + static int nstate_max; /* Upper bound of estimated number of states. */ + + + static int nfa_regcomp_start __ARGS((char_u*expr, int re_flags)); + static int nfa_recognize_char_class __ARGS((char_u *start, char_u *end, int extra_newl)); + static int nfa_emit_equi_class __ARGS((int c, int neg)); + static void nfa_inc __ARGS((char_u **p)); + static void nfa_dec __ARGS((char_u **p)); + static int nfa_regatom __ARGS((void)); + static int nfa_regpiece __ARGS((void)); + static int nfa_regconcat __ARGS((void)); + static int nfa_regbranch __ARGS((void)); + static int nfa_reg __ARGS((int paren)); + #ifdef DEBUG + static void nfa_set_code __ARGS((int c)); + static void nfa_postfix_dump __ARGS((char_u *expr, int retval)); + static void nfa_print_state __ARGS((FILE *debugf, nfa_state_T *state, int ident)); + static void nfa_dump __ARGS((nfa_regprog_T *prog)); + #endif + static int *re2post __ARGS((void)); + static nfa_state_T *new_state __ARGS((int c, nfa_state_T *out, nfa_state_T *out1)); + static nfa_state_T *post2nfa __ARGS((int *postfix, int *end, int nfa_calc_size)); + static int check_char_class __ARGS((int class, int c)); + static void st_error __ARGS((int *postfix, int *end, int *p)); + static void nfa_save_listids __ARGS((nfa_state_T *start, int *list)); + static void nfa_restore_listids __ARGS((nfa_state_T *start, int *list)); + static void nfa_set_null_listids __ARGS((nfa_state_T *start)); + static void nfa_set_neg_listids __ARGS((nfa_state_T *start)); + static long nfa_regtry __ARGS((nfa_state_T *start, colnr_T col)); + static long nfa_regexec_both __ARGS((char_u *line, colnr_T col)); + static regprog_T *nfa_regcomp __ARGS((char_u *expr, int re_flags)); + static int nfa_regexec __ARGS((regmatch_T *rmp, char_u *line, colnr_T col)); + static long nfa_regexec_multi __ARGS((regmmatch_T *rmp, win_T *win, buf_T *buf, linenr_T lnum, colnr_T col, proftime_T *tm)); + + /* helper functions used when doing re2post() ... regatom() parsing */ + #define EMIT(c) do { \ + if (post_ptr >= post_end) \ + return FAIL; \ + *post_ptr++ = c; \ + } while (0) + + #define EMIT_MBYTE(c) \ + len = (*mb_char2bytes)(c, buf); \ + EMIT(buf[0]); \ + for (i = 1; i < len; i++) \ + { \ + EMIT(buf[i]); \ + EMIT(NFA_CONCAT); \ + } \ + EMIT(NFA_MULTIBYTE); + + #define EMIT_COMPOSING_UTF(input) \ + len = utfc_ptr2len(input); \ + EMIT(input[0]); \ + for (i = 1; i < len; i++) \ + { \ + EMIT(input[i]); \ + EMIT(NFA_CONCAT); \ + } \ + EMIT(NFA_COMPOSING); + + /* + * Initialize internal variables before NFA compilation. + * Return OK on success, FAIL otherwise. + */ + static int + nfa_regcomp_start(expr, re_flags) + char_u *expr; + int re_flags; /* see vim_regcomp() */ + { + int postfix_size; + + nstate = 0; + istate = 0; + /* A reasonable estimation for size */ + nstate_max = (STRLEN(expr) + 1) * NFA_POSTFIX_MULTIPLIER; + + /* Size for postfix representation of expr */ + postfix_size = sizeof(*post_start) * nstate_max; + post_start = (int *)lalloc(postfix_size, TRUE); + if (post_start == NULL) + return FAIL; + vim_memset(post_start, 0, postfix_size); + post_ptr = post_start; + post_end = post_start + postfix_size; + nfa_has_zend = FALSE; + + regcomp_start(expr, re_flags); + + return OK; + } + + /* + * Search between "start" and "end" and try to recognize a + * character class in expanded form. For example [0-9]. + * On success, return the id the character class to be emitted. + * On failure, return 0 (=FAIL) + * Start points to the first char of the range, while end should point + * to the closing brace. + */ + static int + nfa_recognize_char_class(start, end, extra_newl) + char_u *start; + char_u *end; + int extra_newl; + { + int i; + /* Each of these variables takes up a char in "config[]", + * in the order they are here. */ + int not = FALSE, af = FALSE, AF = FALSE, az = FALSE, AZ = FALSE, + o7 = FALSE, o9 = FALSE, underscore = FALSE, newl = FALSE; + char_u *p; + #define NCONFIGS 16 + int classid[NCONFIGS] = { + NFA_DIGIT, NFA_NDIGIT, NFA_HEX, NFA_NHEX, + NFA_OCTAL, NFA_NOCTAL, NFA_WORD, NFA_NWORD, + NFA_HEAD, NFA_NHEAD, NFA_ALPHA, NFA_NALPHA, + NFA_LOWER, NFA_NLOWER, NFA_UPPER, NFA_NUPPER + }; + char_u myconfig[9]; + char_u config[NCONFIGS][9] = { + "000000100", /* digit */ + "100000100", /* non digit */ + "011000100", /* hex-digit */ + "111000100", /* non hex-digit */ + "000001000", /* octal-digit */ + "100001000", /* [^0-7] */ + "000110110", /* [0-9A-Za-z_] */ + "100110110", /* [^0-9A-Za-z_] */ + "000110010", /* head of word */ + "100110010", /* not head of word */ + "000110000", /* alphabetic char a-z */ + "100110000", /* non alphabetic char */ + "000100000", /* lowercase letter */ + "100100000", /* non lowercase */ + "000010000", /* uppercase */ + "100010000" /* non uppercase */ + }; + + if (extra_newl == TRUE) + newl = TRUE; + + if (*end != ']') + return FAIL; + p = start; + if (*p == '^') + { + not = TRUE; + p ++; + } + + while (p < end) + { + if (p + 2 < end && *(p + 1) == '-') + { + switch (*p) + { + case '0': + if (*(p + 2) == '9') + { + o9 = TRUE; + break; + } + else + if (*(p + 2) == '7') + { + o7 = TRUE; + break; + } + case 'a': + if (*(p + 2) == 'z') + { + az = TRUE; + break; + } + else + if (*(p + 2) == 'f') + { + af = TRUE; + break; + } + case 'A': + if (*(p + 2) == 'Z') + { + AZ = TRUE; + break; + } + else + if (*(p + 2) == 'F') + { + AF = TRUE; + break; + } + /* FALLTHROUGH */ + default: + return FAIL; + } + p += 3; + } + else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n') + { + newl = TRUE; + p += 2; + } + else if (*p == '_') + { + underscore = TRUE; + p ++; + } + else if (*p == '\n') + { + newl = TRUE; + p ++; + } + else + return FAIL; + } /* while (p < end) */ + + if (p != end) + return FAIL; + + /* build the config that represents the ranges we gathered */ + STRCPY(myconfig, "000000000"); + if (not == TRUE) + myconfig[0] = '1'; + if (af == TRUE) + myconfig[1] = '1'; + if (AF == TRUE) + myconfig[2] = '1'; + if (az == TRUE) + myconfig[3] = '1'; + if (AZ == TRUE) + myconfig[4] = '1'; + if (o7 == TRUE) + myconfig[5] = '1'; + if (o9 == TRUE) + myconfig[6] = '1'; + if (underscore == TRUE) + myconfig[7] = '1'; + if (newl == TRUE) + { + myconfig[8] = '1'; + extra_newl = ADD_NL; + } + /* try to recognize character classes */ + for (i = 0; i < NCONFIGS; i++) + if (STRNCMP(myconfig, config[i],8) == 0) + return classid[i] + extra_newl; + + /* fallthrough => no success so far */ + return FAIL; + + #undef NCONFIGS + } + + /* + * Produce the bytes for equivalence class "c". + * Currently only handles latin1, latin9 and utf-8. + * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is + * equivalent to 'a OR b OR c' + * + * NOTE! When changing this function, also update reg_equi_class() + */ + static int + nfa_emit_equi_class(c, neg) + int c; + int neg; + { + int first = TRUE; + int glue = neg == TRUE ? NFA_CONCAT : NFA_OR; + #define EMIT2(c) \ + EMIT(c); \ + if (neg == TRUE) { \ + EMIT(NFA_NOT); \ + } \ + if (first == FALSE) \ + EMIT(glue); \ + else \ + first = FALSE; \ + + #ifdef FEAT_MBYTE + if (enc_utf8 || STRCMP(p_enc, "latin1") == 0 + || STRCMP(p_enc, "iso-8859-15") == 0) + #endif + { + switch (c) + { + case 'A': case '\300': case '\301': case '\302': + case '\303': case '\304': case '\305': + EMIT2('A'); EMIT2('\300'); EMIT2('\301'); + EMIT2('\302'); EMIT2('\303'); EMIT2('\304'); + EMIT2('\305'); + return OK; + + case 'C': case '\307': + EMIT2('C'); EMIT2('\307'); + return OK; + + case 'E': case '\310': case '\311': case '\312': case '\313': + EMIT2('E'); EMIT2('\310'); EMIT2('\311'); + EMIT2('\312'); EMIT2('\313'); + return OK; + + case 'I': case '\314': case '\315': case '\316': case '\317': + EMIT2('I'); EMIT2('\314'); EMIT2('\315'); + EMIT2('\316'); EMIT2('\317'); + return OK; + + case 'N': case '\321': + EMIT2('N'); EMIT2('\321'); + return OK; + + case 'O': case '\322': case '\323': case '\324': case '\325': + case '\326': + EMIT2('O'); EMIT2('\322'); EMIT2('\323'); + EMIT2('\324'); EMIT2('\325'); EMIT2('\326'); + return OK; + + case 'U': case '\331': case '\332': case '\333': case '\334': + EMIT2('U'); EMIT2('\331'); EMIT2('\332'); + EMIT2('\333'); EMIT2('\334'); + return OK; + + case 'Y': case '\335': + EMIT2('Y'); EMIT2('\335'); + return OK; + + case 'a': case '\340': case '\341': case '\342': + case '\343': case '\344': case '\345': + EMIT2('a'); EMIT2('\340'); EMIT2('\341'); + EMIT2('\342'); EMIT2('\343'); EMIT2('\344'); + EMIT2('\345'); + return OK; + + case 'c': case '\347': + EMIT2('c'); EMIT2('\347'); + return OK; + + case 'e': case '\350': case '\351': case '\352': case '\353': + EMIT2('e'); EMIT2('\350'); EMIT2('\351'); + EMIT2('\352'); EMIT2('\353'); + return OK; + + case 'i': case '\354': case '\355': case '\356': case '\357': + EMIT2('i'); EMIT2('\354'); EMIT2('\355'); + EMIT2('\356'); EMIT2('\357'); + return OK; + + case 'n': case '\361': + EMIT2('n'); EMIT2('\361'); + return OK; + + case 'o': case '\362': case '\363': case '\364': case '\365': + case '\366': + EMIT2('o'); EMIT2('\362'); EMIT2('\363'); + EMIT2('\364'); EMIT2('\365'); EMIT2('\366'); + return OK; + + case 'u': case '\371': case '\372': case '\373': case '\374': + EMIT2('u'); EMIT2('\371'); EMIT2('\372'); + EMIT2('\373'); EMIT2('\374'); + return OK; + + case 'y': case '\375': case '\377': + EMIT2('y'); EMIT2('\375'); EMIT2('\377'); + return OK; + + default: + return FAIL; + } + } + + EMIT(c); + return OK; + #undef EMIT2 + } + + /* + * Code to parse regular expression. + * + * We try to reuse parsing functions in regexp.c to + * minimize surprise and keep the syntax consistent. + */ + + /* + * Increments the pointer "p" by one (multi-byte) character. + */ + static void + nfa_inc(p) + char_u **p; + { + #ifdef FEAT_MBYTE + if (has_mbyte) + mb_ptr2char_adv(p); + else + #endif + *p = *p + 1; + } + + /* + * Decrements the pointer "p" by one (multi-byte) character. + */ + static void + nfa_dec(p) + char_u **p; + { + #ifdef FEAT_MBYTE + char_u *p2, *oldp; + + if (has_mbyte) + { + oldp = *p; + /* Try to find the multibyte char that advances to the current + * position. */ + do + { + *p = *p - 1; + p2 = *p; + mb_ptr2char_adv(&p2); + } while (p2 != oldp); + } + #else + *p = *p - 1; + #endif + } + + /* + * Parse the lowest level. + * + * An atom can be one of a long list of items. Many atoms match one character + * in the text. It is often an ordinary character or a character class. + * Braces can be used to make a pattern into an atom. The "\z(\)" construct + * is only for syntax highlighting. + * + * atom ::= ordinary-atom + * or \( pattern \) + * or \%( pattern \) + * or \z( pattern \) + */ + static int + nfa_regatom() + { + int c; + int charclass; + int equiclass; + int collclass; + int got_coll_char; + char_u *p; + char_u *endp; + #ifdef FEAT_MBYTE + char_u *old_regparse = regparse; + int clen; + int len; + static char_u buf[30]; + int i; + #endif + int extra = 0; + int first; + int emit_range; + int negated; + int result; + int startc = -1; + int endc = -1; + int oldstartc = -1; + int cpo_lit; /* 'cpoptions' contains 'l' flag */ + int cpo_bsl; /* 'cpoptions' contains '\' flag */ + int glue; /* ID that will "glue" nodes together */ + + cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL; + cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL; + + c = getchr(); + + #ifdef FEAT_MBYTE + /* clen has the length of the current char, without composing chars */ + clen = (*mb_char2len)(c); + if (has_mbyte && clen > 1) + goto nfa_do_multibyte; + #endif + switch (c) + { + case Magic('^'): + EMIT(NFA_BOL); + break; + + case Magic('$'): + EMIT(NFA_EOL); + #if defined(FEAT_SYN_HL) || defined(PROTO) + had_eol = TRUE; + #endif + break; + + case Magic('<'): + EMIT(NFA_BOW); + break; + + case Magic('>'): + EMIT(NFA_EOW); + break; + + case Magic('_'): + c = no_Magic(getchr()); + if (c == '^') /* "\_^" is start-of-line */ + { + EMIT(NFA_BOL); + break; + } + if (c == '$') /* "\_$" is end-of-line */ + { + EMIT(NFA_EOL); + #if defined(FEAT_SYN_HL) || defined(PROTO) + had_eol = TRUE; + #endif + break; + } + + extra = ADD_NL; + + /* "\_[" is collection plus newline */ + if (c == '[') + /* TODO: make this work + * goto collection; */ + return FAIL; + + /* "\_x" is character class plus newline */ + /*FALLTHROUGH*/ + + /* + * Character classes. + */ + case Magic('.'): + case Magic('i'): + case Magic('I'): + case Magic('k'): + case Magic('K'): + case Magic('f'): + case Magic('F'): + case Magic('p'): + case Magic('P'): + case Magic('s'): + case Magic('S'): + case Magic('d'): + case Magic('D'): + case Magic('x'): + case Magic('X'): + case Magic('o'): + case Magic('O'): + case Magic('w'): + case Magic('W'): + case Magic('h'): + case Magic('H'): + case Magic('a'): + case Magic('A'): + case Magic('l'): + case Magic('L'): + case Magic('u'): + case Magic('U'): + p = vim_strchr(classchars, no_Magic(c)); + if (p == NULL) + { + return FAIL; /* runtime error */ + } + #ifdef FEAT_MBYTE + /* When '.' is followed by a composing char ignore the dot, so that + * the composing char is matched here. */ + if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr())) + { + c = getchr(); + goto nfa_do_multibyte; + } + #endif + EMIT(nfa_classcodes[p - classchars]); + if (extra == ADD_NL) + { + EMIT(NFA_NEWL); + EMIT(NFA_OR); + regflags |= RF_HASNL; + } + break; + + case Magic('n'): + if (reg_string) + /* In a string "\n" matches a newline character. */ + EMIT(NL); + else + { + /* In buffer text "\n" matches the end of a line. */ + EMIT(NFA_NEWL); + regflags |= RF_HASNL; + } + break; + + case Magic('('): + if (nfa_reg(REG_PAREN) == FAIL) + return FAIL; /* cascaded error */ + break; + + case NUL: + syntax_error = TRUE; + EMSG_RET_FAIL(_("E865: (NFA) Regexp end encountered prematurely")); + + case Magic('|'): + case Magic('&'): + case Magic(')'): + syntax_error = TRUE; + EMSG2(_(e_misplaced), no_Magic(c)); + return FAIL; + + case Magic('='): + case Magic('?'): + case Magic('+'): + case Magic('@'): + case Magic('*'): + case Magic('{'): + /* these should follow an atom, not form an atom */ + syntax_error = TRUE; + EMSG2(_(e_misplaced), no_Magic(c)); + return FAIL; + + case Magic('~'): /* previous substitute pattern */ + /* Not supported yet */ + return FAIL; + + case Magic('1'): + case Magic('2'): + case Magic('3'): + case Magic('4'): + case Magic('5'): + case Magic('6'): + case Magic('7'): + case Magic('8'): + case Magic('9'): + /* not supported yet */ + return FAIL; + + case Magic('z'): + c = no_Magic(getchr()); + switch (c) + { + case 's': + EMIT(NFA_ZSTART); + break; + case 'e': + EMIT(NFA_ZEND); + nfa_has_zend = TRUE; + /* TODO: Currently \ze does not work properly. */ + return FAIL; + /* break; */ + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '(': + /* \z1...\z9 and \z( not yet supported */ + return FAIL; + default: + syntax_error = TRUE; + EMSG2(_("E867: (NFA) Unknown operator '\\z%c'"), + no_Magic(c)); + return FAIL; + } + break; + + case Magic('%'): + c = no_Magic(getchr()); + switch (c) + { + /* () without a back reference */ + case '(': + if (nfa_reg(REG_NPAREN) == FAIL) + return FAIL; + EMIT(NFA_NOPEN); + break; + + case 'd': /* %d123 decimal */ + case 'o': /* %o123 octal */ + case 'x': /* %xab hex 2 */ + case 'u': /* %uabcd hex 4 */ + case 'U': /* %U1234abcd hex 8 */ + /* Not yet supported */ + return FAIL; + + c = coll_get_char(); + #ifdef FEAT_MBYTE + if ((*mb_char2len)(c) > 1) + { + EMIT_MBYTE(c); + } + else + #endif + EMIT(c); + break; + + /* Catch \%^ and \%$ regardless of where they appear in the + * pattern -- regardless of whether or not it makes sense. */ + case '^': + EMIT(NFA_BOF); + /* Not yet supported */ + return FAIL; + break; + + case '$': + EMIT(NFA_EOF); + /* Not yet supported */ + return FAIL; + break; + + case '#': + /* not supported yet */ + return FAIL; + break; + + case 'V': + /* not supported yet */ + return FAIL; + break; + + case '[': + /* \%[abc] not supported yet */ + return FAIL; + + default: + /* not supported yet */ + return FAIL; + } + break; + + /* collection: */ + case Magic('['): + /* + * Glue is emitted between several atoms from the []. + * It is either NFA_OR, or NFA_CONCAT. + * + * [abc] expands to 'a b NFA_OR c NFA_OR' (in postfix notation) + * [^abc] expands to 'a NFA_NOT b NFA_NOT NFA_CONCAT c NFA_NOT + * NFA_CONCAT NFA_END_NEG_RANGE NFA_CONCAT' (in postfix + * notation) + * + */ + + + /* Emit negation atoms, if needed. + * The CONCAT below merges the NOT with the previous node. */ + #define TRY_NEG() \ + if (negated == TRUE) \ + { \ + EMIT(NFA_NOT); \ + } + + /* Emit glue between important nodes : CONCAT or OR. */ + #define EMIT_GLUE() \ + if (first == FALSE) \ + EMIT(glue); \ + else \ + first = FALSE; + + p = regparse; + endp = skip_anyof(p); + if (*endp == ']') + { + /* + * Try to reverse engineer character classes. For example, + * recognize that [0-9] stands for \d and [A-Za-z_] with \h, + * and perform the necessary substitutions in the NFA. + */ + result = nfa_recognize_char_class(regparse, endp, + extra == ADD_NL); + if (result != FAIL) + { + if (result >= NFA_DIGIT && result <= NFA_NUPPER) + EMIT(result); + else /* must be char class + newline */ + { + EMIT(result - ADD_NL); + EMIT(NFA_NEWL); + EMIT(NFA_OR); + } + regparse = endp; + nfa_inc(®parse); + return OK; + } + /* + * Failed to recognize a character class. Use the simple + * version that turns [abc] into 'a' OR 'b' OR 'c' + */ + startc = endc = oldstartc = -1; + first = TRUE; /* Emitting first atom in this sequence? */ + negated = FALSE; + glue = NFA_OR; + if (*regparse == '^') /* negated range */ + { + negated = TRUE; + glue = NFA_CONCAT; + nfa_inc(®parse); + } + if (*regparse == '-') + { + startc = '-'; + EMIT(startc); + TRY_NEG(); + EMIT_GLUE(); + nfa_inc(®parse); + } + /* Emit the OR branches for each character in the [] */ + emit_range = FALSE; + while (regparse < endp) + { + oldstartc = startc; + startc = -1; + got_coll_char = FALSE; + if (*regparse == '[') + { + /* Check for [: :], [= =], [. .] */ + equiclass = collclass = 0; + charclass = get_char_class(®parse); + if (charclass == CLASS_NONE) + { + equiclass = get_equi_class(®parse); + if (equiclass == 0) + collclass = get_coll_element(®parse); + } + + /* Character class like [:alpha:] */ + if (charclass != CLASS_NONE) + { + switch (charclass) + { + case CLASS_ALNUM: + EMIT(NFA_CLASS_ALNUM); + break; + case CLASS_ALPHA: + EMIT(NFA_CLASS_ALPHA); + break; + case CLASS_BLANK: + EMIT(NFA_CLASS_BLANK); + break; + case CLASS_CNTRL: + EMIT(NFA_CLASS_CNTRL); + break; + case CLASS_DIGIT: + EMIT(NFA_CLASS_DIGIT); + break; + case CLASS_GRAPH: + EMIT(NFA_CLASS_GRAPH); + break; + case CLASS_LOWER: + EMIT(NFA_CLASS_LOWER); + break; + case CLASS_PRINT: + EMIT(NFA_CLASS_PRINT); + break; + case CLASS_PUNCT: + EMIT(NFA_CLASS_PUNCT); + break; + case CLASS_SPACE: + EMIT(NFA_CLASS_SPACE); + break; + case CLASS_UPPER: + EMIT(NFA_CLASS_UPPER); + break; + case CLASS_XDIGIT: + EMIT(NFA_CLASS_XDIGIT); + break; + case CLASS_TAB: + EMIT(NFA_CLASS_TAB); + break; + case CLASS_RETURN: + EMIT(NFA_CLASS_RETURN); + break; + case CLASS_BACKSPACE: + EMIT(NFA_CLASS_BACKSPACE); + break; + case CLASS_ESCAPE: + EMIT(NFA_CLASS_ESCAPE); + break; + } + TRY_NEG(); + EMIT_GLUE(); + continue; + } + /* Try equivalence class [=a=] and the like */ + if (equiclass != 0) + { + result = nfa_emit_equi_class(equiclass, negated); + if (result == FAIL) + { + /* should never happen */ + EMSG_RET_FAIL(_("E868: Error building NFA with equivalence class!")); + } + EMIT_GLUE(); + continue; + } + /* Try collating class like [. .] */ + if (collclass != 0) + { + startc = collclass; /* allow [.a.]-x as a range */ + /* Will emit the proper atom at the end of the + * while loop. */ + } + } + /* Try a range like 'a-x' or '\t-z' */ + if (*regparse == '-') + { + emit_range = TRUE; + startc = oldstartc; + nfa_inc(®parse); + continue; /* reading the end of the range */ + } + + /* Now handle simple and escaped characters. + * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim + * accepts "\t", "\e", etc., but only when the 'l' flag in + * 'cpoptions' is not included. + * Posix doesn't recognize backslash at all. + */ + if (*regparse == '\\' + && !cpo_bsl + && regparse + 1 <= endp + && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL + || (!cpo_lit + && vim_strchr(REGEXP_ABBR, regparse[1]) + != NULL) + ) + ) + { + nfa_inc(®parse); + + if (*regparse == 'n' || *regparse == 'n') + startc = reg_string ? NL : NFA_NEWL; + else + if (*regparse == 'd' + || *regparse == 'o' + || *regparse == 'x' + || *regparse == 'u' + || *regparse == 'U' + ) + { + /* TODO(RE) This needs more testing */ + startc = coll_get_char(); + got_coll_char = TRUE; + nfa_dec(®parse); + } + else + { + /* \r,\t,\e,\b */ + startc = backslash_trans(*regparse); + } + } + + /* Normal printable char */ + if (startc == -1) + #ifdef FEAT_MBYTE + startc = (*mb_ptr2char)(regparse); + #else + startc = *regparse; + #endif + + /* Previous char was '-', so this char is end of range. */ + if (emit_range) + { + endc = startc; startc = oldstartc; + if (startc > endc) + EMSG_RET_FAIL(_(e_invrange)); + #ifdef FEAT_MBYTE + if (has_mbyte && ((*mb_char2len)(startc) > 1 + || (*mb_char2len)(endc) > 1)) + { + if (endc > startc + 256) + EMSG_RET_FAIL(_(e_invrange)); + /* Emit the range. "startc" was already emitted, so + * skip it. */ + for (c = startc + 1; c <= endc; c++) + { + if ((*mb_char2len)(c) > 1) + { + EMIT_MBYTE(c); + } + else + EMIT(c); + TRY_NEG(); + EMIT_GLUE(); + } + emit_range = FALSE; + } + else + #endif + { + #ifdef EBCDIC + int alpha_only = FALSE; + + /* for alphabetical range skip the gaps + * 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'. */ + if (isalpha(startc) && isalpha(endc)) + alpha_only = TRUE; + #endif + /* Emit the range. "startc" was already emitted, so + * skip it. */ + for (c = startc + 1; c <= endc; c++) + #ifdef EBCDIC + if (!alpha_only || isalpha(startc)) + #endif + { + EMIT(c); + TRY_NEG(); + EMIT_GLUE(); + } + emit_range = FALSE; + } + } + else + { + /* + * This char (startc) is not part of a range. Just + * emit it. + * + * Normally, simply emit startc. But if we get char + * code=0 from a collating char, then replace it with + * 0x0a. + * + * This is needed to completely mimic the behaviour of + * the backtracking engine. + */ + if (got_coll_char == TRUE && startc == 0) + EMIT(0x0a); + else + #ifdef FEAT_MBYTE + if ((*mb_char2len)(startc) > 1) + { + EMIT_MBYTE(startc); + } + else + #endif + EMIT(startc); + TRY_NEG(); + EMIT_GLUE(); + } + + nfa_inc(®parse); + } /* while (p < endp) */ + + nfa_dec(®parse); + if (*regparse == '-') /* if last, '-' is just a char */ + { + EMIT('-'); + TRY_NEG(); + EMIT_GLUE(); + } + nfa_inc(®parse); + + if (extra == ADD_NL) /* \_[] also matches \n */ + { + EMIT(reg_string ? NL : NFA_NEWL); + TRY_NEG(); + EMIT_GLUE(); + } + + /* skip the trailing ] */ + regparse = endp; + nfa_inc(®parse); + if (negated == TRUE) + { + /* Mark end of negated char range */ + EMIT(NFA_END_NEG_RANGE); + EMIT(NFA_CONCAT); + } + return OK; + } /* if exists closing ] */ + else if (reg_strict) + { + syntax_error = TRUE; + EMSG_RET_FAIL(_(e_missingbracket)); + } + + /* FALLTHROUGH */ + default: + { + #ifdef FEAT_MBYTE + int plen; + + nfa_do_multibyte: + /* length of current char, with composing chars, + * from pointer */ + plen = (*mb_ptr2len)(old_regparse); + if (enc_utf8 && clen != plen) + { + /* A composing character is always handled as a + * separate atom, surrounded by NFA_COMPOSING and + * NFA_END_COMPOSING. Note that right now we are + * building the postfix form, not the NFA itself; + * a composing char could be: a, b, c, NFA_COMPOSING + * where 'a', 'b', 'c' are chars with codes > 256. + */ + EMIT_COMPOSING_UTF(old_regparse); + regparse = old_regparse + plen; + } + else + /* A multi-byte character is always handled as a + * separate atom, surrounded by NFA_MULTIBYTE and + * NFA_END_MULTIBYTE */ + if (plen > 1) + { + EMIT_MBYTE(c); + } + else + #endif + { + c = no_Magic(c); + EMIT(c); + } + return OK; + } + } + + #undef TRY_NEG + #undef EMIT_GLUE + + return OK; + } + + /* + * Parse something followed by possible [*+=]. + * + * A piece is an atom, possibly followed by a multi, an indication of how many + * times the atom can be matched. Example: "a*" matches any sequence of "a" + * characters: "", "a", "aa", etc. + * + * piece ::= atom + * or atom multi + */ + static int + nfa_regpiece() + { + int i; + int op; + int ret; + long minval, maxval; + int greedy = TRUE; /* Braces are prefixed with '-' ? */ + char_u *old_regparse, *new_regparse; + int c2; + int *old_post_ptr, *my_post_start; + int old_regnpar; + int quest; + + /* Save the current position in the regexp, so that we can use it if + * {m,n} is next. */ + old_regparse = regparse; + /* Save current number of open parenthesis, so we can use it if + * {m,n} is next */ + old_regnpar = regnpar; + /* store current pos in the postfix form, for \{m,n} involving 0s */ + my_post_start = post_ptr; + + ret = nfa_regatom(); + if (ret == FAIL) + return FAIL; /* cascaded error */ + + op = peekchr(); + if (re_multi_type(op) == NOT_MULTI) + return OK; + + skipchr(); + switch (op) + { + case Magic('*'): + EMIT(NFA_STAR); + break; + + case Magic('+'): + /* + * Trick: Normally, (a*)\+ would match the whole input "aaa". The + * first and only submatch would be "aaa". But the backtracking + * engine interprets the plus as "try matching one more time", and + * a* matches a second time at the end of the input, the empty + * string. + * The submatch will the empty string. + * + * In order to be consistent with the old engine, we disable + * NFA_PLUS, and replace + with * + */ + /* EMIT(NFA_PLUS); */ + regnpar = old_regnpar; + regparse = old_regparse; + curchr = -1; + if (nfa_regatom() == FAIL) + return FAIL; + EMIT(NFA_STAR); + EMIT(NFA_CONCAT); + skipchr(); /* skip the \+ */ + break; + + case Magic('@'): + op = no_Magic(getchr()); + switch(op) + { + case '=': + EMIT(NFA_PREV_ATOM_NO_WIDTH); + break; + case '!': + case '<': + case '>': + /* Not supported yet */ + return FAIL; + default: + syntax_error = TRUE; + EMSG2(_("E869: (NFA) Unknown operator '\\@%c'"), op); + return FAIL; + } + break; + + case Magic('?'): + case Magic('='): + EMIT(NFA_QUEST); + break; + + case Magic('{'): + /* a{2,5} will expand to 'aaa?a?a?' + * a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy + * version of '?' + * \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the + * parenthesis have the same id + */ + + greedy = TRUE; + c2 = peekchr(); + if (c2 == '-' || c2 == Magic('-')) + { + skipchr(); + greedy = FALSE; + } + if (!read_limits(&minval, &maxval)) + { + syntax_error = TRUE; + EMSG_RET_FAIL(_("E870: (NFA regexp) Error reading repetition limits")); + } + /* {0,inf}, {0,} and {} are equivalent to + * * */ + if (minval == 0 && maxval == MAX_LIMIT && greedy) + { + EMIT(NFA_STAR); + break; + } + + if (maxval > NFA_BRACES_MAXLIMIT) + { + /* This would yield a huge automaton and use too much memory. + * Revert to old engine */ + return FAIL; + } + + /* Special case: x{0} or x{-0} */ + if (maxval == 0) + { + /* Ignore result of previous call to nfa_regatom() */ + post_ptr = my_post_start; + /* NFA_SKIP_CHAR has 0-length and works everywhere */ + EMIT(NFA_SKIP_CHAR); + return OK; + } + + /* Ignore previous call to nfa_regatom() */ + post_ptr = my_post_start; + /* Save pos after the repeated atom and the \{} */ + new_regparse = regparse; + + new_regparse = regparse; + quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY); + for (i = 0; i < maxval; i++) + { + /* Goto beginning of the repeated atom */ + regparse = old_regparse; + curchr = -1; + /* Restore count of parenthesis */ + regnpar = old_regnpar; + old_post_ptr = post_ptr; + if (nfa_regatom() == FAIL) + return FAIL; + /* after "minval" times, atoms are optional */ + if (i + 1 > minval) + EMIT(quest); + if (old_post_ptr != my_post_start) + EMIT(NFA_CONCAT); + } + + /* Go to just after the repeated atom and the \{} */ + regparse = new_regparse; + curchr = -1; + + break; + + + default: + break; + } /* end switch */ + + if (re_multi_type(peekchr()) != NOT_MULTI) + { + /* Can't have a multi follow a multi. */ + syntax_error = TRUE; + EMSG_RET_FAIL(_("E871: (NFA regexp) Can't have a multi follow a multi !")); + } + + return OK; + } + + /* + * Parse one or more pieces, concatenated. It matches a match for the + * first piece, followed by a match for the second piece, etc. Example: + * "f[0-9]b", first matches "f", then a digit and then "b". + * + * concat ::= piece + * or piece piece + * or piece piece piece + * etc. + */ + static int + nfa_regconcat() + { + int cont = TRUE; + int first = TRUE; + + while (cont) + { + switch (peekchr()) + { + case NUL: + case Magic('|'): + case Magic('&'): + case Magic(')'): + cont = FALSE; + break; + + case Magic('Z'): + #ifdef FEAT_MBYTE + regflags |= RF_ICOMBINE; + #endif + skipchr_keepstart(); + break; + case Magic('c'): + regflags |= RF_ICASE; + skipchr_keepstart(); + break; + case Magic('C'): + regflags |= RF_NOICASE; + skipchr_keepstart(); + break; + case Magic('v'): + reg_magic = MAGIC_ALL; + skipchr_keepstart(); + curchr = -1; + break; + case Magic('m'): + reg_magic = MAGIC_ON; + skipchr_keepstart(); + curchr = -1; + break; + case Magic('M'): + reg_magic = MAGIC_OFF; + skipchr_keepstart(); + curchr = -1; + break; + case Magic('V'): + reg_magic = MAGIC_NONE; + skipchr_keepstart(); + curchr = -1; + break; + + default: + if (nfa_regpiece() == FAIL) + return FAIL; + if (first == FALSE) + EMIT(NFA_CONCAT); + else + first = FALSE; + break; + } + } + + return OK; + } + + /* + * Parse a branch, one or more concats, separated by "\&". It matches the + * last concat, but only if all the preceding concats also match at the same + * position. Examples: + * "foobeep\&..." matches "foo" in "foobeep". + * ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob" + * + * branch ::= concat + * or concat \& concat + * or concat \& concat \& concat + * etc. + */ + static int + nfa_regbranch() + { + int ch; + int *old_post_ptr; + + old_post_ptr = post_ptr; + + /* First branch, possibly the only one */ + if (nfa_regconcat() == FAIL) + return FAIL; + + ch = peekchr(); + /* Try next concats */ + while (ch == Magic('&')) + { + skipchr(); + EMIT(NFA_NOPEN); + EMIT(NFA_PREV_ATOM_NO_WIDTH); + old_post_ptr = post_ptr; + if (nfa_regconcat() == FAIL) + return FAIL; + /* if concat is empty, skip a input char. But do emit a node */ + if (old_post_ptr == post_ptr) + EMIT(NFA_SKIP_CHAR); + EMIT(NFA_CONCAT); + ch = peekchr(); + } + + /* Even if a branch is empty, emit one node for it */ + if (old_post_ptr == post_ptr) + EMIT(NFA_SKIP_CHAR); + + return OK; + } + + /* + * Parse a pattern, one or more branches, separated by "\|". It matches + * anything that matches one of the branches. Example: "foo\|beep" matches + * "foo" and matches "beep". If more than one branch matches, the first one + * is used. + * + * pattern ::= branch + * or branch \| branch + * or branch \| branch \| branch + * etc. + */ + static int + nfa_reg(paren) + int paren; /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */ + { + int parno = 0; + + #ifdef FEAT_SYN_HL + #endif + if (paren == REG_PAREN) + { + if (regnpar >= NSUBEXP) /* Too many `(' */ + { + syntax_error = TRUE; + EMSG_RET_FAIL(_("E872: (NFA regexp) Too many '('")); + } + parno = regnpar++; + } + + if (nfa_regbranch() == FAIL) + return FAIL; /* cascaded error */ + + while (peekchr() == Magic('|')) + { + skipchr(); + if (nfa_regbranch() == FAIL) + return FAIL; /* cascaded error */ + EMIT(NFA_OR); + } + + /* Check for proper termination. */ + if (paren != REG_NOPAREN && getchr() != Magic(')')) + { + syntax_error = TRUE; + if (paren == REG_NPAREN) + EMSG2_RET_FAIL(_(e_unmatchedpp), reg_magic == MAGIC_ALL); + else + EMSG2_RET_FAIL(_(e_unmatchedp), reg_magic == MAGIC_ALL); + } + else if (paren == REG_NOPAREN && peekchr() != NUL) + { + syntax_error = TRUE; + if (peekchr() == Magic(')')) + EMSG2_RET_FAIL(_(e_unmatchedpar), reg_magic == MAGIC_ALL); + else + EMSG_RET_FAIL(_("E873: (NFA regexp) proper termination error")); + } + /* + * Here we set the flag allowing back references to this set of + * parentheses. + */ + if (paren == REG_PAREN) + { + had_endbrace[parno] = TRUE; /* have seen the close paren */ + EMIT(NFA_MOPEN + parno); + } + + return OK; + } + + typedef struct + { + char_u *start[NSUBEXP]; + char_u *end[NSUBEXP]; + lpos_T startpos[NSUBEXP]; + lpos_T endpos[NSUBEXP]; + } regsub_T; + + static int nfa_regmatch __ARGS((nfa_state_T *start, regsub_T *submatch, regsub_T *m)); + + #ifdef DEBUG + static char_u code[50]; + + static void + nfa_set_code(c) + int c; + { + int addnl = FALSE; + + if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL) + { + addnl = TRUE; + c -= ADD_NL; + } + + STRCPY(code, ""); + switch (c) + { + case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break; + case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break; + case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break; + case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break; + case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break; + case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break; + + case NFA_PREV_ATOM_NO_WIDTH: + STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break; + case NFA_NOPEN: STRCPY(code, "NFA_MOPEN_INVISIBLE"); break; + case NFA_NCLOSE: STRCPY(code, "NFA_MCLOSE_INVISIBLE"); break; + case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break; + case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break; + + case NFA_MULTIBYTE: STRCPY(code, "NFA_MULTIBYTE"); break; + case NFA_END_MULTIBYTE: STRCPY(code, "NFA_END_MULTIBYTE"); break; + + case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break; + case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break; + + case NFA_MOPEN + 0: + case NFA_MOPEN + 1: + case NFA_MOPEN + 2: + case NFA_MOPEN + 3: + case NFA_MOPEN + 4: + case NFA_MOPEN + 5: + case NFA_MOPEN + 6: + case NFA_MOPEN + 7: + case NFA_MOPEN + 8: + case NFA_MOPEN + 9: + STRCPY(code, "NFA_MOPEN(x)"); + code[10] = c - NFA_MOPEN + '0'; + break; + case NFA_MCLOSE + 0: + case NFA_MCLOSE + 1: + case NFA_MCLOSE + 2: + case NFA_MCLOSE + 3: + case NFA_MCLOSE + 4: + case NFA_MCLOSE + 5: + case NFA_MCLOSE + 6: + case NFA_MCLOSE + 7: + case NFA_MCLOSE + 8: + case NFA_MCLOSE + 9: + STRCPY(code, "NFA_MCLOSE(x)"); + code[11] = c - NFA_MCLOSE + '0'; + break; + case NFA_EOL: STRCPY(code, "NFA_EOL "); break; + case NFA_BOL: STRCPY(code, "NFA_BOL "); break; + case NFA_EOW: STRCPY(code, "NFA_EOW "); break; + case NFA_BOW: STRCPY(code, "NFA_BOW "); break; + case NFA_STAR: STRCPY(code, "NFA_STAR "); break; + case NFA_PLUS: STRCPY(code, "NFA_PLUS "); break; + case NFA_NOT: STRCPY(code, "NFA_NOT "); break; + case NFA_SKIP_CHAR: STRCPY(code, "NFA_SKIP_CHAR"); break; + case NFA_OR: STRCPY(code, "NFA_OR"); break; + case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break; + case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break; + case NFA_END_NEG_RANGE: STRCPY(code, "NFA_END_NEG_RANGE"); break; + case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break; + case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break; + case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break; + case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break; + case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break; + case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break; + case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break; + case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break; + case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break; + case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break; + case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break; + case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break; + case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break; + case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break; + case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break; + case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break; + + case NFA_ANY: STRCPY(code, "NFA_ANY"); break; + case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break; + case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break; + case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break; + case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break; + case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break; + case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break; + case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break; + case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break; + case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break; + case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break; + case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break; + case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break; + case NFA_HEX: STRCPY(code, "NFA_HEX"); break; + case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break; + case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break; + case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break; + case NFA_WORD: STRCPY(code, "NFA_WORD"); break; + case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break; + case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break; + case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break; + case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break; + case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break; + case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break; + case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break; + case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break; + case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break; + + default: + STRCPY(code, "CHAR(x)"); + code[5] = c; + } + + if (addnl == TRUE) + STRCAT(code, " + NEWLINE "); + + } + + #ifdef ENABLE_LOG + static FILE *log_fd; + + /* + * Print the postfix notation of the current regexp. + */ + static void + nfa_postfix_dump(expr, retval) + char_u *expr; + int retval; + { + int *p; + FILE *f; + + f = fopen("LOG.log", "a"); + if (f != NULL) + { + fprintf(f, "\n-------------------------\n"); + if (retval == FAIL) + fprintf(f, ">>> NFA engine failed ... \n"); + else if (retval == OK) + fprintf(f, ">>> NFA engine succeeded !\n"); + fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr); + for (p=post_start; *p; p++) + { + nfa_set_code(*p); + fprintf(f, "%s, ", code); + } + fprintf(f, "\"\nPostfix notation (int): "); + for (p=post_start; *p; p++) + fprintf(f, "%d ", *p); + fprintf(f, "\n\n"); + fclose(f); + } + } + + /* + * Print the NFA starting with a root node "state". + */ + static void + nfa_print_state(debugf, state, ident) + FILE *debugf; + nfa_state_T *state; + int ident; + { + int i; + + if (state == NULL) + return; + + fprintf(debugf, "(%2d)", abs(state->id)); + for (i = 0; i < ident; i++) + fprintf(debugf, "%c", ' '); + + nfa_set_code(state->c); + fprintf(debugf, "%s %s (%d) (id=%d)\n", + state->negated ? "NOT" : "", code, state->c, abs(state->id)); + if (state->id < 0) + return; + + state->id = abs(state->id) * -1; + nfa_print_state(debugf, state->out, ident + 4); + nfa_print_state(debugf, state->out1, ident + 4); + } + + /* + * Print the NFA state machine. + */ + static void + nfa_dump(prog) + nfa_regprog_T *prog; + { + FILE *debugf = fopen("LOG.log", "a"); + + if (debugf != NULL) + { + nfa_print_state(debugf, prog->start, 0); + fclose(debugf); + } + } + #endif /* ENABLE_LOG */ + #endif /* DEBUG */ + + /* + * Parse r.e. @expr and convert it into postfix form. + * Return the postfix string on success, NULL otherwise. + */ + static int * + re2post() + { + if (nfa_reg(REG_NOPAREN) == FAIL) + return NULL; + EMIT(NFA_MOPEN); + return post_start; + } + + /* NB. Some of the code below is inspired by Russ's. */ + + /* + * Represents an NFA state plus zero or one or two arrows exiting. + * if c == MATCH, no arrows out; matching state. + * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL). + * If c < 256, labeled arrow with character c to out. + */ + + static nfa_state_T *state_ptr; /* points to nfa_prog->state */ + + /* + * Allocate and initialize nfa_state_T. + */ + static nfa_state_T * + new_state(c, out, out1) + int c; + nfa_state_T *out; + nfa_state_T *out1; + { + nfa_state_T *s; + + if (istate >= nstate) + return NULL; + + s = &state_ptr[istate++]; + + s->c = c; + s->out = out; + s->out1 = out1; + + s->id = istate; + s->lastlist = 0; + s->lastthread = NULL; + s->visits = 0; + s->negated = FALSE; + + return s; + } + + /* + * A partially built NFA without the matching state filled in. + * Frag_T.start points at the start state. + * Frag_T.out is a list of places that need to be set to the + * next state for this fragment. + */ + typedef union Ptrlist Ptrlist; + struct Frag + { + nfa_state_T *start; + Ptrlist *out; + }; + typedef struct Frag Frag_T; + + static Frag_T frag __ARGS((nfa_state_T *start, Ptrlist *out)); + static Ptrlist *list1 __ARGS((nfa_state_T **outp)); + static void patch __ARGS((Ptrlist *l, nfa_state_T *s)); + static Ptrlist *append __ARGS((Ptrlist *l1, Ptrlist *l2)); + static void st_push __ARGS((Frag_T s, Frag_T **p, Frag_T *stack_end)); + static Frag_T st_pop __ARGS((Frag_T **p, Frag_T *stack)); + + /* + * Initialize Frag_T struct. + */ + static Frag_T + frag(start, out) + nfa_state_T *start; + Ptrlist *out; + { + Frag_T n = { start, out }; + return n; + } + + /* + * Since the out pointers in the list are always + * uninitialized, we use the pointers themselves + * as storage for the Ptrlists. + */ + union Ptrlist + { + Ptrlist *next; + nfa_state_T *s; + }; + + /* + * Create singleton list containing just outp. + */ + static Ptrlist * + list1(outp) + nfa_state_T **outp; + { + Ptrlist *l; + + l = (Ptrlist *)outp; + l->next = NULL; + return l; + } + + /* + * Patch the list of states at out to point to start. + */ + static void + patch(l, s) + Ptrlist *l; + nfa_state_T *s; + { + Ptrlist *next; + + for (; l; l = next) + { + next = l->next; + l->s = s; + } + } + + + /* + * Join the two lists l1 and l2, returning the combination. + */ + static Ptrlist * + append(l1, l2) + Ptrlist *l1; + Ptrlist *l2; + { + Ptrlist *oldl1; + + oldl1 = l1; + while (l1->next) + l1 = l1->next; + l1->next = l2; + return oldl1; + } + + /* + * Stack used for transforming postfix form into NFA. + */ + static Frag_T empty; + + static void + st_error(postfix, end, p) + int *postfix; + int *end; + int *p; + { + FILE *df; + int *p2; + + df = fopen("stack.err", "a"); + if (df) + { + fprintf(df, "Error popping the stack!\n"); + #ifdef DEBUG + fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr); + #endif + fprintf(df, "Postfix form is: "); + #ifdef DEBUG + for (p2 = postfix; p2 < end; p2++) + { + nfa_set_code(*p2); + fprintf(df, "%s, ", code); + } + nfa_set_code(*p); + fprintf(df, "\nCurrent position is: "); + for (p2 = postfix; p2 <= p; p2 ++) + { + nfa_set_code(*p2); + fprintf(df, "%s, ", code); + } + #else + for (p2 = postfix; p2 < end; p2++) + { + fprintf(df, "%d, ", *p2); + } + fprintf(df, "\nCurrent position is: "); + for (p2 = postfix; p2 <= p; p2 ++) + { + fprintf(df, "%d, ", *p2); + } + #endif + fprintf(df, "\n--------------------------\n"); + fclose(df); + } + EMSG(_("E874: (NFA) Could not pop the stack !")); + } + + /* + * Push an item onto the stack. + */ + static void + st_push(s, p, stack_end) + Frag_T s; + Frag_T **p; + Frag_T *stack_end; + { + Frag_T *stackp = *p; + + if (stackp >= stack_end) + return; + *stackp = s; + *p = *p + 1; + } + + /* + * Pop an item from the stack. + */ + static Frag_T + st_pop(p, stack) + Frag_T **p; + Frag_T *stack; + { + Frag_T *stackp; + + *p = *p - 1; + stackp = *p; + if (stackp < stack) + return empty; + return **p; + } + + /* + * Convert a postfix form into its equivalent NFA. + * Return the NFA start state on success, NULL otherwise. + */ + static nfa_state_T * + post2nfa(postfix, end, nfa_calc_size) + int *postfix; + int *end; + int nfa_calc_size; + { + int *p; + int mopen; + int mclose; + Frag_T *stack = NULL; + Frag_T *stackp = NULL; + Frag_T *stack_end = NULL; + Frag_T e1; + Frag_T e2; + Frag_T e; + nfa_state_T *s; + nfa_state_T *s1; + nfa_state_T *matchstate; + + if (postfix == NULL) + return NULL; + + #define PUSH(s) st_push ((s), &stackp, stack_end) + #define POP() st_pop(&stackp, stack); \ + if (stackp < stack) \ + { \ + st_error(postfix, end, p); \ + return NULL; \ + } + + if (nfa_calc_size == FALSE) + { + /* Allocate space for the stack. Max states on the stack : nstate */ + stack = (Frag_T *) lalloc((nstate + 1)*sizeof(Frag_T), TRUE); + stackp = stack; + stack_end = stack + NFA_STACK_SIZE; + } + + for (p = postfix; p < end; ++p) + { + switch (*p) + { + case NFA_CONCAT: + /* Catenation. + * Pay attention: this operator does not exist + * in the r.e. itself (it is implicit, really). + * It is added when r.e. is translated to postfix + * form in re2post(). + * + * No new state added here. */ + if (nfa_calc_size == TRUE) + { + nstate += 0; + break; + } + e2 = POP(); + e1 = POP(); + patch(e1.out, e2.start); + PUSH(frag(e1.start, e2.out)); + break; + + case NFA_NOT: + /* Negation of a character */ + if (nfa_calc_size == TRUE) + { + nstate += 0; + break; + } + e1 = POP(); + e1.start->negated = TRUE; + if (e1.start->c == NFA_MULTIBYTE || e1.start->c == NFA_COMPOSING) + e1.start->out1->negated = TRUE; + PUSH(e1); + break; + + case NFA_OR: + /* Alternation */ + if (nfa_calc_size == TRUE) + { + nstate ++; + break; + } + e2 = POP(); + e1 = POP(); + s = new_state(NFA_SPLIT, e1.start, e2.start); + if (s == NULL) + return NULL; + PUSH(frag(s, append(e1.out, e2.out))); + break; + + case NFA_STAR: + /* Zero or more */ + if (nfa_calc_size == TRUE) + { + nstate ++; + break; + } + e = POP(); + s = new_state(NFA_SPLIT, e.start, NULL); + if (s == NULL) + return NULL; + patch(e.out, s); + PUSH(frag(s, list1(&s->out1))); + break; + + case NFA_QUEST: + /* one or zero atoms=> greedy match */ + if (nfa_calc_size == TRUE) + { + nstate ++; + break; + } + e = POP(); + s = new_state(NFA_SPLIT, e.start, NULL); + if (s == NULL) + return NULL; + PUSH(frag(s, append(e.out, list1(&s->out1)))); + break; + + case NFA_QUEST_NONGREEDY: + /* zero or one atoms => non-greedy match */ + if (nfa_calc_size == TRUE) + { + nstate ++; + break; + } + e = POP(); + s = new_state(NFA_SPLIT, NULL, e.start); + if (s == NULL) + return NULL; + PUSH(frag(s, append(e.out, list1(&s->out)))); + break; + + case NFA_PLUS: + /* One or more */ + if (nfa_calc_size == TRUE) + { + nstate ++; + break; + } + e = POP(); + s = new_state(NFA_SPLIT, e.start, NULL); + if (s == NULL) + return NULL; + patch(e.out, s); + PUSH(frag(e.start, list1(&s->out1))); + break; + + case NFA_SKIP_CHAR: + /* Symbol of 0-length, Used in a repetition + * with max/min count of 0 */ + if (nfa_calc_size == TRUE) + { + nstate ++; + break; + } + s = new_state(NFA_SKIP_CHAR, NULL, NULL); + if (s == NULL) + return NULL; + PUSH(frag(s, list1(&s->out))); + break; + + case NFA_PREV_ATOM_NO_WIDTH: + /* The \@= operator: match the preceding atom with 0 width. + * Surrounds the preceding atom with START_INVISIBLE and + * END_INVISIBLE, similarly to MOPEN. + */ + /* TODO: Maybe this drops the speed? */ + return NULL; + + if (nfa_calc_size == TRUE) + { + nstate += 2; + break; + } + e = POP(); + s1 = new_state(NFA_END_INVISIBLE, NULL, NULL); + if (s1 == NULL) + return NULL; + patch(e.out, s1); + + s = new_state(NFA_START_INVISIBLE, e.start, s1); + if (s == NULL) + return NULL; + PUSH(frag(s, list1(&s1->out))); + break; + + case NFA_MOPEN + 0: /* Submatch */ + case NFA_MOPEN + 1: + case NFA_MOPEN + 2: + case NFA_MOPEN + 3: + case NFA_MOPEN + 4: + case NFA_MOPEN + 5: + case NFA_MOPEN + 6: + case NFA_MOPEN + 7: + case NFA_MOPEN + 8: + case NFA_MOPEN + 9: + case NFA_NOPEN: /* \%( "Invisible Submatch" */ + case NFA_MULTIBYTE: /* mbyte char */ + case NFA_COMPOSING: /* composing char */ + if (nfa_calc_size == TRUE) + { + nstate += 2; + break; + } + + mopen = *p; + switch (*p) + { + case NFA_NOPEN: + mclose = NFA_NCLOSE; + break; + case NFA_MULTIBYTE: + mclose = NFA_END_MULTIBYTE; + break; + case NFA_COMPOSING: + mclose = NFA_END_COMPOSING; + break; + default: + /* NFA_MOPEN(0) ... NFA_MOPEN(9) */ + mclose = *p + NSUBEXP; + break; + } + + /* Allow "NFA_MOPEN" as a valid postfix representation for + * the empty regexp "". In this case, the NFA will be + * NFA_MOPEN -> NFA_MCLOSE. Note that this also allows + * empty groups of parenthesis, and empty mbyte chars */ + if (stackp == stack) + { + s = new_state(mopen, NULL, NULL); + if (s == NULL) + return NULL; + s1 = new_state(mclose, NULL, NULL); + if (s1 == NULL) + return NULL; + patch(list1(&s->out), s1); + PUSH(frag(s, list1(&s1->out))); + break; + } + + /* At least one node was emitted before NFA_MOPEN, so + * at least one node will be between NFA_MOPEN and NFA_MCLOSE */ + e = POP(); + s = new_state(mopen, e.start, NULL); /* `(' */ + if (s == NULL) + return NULL; + + s1 = new_state(mclose, NULL, NULL); /* `)' */ + if (s1 == NULL) + return NULL; + patch(e.out, s1); + + if (mopen == NFA_MULTIBYTE || mopen == NFA_COMPOSING) + /* MULTIBYTE->out1 = END_MULTIBYTE + * COMPOSING->out1 = END_COMPOSING */ + patch(list1(&s->out1), s1); + + PUSH(frag(s, list1(&s1->out))); + break; + + case NFA_ZSTART: + case NFA_ZEND: + default: + /* Operands */ + if (nfa_calc_size == TRUE) + { + nstate ++; + break; + } + s = new_state(*p, NULL, NULL); + if (s == NULL) + return NULL; + PUSH(frag(s, list1(&s->out))); + break; + + } /* switch(*p) */ + + } /* for(p = postfix; *p; ++p) */ + + if (nfa_calc_size == TRUE) + { + nstate ++; + return NULL; /* Return value when counting size is ignored anyway */ + } + + e = POP(); + if (stackp != stack) + EMSG_RET_NULL(_("E875: (NFA regexp) (While converting from postfix to NFA), too many states left on stack")); + + if (istate >= nstate) + EMSG_RET_NULL(_("E876: (NFA regexp) Not enough space to store the whole NFA ")); + + vim_free(stack); + + matchstate = &state_ptr[istate++]; /* the match state */ + matchstate->c = NFA_MATCH; + matchstate->out = matchstate->out1 = NULL; + + patch(e.out, matchstate); + return e.start; + + #undef POP1 + #undef PUSH1 + #undef POP2 + #undef PUSH2 + #undef POP + #undef PUSH + } + + /**************************************************************** + * NFA execution code. + ****************************************************************/ + + /* thread_T contains runtime information of a NFA state */ + struct thread + { + nfa_state_T *state; + regsub_T sub; /* submatch info */ + }; + + typedef struct + { + thread_T *t; + int n; + } List; + + static void addstate __ARGS((List *l, nfa_state_T *state, regsub_T *m, int off, int lid, int *match)); + + static void + addstate(l, state, m, off, lid, match) + List *l; /* runtime state list */ + nfa_state_T *state; /* state to update */ + regsub_T *m; /* pointers to subexpressions */ + int off; + int lid; + int *match; /* found match? */ + { + regsub_T save; + int subidx = 0; + + if (l == NULL || state == NULL) + return; + + switch (state->c) + { + case NFA_SPLIT: + case NFA_NOT: + case NFA_NOPEN: + case NFA_NCLOSE: + case NFA_MCLOSE: + case NFA_MCLOSE + 1: + case NFA_MCLOSE + 2: + case NFA_MCLOSE + 3: + case NFA_MCLOSE + 4: + case NFA_MCLOSE + 5: + case NFA_MCLOSE + 6: + case NFA_MCLOSE + 7: + case NFA_MCLOSE + 8: + case NFA_MCLOSE + 9: + /* Do not remember these nodes in list "thislist" or "nextlist" */ + break; + + default: + if (state->lastlist == lid) + { + if (++state->visits > 2) + return; + } + else + { + /* add the state to the list */ + state->lastlist = lid; + state->lastthread = &l->t[l->n++]; + state->lastthread->state = state; + state->lastthread->sub = *m; + } + } + + #ifdef ENABLE_LOG + nfa_set_code(state->c); + fprintf(log_fd, "> Adding state %d to list. Character %s, code %d\n", + abs(state->id), code, state->c); + #endif + switch (state->c) + { + case NFA_MATCH: + *match = TRUE; + break; + + case NFA_SPLIT: + addstate(l, state->out, m, off, lid, match); + addstate(l, state->out1, m, off, lid, match); + break; + + case NFA_SKIP_CHAR: + addstate(l, state->out, m, off, lid, match); + break; + + #if 0 + case NFA_END_NEG_RANGE: + /* Nothing to handle here. nfa_regmatch() will take care of it */ + break; + + case NFA_NOT: + EMSG(_("E999: (NFA regexp internal error) Should not process NOT node !")); + #ifdef ENABLE_LOG + fprintf(f, "\n\n>>> E999: Added state NFA_NOT to a list ... Something went wrong ! Why wasn't it processed already? \n\n"); + #endif + break; + + case NFA_COMPOSING: + /* nfa_regmatch() will match all the bytes of this composing char. */ + break; + + case NFA_MULTIBYTE: + /* nfa_regmatch() will match all the bytes of this multibyte char. */ + break; + #endif + + case NFA_END_MULTIBYTE: + /* Successfully matched this mbyte char */ + addstate(l, state->out, m, off, lid, match); + break; + + case NFA_NOPEN: + case NFA_NCLOSE: + addstate(l, state->out, m, off, lid, match); + break; + + /* If this state is reached, then a recursive call of nfa_regmatch() + * succeeded. the next call saves the found submatches in the + * first state after the "invisible" branch. */ + #if 0 + case NFA_END_INVISIBLE: + break; + #endif + + case NFA_MOPEN + 0: + case NFA_MOPEN + 1: + case NFA_MOPEN + 2: + case NFA_MOPEN + 3: + case NFA_MOPEN + 4: + case NFA_MOPEN + 5: + case NFA_MOPEN + 6: + case NFA_MOPEN + 7: + case NFA_MOPEN + 8: + case NFA_MOPEN + 9: + case NFA_ZSTART: + subidx = state->c - NFA_MOPEN; + if (state->c == NFA_ZSTART) + subidx = 0; + + if (REG_MULTI) + { + save.startpos[subidx] = m->startpos[subidx]; + save.endpos[subidx] = m->endpos[subidx]; + m->startpos[subidx].lnum = reglnum; + m->startpos[subidx].col = reginput - regline + off; + } + else + { + save.start[subidx] = m->start[subidx]; + save.end[subidx] = m->end[subidx]; + m->start[subidx] = reginput + off; + } + + addstate(l, state->out, m, off, lid, match); + + if (REG_MULTI) + { + m->startpos[subidx] = save.startpos[subidx]; + m->endpos[subidx] = save.endpos[subidx]; + } + else + { + m->start[subidx] = save.start[subidx]; + m->end[subidx] = save.end[subidx]; + } + break; + + case NFA_MCLOSE + 0: + if (nfa_has_zend == TRUE) + { + addstate(l, state->out, m, off, lid, match); + break; + } + case NFA_MCLOSE + 1: + case NFA_MCLOSE + 2: + case NFA_MCLOSE + 3: + case NFA_MCLOSE + 4: + case NFA_MCLOSE + 5: + case NFA_MCLOSE + 6: + case NFA_MCLOSE + 7: + case NFA_MCLOSE + 8: + case NFA_MCLOSE + 9: + case NFA_ZEND: + subidx = state->c - NFA_MCLOSE; + if (state->c == NFA_ZEND) + subidx = 0; + + if (REG_MULTI) + { + save.startpos[subidx] = m->startpos[subidx]; + save.endpos[subidx] = m->endpos[subidx]; + m->endpos[subidx].lnum = reglnum; + m->endpos[subidx].col = reginput - regline + off; + } + else + { + save.start[subidx] = m->start[subidx]; + save.end[subidx] = m->end[subidx]; + m->end[subidx] = reginput + off; + } + + addstate(l, state->out, m, off, lid, match); + + if (REG_MULTI) + { + m->startpos[subidx] = save.startpos[subidx]; + m->endpos[subidx] = save.endpos[subidx]; + } + else + { + m->start[subidx] = save.start[subidx]; + m->end[subidx] = save.end[subidx]; + } + break; + } + } + + /* + * Check character class "class" against current character c. + */ + static int + check_char_class(class, c) + int class; + int c; + { + switch (class) + { + case NFA_CLASS_ALNUM: + if (isalnum(c)) + return OK; + break; + case NFA_CLASS_ALPHA: + if (isalpha(c)) + return OK; + break; + case NFA_CLASS_BLANK: + if (c == ' ' || c == '\t') + return OK; + break; + case NFA_CLASS_CNTRL: + if (iscntrl(c)) + return OK; + break; + case NFA_CLASS_DIGIT: + if (VIM_ISDIGIT(c)) + return OK; + break; + case NFA_CLASS_GRAPH: + if (isgraph(c)) + return OK; + break; + case NFA_CLASS_LOWER: + if (MB_ISLOWER(c)) + return OK; + break; + case NFA_CLASS_PRINT: + if (vim_isprintc(c)) + return OK; + break; + case NFA_CLASS_PUNCT: + if (ispunct(c)) + return OK; + break; + case NFA_CLASS_SPACE: + if ((c >=9 && c <= 13) || (c == ' ')) + return OK; + break; + case NFA_CLASS_UPPER: + if (MB_ISUPPER(c)) + return OK; + break; + case NFA_CLASS_XDIGIT: + if (vim_isxdigit(c)) + return OK; + break; + case NFA_CLASS_TAB: + if (c == '\t') + return OK; + break; + case NFA_CLASS_RETURN: + if (c == '\r') + return OK; + break; + case NFA_CLASS_BACKSPACE: + if (c == '\b') + return OK; + break; + case NFA_CLASS_ESCAPE: + if (c == '\033') + return OK; + break; + + default: + /* should not be here :P */ + EMSG_RET_FAIL(_("E877: (NFA regexp) Invalid character class ")); + } + return FAIL; + } + + /* + * Set all NFA nodes' list ID equal to -1. + */ + static void + nfa_set_neg_listids(start) + nfa_state_T *start; + { + if (start == NULL) + return; + if (start->lastlist >= 0) + { + start->lastlist = -1; + nfa_set_neg_listids(start->out); + nfa_set_neg_listids(start->out1); + } + } + + /* + * Set all NFA nodes' list ID equal to 0. + */ + static void + nfa_set_null_listids(start) + nfa_state_T *start; + { + if (start == NULL) + return; + if (start->lastlist == -1) + { + start->lastlist = 0; + nfa_set_null_listids(start->out); + nfa_set_null_listids(start->out1); + } + } + + /* + * Save list IDs for all NFA states in "list". + */ + static void + nfa_save_listids(start, list) + nfa_state_T *start; + int *list; + { + if (start == NULL) + return; + if (start->lastlist != -1) + { + list[abs(start->id)] = start->lastlist; + start->lastlist = -1; + nfa_save_listids(start->out, list); + nfa_save_listids(start->out1, list); + } + } + + /* + * Restore list IDs from "list" to all NFA states. + */ + static void + nfa_restore_listids(start, list) + nfa_state_T *start; + int *list; + { + if (start == NULL) + return; + if (start->lastlist == -1) + { + start->lastlist = list[abs(start->id)]; + nfa_restore_listids(start->out, list); + nfa_restore_listids(start->out1, list); + } + } + + /* + * Main matching routine. + * + * Run NFA to determine whether it matches reginput. + * + * Return TRUE if there is a match, FALSE otherwise. + * Note: Caller must ensure that: start != NULL. + */ + static int + nfa_regmatch(start, submatch, m) + nfa_state_T *start; + regsub_T *submatch; + regsub_T *m; + { + int c = -1; + int n; + int i = 0; + int result; + int size = 0; + int match = FALSE; + int flag = 0; + int old_reglnum = -1; + int reginput_updated = FALSE; + thread_T *t; + char_u *cc; + char_u *old_reginput = NULL; + char_u *old_regline = NULL; + nfa_state_T *sta; + nfa_state_T *end; + List list[3]; + List *listtbl[2][2]; + List *ll; + int listid = 1; + int endnode = 0; + List *thislist; + List *nextlist; + List *neglist; + int *listids = NULL; + int j = 0; + int len = 0; + #ifdef DEBUG + FILE *debug = fopen("list.log", "a"); + + if (debug == NULL) + { + EMSG(_("(NFA) COULD NOT OPEN list.log !")); + return FALSE; + } + #endif + + /* Allocate memory for the lists of nodes */ + size = (nstate + 1) * sizeof(thread_T); + list[0].t = (thread_T *)lalloc(size, TRUE); + list[1].t = (thread_T *)lalloc(size, TRUE); + list[2].t = (thread_T *)lalloc(size, TRUE); + if (list[0].t == NULL || list[1].t == NULL || list[2].t == NULL) + goto theend; + vim_memset(list[0].t, 0, size); + vim_memset(list[1].t, 0, size); + vim_memset(list[2].t, 0, size); + + #ifdef ENABLE_LOG + log_fd = fopen(LOG_NAME, "a"); + if (log_fd != NULL) + { + fprintf(log_fd, "**********************************\n"); + nfa_set_code(start->c); + fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n", + abs(start->id), code); + fprintf(log_fd, "**********************************\n"); + } + else + { + EMSG(_("Could not open temporary log file for writing, displaying on stderr ... ")); + log_fd = stderr; + } + #endif + + thislist = &list[0]; + thislist->n = 0; + nextlist = &list[1]; + nextlist->n = 0; + neglist = &list[2]; + neglist->n = 0; + #ifdef ENABLE_LOG + fprintf(log_fd, "(---) STARTSTATE\n"); + #endif + addstate(thislist, start, m, 0, listid, &match); + + /* There are two cases when the NFA advances: 1. input char matches the + * NFA node and 2. input char does not match the NFA node, but the next + * node is NFA_NOT. The following macro calls addstate() according to + * these rules. It is used A LOT, so use the "listtbl" table for speed */ + listtbl[0][0] = NULL; + listtbl[0][1] = neglist; + listtbl[1][0] = nextlist; + listtbl[1][1] = NULL; + #define ADD_POS_NEG_STATE(node) \ + ll = listtbl[result ? 1 : 0][node->negated]; \ + if (ll != NULL) \ + addstate(ll, node->out , &t->sub, n, listid + 1, &match); + + + /* + * Run for each character. + */ + do { + again: + #ifdef FEAT_MBYTE + if (has_mbyte) + { + c = (*mb_ptr2char)(reginput); + n = (*mb_ptr2len)(reginput); + } + else + #endif + { + c = *reginput; + n = 1; + } + if (c == NUL) + n = 0; + cc = (char_u *)&c; + + /* swap lists */ + thislist = &list[flag]; + nextlist = &list[flag ^= 1]; + nextlist->n = 0; /* `clear' nextlist */ + listtbl[1][0] = nextlist; + ++listid; + + #ifdef ENABLE_LOG + fprintf(log_fd, "------------------------------------------\n"); + fprintf(log_fd, ">>> Reginput is \"%s\"\n", reginput); + fprintf(log_fd, ">>> Advanced one character ... Current char is %c (code %d) \n", c, (int)c); + fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n); + for (i = 0; i< thislist->n; i++) + fprintf(log_fd, "%d ", abs(thislist->t[i].state->id)); + fprintf(log_fd, "\n"); + #endif + + #ifdef DEBUG + fprintf(debug, "\n-------------------\n"); + #endif + + /* compute nextlist */ + for (i = 0; i < thislist->n || neglist->n > 0; ++i) + { + if (neglist->n > 0) + { + t = &neglist->t[0]; + neglist->n --; + i--; + } + else + t = &thislist->t[i]; + + #ifdef DEBUG + nfa_set_code(t->state->c); + fprintf(debug, "%s, ", code); + #endif + #ifdef ENABLE_LOG + nfa_set_code(t->state->c); + fprintf(log_fd, "(%d) %s, code %d ... \n", abs(t->state->id), + code, (int)t->state->c); + #endif + + /* + * Handle the possible codes of the current state. + * The most important is NFA_MATCH. + */ + switch (t->state->c) + { + case NFA_MATCH: + match = TRUE; + *submatch = t->sub; + #ifdef ENABLE_LOG + for (j = 0; j < 4; j++) + if (REG_MULTI) + fprintf(log_fd, "\n *** group %d, start: c=%d, l=%d, end: c=%d, l=%d", + j, + t->sub.startpos[j].col, + (int)t->sub.startpos[j].lnum, + t->sub.endpos[j].col, + (int)t->sub.endpos[j].lnum); + else + fprintf(log_fd, "\n *** group %d, start: \"%s\", end: \"%s\"", + j, + (char *)t->sub.start[j], + (char *)t->sub.end[j]); + fprintf(log_fd, "\n"); + #endif + goto nextchar; /* found the left-most longest match */ + + case NFA_END_INVISIBLE: + /* This is only encountered after a NFA_START_INVISIBLE node. + * They surround a zero-width group, used with "\@=" and "\&". + * If we got here, it means that the current "invisible" group + * finished successfully, so return control to the parent + * nfa_regmatch(). Submatches are stored in *m, and used in + * the parent call. */ + if (start->c == NFA_MOPEN + 0) + addstate(thislist, t->state->out, &t->sub, 0, listid, + &match); + else + { + *m = t->sub; + match = TRUE; + } + break; + + case NFA_START_INVISIBLE: + /* Save global variables, and call nfa_regmatch() to check if + * the current concat matches at this position. The concat + * ends with the node NFA_END_INVISIBLE */ + old_reginput = reginput; + old_regline = regline; + old_reglnum = reglnum; + if (listids == NULL) + { + listids = (int *) lalloc(sizeof(int) * nstate, TRUE); + if (listids == NULL) + { + EMSG(_("E878: (NFA) Could not allocate memory for branch traversal!")); + return 0; + } + } + #ifdef ENABLE_LOG + if (log_fd != stderr) + fclose(log_fd); + log_fd = NULL; + #endif + /* Have to clear the listid field of the NFA nodes, so that + * nfa_regmatch() and addstate() can run properly after + * recursion. */ + nfa_save_listids(start, listids); + nfa_set_null_listids(start); + result = nfa_regmatch(t->state->out, submatch, m); + nfa_set_neg_listids(start); + nfa_restore_listids(start, listids); + + #ifdef ENABLE_LOG + log_fd = fopen(LOG_NAME, "a"); + if (log_fd != NULL) + { + fprintf(log_fd, "****************************\n"); + fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n"); + fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE"); + fprintf(log_fd, "****************************\n"); + } + else + { + EMSG(_("Could not open temporary log file for writing, displaying on stderr ... ")); + log_fd = stderr; + } + #endif + if (result == TRUE) + { + /* Restore position in input text */ + reginput = old_reginput; + regline = old_regline; + reglnum = old_reglnum; + /* Copy submatch info from the recursive call */ + if (REG_MULTI) + for (j = 1; j < NSUBEXP; j++) + { + t->sub.startpos[j] = m->startpos[j]; + t->sub.endpos[j] = m->endpos[j]; + } + else + for (j = 1; j < NSUBEXP; j++) + { + t->sub.start[j] = m->start[j]; + t->sub.end[j] = m->end[j]; + } + /* t->state->out1 is the corresponding END_INVISIBLE node */ + addstate(thislist, t->state->out1->out, &t->sub, 0, listid, + &match); + } + else + { + /* continue with next input char */ + reginput = old_reginput; + } + break; + + case NFA_BOL: + if (reginput == regline) + addstate(thislist, t->state->out, &t->sub, 0, listid, + &match); + break; + + case NFA_EOL: + if (c == NUL) + addstate(thislist, t->state->out, &t->sub, 0, listid, + &match); + break; + + case NFA_BOW: + { + int bow = TRUE; + + if (c == NUL) + bow = FALSE; + #ifdef FEAT_MBYTE + else if (has_mbyte) + { + int this_class; + + /* Get class of current and previous char (if it exists). */ + this_class = mb_get_class(reginput); + if (this_class <= 1) + bow = FALSE; + else if (reg_prev_class() == this_class) + bow = FALSE; + } + #endif + else if (!vim_iswordc(c) + || (reginput > regline && vim_iswordc(reginput[-1]))) + bow = FALSE; + if (bow) + addstate(thislist, t->state->out, &t->sub, 0, listid, + &match); + break; + } + + case NFA_EOW: + { + int eow = TRUE; + + if (reginput == regline) + eow = FALSE; + #ifdef FEAT_MBYTE + else if (has_mbyte) + { + int this_class, prev_class; + + /* Get class of current and previous char (if it exists). */ + this_class = mb_get_class(reginput); + prev_class = reg_prev_class(); + if (this_class == prev_class + || prev_class == 0 || prev_class == 1) + eow = FALSE; + } + #endif + else if (!vim_iswordc(reginput[-1]) + || (reginput[0] != NUL && vim_iswordc(c))) + eow = FALSE; + if (eow) + addstate(thislist, t->state->out, &t->sub, 0, listid, + &match); + break; + } + + case NFA_MULTIBYTE: + case NFA_COMPOSING: + switch (t->state->c) + { + case NFA_MULTIBYTE: endnode = NFA_END_MULTIBYTE; break; + case NFA_COMPOSING: endnode = NFA_END_COMPOSING; break; + default: endnode = 0; + } + + result = OK; + sta = t->state->out; + len = 1; + while (sta->c != endnode && len <= n) + { + if (reginput[len-1] != sta->c) + { + result = OK - 1; + break; + } + len++; + sta = sta->out; + } + + /* if input char length doesn't match regexp char length */ + if (len -1 < n || sta->c != endnode) + result = OK - 1; + end = t->state->out1; /* NFA_END_MULTIBYTE or + NFA_END_COMPOSING */ + /* If \Z was present, then ignore composing characters */ + if (regflags & RF_ICOMBINE) + result = 1 ^ sta->negated; + ADD_POS_NEG_STATE(end); + break; + + case NFA_NEWL: + if (!reg_line_lbr && REG_MULTI + && c == NUL && reglnum <= reg_maxline) + { + if (reginput_updated == FALSE) + { + reg_nextline(); + reginput_updated = TRUE; + } + addstate(nextlist, t->state->out, &t->sub, n, listid + 1, + &match); + } + break; + + case NFA_CLASS_ALNUM: + case NFA_CLASS_ALPHA: + case NFA_CLASS_BLANK: + case NFA_CLASS_CNTRL: + case NFA_CLASS_DIGIT: + case NFA_CLASS_GRAPH: + case NFA_CLASS_LOWER: + case NFA_CLASS_PRINT: + case NFA_CLASS_PUNCT: + case NFA_CLASS_SPACE: + case NFA_CLASS_UPPER: + case NFA_CLASS_XDIGIT: + case NFA_CLASS_TAB: + case NFA_CLASS_RETURN: + case NFA_CLASS_BACKSPACE: + case NFA_CLASS_ESCAPE: + result = check_char_class(t->state->c, c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_END_NEG_RANGE: + /* This follows a series of negated nodes, like: + * CHAR(x), NFA_NOT, CHAR(y), NFA_NOT etc. */ + if (c > 0) + addstate(nextlist, t->state->out, &t->sub, n, listid + 1, + &match); + break; + + case NFA_ANY: + /* Any printable char, not just any char. '\0' (end of input) + * must not match */ + if (c > 0) + addstate(nextlist, t->state->out, &t->sub, n, listid + 1, + &match); + break; + + /* + * Character classes like \a for alpha, \d for digit etc. + */ + case NFA_IDENT: /* \i */ + result = vim_isIDc(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_SIDENT: /* \I */ + result = !VIM_ISDIGIT(c) && vim_isIDc(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_KWORD: /* \k */ + result = vim_iswordp(cc); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_SKWORD: /* \K */ + result = !VIM_ISDIGIT(c) && vim_iswordp(cc); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_FNAME: /* \f */ + result = vim_isfilec(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_SFNAME: /* \F */ + result = !VIM_ISDIGIT(c) && vim_isfilec(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_PRINT: /* \p */ + result = ptr2cells(cc) == 1; + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_SPRINT: /* \P */ + result = !VIM_ISDIGIT(c) && ptr2cells(cc) == 1; + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_WHITE: /* \s */ + result = vim_iswhite(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_NWHITE: /* \S */ + result = c != NUL && !vim_iswhite(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_DIGIT: /* \d */ + result = ri_digit(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_NDIGIT: /* \D */ + result = c != NUL && !ri_digit(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_HEX: /* \x */ + result = ri_hex(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_NHEX: /* \X */ + result = c != NUL && !ri_hex(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_OCTAL: /* \o */ + result = ri_octal(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_NOCTAL: /* \O */ + result = c != NUL && !ri_octal(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_WORD: /* \w */ + result = ri_word(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_NWORD: /* \W */ + result = c != NUL && !ri_word(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_HEAD: /* \h */ + result = ri_head(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_NHEAD: /* \H */ + result = c != NUL && !ri_head(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_ALPHA: /* \a */ + result = ri_alpha(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_NALPHA: /* \A */ + result = c != NUL && !ri_alpha(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_LOWER: /* \l */ + result = ri_lower(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_NLOWER: /* \L */ + result = c != NUL && !ri_lower(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_UPPER: /* \u */ + result = ri_upper(c); + ADD_POS_NEG_STATE(t->state); + break; + + case NFA_NUPPER: /* \U */ + result = c != NUL && !ri_upper(c); + ADD_POS_NEG_STATE(t->state); + break; + + default: /* regular character */ + result = (no_Magic(t->state->c) == c); + if (!result) + result = ireg_ic == TRUE + && MB_TOLOWER(t->state->c) == MB_TOLOWER(c); + ADD_POS_NEG_STATE(t->state); + break; + } + + } /* for (thislist = thislist; thislist->state; thislist++) */ + + /* The first found match is the leftmost one, but there may be a + * longer one. Keep running the NFA, but don't start from the + * beginning. Also, do not add the start state in recursive calls of + * nfa_regmatch(), because recursive calls should only start in the + * first position. */ + if (match == FALSE && start->c == NFA_MOPEN + 0) + { + #ifdef ENABLE_LOG + fprintf(log_fd, "(---) STARTSTATE\n"); + #endif + addstate(nextlist, start, m, n, listid + 1, &match); + } + + if (reginput_updated) + { + reginput_updated = FALSE; + goto again; + } + + #ifdef ENABLE_LOG + fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n); + for (i = 0; i< thislist->n; i++) + fprintf(log_fd, "%d ", abs(thislist->t[i].state->id)); + fprintf(log_fd, "\n"); + #endif + + nextchar: + reginput += n; + } while (c || reginput_updated); + + #ifdef ENABLE_LOG + if (log_fd != stderr) + fclose(log_fd); + log_fd = NULL; + #endif + + theend: + /* Free memory */ + vim_free(list[0].t); + vim_free(list[1].t); + vim_free(list[2].t); + list[0].t = list[1].t = list[2].t = NULL; + if (listids != NULL) + vim_free(listids); + #undef ADD_POS_NEG_STATE + #ifdef DEBUG + fclose(debug); + #endif + + return match; + } + + /* + * Try match of "prog" with at regline["col"]. + * Returns 0 for failure, number of lines contained in the match otherwise. + */ + static long + nfa_regtry(start, col) + nfa_state_T *start; + colnr_T col; + { + int i; + regsub_T sub, m; + #ifdef ENABLE_LOG + FILE *f; + #endif + + reginput = regline + col; + need_clear_subexpr = TRUE; + + #ifdef ENABLE_LOG + f = fopen(LOG_NAME, "a"); + if (f != NULL) + { + fprintf(f, "\n\n\n\n\n\n\t\t=======================================================\n"); + fprintf(f, " =======================================================\n"); + #ifdef DEBUG + fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr); + #endif + fprintf(f, "\tInput text is \"%s\" \n", reginput); + fprintf(f, " =======================================================\n\n\n\n\n\n\n"); + nfa_print_state(f, start, 0); + fprintf(f, "\n\n"); + fclose(f); + } + else + EMSG(_("Could not open temporary log file for writing ")); + #endif + + if (REG_MULTI) + { + /* Use 0xff to set lnum to -1 */ + vim_memset(sub.startpos, 0xff, sizeof(lpos_T) * NSUBEXP); + vim_memset(sub.endpos, 0xff, sizeof(lpos_T) * NSUBEXP); + vim_memset(m.startpos, 0xff, sizeof(lpos_T) * NSUBEXP); + vim_memset(m.endpos, 0xff, sizeof(lpos_T) * NSUBEXP); + } + else + { + vim_memset(sub.start, 0, sizeof(char_u *) * NSUBEXP); + vim_memset(sub.end, 0, sizeof(char_u *) * NSUBEXP); + vim_memset(m.start, 0, sizeof(char_u *) * NSUBEXP); + vim_memset(m.end, 0, sizeof(char_u *) * NSUBEXP); + } + + if (nfa_regmatch(start, &sub, &m) == FALSE) + return 0; + + cleanup_subexpr(); + if (REG_MULTI) + { + for (i = 0; i < NSUBEXP; i++) + { + reg_startpos[i] = sub.startpos[i]; + reg_endpos[i] = sub.endpos[i]; + } + + if (reg_startpos[0].lnum < 0) + { + reg_startpos[0].lnum = 0; + reg_startpos[0].col = col; + } + if (reg_endpos[0].lnum < 0) + { + reg_endpos[0].lnum = reglnum; + reg_endpos[0].col = (int)(reginput - regline); + } + else + /* Use line number of "\ze". */ + reglnum = reg_endpos[0].lnum; + } + else + { + for (i = 0; i < NSUBEXP; i++) + { + reg_startp[i] = sub.start[i]; + reg_endp[i] = sub.end[i]; + } + + if (reg_startp[0] == NULL) + reg_startp[0] = regline + col; + if (reg_endp[0] == NULL) + reg_endp[0] = reginput; + } + + return 1 + reglnum; + } + + /* + * Match a regexp against a string ("line" points to the string) or multiple + * lines ("line" is NULL, use reg_getline()). + * + * Returns 0 for failure, number of lines contained in the match otherwise. + */ + static long + nfa_regexec_both(line, col) + char_u *line; + colnr_T col; /* column to start looking for match */ + { + nfa_regprog_T *prog; + long retval = 0L; + int i; + + if (REG_MULTI) + { + prog = (nfa_regprog_T *)reg_mmatch->regprog; + line = reg_getline((linenr_T)0); /* relative to the cursor */ + reg_startpos = reg_mmatch->startpos; + reg_endpos = reg_mmatch->endpos; + } + else + { + prog = (nfa_regprog_T *)reg_match->regprog; + reg_startp = reg_match->startp; + reg_endp = reg_match->endp; + } + + /* Be paranoid... */ + if (prog == NULL || line == NULL) + { + EMSG(_(e_null)); + goto theend; + } + + /* If the start column is past the maximum column: no need to try. */ + if (ireg_maxcol > 0 && col >= ireg_maxcol) + goto theend; + + /* If pattern contains "\c" or "\C": overrule value of ireg_ic */ + if (prog->regflags & RF_ICASE) + ireg_ic = TRUE; + else if (prog->regflags & RF_NOICASE) + ireg_ic = FALSE; + + #ifdef FEAT_MBYTE + /* If pattern contains "\Z" overrule value of ireg_icombine */ + if (prog->regflags & RF_ICOMBINE) + ireg_icombine = TRUE; + #endif + + regline = line; + reglnum = 0; /* relative to line */ + + nstate = prog->nstate; + + for (i = 0; i < nstate; ++i) + { + prog->state[i].id = i; + prog->state[i].lastlist = 0; + prog->state[i].visits = 0; + prog->state[i].lastthread = NULL; + } + + retval = nfa_regtry(prog->start, col); + + theend: + return retval; + } + + /* + * Compile a regular expression into internal code for the NFA matcher. + * Returns the program in allocated space. Returns NULL for an error. + */ + static regprog_T * + nfa_regcomp(expr, re_flags) + char_u *expr; + int re_flags; + { + nfa_regprog_T *prog; + int prog_size; + int *postfix; + + if (expr == NULL) + return NULL; + + #ifdef DEBUG + nfa_regengine.expr = expr; + #endif + + init_class_tab(); + + if (nfa_regcomp_start(expr, re_flags) == FAIL) + return NULL; + + /* Space for compiled regexp */ + prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * nstate_max; + prog = (nfa_regprog_T *)lalloc(prog_size, TRUE); + if (prog == NULL) + goto fail; + vim_memset(prog, 0, prog_size); + + /* Build postfix form of the regexp. Needed to build the NFA + * (and count its size) */ + postfix = re2post(); + if (postfix == NULL) + goto fail; /* Cascaded (syntax?) error */ + + /* + * In order to build the NFA, we parse the input regexp twice: + * 1. first pass to count size (so we can allocate space) + * 2. second to emit code + */ + #ifdef ENABLE_LOG + { + FILE *f = fopen(LOG_NAME, "a"); + + if (f != NULL) + { + fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\" ... hold on !\n", expr); + fclose(f); + } + } + #endif + + /* + * PASS 1 + * Count number of NFA states in "nstate". Do not build the NFA. + */ + post2nfa(postfix, post_ptr, TRUE); + state_ptr = prog->state; + + /* + * PASS 2 + * Build the NFA + */ + prog->start = post2nfa(postfix, post_ptr, FALSE); + if (prog->start == NULL) + goto fail; + + prog->regflags = regflags; + prog->engine = &nfa_regengine; + prog->nstate = nstate; + #ifdef ENABLE_LOG + nfa_postfix_dump(expr, OK); + nfa_dump(prog); + #endif + + out: + vim_free(post_start); + post_start = post_ptr = post_end = NULL; + state_ptr = NULL; + return (regprog_T *)prog; + + fail: + vim_free(prog); + prog = NULL; + #ifdef ENABLE_LOG + nfa_postfix_dump(expr, FAIL); + #endif + #ifdef DEBUG + nfa_regengine.expr = NULL; + #endif + goto out; + } + + + /* + * Match a regexp against a string. + * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp(). + * Uses curbuf for line count and 'iskeyword'. + * + * Return TRUE if there is a match, FALSE if not. + */ + static int + nfa_regexec(rmp, line, col) + regmatch_T *rmp; + char_u *line; /* string to match against */ + colnr_T col; /* column to start looking for match */ + { + reg_match = rmp; + reg_mmatch = NULL; + reg_maxline = 0; + reg_line_lbr = FALSE; + reg_buf = curbuf; + reg_win = NULL; + ireg_ic = rmp->rm_ic; + #ifdef FEAT_MBYTE + ireg_icombine = FALSE; + #endif + ireg_maxcol = 0; + return (nfa_regexec_both(line, col) != 0); + } + + #if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \ + || defined(FIND_REPLACE_DIALOG) || defined(PROTO) + + static int nfa_regexec_nl __ARGS((regmatch_T *rmp, char_u *line, colnr_T col)); + + /* + * Like nfa_regexec(), but consider a "\n" in "line" to be a line break. + */ + static int + nfa_regexec_nl(rmp, line, col) + regmatch_T *rmp; + char_u *line; /* string to match against */ + colnr_T col; /* column to start looking for match */ + { + reg_match = rmp; + reg_mmatch = NULL; + reg_maxline = 0; + reg_line_lbr = TRUE; + reg_buf = curbuf; + reg_win = NULL; + ireg_ic = rmp->rm_ic; + #ifdef FEAT_MBYTE + ireg_icombine = FALSE; + #endif + ireg_maxcol = 0; + return (nfa_regexec_both(line, col) != 0); + } + #endif + + + /* + * Match a regexp against multiple lines. + * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). + * Uses curbuf for line count and 'iskeyword'. + * + * Return zero if there is no match. Return number of lines contained in the + * match otherwise. + * + * Note: the body is the same as bt_regexec() except for nfa_regexec_both() + * + * ! Also NOTE : match may actually be in another line. e.g.: + * when r.e. is \nc, cursor is at 'a' and the text buffer looks like + * + * +-------------------------+ + * |a | + * |b | + * |c | + * | | + * +-------------------------+ + * + * then nfa_regexec_multi() returns 3. while the original + * vim_regexec_multi() returns 0 and a second call at line 2 will return 2. + * + * FIXME if this behavior is not compatible. + */ + static long + nfa_regexec_multi(rmp, win, buf, lnum, col, tm) + regmmatch_T *rmp; + win_T *win; /* window in which to search or NULL */ + buf_T *buf; /* buffer in which to search */ + linenr_T lnum; /* nr of line to start looking for match */ + colnr_T col; /* column to start looking for match */ + proftime_T *tm UNUSED; /* timeout limit or NULL */ + { + long r; + buf_T *save_curbuf = curbuf; + + reg_match = NULL; + reg_mmatch = rmp; + reg_buf = buf; + reg_win = win; + reg_firstlnum = lnum; + reg_maxline = reg_buf->b_ml.ml_line_count - lnum; + reg_line_lbr = FALSE; + ireg_ic = rmp->rmm_ic; + #ifdef FEAT_MBYTE + ireg_icombine = FALSE; + #endif + ireg_maxcol = rmp->rmm_maxcol; + + /* Need to switch to buffer "buf" to make vim_iswordc() work. */ + curbuf = buf; + r = nfa_regexec_both(NULL, col); + curbuf = save_curbuf; + + return r; + } + + #ifdef DEBUG + # undef ENABLE_LOG + #endif *** ../vim-7.3.969/src/structs.h 2013-05-15 15:12:25.000000000 +0200 --- src/structs.h 2013-05-17 18:54:19.000000000 +0200 *************** *** 63,77 **** #define GA_EMPTY {0, 0, 0, 0, NULL} - /* - * This is here because regexp.h needs pos_T and below regprog_T is used. - */ - #include "regexp.h" - typedef struct window_S win_T; typedef struct wininfo_S wininfo_T; typedef struct frame_S frame_T; typedef int scid_T; /* script ID */ /* * This is here because gui.h needs the pos_T and win_T, and win_T needs gui.h --- 63,78 ---- #define GA_EMPTY {0, 0, 0, 0, NULL} typedef struct window_S win_T; typedef struct wininfo_S wininfo_T; typedef struct frame_S frame_T; typedef int scid_T; /* script ID */ + typedef struct file_buffer buf_T; /* forward declaration */ + + /* + * This is here because regexp.h needs pos_T and below regprog_T is used. + */ + #include "regexp.h" /* * This is here because gui.h needs the pos_T and win_T, and win_T needs gui.h *************** *** 526,533 **** # endif } cmdmod_T; - typedef struct file_buffer buf_T; /* forward declaration */ - #define MF_SEED_LEN 8 struct memfile --- 527,532 ---- *** ../vim-7.3.969/src/testdir/Makefile 2013-04-24 12:56:13.000000000 +0200 --- src/testdir/Makefile 2013-05-18 14:22:50.000000000 +0200 *************** *** 29,35 **** test79.out test80.out test81.out test82.out test83.out \ test84.out test85.out test86.out test87.out test88.out \ test89.out test90.out test91.out test92.out test93.out \ ! test94.out SCRIPTS_GUI = test16.out --- 29,35 ---- test79.out test80.out test81.out test82.out test83.out \ test84.out test85.out test86.out test87.out test88.out \ test89.out test90.out test91.out test92.out test93.out \ ! test94.out test95.out SCRIPTS_GUI = test16.out *************** *** 85,97 **** fi" # Check if the test.out file matches test.ok. ! @/bin/sh -c "if test -f test.out; then\ if diff test.out $*.ok; \ then mv -f test.out $*.out; \ else echo $* FAILED >>test.log; mv -f test.out $*.failed; \ fi \ else echo $* NO OUTPUT >>test.log; \ fi" -rm -rf X* test.ok viminfo test49.out: test49.vim --- 85,100 ---- fi" # Check if the test.out file matches test.ok. ! @/bin/sh -c "if test -f test.out; then \ if diff test.out $*.ok; \ then mv -f test.out $*.out; \ else echo $* FAILED >>test.log; mv -f test.out $*.failed; \ fi \ else echo $* NO OUTPUT >>test.log; \ fi" + @/bin/sh -c "if test -f valgrind; then\ + mv -f valgrind valgrind.$*; \ + fi" -rm -rf X* test.ok viminfo test49.out: test49.vim *** ../vim-7.3.969/src/testdir/test64.in 2010-08-15 21:57:29.000000000 +0200 --- src/testdir/test64.in 2013-05-19 16:05:36.000000000 +0200 *************** *** 1,4 **** ! Test for regexp patterns. A pattern that gives the expected result produces OK, so that we know it was actually tried. --- 1,5 ---- ! Test for regexp patterns without multi-byte support. ! See test95 for multi-byte tests. A pattern that gives the expected result produces OK, so that we know it was actually tried. *************** *** 14,19 **** --- 15,25 ---- :" etc. :" When there is no match use only the first two items. :let tl = [] + + :"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + :"""" Previously written tests """""""""""""""""""""""""""""""" + :"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + :call add(tl, ['ab', 'aab', 'ab']) :call add(tl, ['b', 'abcdef', 'b']) :call add(tl, ['bc*', 'abccccdef', 'bcccc']) *************** *** 132,137 **** --- 138,301 ---- :" :call add(tl, ['\v(a*)+', 'aaaa', 'aaaa', '']) :call add(tl, ['x', 'abcdef']) + + :"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + :""""" Simple tests """"""""""""""""""""""""""""""""""""""""""" + :"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + + :" Search single groups + :call add(tl, ['ab', 'aab', 'ab']) + :call add(tl, ['ab', 'baced']) + :call add(tl, ['ab', ' ab ', 'ab']) + + :" Search multi-modifiers + :call add(tl, ['x*', 'xcd', 'x']) + :call add(tl, ['x*', 'xxxxxxxxxxxxxxxxsofijiojgf', 'xxxxxxxxxxxxxxxx']) + :call add(tl, ['x*', 'abcdoij', '']) " empty match is good + :call add(tl, ['x\+', 'abcdoin']) " no match here + :call add(tl, ['x\+', 'abcdeoijdfxxiuhfij', 'xx']) + :call add(tl, ['x\+', 'xxxxx', 'xxxxx']) + :call add(tl, ['x\+', 'abc x siufhiush xxxxxxxxx', 'x']) + :call add(tl, ['x\=', 'x sdfoij', 'x']) + :call add(tl, ['x\=', 'abc sfoij', '']) " empty match is good + :call add(tl, ['x\=', 'xxxxxxxxx c', 'x']) + :call add(tl, ['x\?', 'x sdfoij', 'x']) + :call add(tl, ['x\?', 'abc sfoij', '']) " empty match is good + :call add(tl, ['x\?', 'xxxxxxxxxx c', 'x']) + + :call add(tl, ['a\{0,0}', 'abcdfdoij', '']) + :call add(tl, ['a\{0,1}', 'asiubid axxxaaa', 'a']) " same thing as 'a?' + :call add(tl, ['a\{1,0}', 'asiubid axxxaaa', 'a']) " same thing as 'a\{0,1}' + :call add(tl, ['a\{3,6}', 'aa siofuh']) + :call add(tl, ['a\{3,6}', 'aaaaa asfoij afaa', 'aaaaa']) + :call add(tl, ['a\{3,6}', 'aaaaaaaa', 'aaaaaa']) + :call add(tl, ['a\{0}', 'asoiuj', '']) + :call add(tl, ['a\{2}', 'aaaa', 'aa']) + :call add(tl, ['a\{2}', 'iuash fiusahfliusah fiushfilushfi uhsaifuh askfj nasfvius afg aaaa sfiuhuhiushf', 'aa']) + :call add(tl, ['a\{2}', 'abcdefghijklmnopqrestuvwxyz1234567890']) + :call add(tl, ['a\{0,}', 'oij sdigfusnf', '']) " same thing as 'a*' + :call add(tl, ['a\{0,}', 'aaaaa aa', 'aaaaa']) + :call add(tl, ['a\{2,}', 'sdfiougjdsafg']) + :call add(tl, ['a\{2,}', 'aaaaasfoij ', 'aaaaa']) + :call add(tl, ['a\{,0}', 'oidfguih iuhi hiu aaaa', '']) + :call add(tl, ['a\{,5}', 'abcd', 'a']) + :call add(tl, ['a\{,5}', 'aaaaaaaaaa', 'aaaaa']) + :call add(tl, ['a\{}', 'bbbcddiuhfcd', '']) " same thing as 'a*' + :call add(tl, ['a\{}', 'aaaaioudfh coisf jda', 'aaaa']) + + :call add(tl, ['a\{-0,0}', 'abcdfdoij', '']) + :call add(tl, ['a\{-0,1}', 'asiubid axxxaaa', '']) " anti-greedy version of 'a?' + :call add(tl, ['a\{-3,6}', 'aa siofuh']) + :call add(tl, ['a\{-3,6}', 'aaaaa asfoij afaa', 'aaa']) + :call add(tl, ['a\{-3,6}', 'aaaaaaaa', 'aaa']) + :call add(tl, ['a\{-0}', 'asoiuj', '']) + :call add(tl, ['a\{-2}', 'aaaa', 'aa']) + :call add(tl, ['a\{-2}', 'abcdefghijklmnopqrestuvwxyz1234567890']) + :call add(tl, ['a\{-0,}', 'oij sdigfusnf', '']) + :call add(tl, ['a\{-0,}', 'aaaaa aa', '']) + :call add(tl, ['a\{-2,}', 'sdfiougjdsafg']) + :call add(tl, ['a\{-2,}', 'aaaaasfoij ', 'aa']) + :call add(tl, ['a\{-,0}', 'oidfguih iuhi hiu aaaa', '']) + :call add(tl, ['a\{-,5}', 'abcd', '']) + :call add(tl, ['a\{-,5}', 'aaaaaaaaaa', '']) + :call add(tl, ['a\{-}', 'bbbcddiuhfcd', '']) " anti-greedy version of 'a*' + :call add(tl, ['a\{-}', 'aaaaioudfh coisf jda', '']) + + :" Test groups of characters and submatches + :call add(tl, ['\(abc\)*', 'abcabcabc', 'abcabcabc', 'abc']) + :call add(tl, ['\(ab\)\+', 'abababaaaaa', 'ababab', 'ab']) + :call add(tl, ['\(abaaaaa\)*cd', 'cd', 'cd', '']) + :call add(tl, ['\(test1\)\? \(test2\)\?', 'test1 test3', 'test1 ', 'test1', '']) + :call add(tl, ['\(test1\)\= \(test2\) \(test4443\)\=', ' test2 test4443 yupiiiiiiiiiii', ' test2 test4443', '', 'test2', 'test4443']) + :call add(tl, ['\(\(sub1\) hello \(sub 2\)\)', 'asterix sub1 hello sub 2 obelix', 'sub1 hello sub 2', 'sub1 hello sub 2', 'sub1', 'sub 2']) + :call add(tl, ['\(\(\(yyxxzz\)\)\)', 'abcdddsfiusfyyzzxxyyxxzz', 'yyxxzz', 'yyxxzz', 'yyxxzz', 'yyxxzz']) + :call add(tl, ['\v((ab)+|c+)+', 'abcccaba', 'abcccab', 'ab', 'ab']) + :call add(tl, ['\v((ab)|c*)+', 'abcccaba', 'abcccab', '', 'ab']) + :call add(tl, ['\v(a(c*)+b)+', 'acbababaaa', 'acbabab', 'ab', '']) + :call add(tl, ['\v(a|b*)+', 'aaaa', 'aaaa', '']) + + :" Test greedy-ness and lazy-ness + :call add(tl, ['a\{-2,7}','aaaaaaaaaaaaa', 'aa']) + :call add(tl, ['a\{2,7}','aaaaaaaaaaaaaaaaaaaa', 'aaaaaaa']) + :call add(tl, ['\vx(.{-,8})yz(.*)','xayxayzxayzxayz','xayxayzxayzxayz','ayxa','xayzxayz']) + :call add(tl, ['\vx(.*)yz(.*)','xayxayzxayzxayz','xayxayzxayzxayz', 'ayxayzxayzxa','']) + :call add(tl, ['\v(a{1,2}){-2,3}','aaaaaaa','aaaa','aa']) + :call add(tl, ['\v(a{-1,3})+','aa','aa','a']) + + :" Test Character classes + :call add(tl, ['\d\+e\d\d','test 10e23 fd','10e23']) + + :" Test collections and character range [] + :call add(tl, ['\v[a]', 'abcd', 'a']) + :call add(tl, ['a[bcd]', 'abcd', 'ab']) + :call add(tl, ['a[b-d]', 'acbd', 'ac']) + :call add(tl, ['[a-d][e-f][x-x]d', 'cexdxx', 'cexd']) + :call add(tl, ['\v[[:alpha:]]+', 'abcdefghijklmnopqrstuvwxyz6','abcdefghijklmnopqrstuvwxyz']) + :call add(tl, ['[[:alpha:]\+]', '6x8','x']) + :call add(tl, ['[^abc]\+','abcabcabc']) + :call add(tl, ['[^abc]','defghiasijvoinasoiunbvb','d']) + :call add(tl, ['[^abc]\+','ddddddda','ddddddd']) + :call add(tl, ['[^a-d]\+','aaaAAAZIHFNCddd','AAAZIHFNC']) + :call add(tl, ['[a-f]*','iiiiiiii','']) + :call add(tl, ['[a-f]*','abcdefgh','abcdef']) + :call add(tl, ['[^a-f]\+','abcdefgh','gh']) + :call add(tl, ['[a-c]\{-3,6}','abcabc','abc']) + :call add(tl, ['[^[:alpha:]]\+','abcccadfoij7787ysf287yrnccdu','7787']) + :call add(tl, ['[-a]', '-', '-']) + :call add(tl, ['[a-]', '-', '-']) + :call add(tl, ['[-./[:alnum:]_~]\+', 'log13.file', 'log13.file']) " filename regexp + :call add(tl, ['[\]\^\-\\]\+', '\^\\\-\---^', '\^\\\-\---^']) " special chars + :call add(tl, ['[[.a.]]\+', 'aa', 'aa']) " collation elem + :call add(tl, ['abc[0-9]*ddd', 'siuhabc ii']) " middle of regexp + :call add(tl, ['abc[0-9]*ddd', 'adf abc44482ddd oijs', 'abc44482ddd']) + :call add(tl, ['\_[0-9]\+', 'asfi9888u', '9888']) + :call add(tl, ['[0-9\n]\+', 'asfi9888u', '9888']) + + + :"""" Test recognition of some character classes + :call add(tl, ['[0-9]', '8', '8']) + :call add(tl, ['[^0-9]', '8']) + :call add(tl, ['[0-9a-fA-F]*', '0a7', '0a7']) + :call add(tl, ['[^0-9A-Fa-f]\+', '0a7']) + :call add(tl, ['[a-z_A-Z0-9]\+', 'aso_sfoij', 'aso_sfoij']) + :call add(tl, ['[a-z]', 'a', 'a']) + :call add(tl, ['[a-zA-Z]', 'a', 'a']) + :call add(tl, ['[A-Z]', 'a']) + :call add(tl, ['\C[^A-Z]\+', 'ABCOIJDEOIFNSD jsfoij sa', ' jsfoij sa']) + + :"""" Tests for \z features + :call add(tl, ['xx \ze test', 'xx ']) " must match after \ze + :call add(tl, ['abc\zeend', 'oij abcend', 'abc']) + :call add(tl, ['abc\zsdd', 'ddabcddxyzt', 'dd']) + :call add(tl, ['aa \zsax', ' ax']) " must match before \zs + :call add(tl, ['abc \zsmatch\ze abc', 'abc abc abc match abc abc', 'match']) + :call add(tl, ['\v(a \zsif .*){2}', 'a if then a if last', 'if last', 'a if last']) + + :"""" Tests for \@ features + :call add(tl, ['abc\@=', 'abc', 'ab']) + :call add(tl, ['abc\@=cd', 'abcd', 'abcd']) + :call add(tl, ['abc\@=', 'ababc', 'ab']) + :call add(tl, ['abcd\@=e', 'abcd']) " will never match, no matter the input text + :call add(tl, ['abcd\@=e', 'any text in here ... ']) " will never match + :call add(tl, ['\v(abc)@=..', 'xabcd', 'ab', 'abc']) + :call add(tl, ['\(.*John\)\@=.*Bob', 'here is John, and here is B']) " no match + :call add(tl, ['\(John.*\)\@=.*Bob', 'John is Bobs friend', 'John is Bob', 'John is Bobs friend']) + :call add(tl, ['.*John\&.*Bob', 'here is John, and here is B']) " no match + :call add(tl, ['.*John\&.*Bob', 'John is Bobs friend', 'John is Bob']) + :call add(tl, ['\v(test1)@=.*yep', 'this is a test1, yep it is', 'test1, yep', 'test1']) + + :"""" Combining different tests and features + :call add(tl, ['[[:alpha:]]\{-2,6}', '787abcdiuhsasiuhb4', 'ab']) + :call add(tl, ['[^[=a=]]\+', 'ddaãâbcd', 'dd']) + :call add(tl, ['', 'abcd', '']) + :call add(tl, ['\v(())', 'any possible text', '']) + :call add(tl, ['\v%(ab(xyz)c)', ' abxyzc ', 'abxyzc', 'xyz']) + :call add(tl, ['\v(test|)empty', 'tesempty', 'empty', '']) + :call add(tl, ['\v(a|aa)(a|aa)', 'aaa', 'aa', 'a', 'a']) + + + :"""" Run the tests + :" :for t in tl : let l = matchlist(t[1], t[0]) *************** *** 143,149 **** : elseif len(t) > 2 && l[0] != t[2] : $put ='ERROR: pat: \"' . t[0] . '\", text: \"' . t[1] . '\", match: \"' . l[0] . '\", expected: \"' . t[2] . '\"' : else ! : $put ='OK' : endif : if len(l) > 0 :" check all the nine submatches --- 307,313 ---- : elseif len(t) > 2 && l[0] != t[2] : $put ='ERROR: pat: \"' . t[0] . '\", text: \"' . t[1] . '\", match: \"' . l[0] . '\", expected: \"' . t[2] . '\"' : else ! : $put ='OK - ' . t[0] : endif : if len(l) > 0 :" check all the nine submatches *************** *** 161,167 **** : endif :endfor :unlet t tl e l ! :/^Results/,$wq! test.out ENDTEST Results of test64: --- 325,341 ---- : endif :endfor :unlet t tl e l ! ! :" Check that \_[0-9] matching EOL does not break a following \> ! :" This only works on a buffer line, not with expression evaluation ! /^Find this ! /\<\(\(25\_[0-5]\|2\_[0-4]\_[0-9]\|\_[01]\?\_[0-9]\_[0-9]\?\)\.\)\{3\}\(25\_[0-5]\|2\_[0-4]\_[0-9]\|\_[01]\?\_[0-9]\_[0-9]\?\)\> ! y$Gop:" ! ! :/\%#=1^Results/,$wq! test.out ENDTEST + Find this: + localnet/192.168.0.1 + Results of test64: *** ../vim-7.3.969/src/testdir/test64.ok 2010-08-15 21:57:29.000000000 +0200 --- src/testdir/test64.ok 2013-05-19 16:05:16.000000000 +0200 *************** *** 1,102 **** Results of test64: ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK ! OK --- 1,230 ---- Results of test64: ! OK - ab ! OK - b ! OK - bc* ! OK - bc\{-} ! OK - bc\{-}\(d\) ! OK - bc* ! OK - c* ! OK - bc* ! OK - c* ! OK - bc\+ ! OK - bc\+ ! OK - a\|ab ! OK - c\? ! OK - bc\? ! OK - bc\? ! OK - \va{1} ! OK - \va{2} ! OK - \va{2} ! OK - \va{2} ! OK - \va{2} ! OK - \va{2} ! OK - \va{2} ! OK - \vb{1} ! OK - \vba{2} ! OK - \vba{3} ! OK - \v(ab){1} ! OK - \v(ab){1} ! OK - \v(ab){1} ! OK - \v(ab){0,2} ! OK - \v(ab){0,2} ! OK - \v(ab){1,2} ! OK - \v(ab){1,2} ! OK - \v(ab){2,4} ! OK - \v(ab){2,4} ! OK - \v(ab){2} ! OK - \v(ab){2} ! OK - \v(ab){2} ! OK - \v(ab){2} ! OK - \v((ab){2}){2} ! OK - \v((ab){2}){2} ! OK - \v(a{1}){1} ! OK - \v(a{2}){1} ! OK - \v(a{2}){1} ! OK - \v(a{2}){1} ! OK - \v(a{1}){2} ! OK - \v(a{1}){2} ! OK - \v(a{2})+ ! OK - \v(a{2})+ ! OK - \v(a{2}){1} ! OK - \v(a{1}){2} ! OK - \v(a{1}){1} ! OK - \v(a{2}){2} ! OK - \v(a{2}){2} ! OK - \v(a+){2} ! OK - \v(a{3}){2} ! OK - \v(a{1,2}){2} ! OK - \v(a{1,3}){2} ! OK - \v(a{1,3}){2} ! OK - \v(a{1,3}){3} ! OK - \v(a{1,2}){2} ! OK - \v(a+)+ ! OK - \v(a+)+ ! OK - \v(a+){1,2} ! OK - \v(a+)(a+) ! OK - \v(a{3})+ ! OK - \v(a|b|c)+ ! OK - \v(a|b|c){2} ! OK - \v(abc){2} ! OK - \v(abc){2} ! OK - a* ! OK - \v(a*)+ ! OK - \v((ab)+)+ ! OK - \v(((ab)+)+)+ ! OK - \v(((ab)+)+)+ ! OK - \v(a{0,2})+ ! OK - \v(a*)+ ! OK - \v((a*)+)+ ! OK - \v((ab)*)+ ! OK - \va{1,3} ! OK - \va{2,3} ! OK - \v((ab)+|c*)+ ! OK - \v(a{2})|(b{3}) ! OK - \va{2}|b{2} ! OK - \v(a)+|(c)+ ! OK - \vab{2,3}c ! OK - \vab{2,3}c ! OK - \vab{2,3}cd{2,3}e ! OK - \va(bc){2}d ! OK - \va*a{2} ! OK - \va*a{2} ! OK - \va*a{2} ! OK - \va*a{2} ! OK - \va*b*|a*c* ! OK - \va{1}b{1}|a{1}b{1} ! OK - \v(a) ! OK - \v(a)(b) ! OK - \v(ab)(b)(c) ! OK - \v((a)(b)) ! OK - \v(a)|(b) ! OK - \v(a*)+ ! OK - x ! OK - ab ! OK - ab ! OK - ab ! OK - x* ! OK - x* ! OK - x* ! OK - x\+ ! OK - x\+ ! OK - x\+ ! OK - x\+ ! OK - x\= ! OK - x\= ! OK - x\= ! OK - x\? ! OK - x\? ! OK - x\? ! OK - a\{0,0} ! OK - a\{0,1} ! OK - a\{1,0} ! OK - a\{3,6} ! OK - a\{3,6} ! OK - a\{3,6} ! OK - a\{0} ! OK - a\{2} ! OK - a\{2} ! OK - a\{2} ! OK - a\{0,} ! OK - a\{0,} ! OK - a\{2,} ! OK - a\{2,} ! OK - a\{,0} ! OK - a\{,5} ! OK - a\{,5} ! OK - a\{} ! OK - a\{} ! OK - a\{-0,0} ! OK - a\{-0,1} ! OK - a\{-3,6} ! OK - a\{-3,6} ! OK - a\{-3,6} ! OK - a\{-0} ! OK - a\{-2} ! OK - a\{-2} ! OK - a\{-0,} ! OK - a\{-0,} ! OK - a\{-2,} ! OK - a\{-2,} ! OK - a\{-,0} ! OK - a\{-,5} ! OK - a\{-,5} ! OK - a\{-} ! OK - a\{-} ! OK - \(abc\)* ! OK - \(ab\)\+ ! OK - \(abaaaaa\)*cd ! OK - \(test1\)\? \(test2\)\? ! OK - \(test1\)\= \(test2\) \(test4443\)\= ! OK - \(\(sub1\) hello \(sub 2\)\) ! OK - \(\(\(yyxxzz\)\)\) ! OK - \v((ab)+|c+)+ ! OK - \v((ab)|c*)+ ! OK - \v(a(c*)+b)+ ! OK - \v(a|b*)+ ! OK - a\{-2,7} ! OK - a\{2,7} ! OK - \vx(.{-,8})yz(.*) ! OK - \vx(.*)yz(.*) ! OK - \v(a{1,2}){-2,3} ! OK - \v(a{-1,3})+ ! OK - \d\+e\d\d ! OK - \v[a] ! OK - a[bcd] ! OK - a[b-d] ! OK - [a-d][e-f][x-x]d ! OK - \v[[:alpha:]]+ ! OK - [[:alpha:]\+] ! OK - [^abc]\+ ! OK - [^abc] ! OK - [^abc]\+ ! OK - [^a-d]\+ ! OK - [a-f]* ! OK - [a-f]* ! OK - [^a-f]\+ ! OK - [a-c]\{-3,6} ! OK - [^[:alpha:]]\+ ! OK - [-a] ! OK - [a-] ! OK - [-./[:alnum:]_~]\+ ! OK - [\]\^\-\\]\+ ! OK - [[.a.]]\+ ! OK - abc[0-9]*ddd ! OK - abc[0-9]*ddd ! OK - \_[0-9]\+ ! OK - [0-9\n]\+ ! OK - [0-9] ! OK - [^0-9] ! OK - [0-9a-fA-F]* ! OK - [^0-9A-Fa-f]\+ ! OK - [a-z_A-Z0-9]\+ ! OK - [a-z] ! OK - [a-zA-Z] ! OK - [A-Z] ! OK - \C[^A-Z]\+ ! OK - xx \ze test ! OK - abc\zeend ! OK - abc\zsdd ! OK - aa \zsax ! OK - abc \zsmatch\ze abc ! OK - \v(a \zsif .*){2} ! OK - abc\@= ! OK - abc\@=cd ! OK - abc\@= ! OK - abcd\@=e ! OK - abcd\@=e ! OK - \v(abc)@=.. ! OK - \(.*John\)\@=.*Bob ! OK - \(John.*\)\@=.*Bob ! OK - .*John\&.*Bob ! OK - .*John\&.*Bob ! OK - \v(test1)@=.*yep ! OK - [[:alpha:]]\{-2,6} ! OK - [^[=a=]]\+ ! OK - ! OK - \v(()) ! OK - \v%(ab(xyz)c) ! OK - \v(test|)empty ! OK - \v(a|aa)(a|aa) ! 192.168.0.1 *** ../vim-7.3.969/Filelist 2013-03-07 13:32:03.000000000 +0100 --- Filelist 2013-05-17 19:25:39.000000000 +0200 *************** *** 57,62 **** --- 57,63 ---- src/popupmnu.c \ src/quickfix.c \ src/regexp.c \ + src/regexp_nfa.c \ src/regexp.h \ src/screen.c \ src/search.c \ *** ../vim-7.3.969/runtime/doc/pattern.txt 2011-07-20 17:58:14.000000000 +0200 --- runtime/doc/pattern.txt 2013-05-17 22:57:02.000000000 +0200 *************** *** 21,27 **** 10. Highlighting matches |match-highlight| ============================================================================== ! 1. Search commands *search-commands* *E486* */* /{pattern}[/] Search forward for the [count]'th occurrence of --- 21,27 ---- 10. Highlighting matches |match-highlight| ============================================================================== ! 1. Search commands *search-commands* */* /{pattern}[/] Search forward for the [count]'th occurrence of *************** *** 150,155 **** --- 150,160 ---- All matches for the last used search pattern will be highlighted if you set the 'hlsearch' option. This can be suspended with the |:nohlsearch| command. + When no match is found you get the error: *E486* Pattern not found + Note that for the |:global| command this behaves like a normal message, for Vi + compatibility. For the |:s| command the "e" flag can be used to avoid the + error message |:s_flags|. + *search-offset* *{offset}* These commands search for the specified pattern. With "/" and "?" an additional offset may be given. There are two types of offsets: line offsets *************** *** 214,220 **** the search, possibly in another direction or with another count. Note that two patterns are remembered: One for 'normal' search commands and one for the substitute command ":s". Each time an empty pattern is given, the previously ! used pattern is used. The 'magic' option sticks with the last used pattern. If you change 'magic', this will not change how the last used pattern will be interpreted. --- 219,226 ---- the search, possibly in another direction or with another count. Note that two patterns are remembered: One for 'normal' search commands and one for the substitute command ":s". Each time an empty pattern is given, the previously ! used pattern is used. However, if there is no previous search command, a ! previous substitute pattern is used, if possible. The 'magic' option sticks with the last used pattern. If you change 'magic', this will not change how the last used pattern will be interpreted. *************** *** 344,349 **** --- 350,376 ---- or \z( pattern \) |/\z(| + */\%#=* *two-engines* + Vim includes two regexp engines: + 1. An old, backtracking engine that supports everything. + 2. A new, NFA engine that works much faster on some patterns, but does not + support everything. + + Vim will automatically select the right engine for you. However, if you run + into a problem or want to specifically select one engine or the other, you can + prepend one of the following to the pattern: + + \%#=0 Force automatic selection. Only has an effect when + 'regexpengine' has been set to a non-zero value. + \%#=1 Force using the old engine. + \%#=2 Force using the NFA engine. + + You can also use the 'regexpengine' option to change the default. + + *E864* *E868* *E874* *E875* *E876* *E877* *E878* + If selecting the NFA engine and it runs into something that is not implemented + the pattern will not match. This is only useful when debugging Vim. + ============================================================================== 3. Magic */magic* *************** *** 390,398 **** ============================================================================== 4. Overview of pattern items *pattern-overview* Overview of multi items. */multi* *E61* *E62* ! More explanation and examples below, follow the links. *E64* multi ~ 'magic' 'nomagic' matches of the preceding atom ~ --- 417,426 ---- ============================================================================== 4. Overview of pattern items *pattern-overview* + *E865* *E866* *E867* *E869* Overview of multi items. */multi* *E61* *E62* ! More explanation and examples below, follow the links. *E64* *E871* multi ~ 'magic' 'nomagic' matches of the preceding atom ~ *************** *** 498,513 **** x x a character with no special meaning matches itself |/[]| [] \[] any character specified inside the [] ! |/\%[]| \%[] \%[] a sequence of optionally matched atoms |/\c| \c \c ignore case, do not use the 'ignorecase' option |/\C| \C \C match case, do not use the 'ignorecase' option |/\m| \m \m 'magic' on for the following chars in the pattern |/\M| \M \M 'magic' off for the following chars in the pattern |/\v| \v \v the following chars in the pattern are "very magic" |/\V| \V \V the following chars in the pattern are "very nomagic" ! |/\Z| \Z \Z ignore differences in Unicode "combining characters". ! Useful when searching voweled Hebrew or Arabic text. |/\%d| \%d \%d match specified decimal character (eg \%d123) |/\%x| \%x \%x match specified hex character (eg \%x2a) --- 526,543 ---- x x a character with no special meaning matches itself |/[]| [] \[] any character specified inside the [] ! |/\%[]| \%[] \%[] a sequence of optionally matched atoms |/\c| \c \c ignore case, do not use the 'ignorecase' option |/\C| \C \C match case, do not use the 'ignorecase' option + |/\Z| \Z \Z ignore differences in Unicode "combining characters". + Useful when searching voweled Hebrew or Arabic text. + |/\m| \m \m 'magic' on for the following chars in the pattern |/\M| \M \M 'magic' off for the following chars in the pattern |/\v| \v \v the following chars in the pattern are "very magic" |/\V| \V \V the following chars in the pattern are "very nomagic" ! |/\%#=| \%#=1 \%#=1 select regexp engine |/zero-width| |/\%d| \%d \%d match specified decimal character (eg \%d123) |/\%x| \%x \%x match specified hex character (eg \%x2a) *************** *** 575,581 **** \? Just like \=. Cannot be used when searching backwards with the "?" command. {not in Vi} ! */\{* *E58* *E60* *E554* \{n,m} Matches n to m of the preceding atom, as many as possible \{n} Matches n of the preceding atom \{n,} Matches at least n of the preceding atom, as many as possible --- 605,611 ---- \? Just like \=. Cannot be used when searching backwards with the "?" command. {not in Vi} ! */\{* *E58* *E60* *E554* *E870* \{n,m} Matches n to m of the preceding atom, as many as possible \{n} Matches n of the preceding atom \{n,} Matches at least n of the preceding atom, as many as possible *************** *** 631,647 **** */\@!* \@! Matches with zero width if the preceding atom does NOT match at the current position. |/zero-width| {not in Vi} ! Like '(?!pattern)" in Perl. Example matches ~ foo\(bar\)\@! any "foo" not followed by "bar" ! a.\{-}p\@! "a", "ap", "app", etc. not followed by a "p" if \(\(then\)\@!.\)*$ "if " not followed by "then" Using "\@!" is tricky, because there are many places where a pattern does not match. "a.*p\@!" will match from an "a" to the end of the line, because ".*" can match all characters in the line and the "p" doesn't match at the end of the line. "a.\{-}p\@!" will match any ! "a", "ap", "aap", etc. that isn't followed by a "p", because the "." can match a "p" and "p\@!" doesn't match after that. You can't use "\@!" to look for a non-match before the matching --- 661,678 ---- */\@!* \@! Matches with zero width if the preceding atom does NOT match at the current position. |/zero-width| {not in Vi} ! Like "(?!pattern)" in Perl. Example matches ~ foo\(bar\)\@! any "foo" not followed by "bar" ! a.\{-}p\@! "a", "ap", "app", "appp", etc. not immediately ! followed by a "p" if \(\(then\)\@!.\)*$ "if " not followed by "then" Using "\@!" is tricky, because there are many places where a pattern does not match. "a.*p\@!" will match from an "a" to the end of the line, because ".*" can match all characters in the line and the "p" doesn't match at the end of the line. "a.\{-}p\@!" will match any ! "a", "ap", "app", etc. that isn't followed by a "p", because the "." can match a "p" and "p\@!" doesn't match after that. You can't use "\@!" to look for a non-match before the matching *************** *** 650,659 **** "foobar" you could use "\(foo\)\@!...bar", but that doesn't match a bar at the start of a line. Use "\(foo\)\@ + /^\%(.*bar\)\@!.*\zsfoo + < This pattern first checks that there is not a single position in the + line where "bar" matches. If ".*bar" matches somewhere the \@! will + reject the pattern. When there is no match any "foo" will be found. + The "\zs" is to have the match start just before "foo". + */\@<=* \@<= Matches with zero width if the preceding atom matches just before what follows. |/zero-width| {not in Vi} ! Like "(?<=pattern)" in Perl, but Vim allows non-fixed-width patterns. Example matches ~ \(an\_s\+\)\@<=file "file" after "an" and white space or an end-of-line *************** *** 677,683 **** before what follows. Thus this matches if there is no position in the current or previous line where the atom matches such that it ends just before what follows. |/zero-width| {not in Vi} ! Like '(? *** ../vim-7.3.969/src/option.c 2013-05-11 13:45:00.000000000 +0200 --- src/option.c 2013-05-17 22:23:14.000000000 +0200 *************** *** 2077,2082 **** --- 2077,2085 ---- (char_u *)NULL, PV_NONE, #endif {(char_u *)2000L, (char_u *)0L} SCRIPTID_INIT}, + {"regexpengine", "re", P_NUM|P_VI_DEF, + (char_u *)&p_re, PV_NONE, + {(char_u *)0L, (char_u *)0L} SCRIPTID_INIT}, {"relativenumber", "rnu", P_BOOL|P_VI_DEF|P_RWIN, (char_u *)VAR_WIN, PV_RNU, {(char_u *)FALSE, (char_u *)0L} SCRIPTID_INIT}, *************** *** 8604,8609 **** --- 8607,8617 ---- errmsg = e_positive; p_hi = 0; } + if (p_re < 0 || p_re > 2) + { + errmsg = e_invarg; + p_re = 0; + } if (p_report < 0) { errmsg = e_positive; *** ../vim-7.3.969/src/option.h 2013-03-19 16:46:59.000000000 +0100 --- src/option.h 2013-05-17 22:23:39.000000000 +0200 *************** *** 653,658 **** --- 653,659 ---- EXTERN long p_rdt; /* 'redrawtime' */ #endif EXTERN int p_remap; /* 'remap' */ + EXTERN long p_re; /* 'regexpengine' */ EXTERN long p_report; /* 'report' */ #if defined(FEAT_WINDOWS) && defined(FEAT_QUICKFIX) EXTERN long p_pvh; /* 'previewheight' */ *** ../vim-7.3.969/src/testdir/test95.in 2013-05-19 19:10:10.000000000 +0200 --- src/testdir/test95.in 2013-05-18 15:11:43.000000000 +0200 *************** *** 0 **** --- 1,63 ---- + Test for regexp patterns with multi-byte support. + See test64 for the non-multi-byte tests. + + A pattern that gives the expected result produces OK, so that we know it was + actually tried. + + STARTTEST + :so small.vim + :so mbyte.vim + :" tl is a List of Lists with: + :" regexp pattern + :" text to test the pattern on + :" expected match (optional) + :" expected submatch 1 (optional) + :" expected submatch 2 (optional) + :" etc. + :" When there is no match use only the first two items. + :let tl = [] + + :"""" Multi-byte character tests. These will fail unless vim is compiled + :"""" with Multibyte (FEAT_MBYTE) or BIG/HUGE features. + :call add(tl, ['[[:alpha:][=a=]]\+', '879 aiaãâaiuvna ', 'aiaãâaiuvna']) + :call add(tl, ['[[=a=]]\+', 'ddaãâbcd', 'aãâ']) " equivalence classes + :call add(tl, ['[^ม ]\+', 'มม oijasoifjos ifjoisj f osij j มมมมม abcd', 'oijasoifjos']) + :call add(tl, [' [^ ]\+', 'start มabcdม ', ' มabcdม']) + :call add(tl, ['[ม[:alpha:][=a=]]\+', '879 aiaãมâมaiuvna ', 'aiaãมâมaiuvna']) + + :"""" Run the tests + + :" + :for t in tl + : let l = matchlist(t[1], t[0]) + :" check the match itself + : if len(l) == 0 && len(t) > 2 + : $put ='ERROR: pat: \"' . t[0] . '\", text: \"' . t[1] . '\", did not match, expected: \"' . t[2] . '\"' + : elseif len(l) > 0 && len(t) == 2 + : $put ='ERROR: pat: \"' . t[0] . '\", text: \"' . t[1] . '\", match: \"' . l[0] . '\", expected no match' + : elseif len(t) > 2 && l[0] != t[2] + : $put ='ERROR: pat: \"' . t[0] . '\", text: \"' . t[1] . '\", match: \"' . l[0] . '\", expected: \"' . t[2] . '\"' + : else + : $put ='OK - ' . t[0] + : endif + : if len(l) > 0 + :" check all the nine submatches + : for i in range(1, 9) + : if len(t) <= i + 2 + : let e = '' + : else + : let e = t[i + 2] + : endif + : if l[i] != e + : $put ='ERROR: pat: \"' . t[0] . '\", text: \"' . t[1] . '\", submatch ' . i . ': \"' . l[i] . '\", expected: \"' . e . '\"' + : endif + : endfor + : unlet i + : endif + :endfor + :unlet t tl e l + + :/\%#=1^Results/,$wq! test.out + ENDTEST + + Results of test95: *** ../vim-7.3.969/src/testdir/test95.ok 2013-05-19 19:10:10.000000000 +0200 --- src/testdir/test95.ok 2013-05-18 14:30:49.000000000 +0200 *************** *** 0 **** --- 1,6 ---- + Results of test95: + OK - [[:alpha:][=a=]]\+ + OK - [[=a=]]\+ + OK - [^ม ]\+ + OK - [^ ]\+ + OK - [ม[:alpha:][=a=]]\+ *** ../vim-7.3.969/src/testdir/Make_amiga.mak 2013-04-12 13:44:49.000000000 +0200 --- src/testdir/Make_amiga.mak 2013-05-18 14:21:35.000000000 +0200 *************** *** 33,39 **** test76.out test77.out test78.out test79.out test80.out \ test81.out test82.out test83.out test84.out test88.out \ test89.out test90.out test91.out test92.out test93.out \ ! test94.out .SUFFIXES: .in .out --- 33,39 ---- test76.out test77.out test78.out test79.out test80.out \ test81.out test82.out test83.out test84.out test88.out \ test89.out test90.out test91.out test92.out test93.out \ ! test94.out test95.out .SUFFIXES: .in .out *************** *** 144,146 **** --- 144,147 ---- test92.out: test92.in test93.out: test93.in test94.out: test94.in + test95.out: test95.in *** ../vim-7.3.969/src/testdir/Make_dos.mak 2013-04-12 13:44:49.000000000 +0200 --- src/testdir/Make_dos.mak 2013-05-18 14:21:47.000000000 +0200 *************** *** 32,38 **** test79.out test80.out test81.out test82.out test83.out \ test84.out test85.out test86.out test87.out test88.out \ test89.out test90.out test91.out test92.out test93.out \ ! test94.out SCRIPTS32 = test50.out test70.out --- 32,38 ---- test79.out test80.out test81.out test82.out test83.out \ test84.out test85.out test86.out test87.out test88.out \ test89.out test90.out test91.out test92.out test93.out \ ! test94.out test95.out SCRIPTS32 = test50.out test70.out *** ../vim-7.3.969/src/testdir/Make_ming.mak 2013-04-12 13:44:49.000000000 +0200 --- src/testdir/Make_ming.mak 2013-05-18 14:21:56.000000000 +0200 *************** *** 52,58 **** test79.out test80.out test81.out test82.out test83.out \ test84.out test85.out test86.out test87.out test88.out \ test89.out test90.out test91.out test92.out test93.out \ ! test94.out SCRIPTS32 = test50.out test70.out --- 52,58 ---- test79.out test80.out test81.out test82.out test83.out \ test84.out test85.out test86.out test87.out test88.out \ test89.out test90.out test91.out test92.out test93.out \ ! test94.out test95.out SCRIPTS32 = test50.out test70.out *** ../vim-7.3.969/src/testdir/Make_os2.mak 2013-04-12 13:44:49.000000000 +0200 --- src/testdir/Make_os2.mak 2013-05-18 14:22:00.000000000 +0200 *************** *** 33,39 **** test76.out test77.out test78.out test79.out test80.out \ test81.out test82.out test83.out test84.out test88.out \ test89.out test90.out test91.out test92.out test93.out \ ! test94.out .SUFFIXES: .in .out --- 33,39 ---- test76.out test77.out test78.out test79.out test80.out \ test81.out test82.out test83.out test84.out test88.out \ test89.out test90.out test91.out test92.out test93.out \ ! test94.out test95.out .SUFFIXES: .in .out *** ../vim-7.3.969/src/testdir/Make_vms.mms 2013-04-12 13:44:49.000000000 +0200 --- src/testdir/Make_vms.mms 2013-05-18 14:22:11.000000000 +0200 *************** *** 4,10 **** # Authors: Zoltan Arpadffy, # Sandor Kopanyi, # ! # Last change: 2013 Apr 12 # # This has been tested on VMS 6.2 to 8.3 on DEC Alpha, VAX and IA64. # Edit the lines in the Configuration section below to select. --- 4,10 ---- # Authors: Zoltan Arpadffy, # Sandor Kopanyi, # ! # Last change: 2013 May 18 # # This has been tested on VMS 6.2 to 8.3 on DEC Alpha, VAX and IA64. # Edit the lines in the Configuration section below to select. *************** *** 77,83 **** test71.out test72.out test74.out test75.out test76.out \ test77.out test78.out test79.out test80.out test81.out \ test82.out test83.out test84.out test88.out test89.out \ ! test90.out test91.out test92.out test93.out test94.out # Known problems: # Test 30: a problem around mac format - unknown reason --- 77,84 ---- test71.out test72.out test74.out test75.out test76.out \ test77.out test78.out test79.out test80.out test81.out \ test82.out test83.out test84.out test88.out test89.out \ ! test90.out test91.out test92.out test93.out test94.out \ ! test95.out # Known problems: # Test 30: a problem around mac format - unknown reason *** ../vim-7.3.969/src/version.c 2013-05-18 20:55:31.000000000 +0200 --- src/version.c 2013-05-19 19:10:25.000000000 +0200 *************** *** 730,731 **** --- 730,733 ---- { /* Add new patch number below this line */ + /**/ + 970, /**/ -- BRIDGEKEEPER: What is the air-speed velocity of an unladen swallow? ARTHUR: What do you mean? An African or European swallow? BRIDGEKEEPER: Er ... I don't know that ... Aaaaarrrrrrggghhh! BRIDGEKEEPER is cast into the gorge. "Monty Python and the Holy Grail" PYTHON (MONTY) PICTURES LTD /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\ \\\ an exciting new programming language -- http://www.Zimbu.org /// \\\ help me help AIDS victims -- http://ICCF-Holland.org ///