To: vim_dev@googlegroups.com
Subject: Patch 7.3.1088
Fcc: outbox
From: Bram Moolenaar <Bram@moolenaar.net>
Mime-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
------------
Patch 7.3.1088
Problem: New regexp engine: \@<= and \@<! are not implemented.
Solution: Implement look-behind matching. Fix off-by-one error in old
regexp engine.
Files: src/regexp.c, src/regexp_nfa.c, src/testdir/test64.in,
src/testdir/test64.ok
*** ../vim-7.3.1087/src/regexp.c 2013-06-01 14:42:51.000000000 +0200
--- src/regexp.c 2013-06-01 18:55:07.000000000 +0200
***************
*** 5576,5582 ****
limit = OPERAND_MIN(rp->rs_scan);
if (REG_MULTI)
{
! if (rp->rs_un.regsave.rs_u.pos.col == 0)
{
if (rp->rs_un.regsave.rs_u.pos.lnum
< behind_pos.rs_u.pos.lnum
--- 5576,5589 ----
limit = OPERAND_MIN(rp->rs_scan);
if (REG_MULTI)
{
! if (limit > 0
! && ((rp->rs_un.regsave.rs_u.pos.lnum
! < behind_pos.rs_u.pos.lnum
! ? (colnr_T)STRLEN(regline)
! : behind_pos.rs_u.pos.col)
! - rp->rs_un.regsave.rs_u.pos.col >= limit))
! no = FAIL;
! else if (rp->rs_un.regsave.rs_u.pos.col == 0)
{
if (rp->rs_un.regsave.rs_u.pos.lnum
< behind_pos.rs_u.pos.lnum
***************
*** 5601,5613 ****
else
#endif
--rp->rs_un.regsave.rs_u.pos.col;
- if (limit > 0
- && ((rp->rs_un.regsave.rs_u.pos.lnum
- < behind_pos.rs_u.pos.lnum
- ? (colnr_T)STRLEN(regline)
- : behind_pos.rs_u.pos.col)
- - rp->rs_un.regsave.rs_u.pos.col > limit))
- no = FAIL;
}
}
else
--- 5608,5613 ----
*** ../vim-7.3.1087/src/regexp_nfa.c 2013-06-01 14:42:51.000000000 +0200
--- src/regexp_nfa.c 2013-06-01 19:42:22.000000000 +0200
***************
*** 56,61 ****
--- 56,62 ----
NFA_NOPEN, /* Start of subexpression marked with \%( */
NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */
NFA_START_INVISIBLE,
+ NFA_START_INVISIBLE_BEFORE,
NFA_END_INVISIBLE,
NFA_COMPOSING, /* Next nodes in NFA are part of the
composing multibyte char */
***************
*** 1369,1402 ****
break;
case Magic('@'):
op = no_Magic(getchr());
switch(op)
{
case '=':
! EMIT(NFA_PREV_ATOM_NO_WIDTH);
break;
case '!':
! EMIT(NFA_PREV_ATOM_NO_WIDTH_NEG);
break;
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
case '<':
case '>':
! /* Not supported yet */
! return FAIL;
! default:
! syntax_error = TRUE;
! EMSGN(_("E869: (NFA) Unknown operator '\\@%c'"), op);
return FAIL;
}
break;
case Magic('?'):
--- 1370,1412 ----
break;
case Magic('@'):
+ c2 = getdecchrs();
op = no_Magic(getchr());
+ i = 0;
switch(op)
{
case '=':
! /* \@= */
! i = NFA_PREV_ATOM_NO_WIDTH;
break;
case '!':
! /* \@! */
! i = NFA_PREV_ATOM_NO_WIDTH_NEG;
break;
case '<':
+ op = no_Magic(getchr());
+ if (op == '=')
+ /* \@<= */
+ i = NFA_PREV_ATOM_JUST_BEFORE;
+ else if (op == '!')
+ /* \@<! */
+ i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
+ break;
case '>':
! /* \@> Not supported yet */
! /* i = NFA_PREV_ATOM_LIKE_PATTERN; */
return FAIL;
}
+ if (i == 0)
+ {
+ syntax_error = TRUE;
+ EMSGN(_("E869: (NFA) Unknown operator '\\@%c'"), op);
+ return FAIL;
+ }
+ EMIT(i);
+ if (i == NFA_PREV_ATOM_JUST_BEFORE
+ || i == NFA_PREV_ATOM_JUST_BEFORE_NEG)
+ EMIT(c2);
break;
case Magic('?'):
***************
*** 1734,1742 ****
--- 1744,1758 ----
STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
case NFA_PREV_ATOM_NO_WIDTH_NEG:
STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
+ case NFA_PREV_ATOM_JUST_BEFORE:
+ STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
+ case NFA_PREV_ATOM_JUST_BEFORE_NEG:
+ STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
+ case NFA_START_INVISIBLE_BEFORE:
+ STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
***************
*** 2237,2243 ****
if (nfa_calc_size == FALSE)
{
/* Allocate space for the stack. Max states on the stack : nstate */
! stack = (Frag_T *) lalloc((nstate + 1) * sizeof(Frag_T), TRUE);
stackp = stack;
stack_end = stack + (nstate + 1);
}
--- 2253,2259 ----
if (nfa_calc_size == FALSE)
{
/* Allocate space for the stack. Max states on the stack : nstate */
! stack = (Frag_T *)lalloc((nstate + 1) * sizeof(Frag_T), TRUE);
stackp = stack;
stack_end = stack + (nstate + 1);
}
***************
*** 2370,2377 ****
--- 2386,2397 ----
case NFA_PREV_ATOM_NO_WIDTH:
case NFA_PREV_ATOM_NO_WIDTH_NEG:
+ case NFA_PREV_ATOM_JUST_BEFORE:
+ case NFA_PREV_ATOM_JUST_BEFORE_NEG:
/* The \@= operator: match the preceding atom with zero width.
* The \@! operator: no match for the preceding atom.
+ * The \@<= operator: match for the preceding atom.
+ * The \@<! operator: no match for the preceding atom.
* Surrounds the preceding atom with START_INVISIBLE and
* END_INVISIBLE, similarly to MOPEN. */
***************
*** 2389,2399 ****
s = new_state(NFA_START_INVISIBLE, e.start, s1);
if (s == NULL)
goto theend;
! if (*p == NFA_PREV_ATOM_NO_WIDTH_NEG)
{
s->negated = TRUE;
s1->negated = TRUE;
}
PUSH(frag(s, list1(&s1->out)));
break;
--- 2409,2426 ----
s = new_state(NFA_START_INVISIBLE, e.start, s1);
if (s == NULL)
goto theend;
! if (*p == NFA_PREV_ATOM_NO_WIDTH_NEG
! || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG)
{
s->negated = TRUE;
s1->negated = TRUE;
}
+ if (*p == NFA_PREV_ATOM_JUST_BEFORE
+ || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG)
+ {
+ s->val = *++p; /* get the count */
+ ++s->c; /* NFA_START_INVISIBLE -> NFA_START_INVISIBLE_BEFORE */
+ }
PUSH(frag(s, list1(&s1->out)));
break;
***************
*** 3307,3327 ****
return val == pos;
}
! static int nfa_regmatch __ARGS((nfa_state_T *start, regsub_T *submatch, regsub_T *m));
/*
* Main matching routine.
*
* Run NFA to determine whether it matches reginput.
*
* Return TRUE if there is a match, FALSE otherwise.
* Note: Caller must ensure that: start != NULL.
*/
static int
! nfa_regmatch(start, submatch, m)
nfa_state_T *start;
regsub_T *submatch;
regsub_T *m;
{
int result;
int size = 0;
--- 3334,3357 ----
return val == pos;
}
! static int nfa_regmatch __ARGS((nfa_state_T *start, regsub_T *submatch, regsub_T *m, save_se_T *endp));
/*
* Main matching routine.
*
* Run NFA to determine whether it matches reginput.
*
+ * When "endp" is not NULL it is a required end-of-match position.
+ *
* Return TRUE if there is a match, FALSE otherwise.
* Note: Caller must ensure that: start != NULL.
*/
static int
! nfa_regmatch(start, submatch, m, endp)
nfa_state_T *start;
regsub_T *submatch;
regsub_T *m;
+ save_se_T *endp;
{
int result;
int size = 0;
***************
*** 3532,3547 ****
}
case NFA_END_INVISIBLE:
! /* This is only encountered after a NFA_START_INVISIBLE node.
! * They surround a zero-width group, used with "\@=" and "\&".
* If we got here, it means that the current "invisible" group
* finished successfully, so return control to the parent
* nfa_regmatch(). Submatches are stored in *m, and used in
* the parent call. */
if (start->c == NFA_MOPEN + 0)
addstate_here(thislist, t->state->out, &t->sub, &listidx);
else
{
/* do not set submatches for \@! */
if (!t->state->negated)
/* TODO: only copy positions in use. */
--- 3562,3603 ----
}
case NFA_END_INVISIBLE:
! /* This is only encountered after a NFA_START_INVISIBLE or
! * NFA_START_INVISIBLE_BEFORE node.
! * They surround a zero-width group, used with "\@=", "\&",
! * "\@!", "\@<=" and "\@<!".
* If we got here, it means that the current "invisible" group
* finished successfully, so return control to the parent
* nfa_regmatch(). Submatches are stored in *m, and used in
* the parent call. */
if (start->c == NFA_MOPEN + 0)
+ /* TODO: do we ever get here? */
addstate_here(thislist, t->state->out, &t->sub, &listidx);
else
{
+ #ifdef ENABLE_LOG
+ if (endp != NULL)
+ {
+ if (REG_MULTI)
+ fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n",
+ (int)reglnum,
+ (int)endp->se_u.pos.lnum,
+ (int)(reginput - regline),
+ endp->se_u.pos.col);
+ else
+ fprintf(log_fd, "Current col: %d, endp col: %d\n",
+ (int)(reginput - regline),
+ (int)(endp->se_u.ptr - reginput));
+ }
+ #endif
+ /* It's only a match if it ends at "endp" */
+ if (endp != NULL && (REG_MULTI
+ ? (reglnum != endp->se_u.pos.lnum
+ || (int)(reginput - regline)
+ != endp->se_u.pos.col)
+ : reginput != endp->se_u.ptr))
+ break;
+
/* do not set submatches for \@! */
if (!t->state->negated)
/* TODO: only copy positions in use. */
***************
*** 3551,3561 ****
break;
case NFA_START_INVISIBLE:
{
! char_u *save_reginput = reginput;
! char_u *save_regline = regline;
! int save_reglnum = reglnum;
! int save_nfa_match = nfa_match;
/* Call nfa_regmatch() to check if the current concat matches
* at this position. The concat ends with the node
--- 3607,3676 ----
break;
case NFA_START_INVISIBLE:
+ case NFA_START_INVISIBLE_BEFORE:
{
! char_u *save_reginput = reginput;
! char_u *save_regline = regline;
! int save_reglnum = reglnum;
! int save_nfa_match = nfa_match;
! save_se_T endpos;
! save_se_T *endposp = NULL;
!
! if (t->state->c == NFA_START_INVISIBLE_BEFORE)
! {
! /* The recursive match must end at the current position. */
! endposp = &endpos;
! if (REG_MULTI)
! {
! endpos.se_u.pos.col = (int)(reginput - regline);
! endpos.se_u.pos.lnum = reglnum;
! }
! else
! endpos.se_u.ptr = reginput;
!
! /* Go back the specified number of bytes, or as far as the
! * start of the previous line, to try matching "\@<=" or
! * not matching "\@<!". */
! if (t->state->val <= 0)
! {
! if (REG_MULTI)
! {
! regline = reg_getline(--reglnum);
! if (regline == NULL)
! /* can't go before the first line */
! regline = reg_getline(++reglnum);
! }
! reginput = regline;
! }
! else
! {
! if (REG_MULTI
! && (int)(reginput - regline) < t->state->val)
! {
! /* Not enough bytes in this line, go to end of
! * previous line. */
! regline = reg_getline(--reglnum);
! if (regline == NULL)
! {
! /* can't go before the first line */
! regline = reg_getline(++reglnum);
! reginput = regline;
! }
! else
! reginput = regline + STRLEN(regline);
! }
! if ((int)(reginput - regline) >= t->state->val)
! {
! reginput -= t->state->val;
! #ifdef FEAT_MBYTE
! if (has_mbyte)
! reginput -= mb_head_off(regline, reginput);
! #endif
! }
! else
! reginput = regline;
! }
! }
/* Call nfa_regmatch() to check if the current concat matches
* at this position. The concat ends with the node
***************
*** 3579,3585 ****
* recursion. */
nfa_save_listids(start, listids);
nfa_set_null_listids(start);
! result = nfa_regmatch(t->state->out, submatch, m);
nfa_set_neg_listids(start);
nfa_restore_listids(start, listids);
--- 3694,3700 ----
* recursion. */
nfa_save_listids(start, listids);
nfa_set_null_listids(start);
! result = nfa_regmatch(t->state->out, submatch, m, endposp);
nfa_set_neg_listids(start);
nfa_restore_listids(start, listids);
***************
*** 4120,4130 ****
* matters!
* Do not add the start state in recursive calls of nfa_regmatch(),
* because recursive calls should only start in the first position.
* Also don't start a match past the first line. */
! if (nfa_match == FALSE && start->c == NFA_MOPEN + 0
! && reglnum == 0 && clen != 0
! && (ireg_maxcol == 0
! || (colnr_T)(reginput - regline) < ireg_maxcol))
{
#ifdef ENABLE_LOG
fprintf(log_fd, "(---) STARTSTATE\n");
--- 4235,4255 ----
* matters!
* Do not add the start state in recursive calls of nfa_regmatch(),
* because recursive calls should only start in the first position.
+ * Unless "endp" is not NULL, then we match the end position.
* Also don't start a match past the first line. */
! if (nfa_match == FALSE
! && ((start->c == NFA_MOPEN + 0
! && reglnum == 0
! && clen != 0
! && (ireg_maxcol == 0
! || (colnr_T)(reginput - regline) < ireg_maxcol))
! || (endp != NULL
! && (REG_MULTI
! ? (reglnum < endp->se_u.pos.lnum
! || (reglnum == endp->se_u.pos.lnum
! && (int)(reginput - regline)
! < endp->se_u.pos.col))
! : reginput < endp->se_u.ptr))))
{
#ifdef ENABLE_LOG
fprintf(log_fd, "(---) STARTSTATE\n");
***************
*** 4148,4154 ****
* finish. */
if (clen != 0)
reginput += clen;
! else if (go_to_nextline)
reg_nextline();
else
break;
--- 4273,4280 ----
* finish. */
if (clen != 0)
reginput += clen;
! else if (go_to_nextline || (endp != NULL && REG_MULTI
! && reglnum < endp->se_u.pos.lnum))
reg_nextline();
else
break;
***************
*** 4225,4231 ****
sub.in_use = 0;
m.in_use = 0;
! if (nfa_regmatch(start, &sub, &m) == FALSE)
return 0;
cleanup_subexpr();
--- 4351,4357 ----
sub.in_use = 0;
m.in_use = 0;
! if (nfa_regmatch(start, &sub, &m, NULL) == FALSE)
return 0;
cleanup_subexpr();
*** ../vim-7.3.1087/src/testdir/test64.in 2013-06-01 14:42:51.000000000 +0200
--- src/testdir/test64.in 2013-06-01 18:45:09.000000000 +0200
***************
*** 363,374 ****
:call add(tl, [2, '\(a\)\(b\)\(c\)\(dd\)\(e\)\(f\)\(g\)\(h\)\(i\)\1\2\3\4\5\6\7\8\9', 'xabcddefghiabcddefghix', 'abcddefghiabcddefghi', 'a', 'b', 'c', 'dd', 'e', 'f', 'g', 'h', 'i'])
:"
:"""" Look-behind with limit
! :call add(tl, [0, '<\@<=span.', 'xxspanxx<spanyyy', 'spany'])
! :call add(tl, [0, '<\@1<=span.', 'xxspanxx<spanyyy', 'spany'])
! :call add(tl, [0, '<\@2<=span.', 'xxspanxx<spanyyy', 'spany'])
! :call add(tl, [0, '\(<<\)\@<=span.', 'xxspanxxxx<spanxx<<spanyyy', 'spany', '<<'])
! :call add(tl, [0, '\(<<\)\@1<=span.', 'xxspanxxxx<spanxx<<spanyyy'])
! :call add(tl, [0, '\(<<\)\@2<=span.', 'xxspanxxxx<spanxx<<spanyyy', 'spany', '<<'])
:"
:"""" "\_" prepended negated collection matches EOL
:call add(tl, [2, '\_[^8-9]\+', "asfi\n9888", "asfi\n"])
--- 363,375 ----
:call add(tl, [2, '\(a\)\(b\)\(c\)\(dd\)\(e\)\(f\)\(g\)\(h\)\(i\)\1\2\3\4\5\6\7\8\9', 'xabcddefghiabcddefghix', 'abcddefghiabcddefghi', 'a', 'b', 'c', 'dd', 'e', 'f', 'g', 'h', 'i'])
:"
:"""" Look-behind with limit
! :call add(tl, [2, '<\@<=span.', 'xxspanxx<spanyyy', 'spany'])
! :call add(tl, [2, '<\@1<=span.', 'xxspanxx<spanyyy', 'spany'])
! :call add(tl, [2, '<\@2<=span.', 'xxspanxx<spanyyy', 'spany'])
! :call add(tl, [2, '\(<<\)\@<=span.', 'xxspanxxxx<spanxx<<spanyyy', 'spany', '<<'])
! :call add(tl, [2, '\(<<\)\@1<=span.', 'xxspanxxxx<spanxx<<spanyyy'])
! :call add(tl, [2, '\(<<\)\@2<=span.', 'xxspanxxxx<spanxx<<spanyyy', 'spany', '<<'])
! :call add(tl, [2, '\(foo\)\@<!bar.', 'xx foobar1 xbar2 xx', 'bar2'])
:"
:"""" "\_" prepended negated collection matches EOL
:call add(tl, [2, '\_[^8-9]\+', "asfi\n9888", "asfi\n"])
***************
*** 514,521 ****
asdfasd<yyy
xxstart1
asdfasd<yy
! xxxxstart2
asdfasd<yy
! xxxstart3
Results of test64:
--- 515,522 ----
asdfasd<yyy
xxstart1
asdfasd<yy
! xxxstart2
asdfasd<yy
! xxstart3
Results of test64:
*** ../vim-7.3.1087/src/testdir/test64.ok 2013-06-01 14:42:51.000000000 +0200
--- src/testdir/test64.ok 2013-06-01 18:55:43.000000000 +0200
***************
*** 817,832 ****
--- 817,841 ----
OK 2 - \(a\)\(b\)\(c\)\(dd\)\(e\)\(f\)\(g\)\(h\)\(i\)\1\2\3\4\5\6\7\8\9
OK 0 - <\@<=span.
OK 1 - <\@<=span.
+ OK 2 - <\@<=span.
OK 0 - <\@1<=span.
OK 1 - <\@1<=span.
+ OK 2 - <\@1<=span.
OK 0 - <\@2<=span.
OK 1 - <\@2<=span.
+ OK 2 - <\@2<=span.
OK 0 - \(<<\)\@<=span.
OK 1 - \(<<\)\@<=span.
+ OK 2 - \(<<\)\@<=span.
OK 0 - \(<<\)\@1<=span.
OK 1 - \(<<\)\@1<=span.
+ OK 2 - \(<<\)\@1<=span.
OK 0 - \(<<\)\@2<=span.
OK 1 - \(<<\)\@2<=span.
+ OK 2 - \(<<\)\@2<=span.
+ OK 0 - \(foo\)\@<!bar.
+ OK 1 - \(foo\)\@<!bar.
+ OK 2 - \(foo\)\@<!bar.
OK 0 - \_[^8-9]\+
OK 1 - \_[^8-9]\+
OK 2 - \_[^8-9]\+
***************
*** 844,850 ****
<T="7">Ac 7</Title>
ghi
! xxxstart3
-0-
ffo
bob
--- 853,859 ----
<T="7">Ac 7</Title>
ghi
! xxstart3
-0-
ffo
bob
*** ../vim-7.3.1087/src/version.c 2013-06-01 14:42:51.000000000 +0200
--- src/version.c 2013-06-01 18:37:11.000000000 +0200
***************
*** 730,731 ****
--- 730,733 ----
{ /* Add new patch number below this line */
+ /**/
+ 1088,
/**/
--
Seen it all, done it all, can't remember most of it.
/// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\
/// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
\\\ an exciting new programming language -- http://www.Zimbu.org ///
\\\ help me help AIDS victims -- http://ICCF-Holland.org ///