diff --git a/7.3.1011 b/7.3.1011 new file mode 100644 index 0000000..0e2304b --- /dev/null +++ b/7.3.1011 @@ -0,0 +1,475 @@ +To: vim_dev@googlegroups.com +Subject: Patch 7.3.1011 +Fcc: outbox +From: Bram Moolenaar +Mime-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +------------ + +Patch 7.3.1011 +Problem: New regexp engine is inefficient with multi-byte characters. +Solution: Handle a character at a time instead of a byte at a time. Also + make \Z partly work. +Files: src/regexp_nfa.c, src/testdir/test95.in, src/testdir/test95.ok + + +*** ../vim-7.3.1010/src/regexp_nfa.c 2013-05-24 20:25:28.000000000 +0200 +--- src/regexp_nfa.c 2013-05-24 21:49:43.000000000 +0200 +*************** +*** 46,54 **** + NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */ + NFA_START_INVISIBLE, + NFA_END_INVISIBLE, +- NFA_MULTIBYTE, /* Next nodes in NFA are part of the same +- multibyte char */ +- NFA_END_MULTIBYTE, /* End of multibyte char in the NFA */ + NFA_COMPOSING, /* Next nodes in NFA are part of the + composing multibyte char */ + NFA_END_COMPOSING, /* End of a composing char in the NFA */ +--- 46,51 ---- +*************** +*** 195,220 **** + *post_ptr++ = c; \ + } while (0) + +- #define EMIT_MBYTE(c) \ +- len = (*mb_char2bytes)(c, buf); \ +- EMIT(buf[0]); \ +- for (i = 1; i < len; i++) \ +- { \ +- EMIT(buf[i]); \ +- EMIT(NFA_CONCAT); \ +- } \ +- EMIT(NFA_MULTIBYTE); +- +- #define EMIT_COMPOSING_UTF(input) \ +- len = utfc_ptr2len(input); \ +- EMIT(input[0]); \ +- for (i = 1; i < len; i++) \ +- { \ +- EMIT(input[i]); \ +- EMIT(NFA_CONCAT); \ +- } \ +- EMIT(NFA_COMPOSING); +- + /* + * Initialize internal variables before NFA compilation. + * Return OK on success, FAIL otherwise. +--- 192,197 ---- +*************** +*** 611,618 **** + #ifdef FEAT_MBYTE + char_u *old_regparse = regparse; + int clen; +- int len; +- static char_u buf[30]; + int i; + #endif + int extra = 0; +--- 588,593 ---- +*************** +*** 845,858 **** + return FAIL; + + c = coll_get_char(); +! #ifdef FEAT_MBYTE +! if ((*mb_char2len)(c) > 1) +! { +! EMIT_MBYTE(c); +! } +! else +! #endif +! EMIT(c); + break; + + /* Catch \%^ and \%$ regardless of where they appear in the +--- 820,826 ---- + return FAIL; + + c = coll_get_char(); +! EMIT(c); + break; + + /* Catch \%^ and \%$ regardless of where they appear in the +*************** +*** 1135,1146 **** + * skip it. */ + for (c = startc + 1; c <= endc; c++) + { +! if ((*mb_char2len)(c) > 1) +! { +! EMIT_MBYTE(c); +! } +! else +! EMIT(c); + TRY_NEG(); + EMIT_GLUE(); + } +--- 1103,1109 ---- + * skip it. */ + for (c = startc + 1; c <= endc; c++) + { +! EMIT(c); + TRY_NEG(); + EMIT_GLUE(); + } +*************** +*** 1187,1200 **** + if (got_coll_char == TRUE && startc == 0) + EMIT(0x0a); + else +! #ifdef FEAT_MBYTE +! if ((*mb_char2len)(startc) > 1) +! { +! EMIT_MBYTE(startc); +! } +! else +! #endif +! EMIT(startc); + TRY_NEG(); + EMIT_GLUE(); + } +--- 1150,1156 ---- + if (got_coll_char == TRUE && startc == 0) + EMIT(0x0a); + else +! EMIT(startc); + TRY_NEG(); + EMIT_GLUE(); + } +*************** +*** 1242,1271 **** + int plen; + + nfa_do_multibyte: +! /* length of current char, with composing chars, +! * from pointer */ +! plen = (*mb_ptr2len)(old_regparse); +! if (enc_utf8 && clen != plen) +! { +! /* A composing character is always handled as a +! * separate atom, surrounded by NFA_COMPOSING and +! * NFA_END_COMPOSING. Note that right now we are + * building the postfix form, not the NFA itself; + * a composing char could be: a, b, c, NFA_COMPOSING +! * where 'a', 'b', 'c' are chars with codes > 256. +! */ +! EMIT_COMPOSING_UTF(old_regparse); + regparse = old_regparse + plen; + } + else +- /* A multi-byte character is always handled as a +- * separate atom, surrounded by NFA_MULTIBYTE and +- * NFA_END_MULTIBYTE */ +- if (plen > 1) +- { +- EMIT_MBYTE(c); +- } +- else + #endif + { + c = no_Magic(c); +--- 1198,1227 ---- + int plen; + + nfa_do_multibyte: +! /* Length of current char with composing chars. */ +! if (enc_utf8 && clen != (plen = (*mb_ptr2len)(old_regparse))) +! { +! /* A base character plus composing characters. +! * This requires creating a separate atom as if enclosing +! * the characters in (), where NFA_COMPOSING is the ( and +! * NFA_END_COMPOSING is the ). Note that right now we are + * building the postfix form, not the NFA itself; + * a composing char could be: a, b, c, NFA_COMPOSING +! * where 'b' and 'c' are chars with codes > 256. */ +! i = 0; +! for (;;) +! { +! EMIT(c); +! if (i > 0) +! EMIT(NFA_CONCAT); +! if (i += utf_char2len(c) >= plen) +! break; +! c = utf_ptr2char(old_regparse + i); +! } +! EMIT(NFA_COMPOSING); + regparse = old_regparse + plen; + } + else + #endif + { + c = no_Magic(c); +*************** +*** 1702,1710 **** + case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break; + case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break; + +- case NFA_MULTIBYTE: STRCPY(code, "NFA_MULTIBYTE"); break; +- case NFA_END_MULTIBYTE: STRCPY(code, "NFA_END_MULTIBYTE"); break; +- + case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break; + case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break; + +--- 1658,1663 ---- +*************** +*** 2194,2200 **** + } + e1 = POP(); + e1.start->negated = TRUE; +! if (e1.start->c == NFA_MULTIBYTE || e1.start->c == NFA_COMPOSING) + e1.start->out1->negated = TRUE; + PUSH(e1); + break; +--- 2147,2153 ---- + } + e1 = POP(); + e1.start->negated = TRUE; +! if (e1.start->c == NFA_COMPOSING) + e1.start->out1->negated = TRUE; + PUSH(e1); + break; +*************** +*** 2311,2316 **** +--- 2264,2279 ---- + PUSH(frag(s, list1(&s1->out))); + break; + ++ case NFA_COMPOSING: /* char with composing char */ ++ #if 0 ++ /* TODO */ ++ if (regflags & RF_ICOMBINE) ++ { ++ goto normalchar; ++ } ++ #endif ++ /* FALLTHROUGH */ ++ + case NFA_MOPEN + 0: /* Submatch */ + case NFA_MOPEN + 1: + case NFA_MOPEN + 2: +*************** +*** 2322,2329 **** + case NFA_MOPEN + 8: + case NFA_MOPEN + 9: + case NFA_NOPEN: /* \%( "Invisible Submatch" */ +- case NFA_MULTIBYTE: /* mbyte char */ +- case NFA_COMPOSING: /* composing char */ + if (nfa_calc_size == TRUE) + { + nstate += 2; +--- 2285,2290 ---- +*************** +*** 2336,2344 **** + case NFA_NOPEN: + mclose = NFA_NCLOSE; + break; +- case NFA_MULTIBYTE: +- mclose = NFA_END_MULTIBYTE; +- break; + case NFA_COMPOSING: + mclose = NFA_END_COMPOSING; + break; +--- 2297,2302 ---- +*************** +*** 2377,2385 **** + goto theend; + patch(e.out, s1); + +! if (mopen == NFA_MULTIBYTE || mopen == NFA_COMPOSING) +! /* MULTIBYTE->out1 = END_MULTIBYTE +! * COMPOSING->out1 = END_COMPOSING */ + patch(list1(&s->out1), s1); + + PUSH(frag(s, list1(&s1->out))); +--- 2335,2342 ---- + goto theend; + patch(e.out, s1); + +! if (mopen == NFA_COMPOSING) +! /* COMPOSING->out1 = END_COMPOSING */ + patch(list1(&s->out1), s1); + + PUSH(frag(s, list1(&s1->out))); +*************** +*** 2540,2556 **** + case NFA_COMPOSING: + /* nfa_regmatch() will match all the bytes of this composing char. */ + break; +- +- case NFA_MULTIBYTE: +- /* nfa_regmatch() will match all the bytes of this multibyte char. */ +- break; + #endif + +- case NFA_END_MULTIBYTE: +- /* Successfully matched this mbyte char */ +- addstate(l, state->out, m, off, lid, match); +- break; +- + case NFA_NOPEN: + case NFA_NCLOSE: + addstate(l, state->out, m, off, lid, match); +--- 2497,2504 ---- +*************** +*** 2841,2847 **** + regsub_T *submatch; + regsub_T *m; + { +! int c = -1; + int n; + int i = 0; + int result; +--- 2789,2795 ---- + regsub_T *submatch; + regsub_T *m; + { +! int c; + int n; + int i = 0; + int result; +*************** +*** 2859,2865 **** + List *listtbl[2][2]; + List *ll; + int listid = 1; +- int endnode; + List *thislist; + List *nextlist; + List *neglist; +--- 2807,2812 ---- +*************** +*** 3190,3222 **** + break; + } + +! case NFA_MULTIBYTE: + case NFA_COMPOSING: +! endnode = t->state->c + 1; + result = OK; + sta = t->state->out; +! len = 1; +! while (sta->c != endnode && len <= n) + { +! if (reginput[len-1] != sta->c) +! { +! result = FAIL; + break; +! } +! len++; + sta = sta->out; + } + + /* if input char length doesn't match regexp char length */ +! if (len -1 < n || sta->c != endnode) + result = FAIL; +! end = t->state->out1; /* NFA_END_MULTIBYTE or +! NFA_END_COMPOSING */ + /* If \Z was present, then ignore composing characters */ +! if (ireg_icombine && endnode == NFA_END_COMPOSING) + result = 1 ^ sta->negated; + ADD_POS_NEG_STATE(end); + break; + + case NFA_NEWL: + if (!reg_line_lbr && REG_MULTI +--- 3137,3171 ---- + break; + } + +! #ifdef FEAT_MBYTE + case NFA_COMPOSING: +! { +! int mc = c; +! + result = OK; + sta = t->state->out; +! len = 0; +! while (sta->c != NFA_END_COMPOSING && len < n) + { +! if (len > 0) +! mc = mb_ptr2char(reginput + len); +! if (mc != sta->c) + break; +! len += mb_char2len(mc); + sta = sta->out; + } + + /* if input char length doesn't match regexp char length */ +! if (len < n || sta->c != NFA_END_COMPOSING) + result = FAIL; +! end = t->state->out1; /* NFA_END_COMPOSING */ + /* If \Z was present, then ignore composing characters */ +! if (ireg_icombine) + result = 1 ^ sta->negated; + ADD_POS_NEG_STATE(end); + break; ++ } ++ #endif + + case NFA_NEWL: + if (!reg_line_lbr && REG_MULTI +*************** +*** 3425,3430 **** +--- 3374,3387 ---- + if (!result) + result = ireg_ic == TRUE + && MB_TOLOWER(t->state->c) == MB_TOLOWER(c); ++ #ifdef FEAT_MBYTE ++ /* If there is a composing character which is not being ++ * ignored there can be no match. Match with composing ++ * character uses NFA_COMPOSING above. */ ++ if (result && enc_utf8 && !ireg_icombine ++ && n != utf_char2len(c)) ++ result = FALSE; ++ #endif + ADD_POS_NEG_STATE(t->state); + break; + } +*** ../vim-7.3.1010/src/testdir/test95.in 2013-05-24 20:25:28.000000000 +0200 +--- src/testdir/test95.in 2013-05-24 20:45:08.000000000 +0200 +*************** +*** 35,40 **** +--- 35,44 ---- + :call add(tl, ['\f\+', '&*Ÿfname ', 'fname']) + :call add(tl, ['\%#=1\f\+', '&*Ÿfname ', 'fname']) + ++ :"""" Test composing character matching ++ :call add(tl, ['.ม', 'xม่x yมy', 'yม']) ++ :call add(tl, ['.ม่', 'xม่x yมy', 'xม่']) ++ + :"""" Test \Z + :call add(tl, ['ú\Z', 'x']) + +*** ../vim-7.3.1010/src/testdir/test95.ok 2013-05-24 20:25:28.000000000 +0200 +--- src/testdir/test95.ok 2013-05-24 20:44:41.000000000 +0200 +*************** +*** 9,13 **** +--- 9,15 ---- + OK - \%#=1\i\+ + OK - \f\+ + OK - \%#=1\f\+ ++ OK - .ม ++ OK - .ม่ + OK - ú\Z + OK - [^[=a=]]\+ +*** ../vim-7.3.1010/src/version.c 2013-05-24 20:25:28.000000000 +0200 +--- src/version.c 2013-05-24 21:56:02.000000000 +0200 +*************** +*** 730,731 **** +--- 730,733 ---- + { /* Add new patch number below this line */ ++ /**/ ++ 1011, + /**/ + +-- +If you had to identify, in one word, the reason why the +human race has not achieved, and never will achieve, its +full potential, that word would be "meetings." + + /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ +/// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\ +\\\ an exciting new programming language -- http://www.Zimbu.org /// + \\\ help me help AIDS victims -- http://ICCF-Holland.org ///