| To: vim_dev@googlegroups.com |
| Subject: Patch 7.4.001 |
| Fcc: outbox |
| From: Bram Moolenaar <Bram@moolenaar.net> |
| Mime-Version: 1.0 |
| Content-Type: text/plain; charset=UTF-8 |
| Content-Transfer-Encoding: 8bit |
| |
| |
| Patch 7.4.001 |
| Problem: Character classes such as [a-z] to not react to 'ignorecase'. |
| Breaks man page highlighting. (Mario Grgic) |
| Solution: Add separate items for classes that react to 'ignorecase'. Clean |
| up logic handling character classes. Add more tests. |
| Files: src/regexp_nfa.c, src/testdir/test64.in, src/testdir/test64.ok |
| |
| |
| |
| |
| |
| *** 29,34 **** |
| --- 29,37 ---- |
| # define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log" |
| #endif |
| |
| + /* Added to NFA_ANY - NFA_NUPPER_IC to include a NL. */ |
| + #define NFA_ADD_NL 31 |
| + |
| enum |
| { |
| NFA_SPLIT = -1024, |
| |
| *** 183,188 **** |
| --- 186,198 ---- |
| NFA_NLOWER, /* Match non-lowercase char */ |
| NFA_UPPER, /* Match uppercase char */ |
| NFA_NUPPER, /* Match non-uppercase char */ |
| + NFA_LOWER_IC, /* Match [a-z] */ |
| + NFA_NLOWER_IC, /* Match [^a-z] */ |
| + NFA_UPPER_IC, /* Match [A-Z] */ |
| + NFA_NUPPER_IC, /* Match [^A-Z] */ |
| + |
| + NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL, |
| + NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL, |
| |
| NFA_CURSOR, /* Match cursor pos */ |
| NFA_LNUM, /* Match line number */ |
| |
| *** 199,207 **** |
| NFA_MARK_LT, /* Match < mark */ |
| NFA_VISUAL, /* Match Visual area */ |
| |
| - NFA_FIRST_NL = NFA_ANY + ADD_NL, |
| - NFA_LAST_NL = NFA_NUPPER + ADD_NL, |
| - |
| /* Character classes [:alnum:] etc */ |
| NFA_CLASS_ALNUM, |
| NFA_CLASS_ALPHA, |
| --- 209,214 ---- |
| |
| *** 578,583 **** |
| --- 585,592 ---- |
| * On failure, return 0 (=FAIL) |
| * Start points to the first char of the range, while end should point |
| * to the closing brace. |
| + * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may |
| + * need to be interpreted as [a-zA-Z]. |
| */ |
| static int |
| nfa_recognize_char_class(start, end, extra_newl) |
| |
| *** 681,687 **** |
| return FAIL; |
| |
| if (newl == TRUE) |
| ! extra_newl = ADD_NL; |
| |
| switch (config) |
| { |
| --- 690,696 ---- |
| return FAIL; |
| |
| if (newl == TRUE) |
| ! extra_newl = NFA_ADD_NL; |
| |
| switch (config) |
| { |
| |
| *** 710,722 **** |
| case CLASS_not | CLASS_az | CLASS_AZ: |
| return extra_newl + NFA_NALPHA; |
| case CLASS_az: |
| ! return extra_newl + NFA_LOWER; |
| case CLASS_not | CLASS_az: |
| ! return extra_newl + NFA_NLOWER; |
| case CLASS_AZ: |
| ! return extra_newl + NFA_UPPER; |
| case CLASS_not | CLASS_AZ: |
| ! return extra_newl + NFA_NUPPER; |
| } |
| return FAIL; |
| } |
| --- 719,731 ---- |
| case CLASS_not | CLASS_az | CLASS_AZ: |
| return extra_newl + NFA_NALPHA; |
| case CLASS_az: |
| ! return extra_newl + NFA_LOWER_IC; |
| case CLASS_not | CLASS_az: |
| ! return extra_newl + NFA_NLOWER_IC; |
| case CLASS_AZ: |
| ! return extra_newl + NFA_UPPER_IC; |
| case CLASS_not | CLASS_AZ: |
| ! return extra_newl + NFA_NUPPER_IC; |
| } |
| return FAIL; |
| } |
| |
| *** 914,920 **** |
| break; |
| } |
| |
| ! extra = ADD_NL; |
| |
| /* "\_[" is collection plus newline */ |
| if (c == '[') |
| --- 923,929 ---- |
| break; |
| } |
| |
| ! extra = NFA_ADD_NL; |
| |
| /* "\_[" is collection plus newline */ |
| if (c == '[') |
| |
| *** 970,976 **** |
| } |
| #endif |
| EMIT(nfa_classcodes[p - classchars]); |
| ! if (extra == ADD_NL) |
| { |
| EMIT(NFA_NEWL); |
| EMIT(NFA_OR); |
| --- 979,985 ---- |
| } |
| #endif |
| EMIT(nfa_classcodes[p - classchars]); |
| ! if (extra == NFA_ADD_NL) |
| { |
| EMIT(NFA_NEWL); |
| EMIT(NFA_OR); |
| |
| *** 1240,1260 **** |
| { |
| /* |
| * Try to reverse engineer character classes. For example, |
| ! * recognize that [0-9] stands for \d and [A-Za-z_] with \h, |
| * and perform the necessary substitutions in the NFA. |
| */ |
| result = nfa_recognize_char_class(regparse, endp, |
| ! extra == ADD_NL); |
| if (result != FAIL) |
| { |
| ! if (result >= NFA_DIGIT && result <= NFA_NUPPER) |
| ! EMIT(result); |
| ! else /* must be char class + newline */ |
| { |
| ! EMIT(result - ADD_NL); |
| EMIT(NFA_NEWL); |
| EMIT(NFA_OR); |
| } |
| regparse = endp; |
| mb_ptr_adv(regparse); |
| return OK; |
| --- 1249,1269 ---- |
| { |
| /* |
| * Try to reverse engineer character classes. For example, |
| ! * recognize that [0-9] stands for \d and [A-Za-z_] for \h, |
| * and perform the necessary substitutions in the NFA. |
| */ |
| result = nfa_recognize_char_class(regparse, endp, |
| ! extra == NFA_ADD_NL); |
| if (result != FAIL) |
| { |
| ! if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL) |
| { |
| ! EMIT(result - NFA_ADD_NL); |
| EMIT(NFA_NEWL); |
| EMIT(NFA_OR); |
| } |
| + else |
| + EMIT(result); |
| regparse = endp; |
| mb_ptr_adv(regparse); |
| return OK; |
| |
| *** 1504,1510 **** |
| * collection, add an OR below. But not for negated |
| * range. */ |
| if (!negated) |
| ! extra = ADD_NL; |
| } |
| else |
| { |
| --- 1513,1519 ---- |
| * collection, add an OR below. But not for negated |
| * range. */ |
| if (!negated) |
| ! extra = NFA_ADD_NL; |
| } |
| else |
| { |
| |
| *** 1537,1543 **** |
| EMIT(NFA_END_COLL); |
| |
| /* \_[] also matches \n but it's not negated */ |
| ! if (extra == ADD_NL) |
| { |
| EMIT(reg_string ? NL : NFA_NEWL); |
| EMIT(NFA_OR); |
| --- 1546,1552 ---- |
| EMIT(NFA_END_COLL); |
| |
| /* \_[] also matches \n but it's not negated */ |
| ! if (extra == NFA_ADD_NL) |
| { |
| EMIT(reg_string ? NL : NFA_NEWL); |
| EMIT(NFA_OR); |
| |
| *** 2011,2017 **** |
| if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL) |
| { |
| addnl = TRUE; |
| ! c -= ADD_NL; |
| } |
| |
| STRCPY(code, ""); |
| --- 2020,2026 ---- |
| if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL) |
| { |
| addnl = TRUE; |
| ! c -= NFA_ADD_NL; |
| } |
| |
| STRCPY(code, ""); |
| |
| *** 2217,2222 **** |
| --- 2226,2235 ---- |
| case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break; |
| case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break; |
| case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break; |
| + case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break; |
| + case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break; |
| + case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break; |
| + case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break; |
| |
| default: |
| STRCPY(code, "CHAR(x)"); |
| |
| *** 2687,2692 **** |
| --- 2700,2709 ---- |
| case NFA_NLOWER: |
| case NFA_UPPER: |
| case NFA_NUPPER: |
| + case NFA_LOWER_IC: |
| + case NFA_NLOWER_IC: |
| + case NFA_UPPER_IC: |
| + case NFA_NUPPER_IC: |
| /* possibly non-ascii */ |
| #ifdef FEAT_MBYTE |
| if (has_mbyte) |
| |
| *** 3841,3846 **** |
| --- 3858,3867 ---- |
| case NFA_NLOWER: |
| case NFA_UPPER: |
| case NFA_NUPPER: |
| + case NFA_LOWER_IC: |
| + case NFA_NLOWER_IC: |
| + case NFA_UPPER_IC: |
| + case NFA_NUPPER_IC: |
| case NFA_START_COLL: |
| case NFA_START_NEG_COLL: |
| case NFA_NEWL: |
| |
| *** 5872,5877 **** |
| --- 5893,5920 ---- |
| ADD_STATE_IF_MATCH(t->state); |
| break; |
| |
| + case NFA_LOWER_IC: /* [a-z] */ |
| + result = ri_lower(curc) || (ireg_ic && ri_upper(curc)); |
| + ADD_STATE_IF_MATCH(t->state); |
| + break; |
| + |
| + case NFA_NLOWER_IC: /* [^a-z] */ |
| + result = curc != NUL |
| + && !(ri_lower(curc) || (ireg_ic && ri_upper(curc))); |
| + ADD_STATE_IF_MATCH(t->state); |
| + break; |
| + |
| + case NFA_UPPER_IC: /* [A-Z] */ |
| + result = ri_upper(curc) || (ireg_ic && ri_lower(curc)); |
| + ADD_STATE_IF_MATCH(t->state); |
| + break; |
| + |
| + case NFA_NUPPER_IC: /* ^[A-Z] */ |
| + result = curc != NUL |
| + && !(ri_upper(curc) || (ireg_ic && ri_lower(curc))); |
| + ADD_STATE_IF_MATCH(t->state); |
| + break; |
| + |
| case NFA_BACKREF1: |
| case NFA_BACKREF2: |
| case NFA_BACKREF3: |
| |
| |
| |
| *** 289,303 **** |
| :call add(tl, [2, '.a\%$', " a\n "]) |
| :call add(tl, [2, '.a\%$', " a\n_a", "_a"]) |
| :" |
| ! :"""" Test recognition of some character classes |
| ! :call add(tl, [2, '[0-9]', '8', '8']) |
| ! :call add(tl, [2, '[^0-9]', '8']) |
| ! :call add(tl, [2, '[0-9a-fA-F]*', '0a7', '0a7']) |
| ! :call add(tl, [2, '[^0-9A-Fa-f]\+', '0a7']) |
| ! :call add(tl, [2, '[a-z_A-Z0-9]\+', 'aso_sfoij', 'aso_sfoij']) |
| ! :call add(tl, [2, '[a-z]', 'a', 'a']) |
| ! :call add(tl, [2, '[a-zA-Z]', 'a', 'a']) |
| ! :call add(tl, [2, '[A-Z]', 'a']) |
| :call add(tl, [2, '\C[^A-Z]\+', 'ABCOIJDEOIFNSD jsfoij sa', ' jsfoij sa']) |
| :" |
| :"""" Tests for \z features |
| --- 289,317 ---- |
| :call add(tl, [2, '.a\%$', " a\n "]) |
| :call add(tl, [2, '.a\%$', " a\n_a", "_a"]) |
| :" |
| ! :"""" Test recognition of character classes |
| ! :call add(tl, [2, '[0-7]\+', 'x0123456789x', '01234567']) |
| ! :call add(tl, [2, '[^0-7]\+', '0a;X+% 897', 'a;X+% 89']) |
| ! :call add(tl, [2, '[0-9]\+', 'x0123456789x', '0123456789']) |
| ! :call add(tl, [2, '[^0-9]\+', '0a;X+% 9', 'a;X+% ']) |
| ! :call add(tl, [2, '[0-9a-fA-F]\+', 'x0189abcdefg', '0189abcdef']) |
| ! :call add(tl, [2, '[^0-9A-Fa-f]\+', '0189g;X+% ab', 'g;X+% ']) |
| ! :call add(tl, [2, '[a-z_A-Z0-9]\+', ';+aso_SfOij ', 'aso_SfOij']) |
| ! :call add(tl, [2, '[^a-z_A-Z0-9]\+', 'aSo_;+% sfOij', ';+% ']) |
| ! :call add(tl, [2, '[a-z_A-Z]\+', '0abyz_ABYZ;', 'abyz_ABYZ']) |
| ! :call add(tl, [2, '[^a-z_A-Z]\+', 'abAB_09;+% yzYZ', '09;+% ']) |
| ! :call add(tl, [2, '[a-z]\+', '0abcxyz1', 'abcxyz']) |
| ! :call add(tl, [2, '[a-z]\+', 'AabxyzZ', 'abxyz']) |
| ! :call add(tl, [2, '[^a-z]\+', 'a;X09+% x', ';X09+% ']) |
| ! :call add(tl, [2, '[^a-z]\+', 'abX0;%yz', 'X0;%']) |
| ! :call add(tl, [2, '[a-zA-Z]\+', '0abABxzXZ9', 'abABxzXZ']) |
| ! :call add(tl, [2, '[^a-zA-Z]\+', 'ab09_;+ XZ', '09_;+ ']) |
| ! :call add(tl, [2, '[A-Z]\+', 'aABXYZz', 'ABXYZ']) |
| ! :call add(tl, [2, '[^A-Z]\+', 'ABx0;%YZ', 'x0;%']) |
| ! :call add(tl, [2, '[a-z]\+\c', '0abxyzABXYZ;', 'abxyzABXYZ']) |
| ! :call add(tl, [2, '[A-Z]\+\c', '0abABxzXZ9', 'abABxzXZ']) |
| ! :call add(tl, [2, '\c[^a-z]\+', 'ab09_;+ XZ', '09_;+ ']) |
| ! :call add(tl, [2, '\c[^A-Z]\+', 'ab09_;+ XZ', '09_;+ ']) |
| :call add(tl, [2, '\C[^A-Z]\+', 'ABCOIJDEOIFNSD jsfoij sa', ' jsfoij sa']) |
| :" |
| :"""" Tests for \z features |
| |
| |
| |
| *** 650,679 **** |
| OK 0 - .a\%$ |
| OK 1 - .a\%$ |
| OK 2 - .a\%$ |
| ! OK 0 - [0-9] |
| ! OK 1 - [0-9] |
| ! OK 2 - [0-9] |
| ! OK 0 - [^0-9] |
| ! OK 1 - [^0-9] |
| ! OK 2 - [^0-9] |
| ! OK 0 - [0-9a-fA-F]* |
| ! OK 1 - [0-9a-fA-F]* |
| ! OK 2 - [0-9a-fA-F]* |
| OK 0 - [^0-9A-Fa-f]\+ |
| OK 1 - [^0-9A-Fa-f]\+ |
| OK 2 - [^0-9A-Fa-f]\+ |
| OK 0 - [a-z_A-Z0-9]\+ |
| OK 1 - [a-z_A-Z0-9]\+ |
| OK 2 - [a-z_A-Z0-9]\+ |
| ! OK 0 - [a-z] |
| ! OK 1 - [a-z] |
| ! OK 2 - [a-z] |
| ! OK 0 - [a-zA-Z] |
| ! OK 1 - [a-zA-Z] |
| ! OK 2 - [a-zA-Z] |
| ! OK 0 - [A-Z] |
| ! OK 1 - [A-Z] |
| ! OK 2 - [A-Z] |
| OK 0 - \C[^A-Z]\+ |
| OK 1 - \C[^A-Z]\+ |
| OK 2 - \C[^A-Z]\+ |
| --- 650,721 ---- |
| OK 0 - .a\%$ |
| OK 1 - .a\%$ |
| OK 2 - .a\%$ |
| ! OK 0 - [0-7]\+ |
| ! OK 1 - [0-7]\+ |
| ! OK 2 - [0-7]\+ |
| ! OK 0 - [^0-7]\+ |
| ! OK 1 - [^0-7]\+ |
| ! OK 2 - [^0-7]\+ |
| ! OK 0 - [0-9]\+ |
| ! OK 1 - [0-9]\+ |
| ! OK 2 - [0-9]\+ |
| ! OK 0 - [^0-9]\+ |
| ! OK 1 - [^0-9]\+ |
| ! OK 2 - [^0-9]\+ |
| ! OK 0 - [0-9a-fA-F]\+ |
| ! OK 1 - [0-9a-fA-F]\+ |
| ! OK 2 - [0-9a-fA-F]\+ |
| OK 0 - [^0-9A-Fa-f]\+ |
| OK 1 - [^0-9A-Fa-f]\+ |
| OK 2 - [^0-9A-Fa-f]\+ |
| OK 0 - [a-z_A-Z0-9]\+ |
| OK 1 - [a-z_A-Z0-9]\+ |
| OK 2 - [a-z_A-Z0-9]\+ |
| ! OK 0 - [^a-z_A-Z0-9]\+ |
| ! OK 1 - [^a-z_A-Z0-9]\+ |
| ! OK 2 - [^a-z_A-Z0-9]\+ |
| ! OK 0 - [a-z_A-Z]\+ |
| ! OK 1 - [a-z_A-Z]\+ |
| ! OK 2 - [a-z_A-Z]\+ |
| ! OK 0 - [^a-z_A-Z]\+ |
| ! OK 1 - [^a-z_A-Z]\+ |
| ! OK 2 - [^a-z_A-Z]\+ |
| ! OK 0 - [a-z]\+ |
| ! OK 1 - [a-z]\+ |
| ! OK 2 - [a-z]\+ |
| ! OK 0 - [a-z]\+ |
| ! OK 1 - [a-z]\+ |
| ! OK 2 - [a-z]\+ |
| ! OK 0 - [^a-z]\+ |
| ! OK 1 - [^a-z]\+ |
| ! OK 2 - [^a-z]\+ |
| ! OK 0 - [^a-z]\+ |
| ! OK 1 - [^a-z]\+ |
| ! OK 2 - [^a-z]\+ |
| ! OK 0 - [a-zA-Z]\+ |
| ! OK 1 - [a-zA-Z]\+ |
| ! OK 2 - [a-zA-Z]\+ |
| ! OK 0 - [^a-zA-Z]\+ |
| ! OK 1 - [^a-zA-Z]\+ |
| ! OK 2 - [^a-zA-Z]\+ |
| ! OK 0 - [A-Z]\+ |
| ! OK 1 - [A-Z]\+ |
| ! OK 2 - [A-Z]\+ |
| ! OK 0 - [^A-Z]\+ |
| ! OK 1 - [^A-Z]\+ |
| ! OK 2 - [^A-Z]\+ |
| ! OK 0 - [a-z]\+\c |
| ! OK 1 - [a-z]\+\c |
| ! OK 2 - [a-z]\+\c |
| ! OK 0 - [A-Z]\+\c |
| ! OK 1 - [A-Z]\+\c |
| ! OK 2 - [A-Z]\+\c |
| ! OK 0 - \c[^a-z]\+ |
| ! OK 1 - \c[^a-z]\+ |
| ! OK 2 - \c[^a-z]\+ |
| ! OK 0 - \c[^A-Z]\+ |
| ! OK 1 - \c[^A-Z]\+ |
| ! OK 2 - \c[^A-Z]\+ |
| OK 0 - \C[^A-Z]\+ |
| OK 1 - \C[^A-Z]\+ |
| OK 2 - \C[^A-Z]\+ |
| |
| |
| |
| *** 729,730 **** |
| --- 729,732 ---- |
| { /* Add new patch number below this line */ |
| + /**/ |
| + 1, |
| /**/ |
| |
| -- |
| How many light bulbs does it take to change a person? |
| |
| /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ |
| /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\ |
| \\\ an exciting new programming language -- http://www.Zimbu.org /// |
| \\\ help me help AIDS victims -- http://ICCF-Holland.org /// |