| To: vim_dev@googlegroups.com |
| Subject: Patch 7.3.1137 |
| Fcc: outbox |
| From: Bram Moolenaar <Bram@moolenaar.net> |
| Mime-Version: 1.0 |
| Content-Type: text/plain; charset=UTF-8 |
| Content-Transfer-Encoding: 8bit |
| |
| |
| Patch 7.3.1137 |
| Problem: New regexp engine: collections are slow. |
| Solution: Handle all characters in one go. |
| Files: src/regexp_nfa.c |
| |
| |
| |
| |
| |
| *** 34,48 **** |
| NFA_SPLIT = -1024, |
| NFA_MATCH, |
| NFA_SKIP_CHAR, /* matches a 0-length char */ |
| - NFA_END_NEG_RANGE, /* Used when expanding [^ab] */ |
| |
| ! NFA_CONCAT, |
| NFA_OR, |
| NFA_STAR, /* greedy * */ |
| NFA_STAR_NONGREEDY, /* non-greedy * */ |
| NFA_QUEST, /* greedy \? */ |
| NFA_QUEST_NONGREEDY, /* non-greedy \? */ |
| - NFA_NOT, /* used for [^ab] negated char ranges */ |
| |
| NFA_BOL, /* ^ Begin line */ |
| NFA_EOL, /* $ End line */ |
| --- 34,56 ---- |
| NFA_SPLIT = -1024, |
| NFA_MATCH, |
| NFA_SKIP_CHAR, /* matches a 0-length char */ |
| |
| ! NFA_START_COLL, /* [abc] start */ |
| ! NFA_END_COLL, /* [abc] end */ |
| ! NFA_START_NEG_COLL, /* [^abc] start */ |
| ! NFA_END_NEG_COLL, /* [^abc] end (only used in postfix) */ |
| ! NFA_RANGE, /* range of the two previous items (only |
| ! * used in postfix) */ |
| ! NFA_RANGE_MIN, /* low end of a range */ |
| ! NFA_RANGE_MAX, /* high end of a range */ |
| ! |
| ! NFA_CONCAT, /* concatenate two previous items (only |
| ! * used in postfix) */ |
| NFA_OR, |
| NFA_STAR, /* greedy * */ |
| NFA_STAR_NONGREEDY, /* non-greedy * */ |
| NFA_QUEST, /* greedy \? */ |
| NFA_QUEST_NONGREEDY, /* non-greedy \? */ |
| |
| NFA_BOL, /* ^ Begin line */ |
| NFA_EOL, /* $ End line */ |
| |
| *** 260,266 **** |
| static int nfa_get_reganch __ARGS((nfa_state_T *start, int depth)); |
| static int nfa_get_regstart __ARGS((nfa_state_T *start, int depth)); |
| static int nfa_recognize_char_class __ARGS((char_u *start, char_u *end, int extra_newl)); |
| ! static int nfa_emit_equi_class __ARGS((int c, int neg)); |
| static int nfa_regatom __ARGS((void)); |
| static int nfa_regpiece __ARGS((void)); |
| static int nfa_regconcat __ARGS((void)); |
| --- 268,274 ---- |
| static int nfa_get_reganch __ARGS((nfa_state_T *start, int depth)); |
| static int nfa_get_regstart __ARGS((nfa_state_T *start, int depth)); |
| static int nfa_recognize_char_class __ARGS((char_u *start, char_u *end, int extra_newl)); |
| ! static int nfa_emit_equi_class __ARGS((int c)); |
| static int nfa_regatom __ARGS((void)); |
| static int nfa_regpiece __ARGS((void)); |
| static int nfa_regconcat __ARGS((void)); |
| |
| *** 664,684 **** |
| * NOTE! When changing this function, also update reg_equi_class() |
| */ |
| static int |
| ! nfa_emit_equi_class(c, neg) |
| int c; |
| - int neg; |
| { |
| ! int first = TRUE; |
| ! int glue = neg == TRUE ? NFA_CONCAT : NFA_OR; |
| ! #define EMIT2(c) \ |
| ! EMIT(c); \ |
| ! if (neg == TRUE) { \ |
| ! EMIT(NFA_NOT); \ |
| ! } \ |
| ! if (first == FALSE) \ |
| ! EMIT(glue); \ |
| ! else \ |
| ! first = FALSE; \ |
| |
| #ifdef FEAT_MBYTE |
| if (enc_utf8 || STRCMP(p_enc, "latin1") == 0 |
| --- 672,681 ---- |
| * NOTE! When changing this function, also update reg_equi_class() |
| */ |
| static int |
| ! nfa_emit_equi_class(c) |
| int c; |
| { |
| ! #define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT); |
| |
| #ifdef FEAT_MBYTE |
| if (enc_utf8 || STRCMP(p_enc, "latin1") == 0 |
| |
| *** 687,770 **** |
| { |
| switch (c) |
| { |
| ! case 'A': case '\300': case '\301': case '\302': |
| ! case '\303': case '\304': case '\305': |
| ! EMIT2('A'); EMIT2('\300'); EMIT2('\301'); |
| ! EMIT2('\302'); EMIT2('\303'); EMIT2('\304'); |
| ! EMIT2('\305'); |
| return OK; |
| |
| ! case 'C': case '\307': |
| ! EMIT2('C'); EMIT2('\307'); |
| return OK; |
| |
| ! case 'E': case '\310': case '\311': case '\312': case '\313': |
| ! EMIT2('E'); EMIT2('\310'); EMIT2('\311'); |
| ! EMIT2('\312'); EMIT2('\313'); |
| return OK; |
| |
| ! case 'I': case '\314': case '\315': case '\316': case '\317': |
| ! EMIT2('I'); EMIT2('\314'); EMIT2('\315'); |
| ! EMIT2('\316'); EMIT2('\317'); |
| return OK; |
| |
| ! case 'N': case '\321': |
| ! EMIT2('N'); EMIT2('\321'); |
| return OK; |
| |
| ! case 'O': case '\322': case '\323': case '\324': case '\325': |
| ! case '\326': |
| ! EMIT2('O'); EMIT2('\322'); EMIT2('\323'); |
| ! EMIT2('\324'); EMIT2('\325'); EMIT2('\326'); |
| return OK; |
| |
| ! case 'U': case '\331': case '\332': case '\333': case '\334': |
| ! EMIT2('U'); EMIT2('\331'); EMIT2('\332'); |
| ! EMIT2('\333'); EMIT2('\334'); |
| return OK; |
| |
| ! case 'Y': case '\335': |
| ! EMIT2('Y'); EMIT2('\335'); |
| return OK; |
| |
| ! case 'a': case '\340': case '\341': case '\342': |
| ! case '\343': case '\344': case '\345': |
| ! EMIT2('a'); EMIT2('\340'); EMIT2('\341'); |
| ! EMIT2('\342'); EMIT2('\343'); EMIT2('\344'); |
| ! EMIT2('\345'); |
| return OK; |
| |
| ! case 'c': case '\347': |
| ! EMIT2('c'); EMIT2('\347'); |
| return OK; |
| |
| ! case 'e': case '\350': case '\351': case '\352': case '\353': |
| ! EMIT2('e'); EMIT2('\350'); EMIT2('\351'); |
| ! EMIT2('\352'); EMIT2('\353'); |
| return OK; |
| |
| ! case 'i': case '\354': case '\355': case '\356': case '\357': |
| ! EMIT2('i'); EMIT2('\354'); EMIT2('\355'); |
| ! EMIT2('\356'); EMIT2('\357'); |
| return OK; |
| |
| ! case 'n': case '\361': |
| ! EMIT2('n'); EMIT2('\361'); |
| return OK; |
| |
| ! case 'o': case '\362': case '\363': case '\364': case '\365': |
| ! case '\366': |
| ! EMIT2('o'); EMIT2('\362'); EMIT2('\363'); |
| ! EMIT2('\364'); EMIT2('\365'); EMIT2('\366'); |
| return OK; |
| |
| ! case 'u': case '\371': case '\372': case '\373': case '\374': |
| ! EMIT2('u'); EMIT2('\371'); EMIT2('\372'); |
| ! EMIT2('\373'); EMIT2('\374'); |
| return OK; |
| |
| ! case 'y': case '\375': case '\377': |
| ! EMIT2('y'); EMIT2('\375'); EMIT2('\377'); |
| return OK; |
| |
| default: |
| --- 684,767 ---- |
| { |
| switch (c) |
| { |
| ! case 'A': case 0300: case 0301: case 0302: |
| ! case 0303: case 0304: case 0305: |
| ! EMIT2('A'); EMIT2(0300); EMIT2(0301); |
| ! EMIT2(0302); EMIT2(0303); EMIT2(0304); |
| ! EMIT2(0305); |
| return OK; |
| |
| ! case 'C': case 0307: |
| ! EMIT2('C'); EMIT2(0307); |
| return OK; |
| |
| ! case 'E': case 0310: case 0311: case 0312: case 0313: |
| ! EMIT2('E'); EMIT2(0310); EMIT2(0311); |
| ! EMIT2(0312); EMIT2(0313); |
| return OK; |
| |
| ! case 'I': case 0314: case 0315: case 0316: case 0317: |
| ! EMIT2('I'); EMIT2(0314); EMIT2(0315); |
| ! EMIT2(0316); EMIT2(0317); |
| return OK; |
| |
| ! case 'N': case 0321: |
| ! EMIT2('N'); EMIT2(0321); |
| return OK; |
| |
| ! case 'O': case 0322: case 0323: case 0324: case 0325: |
| ! case 0326: |
| ! EMIT2('O'); EMIT2(0322); EMIT2(0323); |
| ! EMIT2(0324); EMIT2(0325); EMIT2(0326); |
| return OK; |
| |
| ! case 'U': case 0331: case 0332: case 0333: case 0334: |
| ! EMIT2('U'); EMIT2(0331); EMIT2(0332); |
| ! EMIT2(0333); EMIT2(0334); |
| return OK; |
| |
| ! case 'Y': case 0335: |
| ! EMIT2('Y'); EMIT2(0335); |
| return OK; |
| |
| ! case 'a': case 0340: case 0341: case 0342: |
| ! case 0343: case 0344: case 0345: |
| ! EMIT2('a'); EMIT2(0340); EMIT2(0341); |
| ! EMIT2(0342); EMIT2(0343); EMIT2(0344); |
| ! EMIT2(0345); |
| return OK; |
| |
| ! case 'c': case 0347: |
| ! EMIT2('c'); EMIT2(0347); |
| return OK; |
| |
| ! case 'e': case 0350: case 0351: case 0352: case 0353: |
| ! EMIT2('e'); EMIT2(0350); EMIT2(0351); |
| ! EMIT2(0352); EMIT2(0353); |
| return OK; |
| |
| ! case 'i': case 0354: case 0355: case 0356: case 0357: |
| ! EMIT2('i'); EMIT2(0354); EMIT2(0355); |
| ! EMIT2(0356); EMIT2(0357); |
| return OK; |
| |
| ! case 'n': case 0361: |
| ! EMIT2('n'); EMIT2(0361); |
| return OK; |
| |
| ! case 'o': case 0362: case 0363: case 0364: case 0365: |
| ! case 0366: |
| ! EMIT2('o'); EMIT2(0362); EMIT2(0363); |
| ! EMIT2(0364); EMIT2(0365); EMIT2(0366); |
| return OK; |
| |
| ! case 'u': case 0371: case 0372: case 0373: case 0374: |
| ! EMIT2('u'); EMIT2(0371); EMIT2(0372); |
| ! EMIT2(0373); EMIT2(0374); |
| return OK; |
| |
| ! case 'y': case 0375: case 0377: |
| ! EMIT2('y'); EMIT2(0375); EMIT2(0377); |
| return OK; |
| |
| default: |
| |
| *** 811,824 **** |
| char_u *old_regparse = regparse; |
| #endif |
| int extra = 0; |
| - int first; |
| int emit_range; |
| int negated; |
| int result; |
| int startc = -1; |
| int endc = -1; |
| int oldstartc = -1; |
| - int glue; /* ID that will "glue" nodes together */ |
| |
| c = getchr(); |
| switch (c) |
| --- 808,819 ---- |
| |
| *** 927,934 **** |
| |
| case Magic('n'): |
| if (reg_string) |
| ! /* In a string "\n" matches a newline character. */ |
| ! EMIT(NL); |
| else |
| { |
| /* In buffer text "\n" matches the end of a line. */ |
| --- 922,929 ---- |
| |
| case Magic('n'): |
| if (reg_string) |
| ! /* In a string "\n" matches a newline character. */ |
| ! EMIT(NL); |
| else |
| { |
| /* In buffer text "\n" matches the end of a line. */ |
| |
| *** 1160,1191 **** |
| case Magic('['): |
| collection: |
| /* |
| ! * Glue is emitted between several atoms from the []. |
| ! * It is either NFA_OR, or NFA_CONCAT. |
| ! * |
| ! * [abc] expands to 'a b NFA_OR c NFA_OR' (in postfix notation) |
| ! * [^abc] expands to 'a NFA_NOT b NFA_NOT NFA_CONCAT c NFA_NOT |
| ! * NFA_CONCAT NFA_END_NEG_RANGE NFA_CONCAT' (in postfix |
| ! * notation) |
| ! * |
| */ |
| |
| - |
| - /* Emit negation atoms, if needed. |
| - * The CONCAT below merges the NOT with the previous node. */ |
| - #define TRY_NEG() \ |
| - if (negated == TRUE) \ |
| - { \ |
| - EMIT(NFA_NOT); \ |
| - } |
| - |
| - /* Emit glue between important nodes : CONCAT or OR. */ |
| - #define EMIT_GLUE() \ |
| - if (first == FALSE) \ |
| - EMIT(glue); \ |
| - else \ |
| - first = FALSE; |
| - |
| p = regparse; |
| endp = skip_anyof(p); |
| if (*endp == ']') |
| --- 1155,1169 ---- |
| case Magic('['): |
| collection: |
| /* |
| ! * [abc] uses NFA_START_COLL - NFA_END_COLL |
| ! * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL |
| ! * Each character is produced as a regular state, using |
| ! * NFA_CONCAT to bind them together. |
| ! * Besides normal characters there can be: |
| ! * - character classes NFA_CLASS_* |
| ! * - ranges, two characters followed by NFA_RANGE. |
| */ |
| |
| p = regparse; |
| endp = skip_anyof(p); |
| if (*endp == ']') |
| |
| *** 1216,1236 **** |
| * version that turns [abc] into 'a' OR 'b' OR 'c' |
| */ |
| startc = endc = oldstartc = -1; |
| - first = TRUE; /* Emitting first atom in this sequence? */ |
| negated = FALSE; |
| - glue = NFA_OR; |
| if (*regparse == '^') /* negated range */ |
| { |
| negated = TRUE; |
| - glue = NFA_CONCAT; |
| mb_ptr_adv(regparse); |
| } |
| if (*regparse == '-') |
| { |
| startc = '-'; |
| EMIT(startc); |
| ! TRY_NEG(); |
| ! EMIT_GLUE(); |
| mb_ptr_adv(regparse); |
| } |
| /* Emit the OR branches for each character in the [] */ |
| --- 1194,1213 ---- |
| * version that turns [abc] into 'a' OR 'b' OR 'c' |
| */ |
| startc = endc = oldstartc = -1; |
| negated = FALSE; |
| if (*regparse == '^') /* negated range */ |
| { |
| negated = TRUE; |
| mb_ptr_adv(regparse); |
| + EMIT(NFA_START_NEG_COLL); |
| } |
| + else |
| + EMIT(NFA_START_COLL); |
| if (*regparse == '-') |
| { |
| startc = '-'; |
| EMIT(startc); |
| ! EMIT(NFA_CONCAT); |
| mb_ptr_adv(regparse); |
| } |
| /* Emit the OR branches for each character in the [] */ |
| |
| *** 1306,1325 **** |
| EMIT(NFA_CLASS_ESCAPE); |
| break; |
| } |
| ! TRY_NEG(); |
| ! EMIT_GLUE(); |
| continue; |
| } |
| /* Try equivalence class [=a=] and the like */ |
| if (equiclass != 0) |
| { |
| ! result = nfa_emit_equi_class(equiclass, negated); |
| if (result == FAIL) |
| { |
| /* should never happen */ |
| EMSG_RET_FAIL(_("E868: Error building NFA with equivalence class!")); |
| } |
| - EMIT_GLUE(); |
| continue; |
| } |
| /* Try collating class like [. .] */ |
| --- 1283,1300 ---- |
| EMIT(NFA_CLASS_ESCAPE); |
| break; |
| } |
| ! EMIT(NFA_CONCAT); |
| continue; |
| } |
| /* Try equivalence class [=a=] and the like */ |
| if (equiclass != 0) |
| { |
| ! result = nfa_emit_equi_class(equiclass); |
| if (result == FAIL) |
| { |
| /* should never happen */ |
| EMSG_RET_FAIL(_("E868: Error building NFA with equivalence class!")); |
| } |
| continue; |
| } |
| /* Try collating class like [. .] */ |
| |
| *** 1391,1409 **** |
| startc = oldstartc; |
| if (startc > endc) |
| EMSG_RET_FAIL(_(e_invrange)); |
| #ifdef FEAT_MBYTE |
| ! if (has_mbyte && ((*mb_char2len)(startc) > 1 |
| || (*mb_char2len)(endc) > 1)) |
| { |
| ! if (endc > startc + 256) |
| ! EMSG_RET_FAIL(_(e_invrange)); |
| ! /* Emit the range. "startc" was already emitted, so |
| ! * skip it. */ |
| for (c = startc + 1; c <= endc; c++) |
| { |
| EMIT(c); |
| ! TRY_NEG(); |
| ! EMIT_GLUE(); |
| } |
| } |
| else |
| --- 1366,1397 ---- |
| startc = oldstartc; |
| if (startc > endc) |
| EMSG_RET_FAIL(_(e_invrange)); |
| + |
| + if (endc > startc + 2) |
| + { |
| + /* Emit a range instead of the sequence of |
| + * individual characters. */ |
| + if (startc == 0) |
| + /* \x00 is translated to \x0a, start at \x01. */ |
| + EMIT(1); |
| + else |
| + --post_ptr; /* remove NFA_CONCAT */ |
| + EMIT(endc); |
| + EMIT(NFA_RANGE); |
| + EMIT(NFA_CONCAT); |
| + } |
| + else |
| #ifdef FEAT_MBYTE |
| ! if (has_mbyte && ((*mb_char2len)(startc) > 1 |
| || (*mb_char2len)(endc) > 1)) |
| { |
| ! /* Emit the characters in the range. |
| ! * "startc" was already emitted, so skip it. |
| ! * */ |
| for (c = startc + 1; c <= endc; c++) |
| { |
| EMIT(c); |
| ! EMIT(NFA_CONCAT); |
| } |
| } |
| else |
| |
| *** 1425,1432 **** |
| #endif |
| { |
| EMIT(c); |
| ! TRY_NEG(); |
| ! EMIT_GLUE(); |
| } |
| } |
| emit_range = FALSE; |
| --- 1413,1419 ---- |
| #endif |
| { |
| EMIT(c); |
| ! EMIT(NFA_CONCAT); |
| } |
| } |
| emit_range = FALSE; |
| |
| *** 1434,1456 **** |
| } |
| else |
| { |
| ! /* |
| ! * This char (startc) is not part of a range. Just |
| * emit it. |
| - * |
| * Normally, simply emit startc. But if we get char |
| * code=0 from a collating char, then replace it with |
| * 0x0a. |
| - * |
| * This is needed to completely mimic the behaviour of |
| ! * the backtracking engine. |
| ! */ |
| ! if (got_coll_char == TRUE && startc == 0) |
| ! EMIT(0x0a); |
| else |
| ! EMIT(startc); |
| ! TRY_NEG(); |
| ! EMIT_GLUE(); |
| } |
| |
| mb_ptr_adv(regparse); |
| --- 1421,1449 ---- |
| } |
| else |
| { |
| ! /* This char (startc) is not part of a range. Just |
| * emit it. |
| * Normally, simply emit startc. But if we get char |
| * code=0 from a collating char, then replace it with |
| * 0x0a. |
| * This is needed to completely mimic the behaviour of |
| ! * the backtracking engine. */ |
| ! if (startc == NFA_NEWL) |
| ! { |
| ! /* Line break can't be matched as part of the |
| ! * collection, add an OR below. But not for negated |
| ! * range. */ |
| ! if (!negated) |
| ! extra = ADD_NL; |
| ! } |
| else |
| ! { |
| ! if (got_coll_char == TRUE && startc == 0) |
| ! EMIT(0x0a); |
| ! else |
| ! EMIT(startc); |
| ! EMIT(NFA_CONCAT); |
| ! } |
| } |
| |
| mb_ptr_adv(regparse); |
| |
| *** 1460,1479 **** |
| if (*regparse == '-') /* if last, '-' is just a char */ |
| { |
| EMIT('-'); |
| ! TRY_NEG(); |
| ! EMIT_GLUE(); |
| } |
| mb_ptr_adv(regparse); |
| |
| /* skip the trailing ] */ |
| regparse = endp; |
| mb_ptr_adv(regparse); |
| if (negated == TRUE) |
| ! { |
| ! /* Mark end of negated char range */ |
| ! EMIT(NFA_END_NEG_RANGE); |
| ! EMIT(NFA_CONCAT); |
| ! } |
| |
| /* \_[] also matches \n but it's not negated */ |
| if (extra == ADD_NL) |
| --- 1453,1471 ---- |
| if (*regparse == '-') /* if last, '-' is just a char */ |
| { |
| EMIT('-'); |
| ! EMIT(NFA_CONCAT); |
| } |
| mb_ptr_adv(regparse); |
| |
| /* skip the trailing ] */ |
| regparse = endp; |
| mb_ptr_adv(regparse); |
| + |
| + /* Mark end of the collection. */ |
| if (negated == TRUE) |
| ! EMIT(NFA_END_NEG_COLL); |
| ! else |
| ! EMIT(NFA_END_COLL); |
| |
| /* \_[] also matches \n but it's not negated */ |
| if (extra == ADD_NL) |
| |
| *** 1532,1540 **** |
| } |
| } |
| |
| - #undef TRY_NEG |
| - #undef EMIT_GLUE |
| - |
| return OK; |
| } |
| |
| --- 1524,1529 ---- |
| |
| *** 2091,2100 **** |
| case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break; |
| case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break; |
| case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break; |
| - case NFA_NOT: STRCPY(code, "NFA_NOT "); break; |
| case NFA_SKIP_CHAR: STRCPY(code, "NFA_SKIP_CHAR"); break; |
| case NFA_OR: STRCPY(code, "NFA_OR"); break; |
| ! case NFA_END_NEG_RANGE: STRCPY(code, "NFA_END_NEG_RANGE"); break; |
| case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break; |
| case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break; |
| case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break; |
| --- 2080,2096 ---- |
| case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break; |
| case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break; |
| case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break; |
| case NFA_SKIP_CHAR: STRCPY(code, "NFA_SKIP_CHAR"); break; |
| case NFA_OR: STRCPY(code, "NFA_OR"); break; |
| ! |
| ! case NFA_START_COLL: STRCPY(code, "NFA_START_COLL"); break; |
| ! case NFA_END_COLL: STRCPY(code, "NFA_END_COLL"); break; |
| ! case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break; |
| ! case NFA_END_NEG_COLL: STRCPY(code, "NFA_END_NEG_COLL"); break; |
| ! case NFA_RANGE: STRCPY(code, "NFA_RANGE"); break; |
| ! case NFA_RANGE_MIN: STRCPY(code, "NFA_RANGE_MIN"); break; |
| ! case NFA_RANGE_MAX: STRCPY(code, "NFA_RANGE_MAX"); break; |
| ! |
| case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break; |
| case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break; |
| case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break; |
| |
| *** 2231,2238 **** |
| fprintf(debugf, " %s", p); |
| |
| nfa_set_code(state->c); |
| ! fprintf(debugf, "%s%s (%d) (id=%d)\n", |
| ! state->negated ? "NOT " : "", code, state->c, abs(state->id)); |
| if (state->id < 0) |
| return; |
| |
| --- 2227,2238 ---- |
| fprintf(debugf, " %s", p); |
| |
| nfa_set_code(state->c); |
| ! fprintf(debugf, "%s%s (%d) (id=%d) val=%d\n", |
| ! state->negated ? "NOT " : "", |
| ! code, |
| ! state->c, |
| ! abs(state->id), |
| ! state->val); |
| if (state->id < 0) |
| return; |
| |
| |
| *** 2325,2330 **** |
| --- 2325,2331 ---- |
| s->c = c; |
| s->out = out; |
| s->out1 = out1; |
| + s->val = 0; |
| |
| s->id = istate; |
| s->lastlist[0] = 0; |
| |
| *** 2565,2577 **** |
| switch (*p) |
| { |
| case NFA_CONCAT: |
| ! /* Catenation. |
| ! * Pay attention: this operator does not exist |
| ! * in the r.e. itself (it is implicit, really). |
| ! * It is added when r.e. is translated to postfix |
| ! * form in re2post(). |
| ! * |
| ! * No new state added here. */ |
| if (nfa_calc_size == TRUE) |
| { |
| /* nstate += 0; */ |
| --- 2566,2575 ---- |
| switch (*p) |
| { |
| case NFA_CONCAT: |
| ! /* Concatenation. |
| ! * Pay attention: this operator does not exist in the r.e. itself |
| ! * (it is implicit, really). It is added when r.e. is translated |
| ! * to postfix form in re2post(). */ |
| if (nfa_calc_size == TRUE) |
| { |
| /* nstate += 0; */ |
| |
| *** 2583,2604 **** |
| PUSH(frag(e1.start, e2.out)); |
| break; |
| |
| - case NFA_NOT: |
| - /* Negation of a character */ |
| - if (nfa_calc_size == TRUE) |
| - { |
| - /* nstate += 0; */ |
| - break; |
| - } |
| - e1 = POP(); |
| - e1.start->negated = TRUE; |
| - #ifdef FEAT_MBYTE |
| - if (e1.start->c == NFA_COMPOSING) |
| - e1.start->out1->negated = TRUE; |
| - #endif |
| - PUSH(e1); |
| - break; |
| - |
| case NFA_OR: |
| /* Alternation */ |
| if (nfa_calc_size == TRUE) |
| --- 2581,2586 ---- |
| |
| *** 2672,2677 **** |
| --- 2654,2696 ---- |
| PUSH(frag(s, append(e.out, list1(&s->out)))); |
| break; |
| |
| + case NFA_END_COLL: |
| + case NFA_END_NEG_COLL: |
| + /* On the stack is the sequence starting with NFA_START_COLL or |
| + * NFA_START_NEG_COLL and all possible characters. Patch it to |
| + * add the output to the start. */ |
| + if (nfa_calc_size == TRUE) |
| + { |
| + nstate++; |
| + break; |
| + } |
| + e = POP(); |
| + s = alloc_state(NFA_END_COLL, NULL, NULL); |
| + if (s == NULL) |
| + goto theend; |
| + patch(e.out, s); |
| + e.start->out1 = s; |
| + PUSH(frag(e.start, list1(&s->out))); |
| + break; |
| + |
| + case NFA_RANGE: |
| + /* Before this are two characters, the low and high end of a |
| + * range. Turn them into two states with MIN and MAX. */ |
| + if (nfa_calc_size == TRUE) |
| + { |
| + /* nstate += 0; */ |
| + break; |
| + } |
| + e2 = POP(); |
| + e1 = POP(); |
| + e2.start->val = e2.start->c; |
| + e2.start->c = NFA_RANGE_MAX; |
| + e1.start->val = e1.start->c; |
| + e1.start->c = NFA_RANGE_MIN; |
| + patch(e1.out, e2.start); |
| + PUSH(frag(e1.start, e2.out)); |
| + break; |
| + |
| case NFA_SKIP_CHAR: |
| /* Symbol of 0-length, Used in a repetition |
| * with max/min count of 0 */ |
| |
| *** 2990,2995 **** |
| --- 3009,3016 ---- |
| matchstate = &state_ptr[istate++]; /* the match state */ |
| matchstate->c = NFA_MATCH; |
| matchstate->out = matchstate->out1 = NULL; |
| + matchstate->negated = FALSE; |
| + matchstate->id = 0; |
| |
| patch(e.out, matchstate); |
| ret = e.start; |
| |
| *** 3308,3314 **** |
| switch (state->c) |
| { |
| case NFA_SPLIT: |
| - case NFA_NOT: |
| case NFA_NOPEN: |
| case NFA_SKIP_CHAR: |
| case NFA_NCLOSE: |
| --- 3329,3334 ---- |
| |
| *** 3782,3788 **** |
| |
| default: |
| /* should not be here :P */ |
| ! EMSG_RET_FAIL(_("E877: (NFA regexp) Invalid character class ")); |
| } |
| return FAIL; |
| } |
| --- 3802,3809 ---- |
| |
| default: |
| /* should not be here :P */ |
| ! EMSGN("E877: (NFA regexp) Invalid character class: %ld", class); |
| ! return FAIL; |
| } |
| return FAIL; |
| } |
| |
| *** 4320,4327 **** |
| addstate(thislist, start, m, 0); |
| |
| /* There are two cases when the NFA advances: 1. input char matches the |
| ! * NFA node and 2. input char does not match the NFA node, but the next |
| ! * node is NFA_NOT. The following macro calls addstate() according to |
| * these rules. It is used A LOT, so use the "listtbl" table for speed */ |
| listtbl[0][0] = NULL; |
| listtbl[0][1] = neglist; |
| --- 4341,4348 ---- |
| addstate(thislist, start, m, 0); |
| |
| /* There are two cases when the NFA advances: 1. input char matches the |
| ! * NFA node and 2. input char does not match the NFA node and the state |
| ! * has the negated flag. The following macro calls addstate() according to |
| * these rules. It is used A LOT, so use the "listtbl" table for speed */ |
| listtbl[0][0] = NULL; |
| listtbl[0][1] = neglist; |
| |
| *** 4845,4860 **** |
| ADD_POS_NEG_STATE(t->state); |
| break; |
| |
| ! case NFA_END_NEG_RANGE: |
| ! /* This follows a series of negated nodes, like: |
| ! * NOT CHAR(x), NOT CHAR(y), etc. */ |
| ! if (curc > 0) |
| { |
| ll = nextlist; |
| ! add_state = t->state->out; |
| add_off = clen; |
| } |
| break; |
| |
| case NFA_ANY: |
| /* Any char except '\0', (end of input) does not match. */ |
| --- 4866,4944 ---- |
| ADD_POS_NEG_STATE(t->state); |
| break; |
| |
| ! case NFA_START_COLL: |
| ! case NFA_START_NEG_COLL: |
| ! { |
| ! /* What follows is a list of characters, until NFA_END_COLL. |
| ! * One of them must match or none of them must match. */ |
| ! nfa_state_T *state; |
| ! int result_if_matched; |
| ! int c1, c2; |
| ! |
| ! /* Never match EOL. If it's part of the collection it is added |
| ! * as a separate state with an OR. */ |
| ! if (curc == NUL) |
| ! break; |
| ! |
| ! state = t->state->out; |
| ! result_if_matched = (t->state->c == NFA_START_COLL); |
| ! for (;;) |
| { |
| + if (state->c == NFA_END_COLL) |
| + { |
| + result = !result_if_matched; |
| + break; |
| + } |
| + if (state->c == NFA_RANGE_MIN) |
| + { |
| + c1 = state->val; |
| + state = state->out; /* advance to NFA_RANGE_MAX */ |
| + c2 = state->val; |
| + #ifdef ENABLE_LOG |
| + fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n", |
| + curc, c1, c2); |
| + #endif |
| + if (curc >= c1 && curc <= c2) |
| + { |
| + result = result_if_matched; |
| + break; |
| + } |
| + if (ireg_ic) |
| + { |
| + int curc_low = MB_TOLOWER(curc); |
| + int done = FALSE; |
| + |
| + for ( ; c1 <= c2; ++c1) |
| + if (MB_TOLOWER(c1) == curc_low) |
| + { |
| + result = result_if_matched; |
| + done = TRUE; |
| + break; |
| + } |
| + if (done) |
| + break; |
| + } |
| + } |
| + else if (state->c < 0 ? check_char_class(state->c, curc) |
| + : (curc == state->c |
| + || (ireg_ic && MB_TOLOWER(curc) |
| + == MB_TOLOWER(state->c)))) |
| + { |
| + result = result_if_matched; |
| + break; |
| + } |
| + state = state->out; |
| + } |
| + if (result) |
| + { |
| + /* next state is in out of the NFA_END_COLL, out1 of |
| + * START points to the END state */ |
| ll = nextlist; |
| ! add_state = t->state->out1->out; |
| add_off = clen; |
| } |
| break; |
| + } |
| |
| case NFA_ANY: |
| /* Any char except '\0', (end of input) does not match. */ |
| |
| |
| |
| *** 730,731 **** |
| --- 730,733 ---- |
| { /* Add new patch number below this line */ |
| + /**/ |
| + 1137, |
| /**/ |
| |
| -- |
| From "know your smileys": |
| :.-( Crying |
| |
| /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ |
| /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\ |
| \\\ an exciting new programming language -- http://www.Zimbu.org /// |
| \\\ help me help AIDS victims -- http://ICCF-Holland.org /// |