diff --git a/jinja2/utils.py b/jinja2/utils.py index 49e9e9a..0e67d88 100644 --- a/jinja2/utils.py +++ b/jinja2/utils.py @@ -11,6 +11,8 @@ import re import sys import errno +from string import ascii_letters as _letters +from string import digits as _digits try: from thread import allocate_lock except ImportError: @@ -19,19 +21,6 @@ from collections import deque from itertools import imap -_word_split_re = re.compile(r'(\s+)') -_punctuation_re = re.compile( - '^(?P(?:%s)*)(?P.*?)(?P(?:%s)*)$' % ( - '|'.join(imap(re.escape, ('(', '<', '<'))), - '|'.join(imap(re.escape, ('.', ',', ')', '>', '\n', '>'))) - ) -) -_simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$') -_striptags_re = re.compile(r'(|<[^>]*>)') -_entity_re = re.compile(r'&([^;]+);') -_letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' -_digits = '0123456789' - # special singleton representing missing values for the runtime missing = type('MissingType', (), {'__repr__': lambda x: 'missing'})() @@ -271,32 +260,61 @@ def urlize(text, trim_url_limit=None, nofollow=False): trim_url = lambda x, limit=trim_url_limit: limit is not None \ and (x[:limit] + (len(x) >=limit and '...' or '')) or x - words = _word_split_re.split(unicode(escape(text))) + words = re.split(r"(\s+)", unicode(escape(text))) nofollow_attr = nofollow and ' rel="nofollow"' or '' for i, word in enumerate(words): - match = _punctuation_re.match(word) + head, middle, tail = "", word, "" + match = re.match(r"^([(<]|<)+", middle) + if match: - lead, middle, trail = match.groups() - if middle.startswith('www.') or ( - '@' not in middle and - not middle.startswith('http://') and - len(middle) > 0 and - middle[0] in _letters + _digits and ( - middle.endswith('.org') or - middle.endswith('.net') or - middle.endswith('.com') - )): - middle = '%s' % (middle, - nofollow_attr, trim_url(middle)) - if middle.startswith('http://') or \ - middle.startswith('https://'): - middle = '%s' % (middle, - nofollow_attr, trim_url(middle)) - if '@' in middle and not middle.startswith('www.') and \ - not ':' in middle and _simple_email_re.match(middle): - middle = '%s' % (middle, middle) - if lead + middle + trail != word: - words[i] = lead + middle + trail + head = match.group() + middle = middle[match.end() :] + + # Unlike lead, which is anchored to the start of the string, + # need to check that the string ends with any of the characters + # before trying to match all of them, to avoid backtracking. + if middle.endswith((")", ">", ".", ",", "\n", ">")): + match = re.search(r"([)>.,\n]|>)+$", middle) + + if match: + tail = match.group() + middle = middle[: match.start()] + + if middle.startswith("www.") or ( + "@" not in middle + and not middle.startswith("http://") + and not middle.startswith("https://") + and len(middle) > 0 + and middle[0] in _letters + _digits + and ( + middle.endswith(".org") + or middle.endswith(".net") + or middle.endswith(".com") + ) + ): + middle = '%s' % ( + middle, + nofollow_attr, + trim_url(middle), + ) + + if middle.startswith("http://") or middle.startswith("https://"): + middle = '%s' % ( + middle, + nofollow_attr, + trim_url(middle), + ) + + if ( + "@" in middle + and not middle.startswith("www.") + and ":" not in middle + and re.match(r"^\S@\w[\w.-]*\.\w$", middle) + ): + middle = '%s' % (middle, middle) + + words[i] = head + middle + tail + return u''.join(words)