From 725268a12dacbb153dacf9d8cc22cfe37ff230ff Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Wed, 6 Jan 2021 10:20:09 +0100 Subject: [PATCH] CVE-2020-27783 --- src/lxml/html/clean.py | 29 ++++++++++++++++++++--------- src/lxml/html/tests/test_clean.py | 20 ++++++++++++++++++++ src/lxml/html/tests/test_clean.txt | 12 ++++++++++-- 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index adc3f45..0492fca 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -4,8 +4,9 @@ Removes unwanted tags and content. See the `Cleaner` class for details. """ -import re import copy +import re +import sys try: from urlparse import urlsplit except ImportError: @@ -61,12 +62,16 @@ __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', # This is an IE-specific construct you can have in a stylesheet to # run some Javascript: -_css_javascript_re = re.compile( - r'expression\s*\(.*?\)', re.S|re.I) +_replace_css_javascript = re.compile( + r'expression\s*\(.*?\)', re.S|re.I).sub # Do I have to worry about @\nimport? -_css_import_re = re.compile( - r'@\s*import', re.I) +_replace_css_import = re.compile( + r'@\s*import', re.I).sub + +_looks_like_tag_content = re.compile( + r'= 3 else ())).search # All kinds of schemes besides just javascript: that can cause # execution: @@ -292,8 +297,8 @@ class Cleaner(object): if not self.inline_style: for el in _find_styled_elements(doc): old = el.get('style') - new = _css_javascript_re.sub('', old) - new = _css_import_re.sub('', new) + new = _replace_css_javascript('', old) + new = _replace_css_import('', new) if self._has_sneaky_javascript(new): # Something tricky is going on... del el.attrib['style'] @@ -305,9 +310,9 @@ class Cleaner(object): el.drop_tree() continue old = el.text or '' - new = _css_javascript_re.sub('', old) + new = _replace_css_javascript('', old) # The imported CSS can do anything; we just can't allow: - new = _css_import_re.sub('', old) + new = _replace_css_import('', new) if self._has_sneaky_javascript(new): # Something tricky is going on... el.text = '/* deleted */' @@ -509,6 +514,12 @@ class Cleaner(object): return True if 'expression(' in style: return True + if '' + return True return False def clean_html(self, html): diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py index 3bcaaf5..451eec2 100644 --- a/src/lxml/html/tests/test_clean.py +++ b/src/lxml/html/tests/test_clean.py @@ -69,6 +69,26 @@ class CleanerTest(unittest.TestCase): s = lxml.html.fromstring('child') self.assertEqual('child', clean_html(s).text_content()) + def test_sneaky_noscript_in_style(self): + # This gets parsed as through into the output. + html = '', + lxml.html.tostring(clean_html(s))) + + def test_sneaky_js_in_math_style(self): + # This gets parsed as -> + # thus passing any tag/script/whatever content through into the output. + html = '' + s = lxml.html.fragment_fromstring(html) + + self.assertEqual( + b'', + lxml.html.tostring(clean_html(s))) + def test_suite(): suite = unittest.TestSuite() diff --git a/src/lxml/html/tests/test_clean.txt b/src/lxml/html/tests/test_clean.txt index c78ab4f..c901871 100644 --- a/src/lxml/html/tests/test_clean.txt +++ b/src/lxml/html/tests/test_clean.txt @@ -104,7 +104,11 @@ >>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc)) - + a link @@ -168,7 +172,11 @@ - + a link -- 2.29.2