c7fb88
From 725268a12dacbb153dacf9d8cc22cfe37ff230ff Mon Sep 17 00:00:00 2001
c7fb88
From: Lumir Balhar <lbalhar@redhat.com>
c7fb88
Date: Wed, 6 Jan 2021 10:20:09 +0100
c7fb88
Subject: [PATCH] CVE-2020-27783
c7fb88
c7fb88
---
c7fb88
 src/lxml/html/clean.py             | 29 ++++++++++++++++++++---------
c7fb88
 src/lxml/html/tests/test_clean.py  | 20 ++++++++++++++++++++
c7fb88
 src/lxml/html/tests/test_clean.txt | 12 ++++++++++--
c7fb88
 3 files changed, 50 insertions(+), 11 deletions(-)
c7fb88
c7fb88
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
c7fb88
index adc3f45..0492fca 100644
c7fb88
--- a/src/lxml/html/clean.py
c7fb88
+++ b/src/lxml/html/clean.py
c7fb88
@@ -4,8 +4,9 @@ Removes unwanted tags and content.  See the `Cleaner` class for
c7fb88
 details.
c7fb88
 """
c7fb88
 
c7fb88
-import re
c7fb88
 import copy
c7fb88
+import re
c7fb88
+import sys
c7fb88
 try:
c7fb88
     from urlparse import urlsplit
c7fb88
 except ImportError:
c7fb88
@@ -61,12 +62,16 @@ __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
c7fb88
 
c7fb88
 # This is an IE-specific construct you can have in a stylesheet to
c7fb88
 # run some Javascript:
c7fb88
-_css_javascript_re = re.compile(
c7fb88
-    r'expression\s*\(.*?\)', re.S|re.I)
c7fb88
+_replace_css_javascript = re.compile(
c7fb88
+    r'expression\s*\(.*?\)', re.S|re.I).sub
c7fb88
 
c7fb88
 # Do I have to worry about @\nimport?
c7fb88
-_css_import_re = re.compile(
c7fb88
-    r'@\s*import', re.I)
c7fb88
+_replace_css_import = re.compile(
c7fb88
+    r'@\s*import', re.I).sub
c7fb88
+
c7fb88
+_looks_like_tag_content = re.compile(
c7fb88
+    r'</?[a-zA-Z]+|\son[a-zA-Z]+\s*=',
c7fb88
+    *((re.ASCII,) if sys.version_info[0] >= 3 else ())).search
c7fb88
 
c7fb88
 # All kinds of schemes besides just javascript: that can cause
c7fb88
 # execution:
c7fb88
@@ -292,8 +297,8 @@ class Cleaner(object):
c7fb88
             if not self.inline_style:
c7fb88
                 for el in _find_styled_elements(doc):
c7fb88
                     old = el.get('style')
c7fb88
-                    new = _css_javascript_re.sub('', old)
c7fb88
-                    new = _css_import_re.sub('', new)
c7fb88
+                    new = _replace_css_javascript('', old)
c7fb88
+                    new = _replace_css_import('', new)
c7fb88
                     if self._has_sneaky_javascript(new):
c7fb88
                         # Something tricky is going on...
c7fb88
                         del el.attrib['style']
c7fb88
@@ -305,9 +310,9 @@ class Cleaner(object):
c7fb88
                         el.drop_tree()
c7fb88
                         continue
c7fb88
                     old = el.text or ''
c7fb88
-                    new = _css_javascript_re.sub('', old)
c7fb88
+                    new = _replace_css_javascript('', old)
c7fb88
                     # The imported CSS can do anything; we just can't allow:
c7fb88
-                    new = _css_import_re.sub('', old)
c7fb88
+                    new = _replace_css_import('', new)
c7fb88
                     if self._has_sneaky_javascript(new):
c7fb88
                         # Something tricky is going on...
c7fb88
                         el.text = '/* deleted */'
c7fb88
@@ -509,6 +514,12 @@ class Cleaner(object):
c7fb88
             return True
c7fb88
         if 'expression(' in style:
c7fb88
             return True
c7fb88
+        if '
c7fb88
+            # e.g. '">'
c7fb88
+            return True
c7fb88
+        if _looks_like_tag_content(style):
c7fb88
+            # e.g. '<math><style></style></math>'
c7fb88
+            return True
c7fb88
         return False
c7fb88
 
c7fb88
     def clean_html(self, html):
c7fb88
diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py
c7fb88
index 3bcaaf5..451eec2 100644
c7fb88
--- a/src/lxml/html/tests/test_clean.py
c7fb88
+++ b/src/lxml/html/tests/test_clean.py
c7fb88
@@ -69,6 +69,26 @@ class CleanerTest(unittest.TestCase):
c7fb88
         s = lxml.html.fromstring('<invalid tag>child</another>')
c7fb88
         self.assertEqual('child', clean_html(s).text_content())
c7fb88
 
c7fb88
+    def test_sneaky_noscript_in_style(self):
c7fb88
+        # This gets parsed as ..."</style>
c7fb88
+        # thus passing the  through into the output.
c7fb88
+        html = '">'
c7fb88
+        s = lxml.html.fragment_fromstring(html)
c7fb88
+
c7fb88
+        self.assertEqual(
c7fb88
+            b'',
c7fb88
+            lxml.html.tostring(clean_html(s)))
c7fb88
+
c7fb88
+    def test_sneaky_js_in_math_style(self):
c7fb88
+        # This gets parsed as <math> -> <style>"..."</style>
c7fb88
+        # thus passing any tag/script/whatever content through into the output.
c7fb88
+        html = '<math><style></style></math>'
c7fb88
+        s = lxml.html.fragment_fromstring(html)
c7fb88
+
c7fb88
+        self.assertEqual(
c7fb88
+            b'<math><style>/* deleted */</style></math>',
c7fb88
+            lxml.html.tostring(clean_html(s)))
c7fb88
+
c7fb88
 
c7fb88
 def test_suite():
c7fb88
     suite = unittest.TestSuite()
c7fb88
diff --git a/src/lxml/html/tests/test_clean.txt b/src/lxml/html/tests/test_clean.txt
c7fb88
index c78ab4f..c901871 100644
c7fb88
--- a/src/lxml/html/tests/test_clean.txt
c7fb88
+++ b/src/lxml/html/tests/test_clean.txt
c7fb88
@@ -104,7 +104,11 @@
c7fb88
 >>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc))
c7fb88
 <html>
c7fb88
   <head>
c7fb88
-    <style>/* deleted */</style>
c7fb88
+    <style>
c7fb88
+      body {background-image: url()};
c7fb88
+      div {background-image: url()};
c7fb88
+      div {color: };
c7fb88
+    </style>
c7fb88
   </head>
c7fb88
   <body>
c7fb88
     a link
c7fb88
@@ -168,7 +172,11 @@
c7fb88
     <link rel="alternate" type="text/rss" src="evil-rss">
c7fb88
     <link rel="alternate" type="text/rss" href="http://example.com">
c7fb88
     <link rel="stylesheet" type="text/rss" href="http://example.com">
c7fb88
-    <style>/* deleted */</style>
c7fb88
+    <style>
c7fb88
+      body {background-image: url()};
c7fb88
+      div {background-image: url()};
c7fb88
+      div {color: };
c7fb88
+    </style>
c7fb88
   </head>
c7fb88
   <body>
c7fb88
     a link
c7fb88
-- 
c7fb88
2.29.2
c7fb88