Skip to content

Commit a105ab8

Browse files
committed
Prevent combinations of <math/svg> and <style> to sneak JavaScript through the HTML cleaner.
1 parent c053dc1 commit a105ab8

File tree

4 files changed

+50
-11
lines changed

4 files changed

+50
-11
lines changed

CHANGES.txt

+11
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,17 @@
22
lxml changelog
33
==============
44

5+
4.6.2 (2020-11-26)
6+
==================
7+
8+
Bugs fixed
9+
----------
10+
11+
* A vulnerability (CVE-2020-27783) was discovered in the HTML Cleaner by Yaniv Nizry,
12+
which allowed JavaScript to pass through. The cleaner now removes more sneaky
13+
"style" content.
14+
15+
516
4.6.1 (2020-10-18)
617
==================
718

src/lxml/html/clean.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,15 @@
6161

6262
# This is an IE-specific construct you can have in a stylesheet to
6363
# run some Javascript:
64-
_css_javascript_re = re.compile(
65-
r'expression\s*\(.*?\)', re.S|re.I)
64+
_replace_css_javascript = re.compile(
65+
r'expression\s*\(.*?\)', re.S|re.I).sub
6666

6767
# Do I have to worry about @\nimport?
68-
_css_import_re = re.compile(
69-
r'@\s*import', re.I)
68+
_replace_css_import = re.compile(
69+
r'@\s*import', re.I).sub
70+
71+
_looks_like_tag_content = re.compile(
72+
r'</?[a-zA-Z]+|\son[a-zA-Z]+\s*=', re.ASCII).search
7073

7174
# All kinds of schemes besides just javascript: that can cause
7275
# execution:
@@ -304,8 +307,8 @@ def __call__(self, doc):
304307
if not self.inline_style:
305308
for el in _find_styled_elements(doc):
306309
old = el.get('style')
307-
new = _css_javascript_re.sub('', old)
308-
new = _css_import_re.sub('', new)
310+
new = _replace_css_javascript('', old)
311+
new = _replace_css_import('', new)
309312
if self._has_sneaky_javascript(new):
310313
# Something tricky is going on...
311314
del el.attrib['style']
@@ -317,9 +320,9 @@ def __call__(self, doc):
317320
el.drop_tree()
318321
continue
319322
old = el.text or ''
320-
new = _css_javascript_re.sub('', old)
323+
new = _replace_css_javascript('', old)
321324
# The imported CSS can do anything; we just can't allow:
322-
new = _css_import_re.sub('', old)
325+
new = _replace_css_import('', new)
323326
if self._has_sneaky_javascript(new):
324327
# Something tricky is going on...
325328
el.text = '/* deleted */'
@@ -539,6 +542,9 @@ def _has_sneaky_javascript(self, style):
539542
if '</noscript' in style:
540543
# e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
541544
return True
545+
if _looks_like_tag_content(style):
546+
# e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
547+
return True
542548
return False
543549

544550
def clean_html(self, html):

src/lxml/html/tests/test_clean.py

+10
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,16 @@ def test_sneaky_noscript_in_style(self):
113113
b'<noscript><style>/* deleted */</style></noscript>',
114114
lxml.html.tostring(clean_html(s)))
115115

116+
def test_sneaky_js_in_math_style(self):
117+
# This gets parsed as <math> -> <style>"..."</style>
118+
# thus passing any tag/script/whatever content through into the output.
119+
html = '<math><style><img src=x onerror=alert(1)></style></math>'
120+
s = lxml.html.fragment_fromstring(html)
121+
122+
self.assertEqual(
123+
b'<math><style>/* deleted */</style></math>',
124+
lxml.html.tostring(clean_html(s)))
125+
116126

117127
def test_suite():
118128
suite = unittest.TestSuite()

src/lxml/html/tests/test_clean.txt

+15-3
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,11 @@
104104
>>> print(Cleaner(page_structure=False, comments=False).clean_html(doc))
105105
<html>
106106
<head>
107-
<style>/* deleted */</style>
107+
<style>
108+
body {background-image: url()};
109+
div {background-image: url()};
110+
div {color: };
111+
</style>
108112
</head>
109113
<body>
110114
<!-- I am interpreted for EVIL! -->
@@ -126,7 +130,11 @@
126130
>>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc))
127131
<html>
128132
<head>
129-
<style>/* deleted */</style>
133+
<style>
134+
body {background-image: url()};
135+
div {background-image: url()};
136+
div {color: };
137+
</style>
130138
</head>
131139
<body>
132140
<a href="">a link</a>
@@ -190,7 +198,11 @@
190198
<link rel="alternate" type="text/rss" src="evil-rss">
191199
<link rel="alternate" type="text/rss" href="http://example.com">
192200
<link rel="stylesheet" type="text/rss" href="http://example.com">
193-
<style>/* deleted */</style>
201+
<style>
202+
body {background-image: url()};
203+
div {background-image: url()};
204+
div {color: };
205+
</style>
194206
</head>
195207
<body>
196208
<a href="">a link</a>

0 commit comments

Comments
 (0)