Kaydet (Commit) 2e3607c1 authored tarafından Ezio Melotti's avatar Ezio Melotti

#7311: fix html.parser to accept non-ASCII attribute values.

üst 9b5ac3ef
...@@ -28,7 +28,7 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') ...@@ -28,7 +28,7 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
# make it correctly strict without breaking backward compatibility. # make it correctly strict without breaking backward compatibility.
attrfind = re.compile( attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?') r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
attrfind_tolerant = re.compile( attrfind_tolerant = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?') r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
......
...@@ -217,6 +217,23 @@ DOCTYPE html [ ...@@ -217,6 +217,23 @@ DOCTYPE html [
("starttag", "a", [("href", "mailto:xyz@example.com")]), ("starttag", "a", [("href", "mailto:xyz@example.com")]),
]) ])
def test_attr_nonascii(self):
# see issue 7311
self._run_check("<img src=/foo/bar.png alt=\u4e2d\u6587>", [
("starttag", "img", [("src", "/foo/bar.png"),
("alt", "\u4e2d\u6587")]),
])
self._run_check("<a title='\u30c6\u30b9\u30c8' "
"href='\u30c6\u30b9\u30c8.html'>", [
("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
("href", "\u30c6\u30b9\u30c8.html")]),
])
self._run_check('<a title="\u30c6\u30b9\u30c8" '
'href="\u30c6\u30b9\u30c8.html">', [
("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
("href", "\u30c6\u30b9\u30c8.html")]),
])
def test_attr_entity_replacement(self): def test_attr_entity_replacement(self):
self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [ self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
("starttag", "a", [("b", "&><\"'")]), ("starttag", "a", [("b", "&><\"'")]),
......
...@@ -49,6 +49,8 @@ Core and Builtins ...@@ -49,6 +49,8 @@ Core and Builtins
Library Library
------- -------
- Issue #7311: fix html.parser to accept non-ASCII attribute values.
- Issue #11605: email.parser.BytesFeedParser was incorrectly converting multipart - Issue #11605: email.parser.BytesFeedParser was incorrectly converting multipart
subpararts with an 8bit CTE into unicode instead of preserving the bytes. subpararts with an 8bit CTE into unicode instead of preserving the bytes.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment