Kaydet (Commit) 6fc16d81 authored tarafından Ezio Melotti's avatar Ezio Melotti

#21047: set the default value for the *convert_charrefs* argument of HTMLParser…

#21047: set the default value for the *convert_charrefs* argument of HTMLParser to True.  Patch by Berker Peksag.
üst 11bec7a1
...@@ -16,15 +16,13 @@ ...@@ -16,15 +16,13 @@
This module defines a class :class:`HTMLParser` which serves as the basis for This module defines a class :class:`HTMLParser` which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
.. class:: HTMLParser(*, convert_charrefs=False) .. class:: HTMLParser(*, convert_charrefs=True)
Create a parser instance able to parse invalid markup. Create a parser instance able to parse invalid markup.
If *convert_charrefs* is ``True`` (default: ``False``), all character If *convert_charrefs* is ``True`` (the default), all character
references (except the ones in ``script``/``style`` elements) are references (except the ones in ``script``/``style`` elements) are
automatically converted to the corresponding Unicode characters. automatically converted to the corresponding Unicode characters.
The use of ``convert_charrefs=True`` is encouraged and will become
the default in Python 3.5.
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
when start tags, end tags, text, comments, and other markup elements are when start tags, end tags, text, comments, and other markup elements are
...@@ -37,6 +35,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. ...@@ -37,6 +35,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
.. versionchanged:: 3.4 .. versionchanged:: 3.4
*convert_charrefs* keyword argument added. *convert_charrefs* keyword argument added.
.. versionchanged:: 3.5
The default value for argument *convert_charrefs* is now ``True``.
Example HTML Parser Application Example HTML Parser Application
------------------------------- -------------------------------
......
...@@ -59,7 +59,6 @@ endendtag = re.compile('>') ...@@ -59,7 +59,6 @@ endendtag = re.compile('>')
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
_default_sentinel = object()
class HTMLParser(_markupbase.ParserBase): class HTMLParser(_markupbase.ParserBase):
"""Find tags and other markup and call handler functions. """Find tags and other markup and call handler functions.
...@@ -85,17 +84,12 @@ class HTMLParser(_markupbase.ParserBase): ...@@ -85,17 +84,12 @@ class HTMLParser(_markupbase.ParserBase):
CDATA_CONTENT_ELEMENTS = ("script", "style") CDATA_CONTENT_ELEMENTS = ("script", "style")
def __init__(self, *, convert_charrefs=_default_sentinel): def __init__(self, *, convert_charrefs=True):
"""Initialize and reset this instance. """Initialize and reset this instance.
If convert_charrefs is True (default: False), all character references If convert_charrefs is True (the default), all character references
are automatically converted to the corresponding Unicode characters. are automatically converted to the corresponding Unicode characters.
""" """
if convert_charrefs is _default_sentinel:
convert_charrefs = False # default
warnings.warn("The value of convert_charrefs will become True in "
"3.5. You are encouraged to set the value explicitly.",
DeprecationWarning, stacklevel=2)
self.convert_charrefs = convert_charrefs self.convert_charrefs = convert_charrefs
self.reset() self.reset()
......
...@@ -346,7 +346,8 @@ text ...@@ -346,7 +346,8 @@ text
self._run_check(html, expected) self._run_check(html, expected)
def test_convert_charrefs(self): def test_convert_charrefs(self):
collector = lambda: EventCollectorCharrefs(convert_charrefs=True) # default value for convert_charrefs is now True
collector = lambda: EventCollectorCharrefs()
self.assertTrue(collector().convert_charrefs) self.assertTrue(collector().convert_charrefs)
charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22'] charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
# check charrefs in the middle of the text/attributes # check charrefs in the middle of the text/attributes
...@@ -383,10 +384,6 @@ text ...@@ -383,10 +384,6 @@ text
self._run_check('no charrefs here', [('data', 'no charrefs here')], self._run_check('no charrefs here', [('data', 'no charrefs here')],
collector=collector()) collector=collector())
def test_deprecation_warnings(self):
with self.assertWarns(DeprecationWarning):
EventCollector() # convert_charrefs not passed explicitly
# the remaining tests were for the "tolerant" parser (which is now # the remaining tests were for the "tolerant" parser (which is now
# the default), and check various kind of broken markup # the default), and check various kind of broken markup
def test_tolerant_parsing(self): def test_tolerant_parsing(self):
......
...@@ -121,6 +121,9 @@ Core and Builtins ...@@ -121,6 +121,9 @@ Core and Builtins
Library Library
------- -------
- Issue #21047: set the default value for the *convert_charrefs* argument
of HTMLParser to True. Patch by Berker Peksag.
- Add an __all__ to html.entities. - Add an __all__ to html.entities.
- Issue #15114: the strict mode and argument of HTMLParser, HTMLParser.error, - Issue #15114: the strict mode and argument of HTMLParser, HTMLParser.error,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment