Kaydet (Commit) 7f6b67c2 authored tarafından Georg Brandl's avatar Georg Brandl

patch #1462498: handle entityrefs in attribute values.

üst 48d5e508
...@@ -95,12 +95,15 @@ lower case, and the \var{method} argument is the bound method which ...@@ -95,12 +95,15 @@ lower case, and the \var{method} argument is the bound method which
should be used to support semantic interpretation of the start tag. should be used to support semantic interpretation of the start tag.
The \var{attributes} argument is a list of \code{(\var{name}, The \var{attributes} argument is a list of \code{(\var{name},
\var{value})} pairs containing the attributes found inside the tag's \var{value})} pairs containing the attributes found inside the tag's
\code{<>} brackets. The \var{name} has been translated to lower case \code{<>} brackets. The \var{name} has been translated to lower case.
and double quotes and backslashes in the \var{value} have been interpreted. Double quotes and backslashes in the \var{value} have been interpreted,
as well as known entity and character references.
For instance, for the tag \code{<A HREF="http://www.cwi.nl/">}, this For instance, for the tag \code{<A HREF="http://www.cwi.nl/">}, this
method would be called as \samp{unknown_starttag('a', [('href', method would be called as \samp{unknown_starttag('a', [('href',
'http://www.cwi.nl/')])}. The base implementation simply calls 'http://www.cwi.nl/')])}. The base implementation simply calls
\var{method} with \var{attributes} as the only argument. \var{method} with \var{attributes} as the only argument.
\versionadded[Handling of entity and character references within
attribute values]{2.5}
\end{methoddesc} \end{methoddesc}
\begin{methoddesc}{handle_endtag}{tag, method} \begin{methoddesc}{handle_endtag}{tag, method}
......
...@@ -269,9 +269,37 @@ class SGMLParser(markupbase.ParserBase): ...@@ -269,9 +269,37 @@ class SGMLParser(markupbase.ParserBase):
attrname, rest, attrvalue = match.group(1, 2, 3) attrname, rest, attrvalue = match.group(1, 2, 3)
if not rest: if not rest:
attrvalue = attrname attrvalue = attrname
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ else:
attrvalue[:1] == '"' == attrvalue[-1:]: if (attrvalue[:1] == "'" == attrvalue[-1:] or
attrvalue = attrvalue[1:-1] attrvalue[:1] == '"' == attrvalue[-1:]):
# strip quotes
attrvalue = attrvalue[1:-1]
l = 0
new_attrvalue = ''
while l < len(attrvalue):
av_match = entityref.match(attrvalue, l)
if (av_match and av_match.group(1) in self.entitydefs and
attrvalue[av_match.end(1)] == ';'):
# only substitute entityrefs ending in ';' since
# otherwise we may break <a href='?p=x&q=y'>
# which is very common
new_attrvalue += self.entitydefs[av_match.group(1)]
l = av_match.end(0)
continue
ch_match = charref.match(attrvalue, l)
if ch_match:
try:
char = chr(int(ch_match.group(1)))
new_attrvalue += char
l = ch_match.end(0)
continue
except ValueError:
# invalid character reference, don't substitute
pass
# all other cases
new_attrvalue += attrvalue[l]
l += 1
attrvalue = new_attrvalue
attrs.append((attrname.lower(), attrvalue)) attrs.append((attrname.lower(), attrvalue))
k = match.end(0) k = match.end(0)
if rawdata[j] == '>': if rawdata[j] == '>':
......
...@@ -214,6 +214,20 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' ...@@ -214,6 +214,20 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
("starttag", "e", [("a", "rgb(1,2,3)")]), ("starttag", "e", [("a", "rgb(1,2,3)")]),
]) ])
def test_attr_values_entities(self):
"""Substitution of entities and charrefs in attribute values"""
# SF bug #1452246
self.check_events("""<a b=&lt; c=&lt;&gt; d=&lt-&gt; e='&lt; '
f="&xxx;" g='&#32;&#33;' h='&#500;' i='x?a=b&c=d;'>""",
[("starttag", "a", [("b", "<"),
("c", "<>"),
("d", "&lt->"),
("e", "< "),
("f", "&xxx;"),
("g", " !"),
("h", "&#500;"),
("i", "x?a=b&c=d;"), ])])
def test_attr_funky_names(self): def test_attr_funky_names(self):
self.check_events("""<a a.b='v' c:d=v e-f=v>""", [ self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
......
...@@ -489,6 +489,9 @@ Extension Modules ...@@ -489,6 +489,9 @@ Extension Modules
Library Library
------- -------
- Patch #1462498: sgmllib now handles entity and character references
in attribute values.
- Added the sqlite3 package. This is based on pysqlite2.1.3, and provides - Added the sqlite3 package. This is based on pysqlite2.1.3, and provides
a DB-API interface in the standard library. You'll need sqlite 3.2.2 or a DB-API interface in the standard library. You'll need sqlite 3.2.2 or
later to build this - if you have an earlier version, the C extension later to build this - if you have an earlier version, the C extension
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment