Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
7903913f
Kaydet (Commit)
7903913f
authored
Kas 01, 2011
tarafından
Ezio Melotti
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Sade Fark
#670664: merge with 3.2.
üst
49ce0685
7de56f6a
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
55 additions
and
19 deletions
+55
-19
html.parser.rst
Doc/library/html.parser.rst
+2
-1
parser.py
Lib/html/parser.py
+18
-4
test_htmlparser.py
Lib/test/test_htmlparser.py
+30
-12
NEWS
Misc/NEWS
+5
-2
No files found.
Doc/library/html.parser.rst
Dosyayı görüntüle @
7903913f
...
@@ -115,7 +115,8 @@ An exception is defined as well:
...
@@ -115,7 +115,8 @@ An exception is defined as well:
.. method:: HTMLParser.handle_data(data)
.. method:: HTMLParser.handle_data(data)
This method is called to process arbitrary data. It is intended to be
This method is called to process arbitrary data (e.g. the content of
``
<script>
...
</script>
`` and ``
<style>
...
</style>
``). It is intended to be
overridden by a derived class; the base class implementation does nothing.
overridden by a derived class; the base class implementation does nothing.
...
...
Lib/html/parser.py
Dosyayı görüntüle @
7903913f
...
@@ -62,6 +62,8 @@ locatestarttagend_tolerant = re.compile(r"""
...
@@ -62,6 +62,8 @@ locatestarttagend_tolerant = re.compile(r"""
\s* # trailing whitespace
\s* # trailing whitespace
"""
,
re
.
VERBOSE
)
"""
,
re
.
VERBOSE
)
endendtag
=
re
.
compile
(
'>'
)
endendtag
=
re
.
compile
(
'>'
)
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
# </ and the tag name, so maybe this should be fixed
endtagfind
=
re
.
compile
(
'</
\
s*([a-zA-Z][-.a-zA-Z0-9:_]*)
\
s*>'
)
endtagfind
=
re
.
compile
(
'</
\
s*([a-zA-Z][-.a-zA-Z0-9:_]*)
\
s*>'
)
...
@@ -121,6 +123,7 @@ class HTMLParser(_markupbase.ParserBase):
...
@@ -121,6 +123,7 @@ class HTMLParser(_markupbase.ParserBase):
self
.
rawdata
=
''
self
.
rawdata
=
''
self
.
lasttag
=
'???'
self
.
lasttag
=
'???'
self
.
interesting
=
interesting_normal
self
.
interesting
=
interesting_normal
self
.
cdata_elem
=
None
_markupbase
.
ParserBase
.
reset
(
self
)
_markupbase
.
ParserBase
.
reset
(
self
)
def
feed
(
self
,
data
):
def
feed
(
self
,
data
):
...
@@ -145,11 +148,13 @@ class HTMLParser(_markupbase.ParserBase):
...
@@ -145,11 +148,13 @@ class HTMLParser(_markupbase.ParserBase):
"""Return full source of start tag: '<...>'."""
"""Return full source of start tag: '<...>'."""
return
self
.
__starttag_text
return
self
.
__starttag_text
def
set_cdata_mode
(
self
):
def
set_cdata_mode
(
self
,
elem
):
self
.
interesting
=
interesting_cdata
self
.
interesting
=
interesting_cdata
self
.
cdata_elem
=
elem
.
lower
()
def
clear_cdata_mode
(
self
):
def
clear_cdata_mode
(
self
):
self
.
interesting
=
interesting_normal
self
.
interesting
=
interesting_normal
self
.
cdata_elem
=
None
# Internal -- handle data as far as reasonable. May leave state
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# and data to be processed by a subsequent call. If 'end' is
...
@@ -314,7 +319,7 @@ class HTMLParser(_markupbase.ParserBase):
...
@@ -314,7 +319,7 @@ class HTMLParser(_markupbase.ParserBase):
else
:
else
:
self
.
handle_starttag
(
tag
,
attrs
)
self
.
handle_starttag
(
tag
,
attrs
)
if
tag
in
self
.
CDATA_CONTENT_ELEMENTS
:
if
tag
in
self
.
CDATA_CONTENT_ELEMENTS
:
self
.
set_cdata_mode
()
self
.
set_cdata_mode
(
tag
)
return
endpos
return
endpos
# Internal -- check to see if we have a complete starttag; return end
# Internal -- check to see if we have a complete starttag; return end
...
@@ -371,6 +376,9 @@ class HTMLParser(_markupbase.ParserBase):
...
@@ -371,6 +376,9 @@ class HTMLParser(_markupbase.ParserBase):
j
=
match
.
end
()
j
=
match
.
end
()
match
=
endtagfind
.
match
(
rawdata
,
i
)
# </ + tag + >
match
=
endtagfind
.
match
(
rawdata
,
i
)
# </ + tag + >
if
not
match
:
if
not
match
:
if
self
.
cdata_elem
is
not
None
:
self
.
handle_data
(
rawdata
[
i
:
j
])
return
j
if
self
.
strict
:
if
self
.
strict
:
self
.
error
(
"bad end tag:
%
r"
%
(
rawdata
[
i
:
j
],))
self
.
error
(
"bad end tag:
%
r"
%
(
rawdata
[
i
:
j
],))
k
=
rawdata
.
find
(
'<'
,
i
+
1
,
j
)
k
=
rawdata
.
find
(
'<'
,
i
+
1
,
j
)
...
@@ -380,8 +388,14 @@ class HTMLParser(_markupbase.ParserBase):
...
@@ -380,8 +388,14 @@ class HTMLParser(_markupbase.ParserBase):
j
=
i
+
1
j
=
i
+
1
self
.
handle_data
(
rawdata
[
i
:
j
])
self
.
handle_data
(
rawdata
[
i
:
j
])
return
j
return
j
tag
=
match
.
group
(
1
)
self
.
handle_endtag
(
tag
.
lower
())
elem
=
match
.
group
(
1
)
.
lower
()
# script or style
if
self
.
cdata_elem
is
not
None
:
if
elem
!=
self
.
cdata_elem
:
self
.
handle_data
(
rawdata
[
i
:
j
])
return
j
self
.
handle_endtag
(
elem
.
lower
())
self
.
clear_cdata_mode
()
self
.
clear_cdata_mode
()
return
j
return
j
...
...
Lib/test/test_htmlparser.py
Dosyayı görüntüle @
7903913f
...
@@ -321,18 +321,36 @@ DOCTYPE html [
...
@@ -321,18 +321,36 @@ DOCTYPE html [
(
"starttag_text"
,
s
)])
(
"starttag_text"
,
s
)])
def
test_cdata_content
(
self
):
def
test_cdata_content
(
self
):
s
=
"""<script> <!-- not a comment --> ¬-an-entity-ref; </script>"""
contents
=
[
self
.
_run_check
(
s
,
[
'<!-- not a comment --> ¬-an-entity-ref;'
,
(
"starttag"
,
"script"
,
[]),
"<not a='start tag'>"
,
(
"data"
,
" <!-- not a comment --> ¬-an-entity-ref; "
),
'<a href="" /> <p> <span></span>'
,
(
"endtag"
,
"script"
),
'foo = "</scr" + "ipt>";'
,
])
'foo = "</SCRIPT" + ">";'
,
s
=
"""<script> <not a='start tag'> </script>"""
'foo = <
\n
/script> '
,
self
.
_run_check
(
s
,
[
'<!-- document.write("</scr" + "ipt>"); -->'
,
(
"starttag"
,
"script"
,
[]),
(
'
\n
//<![CDATA[
\n
'
(
"data"
,
" <not a='start tag'> "
),
'document.write(
\'
<s
\'
+
\'
cript type="text/javascript" '
(
"endtag"
,
"script"
),
'src="http://www.example.org/r=
\'
+new '
])
'Date().getTime()+
\'
"><
\\
/s
\'
+
\'
cript>
\'
);
\n
//]]>'
),
'
\n
<!-- //
\n
var foo = 3.14;
\n
// -->
\n
'
,
'foo = "</sty" + "le>";'
,
'<!--
\u2603
-->'
,
# these two should be invalid according to the HTML 5 spec,
# section 8.1.2.2
#'foo = </\nscript>',
#'foo = </ script>',
]
elements
=
[
'script'
,
'style'
,
'SCRIPT'
,
'STYLE'
,
'Script'
,
'Style'
]
for
content
in
contents
:
for
element
in
elements
:
element_lower
=
element
.
lower
()
s
=
'<{element}>{content}</{element}>'
.
format
(
element
=
element
,
content
=
content
)
self
.
_run_check
(
s
,
[(
"starttag"
,
element_lower
,
[]),
(
"data"
,
content
),
(
"endtag"
,
element_lower
)])
def
test_entityrefs_in_attributes
(
self
):
def
test_entityrefs_in_attributes
(
self
):
self
.
_run_check
(
"<html foo='€&aa&unsupported;'>"
,
[
self
.
_run_check
(
"<html foo='€&aa&unsupported;'>"
,
[
...
...
Misc/NEWS
Dosyayı görüntüle @
7903913f
...
@@ -350,10 +350,13 @@ Core and Builtins
...
@@ -350,10 +350,13 @@ Core and Builtins
Library
Library
-------
-------
-
Issue
10817
:
Fix
urlretrieve
function
to
raise
ContentTooShortError
even
-
Issue
#
670664
:
Fix
HTMLParser
to
correctly
handle
the
content
of
``<
script
>...</
script
>``
and
``<
style
>...</
style
>``.
-
Issue
#
10817
:
Fix
urlretrieve
function
to
raise
ContentTooShortError
even
when
reporthook
is
None
.
Patch
by
Jyrki
Pulliainen
.
when
reporthook
is
None
.
Patch
by
Jyrki
Pulliainen
.
-
Issue
13296
:
Fix
IDLE
to
clear
compile
__future__
flags
on
shell
restart
.
-
Issue
#
13296
:
Fix
IDLE
to
clear
compile
__future__
flags
on
shell
restart
.
(
Patch
by
Roger
Serwy
)
(
Patch
by
Roger
Serwy
)
-
Fix
the
xmlrpc
.
client
user
agent
to
return
something
similar
to
-
Fix
the
xmlrpc
.
client
user
agent
to
return
something
similar
to
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment