Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
4a9ee267
Kaydet (Commit)
4a9ee267
authored
Kas 19, 2013
tarafından
Ezio Melotti
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
#2927: Added the unescape() function to the html module.
üst
5160da1a
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
215 additions
and
49 deletions
+215
-49
html.entities.rst
Doc/library/html.entities.rst
+1
-0
html.rst
Doc/library/html.rst
+11
-0
__init__.py
Lib/html/__init__.py
+113
-1
parser.py
Lib/html/parser.py
+5
-33
test_html.py
Lib/test/test_html.py
+83
-3
test_htmlparser.py
Lib/test/test_htmlparser.py
+0
-12
NEWS
Misc/NEWS
+2
-0
No files found.
Doc/library/html.entities.rst
Dosyayı görüntüle @
4a9ee267
...
@@ -20,6 +20,7 @@ This module defines four dictionaries, :data:`html5`,
...
@@ -20,6 +20,7 @@ This module defines four dictionaries, :data:`html5`,
Note that the trailing semicolon is included in the name (e.g. ``'gt;'``),
Note that the trailing semicolon is included in the name (e.g. ``'gt;'``),
however some of the names are accepted by the standard even without the
however some of the names are accepted by the standard even without the
semicolon: in this case the name is present with and without the ``';'``.
semicolon: in this case the name is present with and without the ``';'``.
See also :func:`html.unescape`.
.. versionadded:: 3.3
.. versionadded:: 3.3
...
...
Doc/library/html.rst
Dosyayı görüntüle @
4a9ee267
...
@@ -20,6 +20,17 @@ This module defines utilities to manipulate HTML.
...
@@ -20,6 +20,17 @@ This module defines utilities to manipulate HTML.
.. versionadded:: 3.2
.. versionadded:: 3.2
.. function:: unescape(s)
Convert all named and numeric character references (e.g. ``
>
``,
``
>
``, ``
&x3e;
``) in the string *s* to the corresponding unicode
characters. This function uses the rules defined by the HTML 5 standard
for both valid and invalid character references, and the :data:`list of
HTML 5 named character references
<html
.
entities
.
html5
>
`.
.. versionadded:: 3.4
--------------
--------------
Submodules in the ``html`` package are:
Submodules in the ``html`` package are:
...
...
Lib/html/__init__.py
Dosyayı görüntüle @
4a9ee267
...
@@ -2,7 +2,12 @@
...
@@ -2,7 +2,12 @@
General functions for HTML manipulation.
General functions for HTML manipulation.
"""
"""
# NB: this is a candidate for a bytes/string polymorphic interface
import
re
as
_re
from
html.entities
import
html5
as
_html5
__all__
=
[
'escape'
,
'unescape'
]
def
escape
(
s
,
quote
=
True
):
def
escape
(
s
,
quote
=
True
):
"""
"""
...
@@ -18,3 +23,110 @@ def escape(s, quote=True):
...
@@ -18,3 +23,110 @@ def escape(s, quote=True):
s
=
s
.
replace
(
'"'
,
"""
)
s
=
s
.
replace
(
'"'
,
"""
)
s
=
s
.
replace
(
'
\'
'
,
"'"
)
s
=
s
.
replace
(
'
\'
'
,
"'"
)
return
s
return
s
# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
_invalid_charrefs
=
{
0x00
:
'
\ufffd
'
,
# REPLACEMENT CHARACTER
0x0d
:
'
\r
'
,
# CARRIAGE RETURN
0x80
:
'
\u20ac
'
,
# EURO SIGN
0x81
:
'
\x81
'
,
# <control>
0x82
:
'
\u201a
'
,
# SINGLE LOW-9 QUOTATION MARK
0x83
:
'
\u0192
'
,
# LATIN SMALL LETTER F WITH HOOK
0x84
:
'
\u201e
'
,
# DOUBLE LOW-9 QUOTATION MARK
0x85
:
'
\u2026
'
,
# HORIZONTAL ELLIPSIS
0x86
:
'
\u2020
'
,
# DAGGER
0x87
:
'
\u2021
'
,
# DOUBLE DAGGER
0x88
:
'
\u02c6
'
,
# MODIFIER LETTER CIRCUMFLEX ACCENT
0x89
:
'
\u2030
'
,
# PER MILLE SIGN
0x8a
:
'
\u0160
'
,
# LATIN CAPITAL LETTER S WITH CARON
0x8b
:
'
\u2039
'
,
# SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x8c
:
'
\u0152
'
,
# LATIN CAPITAL LIGATURE OE
0x8d
:
'
\x8d
'
,
# <control>
0x8e
:
'
\u017d
'
,
# LATIN CAPITAL LETTER Z WITH CARON
0x8f
:
'
\x8f
'
,
# <control>
0x90
:
'
\x90
'
,
# <control>
0x91
:
'
\u2018
'
,
# LEFT SINGLE QUOTATION MARK
0x92
:
'
\u2019
'
,
# RIGHT SINGLE QUOTATION MARK
0x93
:
'
\u201c
'
,
# LEFT DOUBLE QUOTATION MARK
0x94
:
'
\u201d
'
,
# RIGHT DOUBLE QUOTATION MARK
0x95
:
'
\u2022
'
,
# BULLET
0x96
:
'
\u2013
'
,
# EN DASH
0x97
:
'
\u2014
'
,
# EM DASH
0x98
:
'
\u02dc
'
,
# SMALL TILDE
0x99
:
'
\u2122
'
,
# TRADE MARK SIGN
0x9a
:
'
\u0161
'
,
# LATIN SMALL LETTER S WITH CARON
0x9b
:
'
\u203a
'
,
# SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x9c
:
'
\u0153
'
,
# LATIN SMALL LIGATURE OE
0x9d
:
'
\x9d
'
,
# <control>
0x9e
:
'
\u017e
'
,
# LATIN SMALL LETTER Z WITH CARON
0x9f
:
'
\u0178
'
,
# LATIN CAPITAL LETTER Y WITH DIAERESIS
}
_invalid_codepoints
=
{
# 0x0001 to 0x0008
0x1
,
0x2
,
0x3
,
0x4
,
0x5
,
0x6
,
0x7
,
0x8
,
# 0x000E to 0x001F
0xe
,
0xf
,
0x10
,
0x11
,
0x12
,
0x13
,
0x14
,
0x15
,
0x16
,
0x17
,
0x18
,
0x19
,
0x1a
,
0x1b
,
0x1c
,
0x1d
,
0x1e
,
0x1f
,
# 0x007F to 0x009F
0x7f
,
0x80
,
0x81
,
0x82
,
0x83
,
0x84
,
0x85
,
0x86
,
0x87
,
0x88
,
0x89
,
0x8a
,
0x8b
,
0x8c
,
0x8d
,
0x8e
,
0x8f
,
0x90
,
0x91
,
0x92
,
0x93
,
0x94
,
0x95
,
0x96
,
0x97
,
0x98
,
0x99
,
0x9a
,
0x9b
,
0x9c
,
0x9d
,
0x9e
,
0x9f
,
# 0xFDD0 to 0xFDEF
0xfdd0
,
0xfdd1
,
0xfdd2
,
0xfdd3
,
0xfdd4
,
0xfdd5
,
0xfdd6
,
0xfdd7
,
0xfdd8
,
0xfdd9
,
0xfdda
,
0xfddb
,
0xfddc
,
0xfddd
,
0xfdde
,
0xfddf
,
0xfde0
,
0xfde1
,
0xfde2
,
0xfde3
,
0xfde4
,
0xfde5
,
0xfde6
,
0xfde7
,
0xfde8
,
0xfde9
,
0xfdea
,
0xfdeb
,
0xfdec
,
0xfded
,
0xfdee
,
0xfdef
,
# others
0xb
,
0xfffe
,
0xffff
,
0x1fffe
,
0x1ffff
,
0x2fffe
,
0x2ffff
,
0x3fffe
,
0x3ffff
,
0x4fffe
,
0x4ffff
,
0x5fffe
,
0x5ffff
,
0x6fffe
,
0x6ffff
,
0x7fffe
,
0x7ffff
,
0x8fffe
,
0x8ffff
,
0x9fffe
,
0x9ffff
,
0xafffe
,
0xaffff
,
0xbfffe
,
0xbffff
,
0xcfffe
,
0xcffff
,
0xdfffe
,
0xdffff
,
0xefffe
,
0xeffff
,
0xffffe
,
0xfffff
,
0x10fffe
,
0x10ffff
}
def
_replace_charref
(
s
):
s
=
s
.
group
(
1
)
if
s
[
0
]
==
'#'
:
# numeric charref
if
s
[
1
]
in
'xX'
:
num
=
int
(
s
[
2
:]
.
rstrip
(
';'
),
16
)
else
:
num
=
int
(
s
[
1
:]
.
rstrip
(
';'
))
if
num
in
_invalid_charrefs
:
return
_invalid_charrefs
[
num
]
if
0xD800
<=
num
<=
0xDFFF
or
num
>
0x10FFFF
:
return
'
\uFFFD
'
if
num
in
_invalid_codepoints
:
return
''
return
chr
(
num
)
else
:
# named charref
if
s
in
_html5
:
return
_html5
[
s
]
# find the longest matching name (as defined by the standard)
for
x
in
range
(
len
(
s
)
-
1
,
1
,
-
1
):
if
s
[:
x
]
in
_html5
:
return
_html5
[
s
[:
x
]]
+
s
[
x
:]
else
:
return
'&'
+
s
_charref
=
_re
.
compile
(
r'&(#[0-9]+;?'
r'|#[xX][0-9a-fA-F]+;?'
r'|[^\t\n\f <&#;]{1,32};?)'
)
def
unescape
(
s
):
"""
Convert all named and numeric character references (e.g. >, >,
&x3e;) in the string s to the corresponding unicode characters.
This function uses the rules defined by the HTML 5 standard
for both valid and invalid character references, and the list of
HTML 5 named character references defined in html.entities.html5.
"""
if
'&'
not
in
s
:
return
s
return
_charref
.
sub
(
_replace_charref
,
s
)
Lib/html/parser.py
Dosyayı görüntüle @
4a9ee267
...
@@ -8,9 +8,12 @@
...
@@ -8,9 +8,12 @@
# and CDATA (character data -- only end tags are special).
# and CDATA (character data -- only end tags are special).
import
_markupbase
import
re
import
re
import
warnings
import
warnings
import
_markupbase
from
html
import
unescape
__all__
=
[
'HTMLParser'
]
__all__
=
[
'HTMLParser'
]
...
@@ -357,7 +360,7 @@ class HTMLParser(_markupbase.ParserBase):
...
@@ -357,7 +360,7 @@ class HTMLParser(_markupbase.ParserBase):
attrvalue
[:
1
]
==
'"'
==
attrvalue
[
-
1
:]:
attrvalue
[:
1
]
==
'"'
==
attrvalue
[
-
1
:]:
attrvalue
=
attrvalue
[
1
:
-
1
]
attrvalue
=
attrvalue
[
1
:
-
1
]
if
attrvalue
:
if
attrvalue
:
attrvalue
=
self
.
unescape
(
attrvalue
)
attrvalue
=
unescape
(
attrvalue
)
attrs
.
append
((
attrname
.
lower
(),
attrvalue
))
attrs
.
append
((
attrname
.
lower
(),
attrvalue
))
k
=
m
.
end
()
k
=
m
.
end
()
...
@@ -510,34 +513,3 @@ class HTMLParser(_markupbase.ParserBase):
...
@@ -510,34 +513,3 @@ class HTMLParser(_markupbase.ParserBase):
def
unknown_decl
(
self
,
data
):
def
unknown_decl
(
self
,
data
):
if
self
.
strict
:
if
self
.
strict
:
self
.
error
(
"unknown declaration:
%
r"
%
(
data
,))
self
.
error
(
"unknown declaration:
%
r"
%
(
data
,))
# Internal -- helper to remove special character quoting
def
unescape
(
self
,
s
):
if
'&'
not
in
s
:
return
s
def
replaceEntities
(
s
):
s
=
s
.
groups
()[
0
]
try
:
if
s
[
0
]
==
"#"
:
s
=
s
[
1
:]
if
s
[
0
]
in
[
'x'
,
'X'
]:
c
=
int
(
s
[
1
:]
.
rstrip
(
';'
),
16
)
else
:
c
=
int
(
s
.
rstrip
(
';'
))
return
chr
(
c
)
except
ValueError
:
return
'&#'
+
s
else
:
from
html.entities
import
html5
if
s
in
html5
:
return
html5
[
s
]
elif
s
.
endswith
(
';'
):
return
'&'
+
s
for
x
in
range
(
2
,
len
(
s
)):
if
s
[:
x
]
in
html5
:
return
html5
[
s
[:
x
]]
+
s
[
x
:]
else
:
return
'&'
+
s
return
re
.
sub
(
r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))"
,
replaceEntities
,
s
,
flags
=
re
.
ASCII
)
Lib/test/test_html.py
Dosyayı görüntüle @
4a9ee267
...
@@ -16,9 +16,89 @@ class HtmlTests(unittest.TestCase):
...
@@ -16,9 +16,89 @@ class HtmlTests(unittest.TestCase):
html
.
escape
(
'
\'
<script>"&foo;"</script>
\'
'
,
False
),
html
.
escape
(
'
\'
<script>"&foo;"</script>
\'
'
,
False
),
'
\'
<script>"&foo;"</script>
\'
'
)
'
\'
<script>"&foo;"</script>
\'
'
)
def
test_unescape
(
self
):
numeric_formats
=
[
'&#
%
d'
,
'&#
%
d;'
,
'&#x
%
x'
,
'&#x
%
x;'
]
errmsg
=
'unescape(
%
r) should have returned
%
r'
def
check
(
text
,
expected
):
self
.
assertEqual
(
html
.
unescape
(
text
),
expected
,
msg
=
errmsg
%
(
text
,
expected
))
def
check_num
(
num
,
expected
):
for
format
in
numeric_formats
:
text
=
format
%
num
self
.
assertEqual
(
html
.
unescape
(
text
),
expected
,
msg
=
errmsg
%
(
text
,
expected
))
# check text with no character references
check
(
'no character references'
,
'no character references'
)
# check & followed by invalid chars
check
(
'&
\n
&
\t
& &&'
,
'&
\n
&
\t
& &&'
)
# check & followed by numbers and letters
check
(
'&0 &9 &a &0; &9; &a;'
,
'&0 &9 &a &0; &9; &a;'
)
# check incomplete entities at the end of the string
for
x
in
[
'&'
,
'&#'
,
'&#x'
,
'&#X'
,
'&#y'
,
'&#xy'
,
'&#Xy'
]:
check
(
x
,
x
)
check
(
x
+
';'
,
x
+
';'
)
# check several combinations of numeric character references,
# possibly followed by different characters
formats
=
[
'&#
%
d'
,
'&#
%07
d'
,
'&#
%
d;'
,
'&#
%07
d;'
,
'&#x
%
x'
,
'&#x
%06
x'
,
'&#x
%
x;'
,
'&#x
%06
x;'
,
'&#x
%
X'
,
'&#x
%06
X'
,
'&#X
%
x;'
,
'&#X
%06
x;'
]
for
num
,
char
in
zip
([
65
,
97
,
34
,
38
,
0x2603
,
0x101234
],
[
'A'
,
'a'
,
'"'
,
'&'
,
'
\u2603
'
,
'
\U00101234
'
]):
for
s
in
formats
:
check
(
s
%
num
,
char
)
for
end
in
[
' '
,
'X'
]:
check
((
s
+
end
)
%
num
,
char
+
end
)
# check invalid codepoints
for
cp
in
[
0xD800
,
0xDB00
,
0xDC00
,
0xDFFF
,
0x110000
]:
check_num
(
cp
,
'
\uFFFD
'
)
# check more invalid codepoints
for
cp
in
[
0x1
,
0xb
,
0xe
,
0x7f
,
0xfffe
,
0xffff
,
0x10fffe
,
0x10ffff
]:
check_num
(
cp
,
''
)
# check invalid numbers
for
num
,
ch
in
zip
([
0x0d
,
0x80
,
0x95
,
0x9d
],
'
\r\u20ac\u2022\x9d
'
):
check_num
(
num
,
ch
)
# check small numbers
check_num
(
0
,
'
\uFFFD
'
)
check_num
(
9
,
'
\t
'
)
# check a big number
check_num
(
1000000000000000000
,
'
\uFFFD
'
)
# check that multiple trailing semicolons are handled correctly
for
e
in
[
'";'
,
'";'
,
'";'
,
'";'
]:
check
(
e
,
'";'
)
# check that semicolons in the middle don't create problems
for
e
in
[
'"quot;'
,
'"quot;'
,
'"quot;'
,
'"quot;'
]:
check
(
e
,
'"quot;'
)
# check triple adjacent charrefs
for
e
in
[
'"'
,
'"'
,
'"'
,
'"'
]:
check
(
e
*
3
,
'"""'
)
check
((
e
+
';'
)
*
3
,
'"""'
)
# check that the case is respected
for
e
in
[
'&'
,
'&'
,
'&'
,
'&'
]:
check
(
e
,
'&'
)
for
e
in
[
'&Amp'
,
'&Amp;'
]:
check
(
e
,
e
)
# check that non-existent named entities are returned unchanged
check
(
'&svadilfari;'
,
'&svadilfari;'
)
# the following examples are in the html5 specs
check
(
'¬it'
,
'¬it'
)
check
(
'¬it;'
,
'¬it;'
)
check
(
'¬in'
,
'¬in'
)
check
(
'∉'
,
'∉'
)
# a similar example with a long name
check
(
'¬ReallyAnExistingNamedCharacterReference;'
,
'¬ReallyAnExistingNamedCharacterReference;'
)
# longest valid name
check
(
'∳'
,
'∳'
)
# check a charref that maps to two unicode chars
check
(
'∾̳'
,
'
\u223E\u0333
'
)
check
(
'&acE'
,
'&acE'
)
# see #12888
check
(
'{ '
*
1050
,
'{ '
*
1050
)
# see #15156
check
(
'ÉricÉric&alphacentauriαcentauri'
,
'ÉricÉric&alphacentauriαcentauri'
)
check
(
'&co;'
,
'&co;'
)
def
test_main
():
run_unittest
(
HtmlTests
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
test_
main
()
unittest
.
main
()
Lib/test/test_htmlparser.py
Dosyayı görüntüle @
4a9ee267
...
@@ -569,18 +569,6 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
...
@@ -569,18 +569,6 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
for
html
,
expected
in
data
:
for
html
,
expected
in
data
:
self
.
_run_check
(
html
,
expected
)
self
.
_run_check
(
html
,
expected
)
def
test_unescape_function
(
self
):
p
=
self
.
get_collector
()
self
.
assertEqual
(
p
.
unescape
(
'&#bad;'
),
'&#bad;'
)
self
.
assertEqual
(
p
.
unescape
(
'&'
),
'&'
)
# see #12888
self
.
assertEqual
(
p
.
unescape
(
'{ '
*
1050
),
'{ '
*
1050
)
# see #15156
self
.
assertEqual
(
p
.
unescape
(
'ÉricÉric'
'&alphacentauriαcentauri'
),
'ÉricÉric&alphacentauriαcentauri'
)
self
.
assertEqual
(
p
.
unescape
(
'&co;'
),
'&co;'
)
def
test_broken_comments
(
self
):
def
test_broken_comments
(
self
):
html
=
(
'<! not really a comment >'
html
=
(
'<! not really a comment >'
'<! not a comment either -->'
'<! not a comment either -->'
...
...
Misc/NEWS
Dosyayı görüntüle @
4a9ee267
...
@@ -59,6 +59,8 @@ Library
...
@@ -59,6 +59,8 @@ Library
-
Issue
#
19449
:
in
csv
's writerow, handle non-string keys when generating the
-
Issue
#
19449
:
in
csv
's writerow, handle non-string keys when generating the
error message that certain keys are not in the '
fieldnames
' list.
error message that certain keys are not in the '
fieldnames
' list.
- Issue #2927: Added the unescape() function to the html module.
- Issue #8402: Added the escape() function to the glob module.
- Issue #8402: Added the escape() function to the glob module.
- Issue #17618: Add Base85 and Ascii85 encoding/decoding to the base64 module.
- Issue #17618: Add Base85 and Ascii85 encoding/decoding to the base64 module.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment