Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
fab461a4
Kaydet (Commit)
fab461a4
authored
Haz 16, 2006
tarafından
Fred Drake
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
SF patch 1504676: Make sgmllib char and entity references pluggable
(implementation/tests contributed by Sam Ruby)
üst
274facfd
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
115 additions
and
48 deletions
+115
-48
libsgmllib.tex
Doc/lib/libsgmllib.tex
+43
-11
sgmllib.py
Lib/sgmllib.py
+44
-37
test_sgmllib.py
Lib/test/test_sgmllib.py
+27
-0
ACKS
Misc/ACKS
+1
-0
No files found.
Doc/lib/libsgmllib.tex
Dosyayı görüntüle @
fab461a4
...
...
@@ -132,27 +132,59 @@ nothing.
\begin{methoddesc}
{
handle
_
charref
}{
ref
}
This method is called to process a character reference of the form
\samp
{
\&\#\var
{
ref
}
;
}
. In the base implementation,
\var
{
ref
}
must
be a decimal number in the
range 0-255. It translates the character to
\ASCII
{}
and calls the
method
\method
{
handle
_
data()
}
with the character as argument. If
\var
{
ref
}
is invalid or out of range, the method
\code
{
unknown
_
charref(
\var
{
ref
}
)
}
is called to handle the error. A
subclass must override this method to provide support for named
character entities.
\samp
{
\&\#\var
{
ref
}
;
}
. The base implementation uses
\method
{
convert
_
charref()
}
to convert the reference to a string. If
that method returns a string, it is passed to
\method
{
handle
_
data()
}
,
otherwise
\method
{
unknown
_
charref(
\var
{
ref
}
)
}
is called to handle the
error.
\versionchanged
[Use
\method
{
convert
_
charref()
}
instead of hard-coding
the conversion]
{
2.5
}
\end{methoddesc}
\begin{methoddesc}
{
convert
_
charref
}{
ref
}
Convert a character reference to a string, or
\code
{
None
}
.
\var
{
ref
}
is the reference passed in as a string. In the base implementation,
\var
{
ref
}
must be a decimal number in the range 0-255. It converts
the code point found using the
\method
{
convert
_
codepoint()
}
method.
If
\var
{
ref
}
is invalid or out of range, this method returns
\code
{
None
}
. This method is called by the default
\method
{
handle
_
charref()
}
implementation and by the attribute value
parser.
\versionadded
{
2.5
}
\end{methoddesc}
\begin{methoddesc}
{
convert
_
codepoint
}{
codepoint
}
Convert a codepoint to a
\class
{
str
}
value. Encodings can be handled
here if appropriate, though the rest of
\module
{
sgmllib
}
is oblivious
on this matter.
\versionadded
{
2.5
}
\end{methoddesc}
\begin{methoddesc}
{
handle
_
entityref
}{
ref
}
This method is called to process a general entity reference of the
form
\samp
{
\&\var
{
ref
}
;
}
where
\var
{
ref
}
is an general entity
reference. It looks for
\var
{
ref
}
in the instance (or class)
variable
\member
{
entitydefs
}
which should be a mapping from entity
names to corresponding translations. If a translation is found, it
reference. It converts
\var
{
ref
}
by passing it to
\method
{
convert
_
entityref()
}
. If a translation is returned, it
calls the method
\method
{
handle
_
data()
}
with the translation;
otherwise, it calls the method
\code
{
unknown
_
entityref(
\var
{
ref
}
)
}
.
The default
\member
{
entitydefs
}
defines translations for
\code
{
\&
amp;
}
,
\code
{
\&
apos
}
,
\code
{
\&
gt;
}
,
\code
{
\&
lt;
}
, and
\code
{
\&
quot;
}
.
\versionchanged
[Use
\method
{
convert
_
entityref()
}
instead of hard-coding
the conversion]
{
2.5
}
\end{methoddesc}
\begin{methoddesc}
{
convert
_
entityref
}{
ref
}
Convert a named entity reference to a
\class
{
str
}
value, or
\code
{
None
}
. The resulting value will not be parsed.
\var
{
ref
}
will
be only the name of the entity. The default implementation looks for
\var
{
ref
}
in the instance (or class) variable
\member
{
entitydefs
}
which should be a mapping from entity names to corresponding
translations. If no translation is available for
\var
{
ref
}
, this
method returns
\code
{
None
}
. This method is called by the default
\method
{
handle
_
entityref()
}
implementation and by the attribute value
parser.
\versionadded
{
2.5
}
\end{methoddesc}
\begin{methoddesc}
{
handle
_
comment
}{
comment
}
...
...
Lib/sgmllib.py
Dosyayı görüntüle @
fab461a4
...
...
@@ -53,6 +53,10 @@ class SGMLParseError(RuntimeError):
# self.handle_entityref() with the entity reference as argument.
class
SGMLParser
(
markupbase
.
ParserBase
):
# Definition of entities -- derived classes may override
entity_or_charref
=
re
.
compile
(
'&(?:'
'([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
')(;?)'
)
def
__init__
(
self
,
verbose
=
0
):
"""Initialize and reset this instance."""
...
...
@@ -277,32 +281,8 @@ class SGMLParser(markupbase.ParserBase):
attrvalue
[:
1
]
==
'"'
==
attrvalue
[
-
1
:]):
# strip quotes
attrvalue
=
attrvalue
[
1
:
-
1
]
l
=
0
new_attrvalue
=
''
while
l
<
len
(
attrvalue
):
av_match
=
entityref
.
match
(
attrvalue
,
l
)
if
(
av_match
and
av_match
.
group
(
1
)
in
self
.
entitydefs
and
attrvalue
[
av_match
.
end
(
1
)]
==
';'
):
# only substitute entityrefs ending in ';' since
# otherwise we may break <a href='?p=x&q=y'>
# which is very common
new_attrvalue
+=
self
.
entitydefs
[
av_match
.
group
(
1
)]
l
=
av_match
.
end
(
0
)
continue
ch_match
=
charref
.
match
(
attrvalue
,
l
)
if
ch_match
:
try
:
char
=
chr
(
int
(
ch_match
.
group
(
1
)))
new_attrvalue
+=
char
l
=
ch_match
.
end
(
0
)
continue
except
ValueError
:
# invalid character reference, don't substitute
pass
# all other cases
new_attrvalue
+=
attrvalue
[
l
]
l
+=
1
attrvalue
=
new_attrvalue
attrvalue
=
self
.
entity_or_charref
.
sub
(
self
.
_convert_ref
,
attrvalue
)
attrs
.
append
((
attrname
.
lower
(),
attrvalue
))
k
=
match
.
end
(
0
)
if
rawdata
[
j
]
==
'>'
:
...
...
@@ -311,6 +291,17 @@ class SGMLParser(markupbase.ParserBase):
self
.
finish_starttag
(
tag
,
attrs
)
return
j
# Internal -- convert entity or character reference
def
_convert_ref
(
self
,
match
):
if
match
.
group
(
2
):
return
self
.
convert_charref
(
match
.
group
(
2
))
or
\
'&#
%
s
%
s'
%
match
.
groups
()[
1
:]
elif
match
.
group
(
3
):
return
self
.
convert_entityref
(
match
.
group
(
1
))
or
\
'&
%
s;'
%
match
.
group
(
1
)
else
:
return
'&
%
s'
%
match
.
group
(
1
)
# Internal -- parse endtag
def
parse_endtag
(
self
,
i
):
rawdata
=
self
.
rawdata
...
...
@@ -394,35 +385,51 @@ class SGMLParser(markupbase.ParserBase):
print
'*** Unbalanced </'
+
tag
+
'>'
print
'*** Stack:'
,
self
.
stack
def
handle
_charref
(
self
,
name
):
"""
Handle character reference, no need to override
."""
def
convert
_charref
(
self
,
name
):
"""
Convert character reference, may be overridden
."""
try
:
n
=
int
(
name
)
except
ValueError
:
self
.
unknown_charref
(
name
)
return
if
not
0
<=
n
<=
255
:
self
.
unknown_charref
(
name
)
return
self
.
handle_data
(
chr
(
n
))
return
self
.
convert_codepoint
(
n
)
def
convert_codepoint
(
self
,
codepoint
):
return
chr
(
codepoint
)
def
handle_charref
(
self
,
name
):
"""Handle character reference, no need to override."""
replacement
=
convert_charref
(
name
)
if
replacement
is
None
:
self
.
unknown_charref
(
name
)
else
:
self
.
handle_data
(
convert_charref
(
name
))
# Definition of entities -- derived classes may override
entitydefs
=
\
{
'lt'
:
'<'
,
'gt'
:
'>'
,
'amp'
:
'&'
,
'quot'
:
'"'
,
'apos'
:
'
\'
'
}
def
handle
_entityref
(
self
,
name
):
"""
Handle
entity references.
def
convert
_entityref
(
self
,
name
):
"""
Convert
entity references.
There should be no need to override this method; it can b
e
tailored
by setting up the self.entitydefs mapping appropriately.
As an alternative to overriding this method; one can tailor th
e
results
by setting up the self.entitydefs mapping appropriately.
"""
table
=
self
.
entitydefs
if
name
in
table
:
self
.
handle_data
(
table
[
name
])
return
table
[
name
]
else
:
self
.
unknown_entityref
(
name
)
return
def
handle_entityref
(
self
,
name
):
"""Handle entity references, no need to override."""
replacement
=
convert_entityref
(
name
)
if
replacement
is
None
:
self
.
unknown_entityref
(
name
)
else
:
self
.
handle_data
(
convert_entityref
(
name
))
# Example -- handle data, should be overridden
def
handle_data
(
self
,
data
):
pass
...
...
Lib/test/test_sgmllib.py
Dosyayı görüntüle @
fab461a4
...
...
@@ -64,6 +64,23 @@ class CDATAEventCollector(EventCollector):
self
.
setliteral
()
class
HTMLEntityCollector
(
EventCollector
):
import
re
,
htmlentitydefs
entity_or_charref
=
re
.
compile
(
'(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
'|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)'
)
def
convert_charref
(
self
,
name
):
self
.
append
((
"charref"
,
"convert"
,
name
))
if
name
.
startswith
(
'x'
):
return
unichr
(
int
(
name
[
1
:],
16
))
else
:
return
unichr
(
int
(
name
))
def
convert_entityref
(
self
,
name
):
self
.
append
((
"entityref"
,
"convert"
,
name
))
return
unichr
(
self
.
htmlentitydefs
.
name2codepoint
[
name
])
class
SGMLParserTestCase
(
unittest
.
TestCase
):
collector
=
EventCollector
...
...
@@ -233,6 +250,16 @@ DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
(
"k"
,
"*"
),
])])
def
test_convert_overrides
(
self
):
self
.
collector
=
HTMLEntityCollector
self
.
check_events
(
'<a title="“test”">foo</a>'
,
[
(
'entityref'
,
'convert'
,
'ldquo'
),
(
'charref'
,
'convert'
,
'x201d'
),
(
'starttag'
,
'a'
,
[(
'title'
,
u'
\u201c
test
\u201d
'
)]),
(
'data'
,
'foo'
),
(
'endtag'
,
'a'
),
])
def
test_attr_funky_names
(
self
):
self
.
check_events
(
"""<a a.b='v' c:d=v e-f=v>"""
,
[
(
"starttag"
,
"a"
,
[(
"a.b"
,
"v"
),
(
"c:d"
,
"v"
),
(
"e-f"
,
"v"
)]),
...
...
Misc/ACKS
Dosyayı görüntüle @
fab461a4
...
...
@@ -528,6 +528,7 @@ Hugo van Rossum
Saskia van Rossum
Donald Wallace Rouse II
Liam Routt
Sam Ruby
Paul Rubin
Audun S. Runde
Jeff Rush
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment