Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
3163a3b4
Kaydet (Commit)
3163a3b4
authored
Mar 30, 2003
tarafından
Martin v. Löwis
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
Patch #545300: Support marked sections.
üst
a9656493
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
107 additions
and
16 deletions
+107
-16
markupbase.py
Lib/markupbase.py
+69
-2
sgmllib.py
Lib/sgmllib.py
+8
-14
test_htmllib.py
Lib/test/test_htmllib.py
+27
-0
NEWS
Misc/NEWS
+3
-0
No files found.
Lib/markupbase.py
Dosyayı görüntüle @
3163a3b4
...
...
@@ -4,6 +4,13 @@ import re
_declname_match
=
re
.
compile
(
r'[a-zA-Z][-_.a-zA-Z0-9]*\s*'
)
.
match
_declstringlit_match
=
re
.
compile
(
r'(\'[^\']*\'|"[^"]*")\s*'
)
.
match
_commentclose
=
re
.
compile
(
r'--\s*>'
)
_markedsectionclose
=
re
.
compile
(
r']\s*]\s*>'
)
# An analysis of the MS-Word extensions is available at
# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
_msmarkedsectionclose
=
re
.
compile
(
r']\s*>'
)
del
re
...
...
@@ -53,6 +60,13 @@ class ParserBase:
# This is some sort of declaration; in "HTML as
# deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>").
# ISO 8879:1986, however, has more complex
# declaration syntax for elements in <!...>, including:
# --comment--
# [marked section]
# name in the following list: ENTITY, DOCTYPE, ELEMENT,
# ATTLIST, NOTATION, SHORTREF, USEMAP,
# LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
rawdata
=
self
.
rawdata
j
=
i
+
2
assert
rawdata
[
i
:
j
]
==
"<!"
,
"unexpected call to parse_declaration"
...
...
@@ -60,9 +74,19 @@ class ParserBase:
# Start of comment followed by buffer boundary,
# or just a buffer boundary.
return
-
1
#
in practice, this should look like: ((name|stringlit) S*)
+ '>'
#
A simple, practical version could look like: ((name|stringlit) S*)
+ '>'
n
=
len
(
rawdata
)
decltype
,
j
=
self
.
_scan_name
(
j
,
i
)
if
rawdata
[
j
:
j
+
1
]
==
'--'
:
#comment
# Locate --.*-- as the body of the comment
return
self
.
parse_comment
(
i
)
elif
rawdata
[
j
]
==
'['
:
#marked section
# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
# Note that this is extended by Microsoft Office "Save as Web" function
# to include [if...] and [endif].
return
self
.
parse_marked_section
(
i
)
else
:
#all other declaration elements
decltype
,
j
=
self
.
_scan_name
(
j
,
i
)
if
j
<
0
:
return
j
if
decltype
==
"doctype"
:
...
...
@@ -87,8 +111,15 @@ class ParserBase:
elif
c
in
self
.
_decl_otherchars
:
j
=
j
+
1
elif
c
==
"["
:
# this could be handled in a separate doctype parser
if
decltype
==
"doctype"
:
j
=
self
.
_parse_doctype_subset
(
j
+
1
,
i
)
elif
decltype
in
(
"attlist"
,
"linktype"
,
"link"
,
"element"
):
# must tolerate []'d groups in a content model in an element declaration
# also in data attribute specifications of attlist declaration
# also link type declaration subsets in linktype declarations
# also link attribute specification lists in link declarations
self
.
error
(
"unsupported '[' char in
%
s declaration"
%
decltype
)
else
:
self
.
error
(
"unexpected '[' char in declaration"
)
else
:
...
...
@@ -98,6 +129,42 @@ class ParserBase:
return
j
return
-
1
# incomplete
# Internal -- parse a marked section
# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
def
parse_marked_section
(
self
,
i
,
report
=
1
):
rawdata
=
self
.
rawdata
assert
rawdata
[
i
:
i
+
3
]
==
'<!['
,
"unexpected call to parse_marked_section()"
sectName
,
j
=
self
.
_scan_name
(
i
+
3
,
i
)
if
j
<
0
:
return
j
if
sectName
in
(
"temp"
,
"cdata"
,
"ignore"
,
"include"
,
"rcdata"
):
# look for standard ]]> ending
match
=
_markedsectionclose
.
search
(
rawdata
,
i
+
3
)
elif
sectName
in
(
"if"
,
"else"
,
"endif"
):
# look for MS Office ]> ending
match
=
_msmarkedsectionclose
.
search
(
rawdata
,
i
+
3
)
else
:
self
.
error
(
'unknown status keyword
%
s in marked section'
%
`rawdata[i+3:j]`
)
if
not
match
:
return
-
1
if
report
:
j
=
match
.
start
(
0
)
self
.
unknown_decl
(
rawdata
[
i
+
3
:
j
])
return
match
.
end
(
0
)
# Internal -- parse comment, return length or -1 if not terminated
def
parse_comment
(
self
,
i
,
report
=
1
):
rawdata
=
self
.
rawdata
if
rawdata
[
i
:
i
+
4
]
!=
'<!--'
:
self
.
error
(
'unexpected call to parse_comment()'
)
match
=
_commentclose
.
search
(
rawdata
,
i
+
4
)
if
not
match
:
return
-
1
if
report
:
j
=
match
.
start
(
0
)
self
.
handle_comment
(
rawdata
[
i
+
4
:
j
])
return
match
.
end
(
0
)
# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
# returning the index just past any whitespace following the trailing ']'.
def
_parse_doctype_subset
(
self
,
i
,
declstartpos
):
...
...
Lib/sgmllib.py
Dosyayı görüntüle @
3163a3b4
...
...
@@ -30,7 +30,6 @@ shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag
=
re
.
compile
(
'<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/'
)
piclose
=
re
.
compile
(
'>'
)
endbracket
=
re
.
compile
(
'[<>]'
)
commentclose
=
re
.
compile
(
r'--\s*>'
)
tagfind
=
re
.
compile
(
'[a-zA-Z][-_.a-zA-Z0-9]*'
)
attrfind
=
re
.
compile
(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
...
...
@@ -145,6 +144,10 @@ class SGMLParser(markupbase.ParserBase):
break
continue
if
rawdata
.
startswith
(
"<!--"
,
i
):
# Strictly speaking, a comment is --.*--
# within a declaration tag <!...>.
# This should be removed,
# and comments handled only in parse_declaration.
k
=
self
.
parse_comment
(
i
)
if
k
<
0
:
break
i
=
k
...
...
@@ -202,19 +205,6 @@ class SGMLParser(markupbase.ParserBase):
self
.
rawdata
=
rawdata
[
i
:]
# XXX if end: check for empty stack
# Internal -- parse comment, return length or -1 if not terminated
def
parse_comment
(
self
,
i
,
report
=
1
):
rawdata
=
self
.
rawdata
if
rawdata
[
i
:
i
+
4
]
!=
'<!--'
:
self
.
error
(
'unexpected call to parse_comment()'
)
match
=
commentclose
.
search
(
rawdata
,
i
+
4
)
if
not
match
:
return
-
1
if
report
:
j
=
match
.
start
(
0
)
self
.
handle_comment
(
rawdata
[
i
+
4
:
j
])
return
match
.
end
(
0
)
# Extensions for the DOCTYPE scanner:
_decl_otherchars
=
'='
...
...
@@ -471,6 +461,10 @@ class TestSGMLParser(SGMLParser):
self
.
flush
()
print
'*** unknown char ref: &#'
+
ref
+
';'
def
unknown_decl
(
self
,
data
):
self
.
flush
()
print
'*** unknown decl: ['
+
data
+
']'
def
close
(
self
):
SGMLParser
.
close
(
self
)
self
.
flush
()
...
...
Lib/test/test_htmllib.py
Dosyayı görüntüle @
3163a3b4
...
...
@@ -16,6 +16,17 @@ class AnchorCollector(htmllib.HTMLParser):
def
anchor_bgn
(
self
,
*
args
):
self
.
__anchors
.
append
(
args
)
class
DeclCollector
(
htmllib
.
HTMLParser
):
def
__init__
(
self
,
*
args
,
**
kw
):
self
.
__decls
=
[]
htmllib
.
HTMLParser
.
__init__
(
self
,
*
args
,
**
kw
)
def
get_decl_info
(
self
):
return
self
.
__decls
def
unknown_decl
(
self
,
data
):
self
.
__decls
.
append
(
data
)
class
HTMLParserTestCase
(
unittest
.
TestCase
):
def
test_anchor_collection
(
self
):
...
...
@@ -33,6 +44,22 @@ class HTMLParserTestCase(unittest.TestCase):
(
''
,
'frob'
,
''
),
])
def
test_decl_collection
(
self
):
# See SF patch #545300
parser
=
DeclCollector
(
formatter
.
NullFormatter
(),
verbose
=
1
)
parser
.
feed
(
"""<html>
<body>
hallo
<![if !supportEmptyParas]> <![endif]>
</body>
</html>
"""
)
parser
.
close
()
self
.
assertEquals
(
parser
.
get_decl_info
(),
[
"if !supportEmptyParas"
,
"endif"
])
def
test_main
():
test_support
.
run_unittest
(
HTMLParserTestCase
)
...
...
Misc/NEWS
Dosyayı görüntüle @
3163a3b4
...
...
@@ -67,6 +67,9 @@ Extension modules
Library
-------
- sgmllib now supports SGML marked sections, in particular the
MS Office extensions.
- The urllib module now offers support for the iterator protocol.
SF patch 698520 contributed by Brett Cannon.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment