sgmllib.py 17.5 KB
Newer Older
1
"""A parser for SGML, using the derived class as a static DTD."""
2 3 4 5 6 7

# XXX This only supports those SGML features used by HTML.

# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
8 9
# and CDATA (character data -- only end tags are special).  RCDATA is
# not supported at all.
10 11


12 13 14 15 16
from warnings import warnpy3k
warnpy3k("the sgmllib module has been removed in Python 3.0",
         stacklevel=2)
del warnpy3k

17
import markupbase
18
import re
19

20
__all__ = ["SGMLParser", "SGMLParseError"]
21 22 23

# Regular expressions used for parsing

24 25
interesting = re.compile('[&<]')
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
26 27 28
                           '<([a-zA-Z][^<>]*|'
                              '/([a-zA-Z][^<>]*)?|'
                              '![^<>]*)?')
29

30
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
31
charref = re.compile('&#([0-9]+)[^0-9]')
32

33
starttagopen = re.compile('<[>a-zA-Z]')
34 35
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
36
piclose = re.compile('>')
37
endbracket = re.compile('[<>]')
38
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
39
attrfind = re.compile(
40
    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
41
    r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
42

43 44 45 46 47

class SGMLParseError(RuntimeError):
    """Exception raised for all parse errors."""
    pass

48 49 50 51 52 53 54 55

# SGML parser base class -- find tags and call handler functions.
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
# The dtd is defined by deriving a class which defines methods
# with special names to handle tags: start_foo and end_foo to handle
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
# (Tags are converted to lower case for this purpose.)  The data
# between tags is passed to the parser by calling self.handle_data()
Jeremy Hylton's avatar
Jeremy Hylton committed
56
# with some data as argument (the data may be split up in arbitrary
57 58 59
# chunks).  Entity references are passed by calling
# self.handle_entityref() with the entity reference as argument.

60
class SGMLParser(markupbase.ParserBase):
61 62 63 64
    # Definition of entities -- derived classes may override
    entity_or_charref = re.compile('&(?:'
      '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
      ')(;?)')
65

66
    def __init__(self, verbose=0):
67
        """Initialize and reset this instance."""
68 69
        self.verbose = verbose
        self.reset()
70 71

    def reset(self):
72
        """Reset this instance. Loses all unprocessed data."""
73
        self.__starttag_text = None
74 75 76 77 78
        self.rawdata = ''
        self.stack = []
        self.lasttag = '???'
        self.nomoretags = 0
        self.literal = 0
79
        markupbase.ParserBase.reset(self)
80 81

    def setnomoretags(self):
82 83 84 85
        """Enter literal mode (CDATA) till EOF.

        Intended for derived classes only.
        """
86
        self.nomoretags = self.literal = 1
87 88

    def setliteral(self, *args):
89 90 91 92
        """Enter literal mode (CDATA).

        Intended for derived classes only.
        """
93
        self.literal = 1
94 95

    def feed(self, data):
96 97 98 99 100 101 102
        """Feed some data to the parser.

        Call this as often as you want, with as little or as much text
        as you want (may include '\n').  (This just saves the text,
        all the processing is done by goahead().)
        """

103 104
        self.rawdata = self.rawdata + data
        self.goahead(0)
105 106

    def close(self):
107
        """Handle the remaining data."""
108
        self.goahead(1)
109

110 111 112
    def error(self, message):
        raise SGMLParseError(message)

113 114 115 116
    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
    # true, force handling all data as if followed by EOF marker.
    def goahead(self, end):
117 118 119 120 121 122 123 124 125
        rawdata = self.rawdata
        i = 0
        n = len(rawdata)
        while i < n:
            if self.nomoretags:
                self.handle_data(rawdata[i:n])
                i = n
                break
            match = interesting.search(rawdata, i)
126
            if match: j = match.start()
127
            else: j = n
128 129
            if i < j:
                self.handle_data(rawdata[i:j])
130 131 132 133 134 135 136 137 138 139 140 141
            i = j
            if i == n: break
            if rawdata[i] == '<':
                if starttagopen.match(rawdata, i):
                    if self.literal:
                        self.handle_data(rawdata[i])
                        i = i+1
                        continue
                    k = self.parse_starttag(i)
                    if k < 0: break
                    i = k
                    continue
142
                if rawdata.startswith("</", i):
143 144
                    k = self.parse_endtag(i)
                    if k < 0: break
145
                    i = k
146 147
                    self.literal = 0
                    continue
148 149 150
                if self.literal:
                    if n > (i + 1):
                        self.handle_data("<")
151
                        i = i+1
152 153 154 155 156
                    else:
                        # incomplete
                        break
                    continue
                if rawdata.startswith("<!--", i):
Tim Peters's avatar
Tim Peters committed
157 158 159 160
                        # Strictly speaking, a comment is --.*--
                        # within a declaration tag <!...>.
                        # This should be removed,
                        # and comments handled only in parse_declaration.
161 162
                    k = self.parse_comment(i)
                    if k < 0: break
163
                    i = k
164
                    continue
165
                if rawdata.startswith("<?", i):
166 167 168
                    k = self.parse_pi(i)
                    if k < 0: break
                    i = i+k
Tim Peters's avatar
Tim Peters committed
169
                    continue
170
                if rawdata.startswith("<!", i):
171 172 173 174 175 176
                    # This is some sort of declaration; in "HTML as
                    # deployed," this should only be the document type
                    # declaration ("<!DOCTYPE html...>").
                    k = self.parse_declaration(i)
                    if k < 0: break
                    i = k
177 178
                    continue
            elif rawdata[i] == '&':
179 180 181 182
                if self.literal:
                    self.handle_data(rawdata[i])
                    i = i+1
                    continue
183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
                match = charref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_charref(name)
                    i = match.end(0)
                    if rawdata[i-1] != ';': i = i-1
                    continue
                match = entityref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_entityref(name)
                    i = match.end(0)
                    if rawdata[i-1] != ';': i = i-1
                    continue
            else:
198
                self.error('neither < nor & ??')
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
            # We get here only if incomplete matches but
            # nothing else
            match = incomplete.match(rawdata, i)
            if not match:
                self.handle_data(rawdata[i])
                i = i+1
                continue
            j = match.end(0)
            if j == n:
                break # Really incomplete
            self.handle_data(rawdata[i:j])
            i = j
        # end while
        if end and i < n:
            self.handle_data(rawdata[i:n])
            i = n
        self.rawdata = rawdata[i:]
        # XXX if end: check for empty stack
217

218 219
    # Extensions for the DOCTYPE scanner:
    _decl_otherchars = '='
220

221 222 223
    # Internal -- parse processing instr, return length or -1 if not terminated
    def parse_pi(self, i):
        rawdata = self.rawdata
224
        if rawdata[i:i+2] != '<?':
225
            self.error('unexpected call to parse_pi()')
226 227 228 229 230 231 232
        match = piclose.search(rawdata, i+2)
        if not match:
            return -1
        j = match.start(0)
        self.handle_pi(rawdata[i+2: j])
        j = match.end(0)
        return j-i
233 234 235

    def get_starttag_text(self):
        return self.__starttag_text
Tim Peters's avatar
Tim Peters committed
236

237 238
    # Internal -- handle starttag, return length or -1 if not terminated
    def parse_starttag(self, i):
239 240
        self.__starttag_text = None
        start_pos = i
241 242 243 244 245 246 247 248 249 250
        rawdata = self.rawdata
        if shorttagopen.match(rawdata, i):
            # SGML shorthand: <tag/data/ == <tag>data</tag>
            # XXX Can data contain &... (entity or char refs)?
            # XXX Can data contain < or > (tag characters)?
            # XXX Can there be whitespace before the first /?
            match = shorttag.match(rawdata, i)
            if not match:
                return -1
            tag, data = match.group(1, 2)
251
            self.__starttag_text = '<%s/' % tag
252
            tag = tag.lower()
253
            k = match.end(0)
254 255
            self.finish_shorttag(tag, data)
            self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
256
            return k
257 258 259 260 261
        # XXX The following should skip matching quotes (' or ")
        # As a shortcut way to exit, this isn't so bad, but shouldn't
        # be used to locate the actual end of the start tag since the
        # < or > characters may be embedded in an attribute value.
        match = endbracket.search(rawdata, i+1)
262 263
        if not match:
            return -1
264
        j = match.start(0)
265 266 267 268 269 270 271 272 273
        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        if rawdata[i:i+2] == '<>':
            # SGML shorthand: <> == <last open tag seen>
            k = j
            tag = self.lasttag
        else:
            match = tagfind.match(rawdata, i+1)
            if not match:
274
                self.error('unexpected call to parse_starttag')
275
            k = match.end(0)
276
            tag = rawdata[i+1:k].lower()
277 278 279 280 281 282 283
            self.lasttag = tag
        while k < j:
            match = attrfind.match(rawdata, k)
            if not match: break
            attrname, rest, attrvalue = match.group(1, 2, 3)
            if not rest:
                attrvalue = attrname
284
            else:
Tim Peters's avatar
Tim Peters committed
285
                if (attrvalue[:1] == "'" == attrvalue[-1:] or
286 287 288
                    attrvalue[:1] == '"' == attrvalue[-1:]):
                    # strip quotes
                    attrvalue = attrvalue[1:-1]
289 290
                attrvalue = self.entity_or_charref.sub(
                    self._convert_ref, attrvalue)
291
            attrs.append((attrname.lower(), attrvalue))
292 293 294
            k = match.end(0)
        if rawdata[j] == '>':
            j = j+1
295
        self.__starttag_text = rawdata[start_pos:j]
296 297
        self.finish_starttag(tag, attrs)
        return j
298

299 300 301 302 303 304 305 306 307 308 309
    # Internal -- convert entity or character reference
    def _convert_ref(self, match):
        if match.group(2):
            return self.convert_charref(match.group(2)) or \
                '&#%s%s' % match.groups()[1:]
        elif match.group(3):
            return self.convert_entityref(match.group(1)) or \
                '&%s;' % match.group(1)
        else:
            return '&%s' % match.group(1)

310 311
    # Internal -- parse endtag
    def parse_endtag(self, i):
312
        rawdata = self.rawdata
313
        match = endbracket.search(rawdata, i+1)
314 315
        if not match:
            return -1
316
        j = match.start(0)
317
        tag = rawdata[i+2:j].strip().lower()
318 319 320 321
        if rawdata[j] == '>':
            j = j+1
        self.finish_endtag(tag)
        return j
322 323 324

    # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
    def finish_shorttag(self, tag, data):
325 326 327
        self.finish_starttag(tag, [])
        self.handle_data(data)
        self.finish_endtag(tag)
328 329 330 331

    # Internal -- finish processing of start tag
    # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
    def finish_starttag(self, tag, attrs):
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
        try:
            method = getattr(self, 'start_' + tag)
        except AttributeError:
            try:
                method = getattr(self, 'do_' + tag)
            except AttributeError:
                self.unknown_starttag(tag, attrs)
                return -1
            else:
                self.handle_starttag(tag, method, attrs)
                return 0
        else:
            self.stack.append(tag)
            self.handle_starttag(tag, method, attrs)
            return 1
347 348 349

    # Internal -- finish processing of end tag
    def finish_endtag(self, tag):
350 351 352 353 354 355 356 357 358 359 360
        if not tag:
            found = len(self.stack) - 1
            if found < 0:
                self.unknown_endtag(tag)
                return
        else:
            if tag not in self.stack:
                try:
                    method = getattr(self, 'end_' + tag)
                except AttributeError:
                    self.unknown_endtag(tag)
361 362
                else:
                    self.report_unbalanced(tag)
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
                return
            found = len(self.stack)
            for i in range(found):
                if self.stack[i] == tag: found = i
        while len(self.stack) > found:
            tag = self.stack[-1]
            try:
                method = getattr(self, 'end_' + tag)
            except AttributeError:
                method = None
            if method:
                self.handle_endtag(tag, method)
            else:
                self.unknown_endtag(tag)
            del self.stack[-1]
378 379 380

    # Overridable -- handle start tag
    def handle_starttag(self, tag, method, attrs):
381
        method(attrs)
382 383 384

    # Overridable -- handle end tag
    def handle_endtag(self, tag, method):
385
        method()
386 387 388

    # Example -- report an unbalanced </...> tag.
    def report_unbalanced(self, tag):
389 390 391
        if self.verbose:
            print '*** Unbalanced </' + tag + '>'
            print '*** Stack:', self.stack
392

393 394
    def convert_charref(self, name):
        """Convert character reference, may be overridden."""
395
        try:
396
            n = int(name)
397
        except ValueError:
398
            return
399
        if not 0 <= n <= 127:
400
            return
401 402 403 404 405 406 407
        return self.convert_codepoint(n)

    def convert_codepoint(self, codepoint):
        return chr(codepoint)

    def handle_charref(self, name):
        """Handle character reference, no need to override."""
408
        replacement = self.convert_charref(name)
409 410 411
        if replacement is None:
            self.unknown_charref(name)
        else:
412
            self.handle_data(replacement)
413 414 415

    # Definition of entities -- derived classes may override
    entitydefs = \
416
            {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
417

418 419
    def convert_entityref(self, name):
        """Convert entity references.
420

421 422
        As an alternative to overriding this method; one can tailor the
        results by setting up the self.entitydefs mapping appropriately.
423
        """
424
        table = self.entitydefs
425
        if name in table:
426
            return table[name]
427 428
        else:
            return
429

430 431
    def handle_entityref(self, name):
        """Handle entity references, no need to override."""
432
        replacement = self.convert_entityref(name)
433 434 435
        if replacement is None:
            self.unknown_entityref(name)
        else:
436
            self.handle_data(replacement)
437

438 439
    # Example -- handle data, should be overridden
    def handle_data(self, data):
440
        pass
441 442 443

    # Example -- handle comment, could be overridden
    def handle_comment(self, data):
444
        pass
445

446 447 448 449
    # Example -- handle declaration, could be overridden
    def handle_decl(self, decl):
        pass

450 451 452 453
    # Example -- handle processing instruction, could be overridden
    def handle_pi(self, data):
        pass

454 455 456 457 458 459 460 461 462 463
    # To be overridden -- handlers for unknown objects
    def unknown_starttag(self, tag, attrs): pass
    def unknown_endtag(self, tag): pass
    def unknown_charref(self, ref): pass
    def unknown_entityref(self, ref): pass


class TestSGMLParser(SGMLParser):

    def __init__(self, verbose=0):
464 465
        self.testdata = ""
        SGMLParser.__init__(self, verbose)
466 467

    def handle_data(self, data):
468
        self.testdata = self.testdata + data
469
        if len(repr(self.testdata)) >= 70:
470
            self.flush()
471 472

    def flush(self):
473 474 475
        data = self.testdata
        if data:
            self.testdata = ""
476
            print 'data:', repr(data)
477 478

    def handle_comment(self, data):
479
        self.flush()
480
        r = repr(data)
481 482 483
        if len(r) > 68:
            r = r[:32] + '...' + r[-32:]
        print 'comment:', r
484 485

    def unknown_starttag(self, tag, attrs):
486 487 488 489 490 491 492 493
        self.flush()
        if not attrs:
            print 'start tag: <' + tag + '>'
        else:
            print 'start tag: <' + tag,
            for name, value in attrs:
                print name + '=' + '"' + value + '"',
            print '>'
494 495

    def unknown_endtag(self, tag):
496 497
        self.flush()
        print 'end tag: </' + tag + '>'
498 499

    def unknown_entityref(self, ref):
500 501
        self.flush()
        print '*** unknown entity ref: &' + ref + ';'
502 503

    def unknown_charref(self, ref):
504 505
        self.flush()
        print '*** unknown char ref: &#' + ref + ';'
506

507 508 509 510
    def unknown_decl(self, data):
        self.flush()
        print '*** unknown decl: [' + data + ']'

511
    def close(self):
512 513
        SGMLParser.close(self)
        self.flush()
514 515 516 517 518


def test(args = None):
    import sys

519
    if args is None:
520
        args = sys.argv[1:]
521 522

    if args and args[0] == '-s':
523 524
        args = args[1:]
        klass = SGMLParser
525
    else:
526
        klass = TestSGMLParser
527 528

    if args:
529
        file = args[0]
530
    else:
531
        file = 'test.html'
532 533

    if file == '-':
534
        f = sys.stdin
535
    else:
536 537 538 539 540
        try:
            f = open(file, 'r')
        except IOError, msg:
            print file, ":", msg
            sys.exit(1)
541 542 543

    data = f.read()
    if f is not sys.stdin:
544
        f.close()
545 546 547

    x = klass()
    for c in data:
548
        x.feed(c)
549
    x.close()
550 551


552
if __name__ == '__main__':
553
    test()