sgmllib.py 16.1 KB
Newer Older
1
"""A parser for SGML, using the derived class as a static DTD."""
2 3 4 5 6 7

# XXX This only supports those SGML features used by HTML.

# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
8 9
# and CDATA (character data -- only end tags are special).  RCDATA is
# not supported at all.
10 11


12
import markupbase
13
import re
14

15
__all__ = ["SGMLParser"]
16 17 18

# Regular expressions used for parsing

19 20
interesting = re.compile('[&<]')
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
21 22 23
                           '<([a-zA-Z][^<>]*|'
                              '/([a-zA-Z][^<>]*)?|'
                              '![^<>]*)?')
24

25
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26
charref = re.compile('&#([0-9]+)[^0-9]')
27

28
starttagopen = re.compile('<[>a-zA-Z]')
29 30
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
31
piclose = re.compile('>')
32
endbracket = re.compile('[<>]')
33
commentclose = re.compile(r'--\s*>')
34
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
35
attrfind = re.compile(
36
    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
37
    r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
38

39 40 41 42 43

class SGMLParseError(RuntimeError):
    """Exception raised for all parse errors."""
    pass

44 45 46 47 48 49 50 51

# SGML parser base class -- find tags and call handler functions.
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
# The dtd is defined by deriving a class which defines methods
# with special names to handle tags: start_foo and end_foo to handle
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
# (Tags are converted to lower case for this purpose.)  The data
# between tags is passed to the parser by calling self.handle_data()
Jeremy Hylton's avatar
Jeremy Hylton committed
52
# with some data as argument (the data may be split up in arbitrary
53 54 55
# chunks).  Entity references are passed by calling
# self.handle_entityref() with the entity reference as argument.

56
class SGMLParser(markupbase.ParserBase):
57

58
    def __init__(self, verbose=0):
59
        """Initialize and reset this instance."""
60 61
        self.verbose = verbose
        self.reset()
62 63

    def reset(self):
64
        """Reset this instance. Loses all unprocessed data."""
65 66 67 68 69
        self.rawdata = ''
        self.stack = []
        self.lasttag = '???'
        self.nomoretags = 0
        self.literal = 0
70
        markupbase.ParserBase.reset(self)
71 72

    def setnomoretags(self):
73 74 75 76
        """Enter literal mode (CDATA) till EOF.

        Intended for derived classes only.
        """
77
        self.nomoretags = self.literal = 1
78 79

    def setliteral(self, *args):
80 81 82 83
        """Enter literal mode (CDATA).

        Intended for derived classes only.
        """
84
        self.literal = 1
85 86

    def feed(self, data):
87 88 89 90 91 92 93
        """Feed some data to the parser.

        Call this as often as you want, with as little or as much text
        as you want (may include '\n').  (This just saves the text,
        all the processing is done by goahead().)
        """

94 95
        self.rawdata = self.rawdata + data
        self.goahead(0)
96 97

    def close(self):
98
        """Handle the remaining data."""
99
        self.goahead(1)
100

101 102 103
    def error(self, message):
        raise SGMLParseError(message)

104 105 106 107
    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
    # true, force handling all data as if followed by EOF marker.
    def goahead(self, end):
108 109 110 111 112 113 114 115 116
        rawdata = self.rawdata
        i = 0
        n = len(rawdata)
        while i < n:
            if self.nomoretags:
                self.handle_data(rawdata[i:n])
                i = n
                break
            match = interesting.search(rawdata, i)
117
            if match: j = match.start()
118
            else: j = n
119 120
            if i < j:
                self.handle_data(rawdata[i:j])
121 122 123 124 125 126 127 128 129 130 131 132
            i = j
            if i == n: break
            if rawdata[i] == '<':
                if starttagopen.match(rawdata, i):
                    if self.literal:
                        self.handle_data(rawdata[i])
                        i = i+1
                        continue
                    k = self.parse_starttag(i)
                    if k < 0: break
                    i = k
                    continue
133
                if rawdata.startswith("</", i):
134 135
                    k = self.parse_endtag(i)
                    if k < 0: break
136
                    i = k
137 138
                    self.literal = 0
                    continue
139 140 141
                if self.literal:
                    if n > (i + 1):
                        self.handle_data("<")
142
                        i = i+1
143 144 145 146 147
                    else:
                        # incomplete
                        break
                    continue
                if rawdata.startswith("<!--", i):
148 149
                    k = self.parse_comment(i)
                    if k < 0: break
150
                    i = k
151
                    continue
152
                if rawdata.startswith("<?", i):
153 154 155
                    k = self.parse_pi(i)
                    if k < 0: break
                    i = i+k
Tim Peters's avatar
Tim Peters committed
156
                    continue
157
                if rawdata.startswith("<!", i):
158 159 160 161 162 163
                    # This is some sort of declaration; in "HTML as
                    # deployed," this should only be the document type
                    # declaration ("<!DOCTYPE html...>").
                    k = self.parse_declaration(i)
                    if k < 0: break
                    i = k
164 165
                    continue
            elif rawdata[i] == '&':
166 167 168 169
                if self.literal:
                    self.handle_data(rawdata[i])
                    i = i+1
                    continue
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
                match = charref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_charref(name)
                    i = match.end(0)
                    if rawdata[i-1] != ';': i = i-1
                    continue
                match = entityref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_entityref(name)
                    i = match.end(0)
                    if rawdata[i-1] != ';': i = i-1
                    continue
            else:
185
                self.error('neither < nor & ??')
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
            # We get here only if incomplete matches but
            # nothing else
            match = incomplete.match(rawdata, i)
            if not match:
                self.handle_data(rawdata[i])
                i = i+1
                continue
            j = match.end(0)
            if j == n:
                break # Really incomplete
            self.handle_data(rawdata[i:j])
            i = j
        # end while
        if end and i < n:
            self.handle_data(rawdata[i:n])
            i = n
        self.rawdata = rawdata[i:]
        # XXX if end: check for empty stack
204 205

    # Internal -- parse comment, return length or -1 if not terminated
206
    def parse_comment(self, i, report=1):
207
        rawdata = self.rawdata
208
        if rawdata[i:i+4] != '<!--':
209
            self.error('unexpected call to parse_comment()')
210 211 212
        match = commentclose.search(rawdata, i+4)
        if not match:
            return -1
213 214 215 216
        if report:
            j = match.start(0)
            self.handle_comment(rawdata[i+4: j])
        return match.end(0)
217

218 219
    # Extensions for the DOCTYPE scanner:
    _decl_otherchars = '='
220

221 222 223
    # Internal -- parse processing instr, return length or -1 if not terminated
    def parse_pi(self, i):
        rawdata = self.rawdata
224
        if rawdata[i:i+2] != '<?':
225
            self.error('unexpected call to parse_pi()')
226 227 228 229 230 231 232
        match = piclose.search(rawdata, i+2)
        if not match:
            return -1
        j = match.start(0)
        self.handle_pi(rawdata[i+2: j])
        j = match.end(0)
        return j-i
233 234 235 236

    __starttag_text = None
    def get_starttag_text(self):
        return self.__starttag_text
Tim Peters's avatar
Tim Peters committed
237

238 239
    # Internal -- handle starttag, return length or -1 if not terminated
    def parse_starttag(self, i):
240 241
        self.__starttag_text = None
        start_pos = i
242 243 244 245 246 247 248 249 250 251
        rawdata = self.rawdata
        if shorttagopen.match(rawdata, i):
            # SGML shorthand: <tag/data/ == <tag>data</tag>
            # XXX Can data contain &... (entity or char refs)?
            # XXX Can data contain < or > (tag characters)?
            # XXX Can there be whitespace before the first /?
            match = shorttag.match(rawdata, i)
            if not match:
                return -1
            tag, data = match.group(1, 2)
252
            self.__starttag_text = '<%s/' % tag
253
            tag = tag.lower()
254
            k = match.end(0)
255 256
            self.finish_shorttag(tag, data)
            self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
            return k
        # XXX The following should skip matching quotes (' or ")
        match = endbracket.search(rawdata, i+1)
        if not match:
            return -1
        j = match.start(0)
        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        if rawdata[i:i+2] == '<>':
            # SGML shorthand: <> == <last open tag seen>
            k = j
            tag = self.lasttag
        else:
            match = tagfind.match(rawdata, i+1)
            if not match:
272
                self.error('unexpected call to parse_starttag')
273
            k = match.end(0)
274
            tag = rawdata[i+1:k].lower()
275 276 277 278 279 280 281 282 283 284
            self.lasttag = tag
        while k < j:
            match = attrfind.match(rawdata, k)
            if not match: break
            attrname, rest, attrvalue = match.group(1, 2, 3)
            if not rest:
                attrvalue = attrname
            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
285
            attrs.append((attrname.lower(), attrvalue))
286 287 288
            k = match.end(0)
        if rawdata[j] == '>':
            j = j+1
289
        self.__starttag_text = rawdata[start_pos:j]
290 291
        self.finish_starttag(tag, attrs)
        return j
292 293 294

    # Internal -- parse endtag
    def parse_endtag(self, i):
295 296 297 298 299
        rawdata = self.rawdata
        match = endbracket.search(rawdata, i+1)
        if not match:
            return -1
        j = match.start(0)
300
        tag = rawdata[i+2:j].strip().lower()
301 302 303 304
        if rawdata[j] == '>':
            j = j+1
        self.finish_endtag(tag)
        return j
305 306 307

    # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
    def finish_shorttag(self, tag, data):
308 309 310
        self.finish_starttag(tag, [])
        self.handle_data(data)
        self.finish_endtag(tag)
311 312 313 314

    # Internal -- finish processing of start tag
    # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
    def finish_starttag(self, tag, attrs):
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
        try:
            method = getattr(self, 'start_' + tag)
        except AttributeError:
            try:
                method = getattr(self, 'do_' + tag)
            except AttributeError:
                self.unknown_starttag(tag, attrs)
                return -1
            else:
                self.handle_starttag(tag, method, attrs)
                return 0
        else:
            self.stack.append(tag)
            self.handle_starttag(tag, method, attrs)
            return 1
330 331 332

    # Internal -- finish processing of end tag
    def finish_endtag(self, tag):
333 334 335 336 337 338 339 340 341 342 343
        if not tag:
            found = len(self.stack) - 1
            if found < 0:
                self.unknown_endtag(tag)
                return
        else:
            if tag not in self.stack:
                try:
                    method = getattr(self, 'end_' + tag)
                except AttributeError:
                    self.unknown_endtag(tag)
344 345
                else:
                    self.report_unbalanced(tag)
346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
                return
            found = len(self.stack)
            for i in range(found):
                if self.stack[i] == tag: found = i
        while len(self.stack) > found:
            tag = self.stack[-1]
            try:
                method = getattr(self, 'end_' + tag)
            except AttributeError:
                method = None
            if method:
                self.handle_endtag(tag, method)
            else:
                self.unknown_endtag(tag)
            del self.stack[-1]
361 362 363

    # Overridable -- handle start tag
    def handle_starttag(self, tag, method, attrs):
364
        method(attrs)
365 366 367

    # Overridable -- handle end tag
    def handle_endtag(self, tag, method):
368
        method()
369 370 371

    # Example -- report an unbalanced </...> tag.
    def report_unbalanced(self, tag):
372 373 374
        if self.verbose:
            print '*** Unbalanced </' + tag + '>'
            print '*** Stack:', self.stack
375 376

    def handle_charref(self, name):
377
        """Handle character reference, no need to override."""
378
        try:
379
            n = int(name)
380
        except ValueError:
381 382 383 384 385 386
            self.unknown_charref(name)
            return
        if not 0 <= n <= 255:
            self.unknown_charref(name)
            return
        self.handle_data(chr(n))
387 388 389

    # Definition of entities -- derived classes may override
    entitydefs = \
390
            {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
391 392

    def handle_entityref(self, name):
393 394 395 396 397
        """Handle entity references.

        There should be no need to override this method; it can be
        tailored by setting up the self.entitydefs mapping appropriately.
        """
398 399 400 401 402 403
        table = self.entitydefs
        if table.has_key(name):
            self.handle_data(table[name])
        else:
            self.unknown_entityref(name)
            return
404 405 406

    # Example -- handle data, should be overridden
    def handle_data(self, data):
407
        pass
408 409 410

    # Example -- handle comment, could be overridden
    def handle_comment(self, data):
411
        pass
412

413 414 415 416
    # Example -- handle declaration, could be overridden
    def handle_decl(self, decl):
        pass

417 418 419 420
    # Example -- handle processing instruction, could be overridden
    def handle_pi(self, data):
        pass

421 422 423 424 425
    # To be overridden -- handlers for unknown objects
    def unknown_starttag(self, tag, attrs): pass
    def unknown_endtag(self, tag): pass
    def unknown_charref(self, ref): pass
    def unknown_entityref(self, ref): pass
426
    def unknown_decl(self, data): pass
427 428 429 430 431


class TestSGMLParser(SGMLParser):

    def __init__(self, verbose=0):
432 433
        self.testdata = ""
        SGMLParser.__init__(self, verbose)
434 435

    def handle_data(self, data):
436 437 438
        self.testdata = self.testdata + data
        if len(`self.testdata`) >= 70:
            self.flush()
439 440

    def flush(self):
441 442 443 444
        data = self.testdata
        if data:
            self.testdata = ""
            print 'data:', `data`
445 446

    def handle_comment(self, data):
447 448 449 450 451
        self.flush()
        r = `data`
        if len(r) > 68:
            r = r[:32] + '...' + r[-32:]
        print 'comment:', r
452 453

    def unknown_starttag(self, tag, attrs):
454 455 456 457 458 459 460 461
        self.flush()
        if not attrs:
            print 'start tag: <' + tag + '>'
        else:
            print 'start tag: <' + tag,
            for name, value in attrs:
                print name + '=' + '"' + value + '"',
            print '>'
462 463

    def unknown_endtag(self, tag):
464 465
        self.flush()
        print 'end tag: </' + tag + '>'
466 467

    def unknown_entityref(self, ref):
468 469
        self.flush()
        print '*** unknown entity ref: &' + ref + ';'
470 471

    def unknown_charref(self, ref):
472 473
        self.flush()
        print '*** unknown char ref: &#' + ref + ';'
474 475

    def close(self):
476 477
        SGMLParser.close(self)
        self.flush()
478 479 480 481 482 483


def test(args = None):
    import sys

    if not args:
484
        args = sys.argv[1:]
485 486

    if args and args[0] == '-s':
487 488
        args = args[1:]
        klass = SGMLParser
489
    else:
490
        klass = TestSGMLParser
491 492

    if args:
493
        file = args[0]
494
    else:
495
        file = 'test.html'
496 497

    if file == '-':
498
        f = sys.stdin
499
    else:
500 501 502 503 504
        try:
            f = open(file, 'r')
        except IOError, msg:
            print file, ":", msg
            sys.exit(1)
505 506 507

    data = f.read()
    if f is not sys.stdin:
508
        f.close()
509 510 511

    x = klass()
    for c in data:
512
        x.feed(c)
513
    x.close()
514 515


516
if __name__ == '__main__':
517
    test()