xmllib.py 26.6 KB
Newer Older
1
# A parser for XML, using the derived class as static DTD.
2
# Author: Sjoerd Mullender.
3 4 5 6 7

import re
import string


Guido van Rossum's avatar
Guido van Rossum committed
8 9
version = '0.1'

10 11 12 13 14
# Regular expressions used for parsing

_S = '[ \t\r\n]+'
_opS = '[ \t\r\n]*'
_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'
Guido van Rossum's avatar
Guido van Rossum committed
15 16 17 18 19
illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
interesting = re.compile('[]&<]')

amp = re.compile('&')
ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
20 21
entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
Guido van Rossum's avatar
Guido van Rossum committed
22
space = re.compile(_S + '$')
23 24 25 26 27
newline = re.compile('\n')

starttagopen = re.compile('<' + _Name)
endtagopen = re.compile('</')
starttagend = re.compile(_opS + '(?P<slash>/?)>')
Guido van Rossum's avatar
Guido van Rossum committed
28
endbracket = re.compile(_opS + '>')
29
tagfind = re.compile(_Name)
30 31
cdataopen = re.compile(r'<!\[CDATA\[')
cdataclose = re.compile(r'\]\]>')
Guido van Rossum's avatar
Guido van Rossum committed
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
# this matches one of the following:
# SYSTEM SystemLiteral
# PUBLIC PubidLiteral SystemLiteral
_SystemLiteral = '(?P<%s>\'[^\']*\'|"[^"]*")'
_PublicLiteral = '(?P<%s>"[-\'()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
                        "'[-()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
_ExternalId = '(?:SYSTEM|' \
                 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
              ')'+_S+_SystemLiteral%'syslit'
doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
                     '(?:'+_S+_ExternalId+')?'+_opS)
xmldecl = re.compile('<\?xml'+_S+
                     'version'+_opS+'='+_opS+'(?P<version>\'[^\']*\'|"[^"]*")'+
                     '(?:'+_S+'encoding'+_opS+'='+_opS+
                        "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
                        '"[A-Za-z][-A-Za-z0-9._]*"))?'
                     '(?:'+_S+'standalone'+_opS+'='+_opS+
                        '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
                     _opS+'\?>')
procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
52
procclose = re.compile(_opS + r'\?>')
53 54 55 56 57 58 59
commentopen = re.compile('<!--')
commentclose = re.compile('-->')
doubledash = re.compile('--')
attrfind = re.compile(
    _S + '(?P<name>' + _Name + ')'
    '(' + _opS + '=' + _opS +
    '(?P<value>\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9.:+*%?!()_#=~]+))')
Guido van Rossum's avatar
Guido van Rossum committed
60
attrtrans = string.maketrans(' \r\n\t', '    ')
61 62 63 64


# XML parser base class -- find tags and call handler functions.
# Usage: p = XMLParser(); p.feed(data); ...; p.close().
65 66 67 68 69 70 71
# The dtd is defined by deriving a class which defines methods with
# special names to handle tags: start_foo and end_foo to handle <foo>
# and </foo>, respectively.  The data between tags is passed to the
# parser by calling self.handle_data() with some data as argument (the
# data may be split up in arbutrary chunks).  Entity references are
# passed by calling self.handle_entityref() with the entity reference
# as argument.
72 73 74 75 76

class XMLParser:

    # Interface -- initialize and reset this instance
    def __init__(self, verbose=0):
77 78
        self.verbose = verbose
        self.reset()
79 80 81

    # Interface -- reset this instance.  Loses all unprocessed data
    def reset(self):
82 83 84 85 86 87 88 89
        self.rawdata = ''
        self.stack = []
        self.nomoretags = 0
        self.literal = 0
        self.lineno = 1
        self.__at_start = 1
        self.__seen_doctype = None
        self.__seen_starttag = 0
90 91 92

    # For derived classes only -- enter literal mode (CDATA) till EOF
    def setnomoretags(self):
93
        self.nomoretags = self.literal = 1
94 95 96

    # For derived classes only -- enter literal mode (CDATA)
    def setliteral(self, *args):
97
        self.literal = 1
98 99 100 101 102 103

    # Interface -- feed some data to the parser.  Call this as
    # often as you want, with as little or as much text as you
    # want (may include '\n').  (This just saves the text, all the
    # processing is done by goahead().)
    def feed(self, data):
104 105
        self.rawdata = self.rawdata + data
        self.goahead(0)
106 107 108

    # Interface -- handle the remaining data
    def close(self):
109
        self.goahead(1)
110 111

    # Interface -- translate references
Guido van Rossum's avatar
Guido van Rossum committed
112
    def translate_references(self, data, all = 1):
113 114
        i = 0
        while 1:
Guido van Rossum's avatar
Guido van Rossum committed
115
            res = amp.search(data, i)
116
            if res is None:
Guido van Rossum's avatar
Guido van Rossum committed
117 118 119 120 121 122 123 124
                return data
            res = ref.match(data, res.start(0))
            if res is None:
                self.syntax_error("bogus `&'")
                i =i+1
                continue
            i = res.end(0)
            if data[i - 1] != ';':
125
                self.syntax_error("`;' missing after entity/char reference")
Guido van Rossum's avatar
Guido van Rossum committed
126
                i = i-1
127
            str = res.group(1)
Guido van Rossum's avatar
Guido van Rossum committed
128 129
            pre = data[:res.start(0)]
            post = data[i:]
130 131
            if str[0] == '#':
                if str[1] == 'x':
Guido van Rossum's avatar
Guido van Rossum committed
132
                    str = chr(string.atoi(str[2:], 16))
133
                else:
Guido van Rossum's avatar
Guido van Rossum committed
134 135 136 137 138 139 140 141 142
                    str = chr(string.atoi(str[1:]))
                data = pre + str + post
                i = res.start(0)+len(str)
            elif all:
                if self.entitydefs.has_key(str):
                    data = pre + self.entitydefs[str] + post
                    i = res.start(0)    # rescan substituted text
                else:
                    self.syntax_error('reference to unknown entity')
143
                    # can't do it, so keep the entity ref in
Guido van Rossum's avatar
Guido van Rossum committed
144 145 146 147 148
                    data = pre + '&' + str + ';' + post
                    i = res.start(0) + len(str) + 2
            else:
                # just translating character references
                pass                    # i is already postioned correctly
149 150 151 152 153

    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
    # true, force handling all data as if followed by EOF marker.
    def goahead(self, end):
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
        rawdata = self.rawdata
        i = 0
        n = len(rawdata)
        while i < n:
            if i > 0:
                self.__at_start = 0
            if self.nomoretags:
                data = rawdata[i:n]
                self.handle_data(data)
                self.lineno = self.lineno + string.count(data, '\n')
                i = n
                break
            res = interesting.search(rawdata, i)
            if res:
                    j = res.start(0)
            else:
                    j = n
            if i < j:
Guido van Rossum's avatar
Guido van Rossum committed
172 173
                if self.__at_start:
                    self.syntax_error('illegal data at start of file')
174 175
                self.__at_start = 0
                data = rawdata[i:j]
Guido van Rossum's avatar
Guido van Rossum committed
176 177 178 179
                if not self.stack and not space.match(data):
                    self.syntax_error('data not in content')
                if illegal.search(data):
                    self.syntax_error('illegal character in content')
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
                self.handle_data(data)
                self.lineno = self.lineno + string.count(data, '\n')
            i = j
            if i == n: break
            if rawdata[i] == '<':
                if starttagopen.match(rawdata, i):
                    if self.literal:
                        data = rawdata[i]
                        self.handle_data(data)
                        self.lineno = self.lineno + string.count(data, '\n')
                        i = i+1
                        continue
                    k = self.parse_starttag(i)
                    if k < 0: break
                    self.__seen_starttag = 1
                    self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
                    i = k
                    continue
                if endtagopen.match(rawdata, i):
                    k = self.parse_endtag(i)
                    if k < 0: break
                    self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
                    i =  k
                    self.literal = 0
                    continue
                if commentopen.match(rawdata, i):
                    if self.literal:
                        data = rawdata[i]
                        self.handle_data(data)
                        self.lineno = self.lineno + string.count(data, '\n')
                        i = i+1
                        continue
                    k = self.parse_comment(i)
                    if k < 0: break
                    self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
                    i = k
                    continue
                if cdataopen.match(rawdata, i):
                    k = self.parse_cdata(i)
                    if k < 0: break
                    self.lineno = self.lineno + string.count(rawdata[i:i], '\n')
                    i = k
                    continue
Guido van Rossum's avatar
Guido van Rossum committed
223 224 225 226 227 228 229 230 231 232 233 234 235 236
                res = xmldecl.match(rawdata, i)
                if res:
                    if not self.__at_start:
                        self.syntax_error("<?xml?> declaration not at start of document")
                    version, encoding, standalone = res.group('version',
                                                              'encoding',
                                                              'standalone')
                    if version[1:-1] != '1.0':
                        raise RuntimeError, 'only XML version 1.0 supported'
                    if encoding: encoding = encoding[1:-1]
                    if standalone: standalone = standalone[1:-1]
                    self.handle_xml(encoding, standalone)
                    i = res.end(0)
                    continue
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
                res = procopen.match(rawdata, i)
                if res:
                    k = self.parse_proc(i)
                    if k < 0: break
                    self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
                    i = k
                    continue
                res = doctype.match(rawdata, i)
                if res:
                    if self.literal:
                        data = rawdata[i]
                        self.handle_data(data)
                        self.lineno = self.lineno + string.count(data, '\n')
                        i = i+1
                        continue
                    if self.__seen_doctype:
                        self.syntax_error('multiple DOCTYPE elements')
                    if self.__seen_starttag:
                        self.syntax_error('DOCTYPE not at beginning of document')
                    k = self.parse_doctype(res)
                    if k < 0: break
                    self.__seen_doctype = res.group('name')
                    self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
                    i = k
                    continue
            elif rawdata[i] == '&':
                res = charref.match(rawdata, i)
                if res is not None:
                    i = res.end(0)
                    if rawdata[i-1] != ';':
                        self.syntax_error("`;' missing in charref")
                        i = i-1
Guido van Rossum's avatar
Guido van Rossum committed
269 270
                    if not self.stack:
                        self.syntax_error('data not in content')
271 272 273 274 275 276 277 278 279
                    self.handle_charref(res.group('char')[:-1])
                    self.lineno = self.lineno + string.count(res.group(0), '\n')
                    continue
                res = entityref.match(rawdata, i)
                if res is not None:
                    i = res.end(0)
                    if rawdata[i-1] != ';':
                        self.syntax_error("`;' missing in entityref")
                        i = i-1
Guido van Rossum's avatar
Guido van Rossum committed
280 281 282 283 284 285 286 287
                    name = res.group('name')
                    if self.entitydefs.has_key(name):
                        self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
                        n = len(rawdata)
                        i = res.start(0)
                    else:
                        self.syntax_error('reference to unknown entity')
                        self.unknown_entityref(name)
288 289
                    self.lineno = self.lineno + string.count(res.group(0), '\n')
                    continue
Guido van Rossum's avatar
Guido van Rossum committed
290 291 292 293 294 295 296 297
            elif rawdata[i] == ']':
                if n-i < 3:
                    break
                if cdataclose.match(rawdata, i):
                    self.syntax_error("bogus `]]>'")
                self.handle_data(rawdata[i])
                i = i+1
                continue
298 299 300 301
            else:
                raise RuntimeError, 'neither < nor & ??'
            # We get here only if incomplete matches but
            # nothing else
Guido van Rossum's avatar
Guido van Rossum committed
302
            break
303
        # end while
Guido van Rossum's avatar
Guido van Rossum committed
304 305
        if i > 0:
            self.__at_start = 0
306
        if end and i < n:
Guido van Rossum's avatar
Guido van Rossum committed
307 308 309 310
            data = rawdata[i]
            self.syntax_error("bogus `%s'" % data)
            if illegal.search(data):
                self.syntax_error('illegal character in content')
311 312
            self.handle_data(data)
            self.lineno = self.lineno + string.count(data, '\n')
Guido van Rossum's avatar
Guido van Rossum committed
313 314
            self.rawdata = rawdata[i+1:]
            return self.goahead(end)
315 316
        self.rawdata = rawdata[i:]
        if end:
Guido van Rossum's avatar
Guido van Rossum committed
317 318
            if not self.__seen_starttag:
                self.syntax_error('no elements in file')
319 320 321 322
            if self.stack:
                self.syntax_error('missing end tags')
                while self.stack:
                    self.finish_endtag(self.stack[-1])
323 324 325

    # Internal -- parse comment, return length or -1 if not terminated
    def parse_comment(self, i):
326 327 328 329 330 331
        rawdata = self.rawdata
        if rawdata[i:i+4] <> '<!--':
            raise RuntimeError, 'unexpected call to handle_comment'
        res = commentclose.search(rawdata, i+4)
        if not res:
            return -1
Guido van Rossum's avatar
Guido van Rossum committed
332
        if doubledash.search(rawdata, i+4, res.start(0)):
333
            self.syntax_error("`--' inside comment")
Guido van Rossum's avatar
Guido van Rossum committed
334 335 336 337
        if rawdata[res.start(0)-1] == '-':
            self.syntax_error('comment cannot end in three dashes')
        if illegal.search(rawdata, i+4, res.start(0)):
            self.syntax_error('illegal character in comment')
338 339
        self.handle_comment(rawdata[i+4: res.start(0)])
        return res.end(0)
340

341 342
    # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
    def parse_doctype(self, res):
343 344 345
        rawdata = self.rawdata
        n = len(rawdata)
        name = res.group('name')
Guido van Rossum's avatar
Guido van Rossum committed
346 347 348 349 350
        pubid, syslit = res.group('pubid', 'syslit')
        if pubid is not None:
            pubid = pubid[1:-1]         # remove quotes
            pubid = string.join(string.split(pubid)) # normalize
        if syslit is not None: syslit = syslit[1:-1] # remove quotes
351
        j = k = res.end(0)
Guido van Rossum's avatar
Guido van Rossum committed
352 353 354 355
        if k >= n:
            return -1
        if rawdata[k] == '[':
            level = 0
356
            k = k+1
Guido van Rossum's avatar
Guido van Rossum committed
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
            dq = sq = 0
            while k < n:
                c = rawdata[k]
                if not sq and c == '"':
                    dq = not dq
                elif not dq and c == "'":
                    sq = not sq
                elif sq or dq:
                    pass
                elif level <= 0 and c == ']':
                    res = endbracket.match(rawdata, k+1)
                    if not res:
                        return -1
                    self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
                    return res.end(0)
                elif c == '<':
                    level = level + 1
                elif c == '>':
                    level = level - 1
                    if level < 0:
                        self.syntax_error("bogus `>' in DOCTYPE")
                k = k+1
        res = endbracket.search(rawdata, k)
        if not res:
            return -1
        if res.start(0) != k:
            self.syntax_error('garbage in DOCTYPE')
        self.handle_doctype(name, pubid, syslit, None)
        return res.end(0)
386 387

    # Internal -- handle CDATA tag, return length or -1 if not terminated
388
    def parse_cdata(self, i):
389 390
        rawdata = self.rawdata
        if rawdata[i:i+9] <> '<![CDATA[':
Guido van Rossum's avatar
Guido van Rossum committed
391
            raise RuntimeError, 'unexpected call to parse_cdata'
392 393 394
        res = cdataclose.search(rawdata, i+9)
        if not res:
            return -1
Guido van Rossum's avatar
Guido van Rossum committed
395 396 397 398
        if illegal.search(rawdata, i+9, res.start(0)):
            self.syntax_error('illegal character in CDATA')
        if not self.stack:
            self.syntax_error('CDATA not in content')
399 400
        self.handle_cdata(rawdata[i+9:res.start(0)])
        return res.end(0)
401

402 403 404
    __xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None}
    # Internal -- handle a processing instruction tag
    def parse_proc(self, i):
405 406 407 408 409
        rawdata = self.rawdata
        end = procclose.search(rawdata, i)
        if not end:
            return -1
        j = end.start(0)
Guido van Rossum's avatar
Guido van Rossum committed
410 411
        if illegal.search(rawdata, i+2, j):
            self.syntax_error('illegal character in processing instruction')
412 413 414 415 416
        res = tagfind.match(rawdata, i+2)
        if not res:
            raise RuntimeError, 'unexpected call to parse_proc'
        k = res.end(0)
        name = res.group(0)
Guido van Rossum's avatar
Guido van Rossum committed
417 418
        if string.find(string.lower(name), 'xml') >= 0:
            self.syntax_error('illegal processing instruction target name')
419 420
        self.handle_proc(name, rawdata[k:j])
        return end.end(0)
421 422 423

    # Internal -- parse attributes between i and j
    def parse_attributes(self, tag, k, j, attributes = None):
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451
        rawdata = self.rawdata
        # Now parse the data between k and j into a tag and attrs
        attrdict = {}
        try:
            # convert attributes list to dictionary
            d = {}
            for a in attributes:
                d[a] = None
            attributes = d
        except TypeError:
            pass
        while k < j:
            res = attrfind.match(rawdata, k)
            if not res: break
            attrname, attrvalue = res.group('name', 'value')
            if attrvalue is None:
                self.syntax_error('no attribute value specified')
                attrvalue = attrname
            elif attrvalue[:1] == "'" == attrvalue[-1:] or \
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
            else:
                self.syntax_error('attribute value not quoted')
            if attributes is not None and not attributes.has_key(attrname):
                self.syntax_error('unknown attribute %s of element %s' %
                                  (attrname, tag))
            if attrdict.has_key(attrname):
                self.syntax_error('attribute specified twice')
Guido van Rossum's avatar
Guido van Rossum committed
452
            attrvalue = string.translate(attrvalue, attrtrans)
453 454 455 456 457 458 459 460
            attrdict[attrname] = self.translate_references(attrvalue)
            k = res.end(0)
        if attributes is not None:
            # fill in with default attributes
            for key, val in attributes.items():
                if val is not None and not attrdict.has_key(key):
                    attrdict[key] = val
        return attrdict, k
461 462 463

    # Internal -- handle starttag, return length or -1 if not terminated
    def parse_starttag(self, i):
464 465 466 467 468 469 470 471 472 473 474 475 476 477
        rawdata = self.rawdata
        # i points to start of tag
        end = endbracket.search(rawdata, i+1)
        if not end:
            return -1
        j = end.start(0)
        res = tagfind.match(rawdata, i+1)
        if not res:
            raise RuntimeError, 'unexpected call to parse_starttag'
        k = res.end(0)
        tag = res.group(0)
        if not self.__seen_starttag and self.__seen_doctype:
            if tag != self.__seen_doctype:
                self.syntax_error('starttag does not match DOCTYPE')
Guido van Rossum's avatar
Guido van Rossum committed
478 479
        if self.__seen_starttag and not self.stack:
            self.syntax_error('multiple elements on top level')
480 481 482 483 484 485 486 487 488 489 490 491
        if hasattr(self, tag + '_attributes'):
            attributes = getattr(self, tag + '_attributes')
        else:
            attributes = None
        attrdict, k = self.parse_attributes(tag, k, j, attributes)
        res = starttagend.match(rawdata, k)
        if not res:
            self.syntax_error('garbage in start tag')
        self.finish_starttag(tag, attrdict)
        if res and res.group('slash') == '/':
            self.finish_endtag(tag)
        return end.end(0)
492 493 494

    # Internal -- parse endtag
    def parse_endtag(self, i):
495 496 497 498 499 500 501 502 503 504 505 506 507
        rawdata = self.rawdata
        end = endbracket.search(rawdata, i+1)
        if not end:
            return -1
        res = tagfind.match(rawdata, i+2)
        if not res:
            self.syntax_error('no name specified in end tag')
            tag = ''
            k = i+2
        else:
            tag = res.group(0)
            k = res.end(0)
        if k != end.start(0):
Guido van Rossum's avatar
Guido van Rossum committed
508
            self.syntax_error('garbage in end tag')
509 510
        self.finish_endtag(tag)
        return end.end(0)
511 512 513 514

    # Internal -- finish processing of start tag
    # Return -1 for unknown tag, 1 for balanced tag
    def finish_starttag(self, tag, attrs):
515
        self.stack.append(tag)
Guido van Rossum's avatar
Guido van Rossum committed
516 517 518
        methodname = 'start_' + tag
        if hasattr(self, methodname):
            method = getattr(self, methodname)
519 520
            self.handle_starttag(tag, method, attrs)
            return 1
Guido van Rossum's avatar
Guido van Rossum committed
521 522 523
        else:
            self.unknown_starttag(tag, attrs)
            return -1
524 525 526

    # Internal -- finish processing of end tag
    def finish_endtag(self, tag):
Guido van Rossum's avatar
Guido van Rossum committed
527
        methodname = 'end_' + tag
528 529 530 531 532 533 534 535 536
        if not tag:
            self.syntax_error('name-less end tag')
            found = len(self.stack) - 1
            if found < 0:
                self.unknown_endtag(tag)
                return
        else:
            if tag not in self.stack:
                self.syntax_error('unopened end tag')
Guido van Rossum's avatar
Guido van Rossum committed
537 538 539 540
                if hasattr(self, methodname):
                    method = getattr(self, methodname)
                    self.handle_endtag(tag, method)
                else:
541 542 543 544 545 546 547 548 549 550
                    self.unknown_endtag(tag)
                return
            found = len(self.stack)
            for i in range(found):
                if self.stack[i] == tag:
                    found = i
        while len(self.stack) > found:
            if found < len(self.stack) - 1:
                self.syntax_error('missing close tag for %s' % self.stack[-1])
            tag = self.stack[-1]
Guido van Rossum's avatar
Guido van Rossum committed
551 552
            if hasattr(self, methodname):
                method = getattr(self, methodname)
553 554 555 556
                self.handle_endtag(tag, method)
            else:
                self.unknown_endtag(tag)
            del self.stack[-1]
557

558 559
    # Overridable -- handle xml processing instruction
    def handle_xml(self, encoding, standalone):
560
        pass
561 562

    # Overridable -- handle DOCTYPE
Guido van Rossum's avatar
Guido van Rossum committed
563
    def handle_doctype(self, tag, pubid, syslit, data):
564
        pass
565

566 567
    # Overridable -- handle start tag
    def handle_starttag(self, tag, method, attrs):
568
        method(attrs)
569 570 571

    # Overridable -- handle end tag
    def handle_endtag(self, tag, method):
572
        method()
573 574 575

    # Example -- handle character reference, no need to override
    def handle_charref(self, name):
576 577 578 579 580 581 582 583 584 585 586 587
        try:
            if name[0] == 'x':
                n = string.atoi(name[1:], 16)
            else:
                n = string.atoi(name)
        except string.atoi_error:
            self.unknown_charref(name)
            return
        if not 0 <= n <= 255:
            self.unknown_charref(name)
            return
        self.handle_data(chr(n))
588 589

    # Definition of entities -- derived classes may override
Guido van Rossum's avatar
Guido van Rossum committed
590 591 592 593 594 595
    entitydefs = {'lt': '&#60;',        # must use charref
                  'gt': '&#62;',
                  'amp': '&#38;',       # must use charref
                  'quot': '&#34;',
                  'apos': '&#39;',
                  }
596 597 598

    # Example -- handle entity reference, no need to override
    def handle_entityref(self, name):
599 600 601 602 603 604
        table = self.entitydefs
        if table.has_key(name):
            self.handle_data(table[name])
        else:
            self.unknown_entityref(name)
            return
605 606 607

    # Example -- handle data, should be overridden
    def handle_data(self, data):
608
        pass
609 610 611

    # Example -- handle cdata, could be overridden
    def handle_cdata(self, data):
612
        pass
613 614 615

    # Example -- handle comment, could be overridden
    def handle_comment(self, data):
616
        pass
617 618 619

    # Example -- handle processing instructions, could be overridden
    def handle_proc(self, name, data):
620
        pass
621 622

    # Example -- handle relatively harmless syntax errors, could be overridden
623
    def syntax_error(self, message):
624
        raise RuntimeError, 'Syntax error at line %d: %s' % (self.lineno, message)
625 626 627 628 629 630 631 632 633 634 635

    # To be overridden -- handlers for unknown objects
    def unknown_starttag(self, tag, attrs): pass
    def unknown_endtag(self, tag): pass
    def unknown_charref(self, ref): pass
    def unknown_entityref(self, ref): pass


class TestXMLParser(XMLParser):

    def __init__(self, verbose=0):
636 637
        self.testdata = ""
        XMLParser.__init__(self, verbose)
638

639
    def handle_xml(self, encoding, standalone):
640 641
        self.flush()
        print 'xml: encoding =',encoding,'standalone =',standalone
642

Guido van Rossum's avatar
Guido van Rossum committed
643
    def handle_doctype(self, tag, pubid, syslit, data):
644 645
        self.flush()
        print 'DOCTYPE:',tag, `data`
646

Guido van Rossum's avatar
Guido van Rossum committed
647 648 649 650
    def handle_entity(self, name, strval, pubid, syslit, ndata):
        self.flush()
        print 'ENTITY:',`data`

651
    def handle_data(self, data):
652 653 654
        self.testdata = self.testdata + data
        if len(`self.testdata`) >= 70:
            self.flush()
655 656

    def flush(self):
657 658 659 660
        data = self.testdata
        if data:
            self.testdata = ""
            print 'data:', `data`
661 662

    def handle_cdata(self, data):
663 664
        self.flush()
        print 'cdata:', `data`
665 666

    def handle_proc(self, name, data):
667 668
        self.flush()
        print 'processing:',name,`data`
669 670

    def handle_comment(self, data):
671 672 673 674 675
        self.flush()
        r = `data`
        if len(r) > 68:
            r = r[:32] + '...' + r[-32:]
        print 'comment:', r
676

677
    def syntax_error(self, message):
678
        print 'error at line %d:' % self.lineno, message
679 680

    def unknown_starttag(self, tag, attrs):
681 682 683 684 685 686 687 688
        self.flush()
        if not attrs:
            print 'start tag: <' + tag + '>'
        else:
            print 'start tag: <' + tag,
            for name, value in attrs.items():
                print name + '=' + '"' + value + '"',
            print '>'
689 690

    def unknown_endtag(self, tag):
691 692
        self.flush()
        print 'end tag: </' + tag + '>'
693 694

    def unknown_entityref(self, ref):
695 696
        self.flush()
        print '*** unknown entity ref: &' + ref + ';'
697 698

    def unknown_charref(self, ref):
699 700
        self.flush()
        print '*** unknown char ref: &#' + ref + ';'
701 702

    def close(self):
703 704
        XMLParser.close(self)
        self.flush()
705 706 707 708 709

def test(args = None):
    import sys

    if not args:
710
        args = sys.argv[1:]
711 712

    if args and args[0] == '-s':
713 714
        args = args[1:]
        klass = XMLParser
715
    else:
716
        klass = TestXMLParser
717 718

    if args:
719
        file = args[0]
720
    else:
721
        file = 'test.xml'
722 723

    if file == '-':
724
        f = sys.stdin
725
    else:
726 727 728 729 730
        try:
            f = open(file, 'r')
        except IOError, msg:
            print file, ":", msg
            sys.exit(1)
731 732 733

    data = f.read()
    if f is not sys.stdin:
734
        f.close()
735 736

    x = klass()
Guido van Rossum's avatar
Guido van Rossum committed
737 738 739 740 741 742 743
    try:
        for c in data:
            x.feed(c)
        x.close()
    except RuntimeError, msg:
        print msg
        sys.exit(1)
744 745 746 747


if __name__ == '__main__':
    test()