parser.py 19.1 KB
Newer Older
1
"""A parser for HTML and XHTML."""
2 3 4 5 6 7 8 9 10

# This file is based on sgmllib.py, but the API is slightly different.

# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special).


11
import _markupbase
12
import re
13
import warnings
14 15 16 17

# Regular expressions used for parsing

interesting_normal = re.compile('[&<]')
18
incomplete = re.compile('&[a-zA-Z#]')
19 20

entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
21
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
22 23 24 25

starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')
26
tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
27 28 29
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
30 31 32 33 34 35
# Note:
#  1) the strict attrfind isn't really strict, but we can't make it
#     correctly strict without breaking backward compatibility;
#  2) if you change attrfind remember to update locatestarttagend too;
#  3) if you change attrfind and/or locatestarttagend the parser will
#     explode, so don't do it.
36 37
attrfind = re.compile(
    r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
38
    r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
39
attrfind_tolerant = re.compile(
40
    r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
41
    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
42 43 44 45 46 47 48 49
locatestarttagend = re.compile(r"""
  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  (?:\s+                             # whitespace before attribute name
    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
      (?:\s*=\s*                     # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |\"[^\"]*\"                # LIT-enclosed value
          |[^'\">\s]+                # bare value
50
         )
51 52 53 54 55
       )?
     )
   )*
  \s*                                # trailing whitespace
""", re.VERBOSE)
56 57
locatestarttagend_tolerant = re.compile(r"""
  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
58 59
  (?:[\s/]*                          # optional whitespace before attribute name
    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
60
      (?:\s*=+\s*                    # value indicator
61
        (?:'[^']*'                   # LITA-enclosed value
62 63
          |"[^"]*"                   # LIT-enclosed value
          |(?!['"])[^>\s]*           # bare value
64 65
         )
         (?:\s*,)*                   # possibly followed by a comma
66
       )?(?:\s|/(?!>))*
67 68
     )*
   )?
69 70
  \s*                                # trailing whitespace
""", re.VERBOSE)
71
endendtag = re.compile('>')
72 73
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
# </ and the tag name, so maybe this should be fixed
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')


class HTMLParseError(Exception):
    """Exception raised for all parse errors."""

    def __init__(self, msg, position=(None, None)):
        assert msg
        self.msg = msg
        self.lineno = position[0]
        self.offset = position[1]

    def __str__(self):
        result = self.msg
        if self.lineno is not None:
            result = result + ", at line %d" % self.lineno
        if self.offset is not None:
            result = result + ", column %d" % (self.offset + 1)
        return result


95
class HTMLParser(_markupbase.ParserBase):
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
    """Find tags and other markup and call handler functions.

    Usage:
        p = HTMLParser()
        p.feed(data)
        ...
        p.close()

    Start tags are handled by calling self.handle_starttag() or
    self.handle_startendtag(); end tags by self.handle_endtag().  The
    data between tags is passed from the parser to the derived class
    by calling self.handle_data() with the data as argument (the data
    may be split up in arbitrary chunks).  Entity references are
    passed by calling self.handle_entityref() with the entity
    reference as the argument.  Numeric character references are
    passed to self.handle_charref() with the string containing the
    reference as the argument.
    """
114 115 116

    CDATA_CONTENT_ELEMENTS = ("script", "style")

117
    def __init__(self, strict=False):
118
        """Initialize and reset this instance.
119

120 121 122
        If strict is set to False (the default) the parser will parse invalid
        markup, otherwise it will raise an error.  Note that the strict mode
        is deprecated.
123
        """
124 125 126
        if strict:
            warnings.warn("The strict mode is deprecated.",
                          DeprecationWarning, stacklevel=2)
127
        self.strict = strict
128 129 130
        self.reset()

    def reset(self):
131
        """Reset this instance.  Loses all unprocessed data."""
132 133 134
        self.rawdata = ''
        self.lasttag = '???'
        self.interesting = interesting_normal
135
        self.cdata_elem = None
136
        _markupbase.ParserBase.reset(self)
137 138

    def feed(self, data):
139
        r"""Feed data to the parser.
140 141 142 143

        Call this as often as you want, with as little or as much text
        as you want (may include '\n').
        """
144 145 146 147
        self.rawdata = self.rawdata + data
        self.goahead(0)

    def close(self):
148
        """Handle any buffered data."""
149 150
        self.goahead(1)

151 152
    def error(self, message):
        raise HTMLParseError(message, self.getpos())
153 154 155 156

    __starttag_text = None

    def get_starttag_text(self):
157
        """Return full source of start tag: '<...>'."""
158 159
        return self.__starttag_text

160 161
    def set_cdata_mode(self, elem):
        self.cdata_elem = elem.lower()
162
        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
163 164 165

    def clear_cdata_mode(self):
        self.interesting = interesting_normal
166
        self.cdata_elem = None
167 168 169 170 171 172 173 174 175 176 177 178 179

    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
    # true, force handling all data as if followed by EOF marker.
    def goahead(self, end):
        rawdata = self.rawdata
        i = 0
        n = len(rawdata)
        while i < n:
            match = self.interesting.search(rawdata, i) # < or &
            if match:
                j = match.start()
            else:
180 181
                if self.cdata_elem:
                    break
182 183 184 185
                j = n
            if i < j: self.handle_data(rawdata[i:j])
            i = self.updatepos(i, j)
            if i == n: break
186 187
            startswith = rawdata.startswith
            if startswith('<', i):
188 189
                if starttagopen.match(rawdata, i): # < + letter
                    k = self.parse_starttag(i)
190
                elif startswith("</", i):
191
                    k = self.parse_endtag(i)
192
                elif startswith("<!--", i):
193
                    k = self.parse_comment(i)
194
                elif startswith("<?", i):
195
                    k = self.parse_pi(i)
196
                elif startswith("<!", i):
197 198 199
                    if self.strict:
                        k = self.parse_declaration(i)
                    else:
200
                        k = self.parse_html_declaration(i)
201
                elif (i + 1) < n:
202 203
                    self.handle_data("<")
                    k = i + 1
204 205
                else:
                    break
206
                if k < 0:
207 208 209
                    if not end:
                        break
                    if self.strict:
210
                        self.error("EOF in middle of construct")
211 212 213 214 215 216 217 218
                    k = rawdata.find('>', i + 1)
                    if k < 0:
                        k = rawdata.find('<', i + 1)
                        if k < 0:
                            k = i + 1
                    else:
                        k += 1
                    self.handle_data(rawdata[i:k])
219
                i = self.updatepos(i, k)
220
            elif startswith("&#", i):
221 222
                match = charref.match(rawdata, i)
                if match:
223
                    name = match.group()[2:-1]
224 225
                    self.handle_charref(name)
                    k = match.end()
226
                    if not startswith(';', k-1):
227
                        k = k - 1
228 229
                    i = self.updatepos(i, k)
                    continue
230
                else:
231 232 233
                    if ";" in rawdata[i:]: #bail by consuming &#
                        self.handle_data(rawdata[0:2])
                        i = self.updatepos(i, 2)
234
                    break
235
            elif startswith('&', i):
236 237 238 239 240
                match = entityref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_entityref(name)
                    k = match.end()
241
                    if not startswith(';', k-1):
242
                        k = k - 1
243 244
                    i = self.updatepos(i, k)
                    continue
245 246
                match = incomplete.match(rawdata, i)
                if match:
247
                    # match.group() will contain at least 2 chars
248
                    if end and match.group() == rawdata[i:]:
249 250 251 252 253 254
                        if self.strict:
                            self.error("EOF in middle of entity or char ref")
                        else:
                            if k <= i:
                                k = n
                            i = self.updatepos(i, i + 1)
255 256 257 258 259 260 261 262 263
                    # incomplete
                    break
                elif (i + 1) < n:
                    # not the end of the buffer, and can't be confused
                    # with some other construct
                    self.handle_data("&")
                    i = self.updatepos(i, i + 1)
                else:
                    break
264 265 266
            else:
                assert 0, "interesting.search() lied"
        # end while
267
        if end and i < n and not self.cdata_elem:
268 269 270 271
            self.handle_data(rawdata[i:n])
            i = self.updatepos(i, n)
        self.rawdata = rawdata[i:]

272 273 274 275 276
    # Internal -- parse html declarations, return length or -1 if not terminated
    # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
    # See also parse_declaration in _markupbase
    def parse_html_declaration(self, i):
        rawdata = self.rawdata
277 278
        assert rawdata[i:i+2] == '<!', ('unexpected call to '
                                        'parse_html_declaration()')
279
        if rawdata[i:i+4] == '<!--':
280
            # this case is actually already handled in goahead()
281 282 283 284 285
            return self.parse_comment(i)
        elif rawdata[i:i+3] == '<![':
            return self.parse_marked_section(i)
        elif rawdata[i:i+9].lower() == '<!doctype':
            # find the closing >
286
            gtpos = rawdata.find('>', i+9)
287 288 289 290 291 292 293
            if gtpos == -1:
                return -1
            self.handle_decl(rawdata[i+2:gtpos])
            return gtpos+1
        else:
            return self.parse_bogus_comment(i)

294 295 296 297
    # Internal -- parse bogus comment, return length or -1 if not terminated
    # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
    def parse_bogus_comment(self, i, report=1):
        rawdata = self.rawdata
298 299
        assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
                                                'parse_comment()')
300 301 302 303 304 305 306
        pos = rawdata.find('>', i+2)
        if pos == -1:
            return -1
        if report:
            self.handle_comment(rawdata[i+2:pos])
        return pos + 1

307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
    # Internal -- parse processing instr, return end or -1 if not terminated
    def parse_pi(self, i):
        rawdata = self.rawdata
        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
        match = piclose.search(rawdata, i+2) # >
        if not match:
            return -1
        j = match.start()
        self.handle_pi(rawdata[i+2: j])
        j = match.end()
        return j

    # Internal -- handle starttag, return end or -1 if not terminated
    def parse_starttag(self, i):
        self.__starttag_text = None
        endpos = self.check_for_whole_start_tag(i)
        if endpos < 0:
            return endpos
        rawdata = self.rawdata
        self.__starttag_text = rawdata[i:endpos]

        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        match = tagfind.match(rawdata, i+1)
        assert match, 'unexpected call to parse_starttag()'
        k = match.end()
333
        self.lasttag = tag = match.group(1).lower()
334
        while k < endpos:
335 336 337
            if self.strict:
                m = attrfind.match(rawdata, k)
            else:
338
                m = attrfind_tolerant.match(rawdata, k)
339 340 341 342 343 344 345 346
            if not m:
                break
            attrname, rest, attrvalue = m.group(1, 2, 3)
            if not rest:
                attrvalue = None
            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
347
            if attrvalue:
348
                attrvalue = self.unescape(attrvalue)
349
            attrs.append((attrname.lower(), attrvalue))
350 351
            k = m.end()

352
        end = rawdata[k:endpos].strip()
353 354 355
        if end not in (">", "/>"):
            lineno, offset = self.getpos()
            if "\n" in self.__starttag_text:
356
                lineno = lineno + self.__starttag_text.count("\n")
357
                offset = len(self.__starttag_text) \
358
                         - self.__starttag_text.rfind("\n")
359 360
            else:
                offset = offset + len(self.__starttag_text)
361 362 363 364 365
            if self.strict:
                self.error("junk characters in start tag: %r"
                           % (rawdata[k:endpos][:20],))
            self.handle_data(rawdata[i:endpos])
            return endpos
366
        if end.endswith('/>'):
367 368 369 370 371
            # XHTML-style empty tag: <span attr="value" />
            self.handle_startendtag(tag, attrs)
        else:
            self.handle_starttag(tag, attrs)
            if tag in self.CDATA_CONTENT_ELEMENTS:
372
                self.set_cdata_mode(tag)
373 374 375 376 377 378
        return endpos

    # Internal -- check to see if we have a complete starttag; return end
    # or -1 if incomplete.
    def check_for_whole_start_tag(self, i):
        rawdata = self.rawdata
379 380 381 382
        if self.strict:
            m = locatestarttagend.match(rawdata, i)
        else:
            m = locatestarttagend_tolerant.match(rawdata, i)
383 384 385 386 387 388
        if m:
            j = m.end()
            next = rawdata[j:j+1]
            if next == ">":
                return j + 1
            if next == "/":
389
                if rawdata.startswith("/>", j):
390
                    return j + 2
391
                if rawdata.startswith("/", j):
392 393 394
                    # buffer boundary
                    return -1
                # else bogus input
395 396 397 398 399 400 401
                if self.strict:
                    self.updatepos(i, j + 1)
                    self.error("malformed empty start tag")
                if j > i:
                    return j
                else:
                    return i + 1
402 403 404 405 406 407 408 409
            if next == "":
                # end of input
                return -1
            if next in ("abcdefghijklmnopqrstuvwxyz=/"
                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
                # end of input in or before attribute value, or we have the
                # '/' from a '/>' ending
                return -1
410 411 412 413 414 415 416
            if self.strict:
                self.updatepos(i, j)
                self.error("malformed start tag")
            if j > i:
                return j
            else:
                return i + 1
417
        raise AssertionError("we should not get here!")
418 419 420 421 422 423 424 425

    # Internal -- parse endtag, return end or -1 if incomplete
    def parse_endtag(self, i):
        rawdata = self.rawdata
        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
        match = endendtag.search(rawdata, i+1) # >
        if not match:
            return -1
426
        gtpos = match.end()
427 428
        match = endtagfind.match(rawdata, i) # </ + tag + >
        if not match:
429
            if self.cdata_elem is not None:
430 431
                self.handle_data(rawdata[i:gtpos])
                return gtpos
432
            if self.strict:
433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
                self.error("bad end tag: %r" % (rawdata[i:gtpos],))
            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
            namematch = tagfind_tolerant.match(rawdata, i+2)
            if not namematch:
                # w3.org/TR/html5/tokenization.html#end-tag-open-state
                if rawdata[i:i+3] == '</>':
                    return i+3
                else:
                    return self.parse_bogus_comment(i)
            tagname = namematch.group().lower()
            # consume and ignore other stuff between the name and the >
            # Note: this is not 100% correct, since we might have things like
            # </tag attr=">">, but looking for > after tha name should cover
            # most of the cases and is much simpler
            gtpos = rawdata.find('>', namematch.end())
            self.handle_endtag(tagname)
            return gtpos+1
450 451 452 453

        elem = match.group(1).lower() # script or style
        if self.cdata_elem is not None:
            if elem != self.cdata_elem:
454 455
                self.handle_data(rawdata[i:gtpos])
                return gtpos
456 457

        self.handle_endtag(elem.lower())
458
        self.clear_cdata_mode()
459
        return gtpos
460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497

    # Overridable -- finish processing of start+end tag: <tag.../>
    def handle_startendtag(self, tag, attrs):
        self.handle_starttag(tag, attrs)
        self.handle_endtag(tag)

    # Overridable -- handle start tag
    def handle_starttag(self, tag, attrs):
        pass

    # Overridable -- handle end tag
    def handle_endtag(self, tag):
        pass

    # Overridable -- handle character reference
    def handle_charref(self, name):
        pass

    # Overridable -- handle entity reference
    def handle_entityref(self, name):
        pass

    # Overridable -- handle data
    def handle_data(self, data):
        pass

    # Overridable -- handle comment
    def handle_comment(self, data):
        pass

    # Overridable -- handle declaration
    def handle_decl(self, decl):
        pass

    # Overridable -- handle processing instruction
    def handle_pi(self, data):
        pass

498
    def unknown_decl(self, data):
499 500
        if self.strict:
            self.error("unknown declaration: %r" % (data,))
501

502 503 504 505
    # Internal -- helper to remove special character quoting
    def unescape(self, s):
        if '&' not in s:
            return s
506 507
        def replaceEntities(s):
            s = s.groups()[0]
508 509 510 511
            try:
                if s[0] == "#":
                    s = s[1:]
                    if s[0] in ['x','X']:
512
                        c = int(s[1:].rstrip(';'), 16)
513
                    else:
514
                        c = int(s.rstrip(';'))
515 516
                    return chr(c)
            except ValueError:
517
                return '&#' + s
518
            else:
519 520 521 522 523 524 525 526 527 528 529 530
                from html.entities import html5
                if s in html5:
                    return html5[s]
                elif s.endswith(';'):
                    return '&' + s
                for x in range(2, len(s)):
                    if s[:x] in html5:
                        return html5[s[:x]] + s[x:]
                else:
                    return '&' + s

        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
531
                      replaceEntities, s, flags=re.ASCII)