tokenize.py 19.9 KB
Newer Older
1 2
"""Tokenization help for Python programs.

3 4 5
tokenize(readline) is a generator that breaks a stream of bytes into
Python tokens.  It decodes the bytes according to PEP-0263 for
determining source file encoding.
6

7 8 9
It accepts a readline-like method which is called repeatedly to get the
next line of input (or b"" for EOF).  It generates 5-tuples with these
members:
10 11 12 13 14 15 16 17 18

    the token type (see token.py)
    the token (a string)
    the starting (row, column) indices of the token (a 2-tuple of ints)
    the ending (row, column) indices of the token (a 2-tuple of ints)
    the original line (string)

It is designed to match the working of the Python tokenizer exactly, except
that it produces COMMENT tokens for comments and gives type OP for all
19 20 21
operators.  Additionally, all token lists start with an ENCODING token
which tells you which encoding was used to decode the bytes stream.
"""
22

23
__author__ = 'Ka-Ping Yee <ping@lfw.org>'
24 25 26
__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
               'Michael Foord')
27 28
import re
import sys
Guido van Rossum's avatar
Guido van Rossum committed
29
from token import *
30
from codecs import lookup, BOM_UTF8
31
import collections
32
cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
Guido van Rossum's avatar
Guido van Rossum committed
33

34
import token
Benjamin Peterson's avatar
Benjamin Peterson committed
35 36 37
__all__ = [x for x in dir(token) if not x.startswith("_")]
__all__.extend(["COMMENT", "tokenize", "detect_encoding", "NL", "untokenize",
                "ENCODING", "TokenInfo"])
38 39
del token

40 41
COMMENT = N_TOKENS
tok_name[COMMENT] = 'COMMENT'
42 43
NL = N_TOKENS + 1
tok_name[NL] = 'NL'
44 45 46
ENCODING = N_TOKENS + 2
tok_name[ENCODING] = 'ENCODING'
N_TOKENS += 3
47

48
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
49
    def __repr__(self):
50 51 52
        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
                self._replace(type=annotated_type))
53

54
def group(*choices): return '(' + '|'.join(choices) + ')'
55 56
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'
Guido van Rossum's avatar
Guido van Rossum committed
57

58 59
# Note: we use unicode matching for names ("\w") but ascii matching for
# number literals.
60 61 62
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
63
Name = r'\w+'
Guido van Rossum's avatar
Guido van Rossum committed
64

65
Hexnumber = r'0[xX][0-9a-fA-F]+'
66 67
Binnumber = r'0[bB][01]+'
Octnumber = r'0[oO][0-7]+'
68
Decnumber = r'(?:0+|[1-9][0-9]*)'
69
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
70 71 72
Exponent = r'[eE][-+]?[0-9]+'
Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
Expfloat = r'[0-9]+' + Exponent
73
Floatnumber = group(Pointfloat, Expfloat)
74
Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
75
Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum's avatar
Guido van Rossum committed
76

77 78 79 80 81 82 83 84
# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
# Tail end of " string.
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
# Tail end of ''' string.
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
85
Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
86
# Single-line ' or " string.
87 88
String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
               r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum's avatar
Guido van Rossum committed
89

90 91 92
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
93
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
94
                 r"//=?", r"->",
95 96
                 r"[+\-*/%&|^=<>]=?",
                 r"~")
97

Guido van Rossum's avatar
Guido van Rossum committed
98
Bracket = '[][(){}]'
99
Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossum's avatar
Guido van Rossum committed
100
Funny = group(Operator, Bracket, Special)
Guido van Rossum's avatar
Guido van Rossum committed
101

102
PlainToken = group(Number, Funny, String, Name)
Guido van Rossum's avatar
Guido van Rossum committed
103
Token = Ignore + PlainToken
Guido van Rossum's avatar
Guido van Rossum committed
104

105
# First (or only) line of ' or " string.
106
ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
107
                group("'", r'\\\r?\n'),
108
                r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
109
                group('"', r'\\\r?\n'))
110 111
PseudoExtras = group(r'\\\r?\n', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
112

113 114 115
def _compile(expr):
    return re.compile(expr, re.UNICODE)

116
tokenprog, pseudoprog, single3prog, double3prog = map(
117 118
    _compile, (Token, PseudoToken, Single3, Double3))
endprogs = {"'": _compile(Single), '"': _compile(Double),
119
            "'''": single3prog, '"""': double3prog,
120
            "r'''": single3prog, 'r"""': double3prog,
121 122
            "b'''": single3prog, 'b"""': double3prog,
            "br'''": single3prog, 'br"""': double3prog,
123
            "R'''": single3prog, 'R"""': double3prog,
124 125 126 127 128
            "B'''": single3prog, 'B"""': double3prog,
            "bR'''": single3prog, 'bR"""': double3prog,
            "Br'''": single3prog, 'Br"""': double3prog,
            "BR'''": single3prog, 'BR"""': double3prog,
            'r': None, 'R': None, 'b': None, 'B': None}
Guido van Rossum's avatar
Guido van Rossum committed
129

130 131 132
triple_quoted = {}
for t in ("'''", '"""',
          "r'''", 'r"""', "R'''", 'R"""',
133 134 135
          "b'''", 'b"""', "B'''", 'B"""',
          "br'''", 'br"""', "Br'''", 'Br"""',
          "bR'''", 'bR"""', "BR'''", 'BR"""'):
136 137 138 139
    triple_quoted[t] = t
single_quoted = {}
for t in ("'", '"',
          "r'", 'r"', "R'", 'R"',
140 141 142
          "b'", 'b"', "B'", 'B"',
          "br'", 'br"', "Br'", 'Br"',
          "bR'", 'bR"', "BR'", 'BR"' ):
143 144
    single_quoted[t] = t

145 146
del _compile

Guido van Rossum's avatar
Guido van Rossum committed
147
tabsize = 8
148

149 150 151
class TokenError(Exception): pass

class StopTokenizing(Exception): pass
152

153

154 155 156 157 158 159
class Untokenizer:

    def __init__(self):
        self.tokens = []
        self.prev_row = 1
        self.prev_col = 0
160
        self.encoding = None
161 162 163 164 165 166 167 168 169 170 171 172 173 174

    def add_whitespace(self, start):
        row, col = start
        assert row <= self.prev_row
        col_offset = col - self.prev_col
        if col_offset:
            self.tokens.append(" " * col_offset)

    def untokenize(self, iterable):
        for t in iterable:
            if len(t) == 2:
                self.compat(t, iterable)
                break
            tok_type, token, start, end, line = t
175 176 177
            if tok_type == ENCODING:
                self.encoding = token
                continue
178 179 180 181 182 183 184 185 186 187 188 189 190
            self.add_whitespace(start)
            self.tokens.append(token)
            self.prev_row, self.prev_col = end
            if tok_type in (NEWLINE, NL):
                self.prev_row += 1
                self.prev_col = 0
        return "".join(self.tokens)

    def compat(self, token, iterable):
        startline = False
        indents = []
        toks_append = self.tokens.append
        toknum, tokval = token
191

192 193 194 195
        if toknum in (NAME, NUMBER):
            tokval += ' '
        if toknum in (NEWLINE, NL):
            startline = True
196
        prevstring = False
197 198
        for tok in iterable:
            toknum, tokval = tok[:2]
199 200 201
            if toknum == ENCODING:
                self.encoding = tokval
                continue
202 203 204 205

            if toknum in (NAME, NUMBER):
                tokval += ' '

206 207 208 209 210 211 212 213
            # Insert a space between two consecutive strings
            if toknum == STRING:
                if prevstring:
                    tokval = ' ' + tokval
                prevstring = True
            else:
                prevstring = False

214 215 216 217 218 219 220 221 222 223 224 225
            if toknum == INDENT:
                indents.append(tokval)
                continue
            elif toknum == DEDENT:
                indents.pop()
                continue
            elif toknum in (NEWLINE, NL):
                startline = True
            elif startline and indents:
                toks_append(indents[-1])
                startline = False
            toks_append(tokval)
226

227

228 229
def untokenize(iterable):
    """Transform tokens back into Python source code.
230 231
    It returns a bytes object, encoded using the ENCODING
    token, which is the first token sequence output by tokenize.
232 233

    Each element returned by the iterable must be a token sequence
234 235 236 237 238
    with at least two elements, a token number and token value.  If
    only two tokens are passed, the resulting output is poor.

    Round-trip invariant for full input:
        Untokenized source will match input source exactly
239

240
    Round-trip invariant for limited intput:
241 242
        # Output bytes will tokenize the back to the input
        t1 = [tok[:2] for tok in tokenize(f.readline)]
243
        newcode = untokenize(t1)
244 245
        readline = BytesIO(newcode).readline
        t2 = [tok[:2] for tok in tokenize(readline)]
246 247
        assert t1 == t2
    """
248
    ut = Untokenizer()
249 250 251 252
    out = ut.untokenize(iterable)
    if ut.encoding is not None:
        out = out.encode(ut.encoding)
    return out
253

254

255 256 257 258 259 260 261 262 263 264 265
def _get_normal_name(orig_enc):
    """Imitates get_normal_name in tokenizer.c."""
    # Only care about the first 12 characters.
    enc = orig_enc[:12].lower().replace("_", "-")
    if enc == "utf-8" or enc.startswith("utf-8-"):
        return "utf-8"
    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
        return "iso-8859-1"
    return orig_enc

266
def detect_encoding(readline):
267
    """
268
    The detect_encoding() function is used to detect the encoding that should
269
    be used to decode a Python source file.  It requires one argment, readline,
270 271 272
    in the same way as the tokenize() generator.

    It will call readline a maximum of twice, and return the encoding used
273
    (as a string) and a list of any lines (left as bytes) it has read in.
274 275

    It detects the encoding from the presence of a utf-8 bom or an encoding
276 277 278
    cookie as specified in pep-0263.  If both a bom and a cookie are present,
    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
279
    'utf-8-sig' is returned.
280 281 282 283 284

    If no encoding is specified, then the default of 'utf-8' will be returned.
    """
    bom_found = False
    encoding = None
285
    default = 'utf-8'
286 287 288 289 290 291 292 293 294 295
    def read_or_stop():
        try:
            return readline()
        except StopIteration:
            return b''

    def find_cookie(line):
        try:
            line_string = line.decode('ascii')
        except UnicodeDecodeError:
296 297 298 299 300
            return None

        matches = cookie_re.findall(line_string)
        if not matches:
            return None
301
        encoding = _get_normal_name(matches[0])
302 303 304 305 306 307
        try:
            codec = lookup(encoding)
        except LookupError:
            # This behaviour mimics the Python interpreter
            raise SyntaxError("unknown encoding: " + encoding)

308 309 310 311 312
        if bom_found:
            if codec.name != 'utf-8':
                # This behaviour mimics the Python interpreter
                raise SyntaxError('encoding problem: utf-8')
            encoding += '-sig'
313
        return encoding
314 315

    first = read_or_stop()
316
    if first.startswith(BOM_UTF8):
317 318
        bom_found = True
        first = first[3:]
319
        default = 'utf-8-sig'
320
    if not first:
321
        return default, []
322 323 324 325 326 327 328

    encoding = find_cookie(first)
    if encoding:
        return encoding, [first]

    second = read_or_stop()
    if not second:
329
        return default, [first]
330 331 332 333 334

    encoding = find_cookie(second)
    if encoding:
        return encoding, [first, second]

335
    return default, [first, second]
336 337 338 339 340


def tokenize(readline):
    """
    The tokenize() generator requires one argment, readline, which
341
    must be a callable object which provides the same interface as the
342
    readline() method of built-in file objects.  Each call to the function
343
    should return one line of input as bytes.  Alternately, readline
344
    can be a callable function terminating with StopIteration:
345
        readline = open(myfile, 'rb').__next__  # Example of alternate readline
Tim Peters's avatar
Tim Peters committed
346

347 348 349 350
    The generator produces 5-tuples with these members: the token type; the
    token string; a 2-tuple (srow, scol) of ints specifying the row and
    column where the token begins in the source; a 2-tuple (erow, ecol) of
    ints specifying the row and column where the token ends in the source;
351
    and the line on which the token was found.  The line passed is the
Tim Peters's avatar
Tim Peters committed
352
    logical line; continuation lines are included.
353 354 355

    The first token sequence will always be an ENCODING token
    which tells you which encoding was used to decode the bytes stream.
356
    """
357 358
    # This import is here to avoid problems when the itertools module is not
    # built yet and tokenize is imported.
359
    from itertools import chain, repeat
360
    encoding, consumed = detect_encoding(readline)
361 362 363
    rl_gen = iter(readline, b"")
    empty = repeat(b"")
    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
364 365 366


def _tokenize(readline, encoding):
367
    lnum = parenlev = continued = 0
368
    numchars = '0123456789'
369
    contstr, needcont = '', 0
370
    contline = None
Guido van Rossum's avatar
Guido van Rossum committed
371
    indents = [0]
372

373
    if encoding is not None:
374 375 376
        if encoding == "utf-8-sig":
            # BOM will already have been stripped.
            encoding = "utf-8"
377
        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
378
    while True:             # loop over lines in stream
379 380 381
        try:
            line = readline()
        except StopIteration:
382 383 384 385
            line = b''

        if encoding is not None:
            line = line.decode(encoding)
Benjamin Peterson's avatar
Benjamin Peterson committed
386
        lnum += 1
Guido van Rossum's avatar
Guido van Rossum committed
387 388 389
        pos, max = 0, len(line)

        if contstr:                            # continued string
390
            if not line:
391
                raise TokenError("EOF in multi-line string", strstart)
392 393 394
            endmatch = endprog.match(line)
            if endmatch:
                pos = end = endmatch.end(0)
395
                yield TokenInfo(STRING, contstr + line[:end],
396
                       strstart, (lnum, end), contline + line)
397
                contstr, needcont = '', 0
398
                contline = None
399
            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
400
                yield TokenInfo(ERRORTOKEN, contstr + line,
401
                           strstart, (lnum, len(line)), contline)
Guido van Rossum's avatar
Guido van Rossum committed
402
                contstr = ''
403
                contline = None
404
                continue
Guido van Rossum's avatar
Guido van Rossum committed
405 406
            else:
                contstr = contstr + line
407
                contline = contline + line
Guido van Rossum's avatar
Guido van Rossum committed
408 409
                continue

410
        elif parenlev == 0 and not continued:  # new statement
Guido van Rossum's avatar
Guido van Rossum committed
411 412
            if not line: break
            column = 0
413
            while pos < max:                   # measure leading whitespace
Benjamin Peterson's avatar
Benjamin Peterson committed
414 415 416 417 418 419 420 421 422 423 424
                if line[pos] == ' ':
                    column += 1
                elif line[pos] == '\t':
                    column = (column//tabsize + 1)*tabsize
                elif line[pos] == '\f':
                    column = 0
                else:
                    break
                pos += 1
            if pos == max:
                break
425 426

            if line[pos] in '#\r\n':           # skip comments or blank lines
427 428 429
                if line[pos] == '#':
                    comment_token = line[pos:].rstrip('\r\n')
                    nl_pos = pos + len(comment_token)
430
                    yield TokenInfo(COMMENT, comment_token,
431
                           (lnum, pos), (lnum, pos + len(comment_token)), line)
432
                    yield TokenInfo(NL, line[nl_pos:],
433 434
                           (lnum, nl_pos), (lnum, len(line)), line)
                else:
435
                    yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
436 437
                           (lnum, pos), (lnum, len(line)), line)
                continue
Guido van Rossum's avatar
Guido van Rossum committed
438 439 440

            if column > indents[-1]:           # count indents or dedents
                indents.append(column)
441
                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
442
            while column < indents[-1]:
443 444
                if column not in indents:
                    raise IndentationError(
445 446
                        "unindent does not match any outer indentation level",
                        ("<tokenize>", lnum, pos, line))
Guido van Rossum's avatar
Guido van Rossum committed
447
                indents = indents[:-1]
448
                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
449 450

        else:                                  # continued statement
451
            if not line:
452
                raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossum's avatar
Guido van Rossum committed
453 454 455
            continued = 0

        while pos < max:
456 457 458
            pseudomatch = pseudoprog.match(line, pos)
            if pseudomatch:                                # scan for tokens
                start, end = pseudomatch.span(1)
459
                spos, epos, pos = (lnum, start), (lnum, end), end
460
                token, initial = line[start:end], line[start]
Guido van Rossum's avatar
Guido van Rossum committed
461

462 463
                if (initial in numchars or                  # ordinary number
                    (initial == '.' and token != '.' and token != '...')):
464
                    yield TokenInfo(NUMBER, token, spos, epos, line)
465
                elif initial in '\r\n':
466
                    yield TokenInfo(NL if parenlev > 0 else NEWLINE,
467
                           token, spos, epos, line)
468
                elif initial == '#':
469
                    assert not token.endswith("\n")
470
                    yield TokenInfo(COMMENT, token, spos, epos, line)
471
                elif token in triple_quoted:
Guido van Rossum's avatar
Guido van Rossum committed
472
                    endprog = endprogs[token]
473 474 475
                    endmatch = endprog.match(line, pos)
                    if endmatch:                           # all on one line
                        pos = endmatch.end(0)
476
                        token = line[start:pos]
477
                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
478
                    else:
479 480
                        strstart = (lnum, start)           # multiple lines
                        contstr = line[start:]
481
                        contline = line
Guido van Rossum's avatar
Guido van Rossum committed
482
                        break
483 484 485
                elif initial in single_quoted or \
                    token[:2] in single_quoted or \
                    token[:3] in single_quoted:
Guido van Rossum's avatar
Guido van Rossum committed
486
                    if token[-1] == '\n':                  # continued string
487
                        strstart = (lnum, start)
488 489
                        endprog = (endprogs[initial] or endprogs[token[1]] or
                                   endprogs[token[2]])
490
                        contstr, needcont = line[start:], 1
491
                        contline = line
Guido van Rossum's avatar
Guido van Rossum committed
492 493
                        break
                    else:                                  # ordinary string
494
                        yield TokenInfo(STRING, token, spos, epos, line)
495
                elif initial.isidentifier():               # ordinary name
496
                    yield TokenInfo(NAME, token, spos, epos, line)
497 498
                elif initial == '\\':                      # continued stmt
                    continued = 1
Guido van Rossum's avatar
Guido van Rossum committed
499
                else:
Benjamin Peterson's avatar
Benjamin Peterson committed
500 501 502 503
                    if initial in '([{':
                        parenlev += 1
                    elif initial in ')]}':
                        parenlev -= 1
504
                    yield TokenInfo(OP, token, spos, epos, line)
Guido van Rossum's avatar
Guido van Rossum committed
505
            else:
506
                yield TokenInfo(ERRORTOKEN, line[pos],
507
                           (lnum, pos), (lnum, pos+1), line)
Benjamin Peterson's avatar
Benjamin Peterson committed
508
                pos += 1
Guido van Rossum's avatar
Guido van Rossum committed
509 510

    for indent in indents[1:]:                 # pop remaining indent levels
511 512
        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossum's avatar
Guido van Rossum committed
513

514 515 516 517 518

# An undocumented, backwards compatible, API for all the places in the standard
# library that expect to be able to use tokenize with strings
def generate_tokens(readline):
    return _tokenize(readline, None)
519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543

if __name__ == "__main__":
    # Quick sanity check
    s = b'''def parseline(self, line):
            """Parse the line into a command name and a string containing
            the arguments.  Returns a tuple containing (command, args, line).
            'command' and 'args' may be None if the line couldn't be parsed.
            """
            line = line.strip()
            if not line:
                return None, None, line
            elif line[0] == '?':
                line = 'help ' + line[1:]
            elif line[0] == '!':
                if hasattr(self, 'do_shell'):
                    line = 'shell ' + line[1:]
                else:
                    return None, None, line
            i, n = 0, len(line)
            while i < n and line[i] in self.identchars: i = i+1
            cmd, arg = line[:i], line[i:].strip()
            return cmd, arg, line
    '''
    for tok in tokenize(iter(s.splitlines()).__next__):
        print(tok)