tokenize.py 24.3 KB
Newer Older
1 2
"""Tokenization help for Python programs.

3 4 5
tokenize(readline) is a generator that breaks a stream of bytes into
Python tokens.  It decodes the bytes according to PEP-0263 for
determining source file encoding.
6

7 8 9
It accepts a readline-like method which is called repeatedly to get the
next line of input (or b"" for EOF).  It generates 5-tuples with these
members:
10 11 12 13 14 15 16 17 18

    the token type (see token.py)
    the token (a string)
    the starting (row, column) indices of the token (a 2-tuple of ints)
    the ending (row, column) indices of the token (a 2-tuple of ints)
    the original line (string)

It is designed to match the working of the Python tokenizer exactly, except
that it produces COMMENT tokens for comments and gives type OP for all
19 20 21
operators.  Additionally, all token lists start with an ENCODING token
which tells you which encoding was used to decode the bytes stream.
"""
22

23
__author__ = 'Ka-Ping Yee <ping@lfw.org>'
24 25 26
__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
               'Michael Foord')
27
from codecs import lookup, BOM_UTF8
28
import collections
29
from io import TextIOWrapper
30 31 32 33 34
from itertools import chain
import re
import sys
from token import *

35
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
36
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
Guido van Rossum's avatar
Guido van Rossum committed
37

38
import token
39 40
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
                           "NL", "untokenize", "ENCODING", "TokenInfo"]
41 42
del token

43 44
COMMENT = N_TOKENS
tok_name[COMMENT] = 'COMMENT'
45 46
NL = N_TOKENS + 1
tok_name[NL] = 'NL'
47 48 49
ENCODING = N_TOKENS + 2
tok_name[ENCODING] = 'ENCODING'
N_TOKENS += 3
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
EXACT_TOKEN_TYPES = {
    '(':   LPAR,
    ')':   RPAR,
    '[':   LSQB,
    ']':   RSQB,
    ':':   COLON,
    ',':   COMMA,
    ';':   SEMI,
    '+':   PLUS,
    '-':   MINUS,
    '*':   STAR,
    '/':   SLASH,
    '|':   VBAR,
    '&':   AMPER,
    '<':   LESS,
    '>':   GREATER,
    '=':   EQUAL,
    '.':   DOT,
    '%':   PERCENT,
    '{':   LBRACE,
    '}':   RBRACE,
    '==':  EQEQUAL,
    '!=':  NOTEQUAL,
    '<=':  LESSEQUAL,
    '>=':  GREATEREQUAL,
    '~':   TILDE,
    '^':   CIRCUMFLEX,
    '<<':  LEFTSHIFT,
    '>>':  RIGHTSHIFT,
    '**':  DOUBLESTAR,
    '+=':  PLUSEQUAL,
    '-=':  MINEQUAL,
    '*=':  STAREQUAL,
    '/=':  SLASHEQUAL,
    '%=':  PERCENTEQUAL,
    '&=':  AMPEREQUAL,
    '|=':  VBAREQUAL,
    '^=': CIRCUMFLEXEQUAL,
    '<<=': LEFTSHIFTEQUAL,
    '>>=': RIGHTSHIFTEQUAL,
    '**=': DOUBLESTAREQUAL,
    '//':  DOUBLESLASH,
    '//=': DOUBLESLASHEQUAL,
93 94
    '@':   AT,
    '@=':  ATEQUAL,
95
}
96

97
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
98
    def __repr__(self):
99 100 101
        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
                self._replace(type=annotated_type))
102

103 104 105 106 107 108 109
    @property
    def exact_type(self):
        if self.type == OP and self.string in EXACT_TOKEN_TYPES:
            return EXACT_TOKEN_TYPES[self.string]
        else:
            return self.type

110
def group(*choices): return '(' + '|'.join(choices) + ')'
111 112
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'
Guido van Rossum's avatar
Guido van Rossum committed
113

114 115
# Note: we use unicode matching for names ("\w") but ascii matching for
# number literals.
116 117 118
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
119
Name = r'\w+'
Guido van Rossum's avatar
Guido van Rossum committed
120

121
Hexnumber = r'0[xX][0-9a-fA-F]+'
122 123
Binnumber = r'0[bB][01]+'
Octnumber = r'0[oO][0-7]+'
124
Decnumber = r'(?:0+|[1-9][0-9]*)'
125
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
126 127 128
Exponent = r'[eE][-+]?[0-9]+'
Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
Expfloat = r'[0-9]+' + Exponent
129
Floatnumber = group(Pointfloat, Expfloat)
130
Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
131
Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum's avatar
Guido van Rossum committed
132

133
StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
134

135 136 137 138 139 140 141 142
# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
# Tail end of " string.
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
# Tail end of ''' string.
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
143
Triple = group(StringPrefix + "'''", StringPrefix + '"""')
144
# Single-line ' or " string.
145 146
String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum's avatar
Guido van Rossum committed
147

148 149 150
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
151
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
152
                 r"//=?", r"->",
153
                 r"[+\-*/%&@|^=<>]=?",
154
                 r"~")
155

Guido van Rossum's avatar
Guido van Rossum committed
156
Bracket = '[][(){}]'
157
Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossum's avatar
Guido van Rossum committed
158
Funny = group(Operator, Bracket, Special)
Guido van Rossum's avatar
Guido van Rossum committed
159

160
PlainToken = group(Number, Funny, String, Name)
Guido van Rossum's avatar
Guido van Rossum committed
161
Token = Ignore + PlainToken
Guido van Rossum's avatar
Guido van Rossum committed
162

163
# First (or only) line of ' or " string.
164
ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
165
                group("'", r'\\\r?\n'),
166
                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
167
                group('"', r'\\\r?\n'))
168
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
169
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
170

171 172 173
def _compile(expr):
    return re.compile(expr, re.UNICODE)

174 175 176 177 178 179
endpats = {"'": Single, '"': Double,
           "'''": Single3, '"""': Double3,
           "r'''": Single3, 'r"""': Double3,
           "b'''": Single3, 'b"""': Double3,
           "R'''": Single3, 'R"""': Double3,
           "B'''": Single3, 'B"""': Double3,
180
           "br'''": Single3, 'br"""': Double3,
181 182 183
           "bR'''": Single3, 'bR"""': Double3,
           "Br'''": Single3, 'Br"""': Double3,
           "BR'''": Single3, 'BR"""': Double3,
184 185 186 187
           "rb'''": Single3, 'rb"""': Double3,
           "Rb'''": Single3, 'Rb"""': Double3,
           "rB'''": Single3, 'rB"""': Double3,
           "RB'''": Single3, 'RB"""': Double3,
188 189 190 191
           "u'''": Single3, 'u"""': Double3,
           "U'''": Single3, 'U"""': Double3,
           'r': None, 'R': None, 'b': None, 'B': None,
           'u': None, 'U': None}
Guido van Rossum's avatar
Guido van Rossum committed
192

193 194 195
triple_quoted = {}
for t in ("'''", '"""',
          "r'''", 'r"""', "R'''", 'R"""',
196 197
          "b'''", 'b"""', "B'''", 'B"""',
          "br'''", 'br"""', "Br'''", 'Br"""',
198
          "bR'''", 'bR"""', "BR'''", 'BR"""',
199 200
          "rb'''", 'rb"""', "rB'''", 'rB"""',
          "Rb'''", 'Rb"""', "RB'''", 'RB"""',
201
          "u'''", 'u"""', "U'''", 'U"""',
202
          ):
203 204 205 206
    triple_quoted[t] = t
single_quoted = {}
for t in ("'", '"',
          "r'", 'r"', "R'", 'R"',
207 208
          "b'", 'b"', "B'", 'B"',
          "br'", 'br"', "Br'", 'Br"',
209
          "bR'", 'bR"', "BR'", 'BR"' ,
210 211
          "rb'", 'rb"', "rB'", 'rB"',
          "Rb'", 'Rb"', "RB'", 'RB"' ,
212
          "u'", 'u"', "U'", 'U"',
213
          ):
214 215
    single_quoted[t] = t

Guido van Rossum's avatar
Guido van Rossum committed
216
tabsize = 8
217

218 219 220
class TokenError(Exception): pass

class StopTokenizing(Exception): pass
221

222

223 224 225 226 227 228
class Untokenizer:

    def __init__(self):
        self.tokens = []
        self.prev_row = 1
        self.prev_col = 0
229
        self.encoding = None
230 231 232

    def add_whitespace(self, start):
        row, col = start
233 234 235
        if row < self.prev_row or row == self.prev_row and col < self.prev_col:
            raise ValueError("start ({},{}) precedes previous end ({},{})"
                             .format(row, col, self.prev_row, self.prev_col))
236
        row_offset = row - self.prev_row
Terry Jan Reedy's avatar
Terry Jan Reedy committed
237
        if row_offset:
238 239
            self.tokens.append("\\\n" * row_offset)
            self.prev_col = 0
240 241 242 243 244
        col_offset = col - self.prev_col
        if col_offset:
            self.tokens.append(" " * col_offset)

    def untokenize(self, iterable):
245 246
        it = iter(iterable)
        for t in it:
247
            if len(t) == 2:
248
                self.compat(t, it)
249 250
                break
            tok_type, token, start, end, line = t
251 252 253
            if tok_type == ENCODING:
                self.encoding = token
                continue
254 255
            if tok_type == ENDMARKER:
                break
256 257 258 259 260 261 262 263 264 265 266
            self.add_whitespace(start)
            self.tokens.append(token)
            self.prev_row, self.prev_col = end
            if tok_type in (NEWLINE, NL):
                self.prev_row += 1
                self.prev_col = 0
        return "".join(self.tokens)

    def compat(self, token, iterable):
        indents = []
        toks_append = self.tokens.append
267
        startline = token[0] in (NEWLINE, NL)
268
        prevstring = False
269 270

        for tok in chain([token], iterable):
271
            toknum, tokval = tok[:2]
272 273 274
            if toknum == ENCODING:
                self.encoding = tokval
                continue
275 276 277 278

            if toknum in (NAME, NUMBER):
                tokval += ' '

279 280 281 282 283 284 285 286
            # Insert a space between two consecutive strings
            if toknum == STRING:
                if prevstring:
                    tokval = ' ' + tokval
                prevstring = True
            else:
                prevstring = False

287 288 289 290 291 292 293 294 295 296 297 298
            if toknum == INDENT:
                indents.append(tokval)
                continue
            elif toknum == DEDENT:
                indents.pop()
                continue
            elif toknum in (NEWLINE, NL):
                startline = True
            elif startline and indents:
                toks_append(indents[-1])
                startline = False
            toks_append(tokval)
299

300

301 302
def untokenize(iterable):
    """Transform tokens back into Python source code.
303 304
    It returns a bytes object, encoded using the ENCODING
    token, which is the first token sequence output by tokenize.
305 306

    Each element returned by the iterable must be a token sequence
307 308 309 310 311
    with at least two elements, a token number and token value.  If
    only two tokens are passed, the resulting output is poor.

    Round-trip invariant for full input:
        Untokenized source will match input source exactly
312

313
    Round-trip invariant for limited intput:
314 315
        # Output bytes will tokenize the back to the input
        t1 = [tok[:2] for tok in tokenize(f.readline)]
316
        newcode = untokenize(t1)
317 318
        readline = BytesIO(newcode).readline
        t2 = [tok[:2] for tok in tokenize(readline)]
319 320
        assert t1 == t2
    """
321
    ut = Untokenizer()
322 323 324 325
    out = ut.untokenize(iterable)
    if ut.encoding is not None:
        out = out.encode(ut.encoding)
    return out
326

327

328 329 330 331 332 333 334 335 336 337 338
def _get_normal_name(orig_enc):
    """Imitates get_normal_name in tokenizer.c."""
    # Only care about the first 12 characters.
    enc = orig_enc[:12].lower().replace("_", "-")
    if enc == "utf-8" or enc.startswith("utf-8-"):
        return "utf-8"
    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
        return "iso-8859-1"
    return orig_enc

339
def detect_encoding(readline):
340
    """
341
    The detect_encoding() function is used to detect the encoding that should
342
    be used to decode a Python source file.  It requires one argument, readline,
343 344 345
    in the same way as the tokenize() generator.

    It will call readline a maximum of twice, and return the encoding used
346
    (as a string) and a list of any lines (left as bytes) it has read in.
347 348

    It detects the encoding from the presence of a utf-8 bom or an encoding
349 350 351
    cookie as specified in pep-0263.  If both a bom and a cookie are present,
    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
352
    'utf-8-sig' is returned.
353 354 355

    If no encoding is specified, then the default of 'utf-8' will be returned.
    """
356 357 358 359
    try:
        filename = readline.__self__.name
    except AttributeError:
        filename = None
360 361
    bom_found = False
    encoding = None
362
    default = 'utf-8'
363 364 365 366 367 368 369 370
    def read_or_stop():
        try:
            return readline()
        except StopIteration:
            return b''

    def find_cookie(line):
        try:
371 372 373 374
            # Decode as UTF-8. Either the line is an encoding declaration,
            # in which case it should be pure ASCII, or it must be UTF-8
            # per default encoding.
            line_string = line.decode('utf-8')
375
        except UnicodeDecodeError:
376 377 378 379
            msg = "invalid or missing encoding declaration"
            if filename is not None:
                msg = '{} for {!r}'.format(msg, filename)
            raise SyntaxError(msg)
380

381 382
        match = cookie_re.match(line_string)
        if not match:
383
            return None
384
        encoding = _get_normal_name(match.group(1))
385 386 387 388
        try:
            codec = lookup(encoding)
        except LookupError:
            # This behaviour mimics the Python interpreter
389 390 391 392 393 394
            if filename is None:
                msg = "unknown encoding: " + encoding
            else:
                msg = "unknown encoding for {!r}: {}".format(filename,
                        encoding)
            raise SyntaxError(msg)
395

396
        if bom_found:
397
            if encoding != 'utf-8':
398
                # This behaviour mimics the Python interpreter
399 400 401 402 403
                if filename is None:
                    msg = 'encoding problem: utf-8'
                else:
                    msg = 'encoding problem for {!r}: utf-8'.format(filename)
                raise SyntaxError(msg)
404
            encoding += '-sig'
405
        return encoding
406 407

    first = read_or_stop()
408
    if first.startswith(BOM_UTF8):
409 410
        bom_found = True
        first = first[3:]
411
        default = 'utf-8-sig'
412
    if not first:
413
        return default, []
414 415 416 417

    encoding = find_cookie(first)
    if encoding:
        return encoding, [first]
418 419
    if not blank_re.match(first):
        return default, [first]
420 421 422

    second = read_or_stop()
    if not second:
423
        return default, [first]
424 425 426 427 428

    encoding = find_cookie(second)
    if encoding:
        return encoding, [first, second]

429
    return default, [first, second]
430 431


432 433
_builtin_open = open

434 435 436 437
def open(filename):
    """Open a file in read only mode using the encoding detected by
    detect_encoding().
    """
438
    buffer = _builtin_open(filename, 'rb')
439 440 441 442 443 444 445
    encoding, lines = detect_encoding(buffer.readline)
    buffer.seek(0)
    text = TextIOWrapper(buffer, encoding, line_buffering=True)
    text.mode = 'r'
    return text


446 447 448
def tokenize(readline):
    """
    The tokenize() generator requires one argment, readline, which
449
    must be a callable object which provides the same interface as the
450
    readline() method of built-in file objects.  Each call to the function
451
    should return one line of input as bytes.  Alternately, readline
452
    can be a callable function terminating with StopIteration:
453
        readline = open(myfile, 'rb').__next__  # Example of alternate readline
Tim Peters's avatar
Tim Peters committed
454

455 456 457 458
    The generator produces 5-tuples with these members: the token type; the
    token string; a 2-tuple (srow, scol) of ints specifying the row and
    column where the token begins in the source; a 2-tuple (erow, ecol) of
    ints specifying the row and column where the token ends in the source;
459
    and the line on which the token was found.  The line passed is the
Tim Peters's avatar
Tim Peters committed
460
    logical line; continuation lines are included.
461 462 463

    The first token sequence will always be an ENCODING token
    which tells you which encoding was used to decode the bytes stream.
464
    """
465 466
    # This import is here to avoid problems when the itertools module is not
    # built yet and tokenize is imported.
467
    from itertools import chain, repeat
468
    encoding, consumed = detect_encoding(readline)
469 470 471
    rl_gen = iter(readline, b"")
    empty = repeat(b"")
    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
472 473 474


def _tokenize(readline, encoding):
475
    lnum = parenlev = continued = 0
476
    numchars = '0123456789'
477
    contstr, needcont = '', 0
478
    contline = None
Guido van Rossum's avatar
Guido van Rossum committed
479
    indents = [0]
480

481
    if encoding is not None:
482 483 484
        if encoding == "utf-8-sig":
            # BOM will already have been stripped.
            encoding = "utf-8"
485
        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
486
    while True:             # loop over lines in stream
487 488 489
        try:
            line = readline()
        except StopIteration:
490 491 492 493
            line = b''

        if encoding is not None:
            line = line.decode(encoding)
Benjamin Peterson's avatar
Benjamin Peterson committed
494
        lnum += 1
Guido van Rossum's avatar
Guido van Rossum committed
495 496 497
        pos, max = 0, len(line)

        if contstr:                            # continued string
498
            if not line:
499
                raise TokenError("EOF in multi-line string", strstart)
500 501 502
            endmatch = endprog.match(line)
            if endmatch:
                pos = end = endmatch.end(0)
503
                yield TokenInfo(STRING, contstr + line[:end],
504
                       strstart, (lnum, end), contline + line)
505
                contstr, needcont = '', 0
506
                contline = None
507
            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
508
                yield TokenInfo(ERRORTOKEN, contstr + line,
509
                           strstart, (lnum, len(line)), contline)
Guido van Rossum's avatar
Guido van Rossum committed
510
                contstr = ''
511
                contline = None
512
                continue
Guido van Rossum's avatar
Guido van Rossum committed
513 514
            else:
                contstr = contstr + line
515
                contline = contline + line
Guido van Rossum's avatar
Guido van Rossum committed
516 517
                continue

518
        elif parenlev == 0 and not continued:  # new statement
Guido van Rossum's avatar
Guido van Rossum committed
519 520
            if not line: break
            column = 0
521
            while pos < max:                   # measure leading whitespace
Benjamin Peterson's avatar
Benjamin Peterson committed
522 523 524 525 526 527 528 529 530 531 532
                if line[pos] == ' ':
                    column += 1
                elif line[pos] == '\t':
                    column = (column//tabsize + 1)*tabsize
                elif line[pos] == '\f':
                    column = 0
                else:
                    break
                pos += 1
            if pos == max:
                break
533 534

            if line[pos] in '#\r\n':           # skip comments or blank lines
535 536 537
                if line[pos] == '#':
                    comment_token = line[pos:].rstrip('\r\n')
                    nl_pos = pos + len(comment_token)
538
                    yield TokenInfo(COMMENT, comment_token,
539
                           (lnum, pos), (lnum, pos + len(comment_token)), line)
540
                    yield TokenInfo(NL, line[nl_pos:],
541 542
                           (lnum, nl_pos), (lnum, len(line)), line)
                else:
543
                    yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
544 545
                           (lnum, pos), (lnum, len(line)), line)
                continue
Guido van Rossum's avatar
Guido van Rossum committed
546 547 548

            if column > indents[-1]:           # count indents or dedents
                indents.append(column)
549
                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
550
            while column < indents[-1]:
551 552
                if column not in indents:
                    raise IndentationError(
553 554
                        "unindent does not match any outer indentation level",
                        ("<tokenize>", lnum, pos, line))
Guido van Rossum's avatar
Guido van Rossum committed
555
                indents = indents[:-1]
556
                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
557 558

        else:                                  # continued statement
559
            if not line:
560
                raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossum's avatar
Guido van Rossum committed
561 562 563
            continued = 0

        while pos < max:
564
            pseudomatch = _compile(PseudoToken).match(line, pos)
565 566
            if pseudomatch:                                # scan for tokens
                start, end = pseudomatch.span(1)
567
                spos, epos, pos = (lnum, start), (lnum, end), end
568 569
                if start == end:
                    continue
570
                token, initial = line[start:end], line[start]
Guido van Rossum's avatar
Guido van Rossum committed
571

572 573
                if (initial in numchars or                  # ordinary number
                    (initial == '.' and token != '.' and token != '...')):
574
                    yield TokenInfo(NUMBER, token, spos, epos, line)
575
                elif initial in '\r\n':
576
                    yield TokenInfo(NL if parenlev > 0 else NEWLINE,
577
                           token, spos, epos, line)
578
                elif initial == '#':
579
                    assert not token.endswith("\n")
580
                    yield TokenInfo(COMMENT, token, spos, epos, line)
581
                elif token in triple_quoted:
582
                    endprog = _compile(endpats[token])
583 584 585
                    endmatch = endprog.match(line, pos)
                    if endmatch:                           # all on one line
                        pos = endmatch.end(0)
586
                        token = line[start:pos]
587
                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
588
                    else:
589 590
                        strstart = (lnum, start)           # multiple lines
                        contstr = line[start:]
591
                        contline = line
Guido van Rossum's avatar
Guido van Rossum committed
592
                        break
593 594 595
                elif initial in single_quoted or \
                    token[:2] in single_quoted or \
                    token[:3] in single_quoted:
Guido van Rossum's avatar
Guido van Rossum committed
596
                    if token[-1] == '\n':                  # continued string
597
                        strstart = (lnum, start)
598 599 600
                        endprog = _compile(endpats[initial] or
                                           endpats[token[1]] or
                                           endpats[token[2]])
601
                        contstr, needcont = line[start:], 1
602
                        contline = line
Guido van Rossum's avatar
Guido van Rossum committed
603 604
                        break
                    else:                                  # ordinary string
605
                        yield TokenInfo(STRING, token, spos, epos, line)
606
                elif initial.isidentifier():               # ordinary name
607
                    yield TokenInfo(NAME, token, spos, epos, line)
608 609
                elif initial == '\\':                      # continued stmt
                    continued = 1
Guido van Rossum's avatar
Guido van Rossum committed
610
                else:
Benjamin Peterson's avatar
Benjamin Peterson committed
611 612 613 614
                    if initial in '([{':
                        parenlev += 1
                    elif initial in ')]}':
                        parenlev -= 1
615
                    yield TokenInfo(OP, token, spos, epos, line)
Guido van Rossum's avatar
Guido van Rossum committed
616
            else:
617
                yield TokenInfo(ERRORTOKEN, line[pos],
618
                           (lnum, pos), (lnum, pos+1), line)
Benjamin Peterson's avatar
Benjamin Peterson committed
619
                pos += 1
Guido van Rossum's avatar
Guido van Rossum committed
620 621

    for indent in indents[1:]:                 # pop remaining indent levels
622 623
        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossum's avatar
Guido van Rossum committed
624

625 626 627 628 629

# An undocumented, backwards compatible, API for all the places in the standard
# library that expect to be able to use tokenize with strings
def generate_tokens(readline):
    return _tokenize(readline, None)
630

631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652
def main():
    import argparse

    # Helper error handling routines
    def perror(message):
        print(message, file=sys.stderr)

    def error(message, filename=None, location=None):
        if location:
            args = (filename,) + location + (message,)
            perror("%s:%d:%d: error: %s" % args)
        elif filename:
            perror("%s: error: %s" % (filename, message))
        else:
            perror("error: %s" % message)
        sys.exit(1)

    # Parse the arguments and options
    parser = argparse.ArgumentParser(prog='python -m tokenize')
    parser.add_argument(dest='filename', nargs='?',
                        metavar='filename.py',
                        help='the file to tokenize; defaults to stdin')
653 654
    parser.add_argument('-e', '--exact', dest='exact', action='store_true',
                        help='display token names using the exact type')
655 656 657 658 659 660
    args = parser.parse_args()

    try:
        # Tokenize the input
        if args.filename:
            filename = args.filename
661
            with _builtin_open(filename, 'rb') as f:
662 663 664 665 666 667 668
                tokens = list(tokenize(f.readline))
        else:
            filename = "<stdin>"
            tokens = _tokenize(sys.stdin.readline, None)

        # Output the tokenization
        for token in tokens:
669 670 671
            token_type = token.type
            if args.exact:
                token_type = token.exact_type
672 673
            token_range = "%d,%d-%d,%d:" % (token.start + token.end)
            print("%-20s%-15s%-15r" %
674
                  (token_range, tok_name[token_type], token.string))
675 676 677 678 679 680 681 682
    except IndentationError as err:
        line, column = err.args[1][1:3]
        error(err.args[0], filename, (line, column))
    except TokenError as err:
        line, column = err.args[1]
        error(err.args[0], filename, (line, column))
    except SyntaxError as err:
        error(err, filename)
683
    except OSError as err:
684 685 686 687 688 689 690
        error(err)
    except KeyboardInterrupt:
        print("interrupted\n")
    except Exception as err:
        perror("unexpected error: %s" % err)
        raise

691
if __name__ == "__main__":
692
    main()