tokenize.py 24 KB
Newer Older
1 2
"""Tokenization help for Python programs.

3 4 5
tokenize(readline) is a generator that breaks a stream of bytes into
Python tokens.  It decodes the bytes according to PEP-0263 for
determining source file encoding.
6

7 8 9
It accepts a readline-like method which is called repeatedly to get the
next line of input (or b"" for EOF).  It generates 5-tuples with these
members:
10 11 12 13 14 15 16 17 18

    the token type (see token.py)
    the token (a string)
    the starting (row, column) indices of the token (a 2-tuple of ints)
    the ending (row, column) indices of the token (a 2-tuple of ints)
    the original line (string)

It is designed to match the working of the Python tokenizer exactly, except
that it produces COMMENT tokens for comments and gives type OP for all
19 20 21
operators.  Additionally, all token lists start with an ENCODING token
which tells you which encoding was used to decode the bytes stream.
"""
22

23
__author__ = 'Ka-Ping Yee <ping@lfw.org>'
24 25 26
__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
               'Michael Foord')
27
import builtins
28 29
import re
import sys
Guido van Rossum's avatar
Guido van Rossum committed
30
from token import *
31
from codecs import lookup, BOM_UTF8
32
import collections
33
from io import TextIOWrapper
34
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
35
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
Guido van Rossum's avatar
Guido van Rossum committed
36

37
import token
38 39
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
                           "NL", "untokenize", "ENCODING", "TokenInfo"]
40 41
del token

42 43
COMMENT = N_TOKENS
tok_name[COMMENT] = 'COMMENT'
44 45
NL = N_TOKENS + 1
tok_name[NL] = 'NL'
46 47 48
ENCODING = N_TOKENS + 2
tok_name[ENCODING] = 'ENCODING'
N_TOKENS += 3
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
EXACT_TOKEN_TYPES = {
    '(':   LPAR,
    ')':   RPAR,
    '[':   LSQB,
    ']':   RSQB,
    ':':   COLON,
    ',':   COMMA,
    ';':   SEMI,
    '+':   PLUS,
    '-':   MINUS,
    '*':   STAR,
    '/':   SLASH,
    '|':   VBAR,
    '&':   AMPER,
    '<':   LESS,
    '>':   GREATER,
    '=':   EQUAL,
    '.':   DOT,
    '%':   PERCENT,
    '{':   LBRACE,
    '}':   RBRACE,
    '==':  EQEQUAL,
    '!=':  NOTEQUAL,
    '<=':  LESSEQUAL,
    '>=':  GREATEREQUAL,
    '~':   TILDE,
    '^':   CIRCUMFLEX,
    '<<':  LEFTSHIFT,
    '>>':  RIGHTSHIFT,
    '**':  DOUBLESTAR,
    '+=':  PLUSEQUAL,
    '-=':  MINEQUAL,
    '*=':  STAREQUAL,
    '/=':  SLASHEQUAL,
    '%=':  PERCENTEQUAL,
    '&=':  AMPEREQUAL,
    '|=':  VBAREQUAL,
    '^=': CIRCUMFLEXEQUAL,
    '<<=': LEFTSHIFTEQUAL,
    '>>=': RIGHTSHIFTEQUAL,
    '**=': DOUBLESTAREQUAL,
    '//':  DOUBLESLASH,
    '//=': DOUBLESLASHEQUAL,
    '@':   AT
}
94

95
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
96
    def __repr__(self):
97 98 99
        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
                self._replace(type=annotated_type))
100

101 102 103 104 105 106 107
    @property
    def exact_type(self):
        if self.type == OP and self.string in EXACT_TOKEN_TYPES:
            return EXACT_TOKEN_TYPES[self.string]
        else:
            return self.type

108
def group(*choices): return '(' + '|'.join(choices) + ')'
109 110
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'
Guido van Rossum's avatar
Guido van Rossum committed
111

112 113
# Note: we use unicode matching for names ("\w") but ascii matching for
# number literals.
114 115 116
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
117
Name = r'\w+'
Guido van Rossum's avatar
Guido van Rossum committed
118

119
Hexnumber = r'0[xX][0-9a-fA-F]+'
120 121
Binnumber = r'0[bB][01]+'
Octnumber = r'0[oO][0-7]+'
122
Decnumber = r'(?:0+|[1-9][0-9]*)'
123
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
124 125 126
Exponent = r'[eE][-+]?[0-9]+'
Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
Expfloat = r'[0-9]+' + Exponent
127
Floatnumber = group(Pointfloat, Expfloat)
128
Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
129
Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum's avatar
Guido van Rossum committed
130

131
StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
132

133 134 135 136 137 138 139 140
# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
# Tail end of " string.
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
# Tail end of ''' string.
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
141
Triple = group(StringPrefix + "'''", StringPrefix + '"""')
142
# Single-line ' or " string.
143 144
String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum's avatar
Guido van Rossum committed
145

146 147 148
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
149
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
150
                 r"//=?", r"->",
151 152
                 r"[+\-*/%&|^=<>]=?",
                 r"~")
153

Guido van Rossum's avatar
Guido van Rossum committed
154
Bracket = '[][(){}]'
155
Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossum's avatar
Guido van Rossum committed
156
Funny = group(Operator, Bracket, Special)
Guido van Rossum's avatar
Guido van Rossum committed
157

158
PlainToken = group(Number, Funny, String, Name)
Guido van Rossum's avatar
Guido van Rossum committed
159
Token = Ignore + PlainToken
Guido van Rossum's avatar
Guido van Rossum committed
160

161
# First (or only) line of ' or " string.
162
ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
163
                group("'", r'\\\r?\n'),
164
                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
165
                group('"', r'\\\r?\n'))
166
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
167
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
168

169 170 171
def _compile(expr):
    return re.compile(expr, re.UNICODE)

172 173 174 175 176 177
endpats = {"'": Single, '"': Double,
           "'''": Single3, '"""': Double3,
           "r'''": Single3, 'r"""': Double3,
           "b'''": Single3, 'b"""': Double3,
           "R'''": Single3, 'R"""': Double3,
           "B'''": Single3, 'B"""': Double3,
178
           "br'''": Single3, 'br"""': Double3,
179 180 181
           "bR'''": Single3, 'bR"""': Double3,
           "Br'''": Single3, 'Br"""': Double3,
           "BR'''": Single3, 'BR"""': Double3,
182 183 184 185
           "rb'''": Single3, 'rb"""': Double3,
           "Rb'''": Single3, 'Rb"""': Double3,
           "rB'''": Single3, 'rB"""': Double3,
           "RB'''": Single3, 'RB"""': Double3,
186 187 188 189 190
           "u'''": Single3, 'u"""': Double3,
           "R'''": Single3, 'R"""': Double3,
           "U'''": Single3, 'U"""': Double3,
           'r': None, 'R': None, 'b': None, 'B': None,
           'u': None, 'U': None}
Guido van Rossum's avatar
Guido van Rossum committed
191

192 193 194
triple_quoted = {}
for t in ("'''", '"""',
          "r'''", 'r"""', "R'''", 'R"""',
195 196
          "b'''", 'b"""', "B'''", 'B"""',
          "br'''", 'br"""', "Br'''", 'Br"""',
197
          "bR'''", 'bR"""', "BR'''", 'BR"""',
198 199
          "rb'''", 'rb"""', "rB'''", 'rB"""',
          "Rb'''", 'Rb"""', "RB'''", 'RB"""',
200
          "u'''", 'u"""', "U'''", 'U"""',
201
          ):
202 203 204 205
    triple_quoted[t] = t
single_quoted = {}
for t in ("'", '"',
          "r'", 'r"', "R'", 'R"',
206 207
          "b'", 'b"', "B'", 'B"',
          "br'", 'br"', "Br'", 'Br"',
208
          "bR'", 'bR"', "BR'", 'BR"' ,
209 210
          "rb'", 'rb"', "rB'", 'rB"',
          "Rb'", 'Rb"', "RB'", 'RB"' ,
211
          "u'", 'u"', "U'", 'U"',
212
          ):
213 214
    single_quoted[t] = t

Guido van Rossum's avatar
Guido van Rossum committed
215
tabsize = 8
216

217 218 219
class TokenError(Exception): pass

class StopTokenizing(Exception): pass
220

221

222 223 224 225 226 227
class Untokenizer:

    def __init__(self):
        self.tokens = []
        self.prev_row = 1
        self.prev_col = 0
228
        self.encoding = None
229 230 231 232 233 234 235 236 237 238 239 240 241 242

    def add_whitespace(self, start):
        row, col = start
        assert row <= self.prev_row
        col_offset = col - self.prev_col
        if col_offset:
            self.tokens.append(" " * col_offset)

    def untokenize(self, iterable):
        for t in iterable:
            if len(t) == 2:
                self.compat(t, iterable)
                break
            tok_type, token, start, end, line = t
243 244 245
            if tok_type == ENCODING:
                self.encoding = token
                continue
246 247 248 249 250 251 252 253 254 255 256 257 258
            self.add_whitespace(start)
            self.tokens.append(token)
            self.prev_row, self.prev_col = end
            if tok_type in (NEWLINE, NL):
                self.prev_row += 1
                self.prev_col = 0
        return "".join(self.tokens)

    def compat(self, token, iterable):
        startline = False
        indents = []
        toks_append = self.tokens.append
        toknum, tokval = token
259

260 261 262 263
        if toknum in (NAME, NUMBER):
            tokval += ' '
        if toknum in (NEWLINE, NL):
            startline = True
264
        prevstring = False
265 266
        for tok in iterable:
            toknum, tokval = tok[:2]
267 268 269
            if toknum == ENCODING:
                self.encoding = tokval
                continue
270 271 272 273

            if toknum in (NAME, NUMBER):
                tokval += ' '

274 275 276 277 278 279 280 281
            # Insert a space between two consecutive strings
            if toknum == STRING:
                if prevstring:
                    tokval = ' ' + tokval
                prevstring = True
            else:
                prevstring = False

282 283 284 285 286 287 288 289 290 291 292 293
            if toknum == INDENT:
                indents.append(tokval)
                continue
            elif toknum == DEDENT:
                indents.pop()
                continue
            elif toknum in (NEWLINE, NL):
                startline = True
            elif startline and indents:
                toks_append(indents[-1])
                startline = False
            toks_append(tokval)
294

295

296 297
def untokenize(iterable):
    """Transform tokens back into Python source code.
298 299
    It returns a bytes object, encoded using the ENCODING
    token, which is the first token sequence output by tokenize.
300 301

    Each element returned by the iterable must be a token sequence
302 303 304 305 306
    with at least two elements, a token number and token value.  If
    only two tokens are passed, the resulting output is poor.

    Round-trip invariant for full input:
        Untokenized source will match input source exactly
307

308
    Round-trip invariant for limited intput:
309 310
        # Output bytes will tokenize the back to the input
        t1 = [tok[:2] for tok in tokenize(f.readline)]
311
        newcode = untokenize(t1)
312 313
        readline = BytesIO(newcode).readline
        t2 = [tok[:2] for tok in tokenize(readline)]
314 315
        assert t1 == t2
    """
316
    ut = Untokenizer()
317 318 319 320
    out = ut.untokenize(iterable)
    if ut.encoding is not None:
        out = out.encode(ut.encoding)
    return out
321

322

323 324 325 326 327 328 329 330 331 332 333
def _get_normal_name(orig_enc):
    """Imitates get_normal_name in tokenizer.c."""
    # Only care about the first 12 characters.
    enc = orig_enc[:12].lower().replace("_", "-")
    if enc == "utf-8" or enc.startswith("utf-8-"):
        return "utf-8"
    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
        return "iso-8859-1"
    return orig_enc

334
def detect_encoding(readline):
335
    """
336
    The detect_encoding() function is used to detect the encoding that should
337
    be used to decode a Python source file.  It requires one argument, readline,
338 339 340
    in the same way as the tokenize() generator.

    It will call readline a maximum of twice, and return the encoding used
341
    (as a string) and a list of any lines (left as bytes) it has read in.
342 343

    It detects the encoding from the presence of a utf-8 bom or an encoding
344 345 346
    cookie as specified in pep-0263.  If both a bom and a cookie are present,
    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
347
    'utf-8-sig' is returned.
348 349 350

    If no encoding is specified, then the default of 'utf-8' will be returned.
    """
351 352 353 354
    try:
        filename = readline.__self__.name
    except AttributeError:
        filename = None
355 356
    bom_found = False
    encoding = None
357
    default = 'utf-8'
358 359 360 361 362 363 364 365
    def read_or_stop():
        try:
            return readline()
        except StopIteration:
            return b''

    def find_cookie(line):
        try:
366 367 368 369
            # Decode as UTF-8. Either the line is an encoding declaration,
            # in which case it should be pure ASCII, or it must be UTF-8
            # per default encoding.
            line_string = line.decode('utf-8')
370
        except UnicodeDecodeError:
371 372 373 374
            msg = "invalid or missing encoding declaration"
            if filename is not None:
                msg = '{} for {!r}'.format(msg, filename)
            raise SyntaxError(msg)
375

376 377
        match = cookie_re.match(line_string)
        if not match:
378
            return None
379
        encoding = _get_normal_name(match.group(1))
380 381 382 383
        try:
            codec = lookup(encoding)
        except LookupError:
            # This behaviour mimics the Python interpreter
384 385 386 387 388 389
            if filename is None:
                msg = "unknown encoding: " + encoding
            else:
                msg = "unknown encoding for {!r}: {}".format(filename,
                        encoding)
            raise SyntaxError(msg)
390

391
        if bom_found:
392
            if encoding != 'utf-8':
393
                # This behaviour mimics the Python interpreter
394 395 396 397 398
                if filename is None:
                    msg = 'encoding problem: utf-8'
                else:
                    msg = 'encoding problem for {!r}: utf-8'.format(filename)
                raise SyntaxError(msg)
399
            encoding += '-sig'
400
        return encoding
401 402

    first = read_or_stop()
403
    if first.startswith(BOM_UTF8):
404 405
        bom_found = True
        first = first[3:]
406
        default = 'utf-8-sig'
407
    if not first:
408
        return default, []
409 410 411 412

    encoding = find_cookie(first)
    if encoding:
        return encoding, [first]
413 414
    if not blank_re.match(first):
        return default, [first]
415 416 417

    second = read_or_stop()
    if not second:
418
        return default, [first]
419 420 421 422 423

    encoding = find_cookie(second)
    if encoding:
        return encoding, [first, second]

424
    return default, [first, second]
425 426


427 428 429 430
def open(filename):
    """Open a file in read only mode using the encoding detected by
    detect_encoding().
    """
431
    buffer = builtins.open(filename, 'rb')
432 433 434 435 436 437 438
    encoding, lines = detect_encoding(buffer.readline)
    buffer.seek(0)
    text = TextIOWrapper(buffer, encoding, line_buffering=True)
    text.mode = 'r'
    return text


439 440 441
def tokenize(readline):
    """
    The tokenize() generator requires one argment, readline, which
442
    must be a callable object which provides the same interface as the
443
    readline() method of built-in file objects.  Each call to the function
444
    should return one line of input as bytes.  Alternately, readline
445
    can be a callable function terminating with StopIteration:
446
        readline = open(myfile, 'rb').__next__  # Example of alternate readline
Tim Peters's avatar
Tim Peters committed
447

448 449 450 451
    The generator produces 5-tuples with these members: the token type; the
    token string; a 2-tuple (srow, scol) of ints specifying the row and
    column where the token begins in the source; a 2-tuple (erow, ecol) of
    ints specifying the row and column where the token ends in the source;
452
    and the line on which the token was found.  The line passed is the
Tim Peters's avatar
Tim Peters committed
453
    logical line; continuation lines are included.
454 455 456

    The first token sequence will always be an ENCODING token
    which tells you which encoding was used to decode the bytes stream.
457
    """
458 459
    # This import is here to avoid problems when the itertools module is not
    # built yet and tokenize is imported.
460
    from itertools import chain, repeat
461
    encoding, consumed = detect_encoding(readline)
462 463 464
    rl_gen = iter(readline, b"")
    empty = repeat(b"")
    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
465 466 467


def _tokenize(readline, encoding):
468
    lnum = parenlev = continued = 0
469
    numchars = '0123456789'
470
    contstr, needcont = '', 0
471
    contline = None
Guido van Rossum's avatar
Guido van Rossum committed
472
    indents = [0]
473

474
    if encoding is not None:
475 476 477
        if encoding == "utf-8-sig":
            # BOM will already have been stripped.
            encoding = "utf-8"
478
        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
479
    while True:             # loop over lines in stream
480 481 482
        try:
            line = readline()
        except StopIteration:
483 484 485 486
            line = b''

        if encoding is not None:
            line = line.decode(encoding)
Benjamin Peterson's avatar
Benjamin Peterson committed
487
        lnum += 1
Guido van Rossum's avatar
Guido van Rossum committed
488 489 490
        pos, max = 0, len(line)

        if contstr:                            # continued string
491
            if not line:
492
                raise TokenError("EOF in multi-line string", strstart)
493 494 495
            endmatch = endprog.match(line)
            if endmatch:
                pos = end = endmatch.end(0)
496
                yield TokenInfo(STRING, contstr + line[:end],
497
                       strstart, (lnum, end), contline + line)
498
                contstr, needcont = '', 0
499
                contline = None
500
            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
501
                yield TokenInfo(ERRORTOKEN, contstr + line,
502
                           strstart, (lnum, len(line)), contline)
Guido van Rossum's avatar
Guido van Rossum committed
503
                contstr = ''
504
                contline = None
505
                continue
Guido van Rossum's avatar
Guido van Rossum committed
506 507
            else:
                contstr = contstr + line
508
                contline = contline + line
Guido van Rossum's avatar
Guido van Rossum committed
509 510
                continue

511
        elif parenlev == 0 and not continued:  # new statement
Guido van Rossum's avatar
Guido van Rossum committed
512 513
            if not line: break
            column = 0
514
            while pos < max:                   # measure leading whitespace
Benjamin Peterson's avatar
Benjamin Peterson committed
515 516 517 518 519 520 521 522 523 524 525
                if line[pos] == ' ':
                    column += 1
                elif line[pos] == '\t':
                    column = (column//tabsize + 1)*tabsize
                elif line[pos] == '\f':
                    column = 0
                else:
                    break
                pos += 1
            if pos == max:
                break
526 527

            if line[pos] in '#\r\n':           # skip comments or blank lines
528 529 530
                if line[pos] == '#':
                    comment_token = line[pos:].rstrip('\r\n')
                    nl_pos = pos + len(comment_token)
531
                    yield TokenInfo(COMMENT, comment_token,
532
                           (lnum, pos), (lnum, pos + len(comment_token)), line)
533
                    yield TokenInfo(NL, line[nl_pos:],
534 535
                           (lnum, nl_pos), (lnum, len(line)), line)
                else:
536
                    yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
537 538
                           (lnum, pos), (lnum, len(line)), line)
                continue
Guido van Rossum's avatar
Guido van Rossum committed
539 540 541

            if column > indents[-1]:           # count indents or dedents
                indents.append(column)
542
                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
543
            while column < indents[-1]:
544 545
                if column not in indents:
                    raise IndentationError(
546 547
                        "unindent does not match any outer indentation level",
                        ("<tokenize>", lnum, pos, line))
Guido van Rossum's avatar
Guido van Rossum committed
548
                indents = indents[:-1]
549
                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
550 551

        else:                                  # continued statement
552
            if not line:
553
                raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossum's avatar
Guido van Rossum committed
554 555 556
            continued = 0

        while pos < max:
557
            pseudomatch = _compile(PseudoToken).match(line, pos)
558 559
            if pseudomatch:                                # scan for tokens
                start, end = pseudomatch.span(1)
560
                spos, epos, pos = (lnum, start), (lnum, end), end
561 562
                if start == end:
                    continue
563
                token, initial = line[start:end], line[start]
Guido van Rossum's avatar
Guido van Rossum committed
564

565 566
                if (initial in numchars or                  # ordinary number
                    (initial == '.' and token != '.' and token != '...')):
567
                    yield TokenInfo(NUMBER, token, spos, epos, line)
568
                elif initial in '\r\n':
569
                    yield TokenInfo(NL if parenlev > 0 else NEWLINE,
570
                           token, spos, epos, line)
571
                elif initial == '#':
572
                    assert not token.endswith("\n")
573
                    yield TokenInfo(COMMENT, token, spos, epos, line)
574
                elif token in triple_quoted:
575
                    endprog = _compile(endpats[token])
576 577 578
                    endmatch = endprog.match(line, pos)
                    if endmatch:                           # all on one line
                        pos = endmatch.end(0)
579
                        token = line[start:pos]
580
                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
581
                    else:
582 583
                        strstart = (lnum, start)           # multiple lines
                        contstr = line[start:]
584
                        contline = line
Guido van Rossum's avatar
Guido van Rossum committed
585
                        break
586 587 588
                elif initial in single_quoted or \
                    token[:2] in single_quoted or \
                    token[:3] in single_quoted:
Guido van Rossum's avatar
Guido van Rossum committed
589
                    if token[-1] == '\n':                  # continued string
590
                        strstart = (lnum, start)
591 592 593
                        endprog = _compile(endpats[initial] or
                                           endpats[token[1]] or
                                           endpats[token[2]])
594
                        contstr, needcont = line[start:], 1
595
                        contline = line
Guido van Rossum's avatar
Guido van Rossum committed
596 597
                        break
                    else:                                  # ordinary string
598
                        yield TokenInfo(STRING, token, spos, epos, line)
599
                elif initial.isidentifier():               # ordinary name
600
                    yield TokenInfo(NAME, token, spos, epos, line)
601 602
                elif initial == '\\':                      # continued stmt
                    continued = 1
Guido van Rossum's avatar
Guido van Rossum committed
603
                else:
Benjamin Peterson's avatar
Benjamin Peterson committed
604 605 606 607
                    if initial in '([{':
                        parenlev += 1
                    elif initial in ')]}':
                        parenlev -= 1
608
                    yield TokenInfo(OP, token, spos, epos, line)
Guido van Rossum's avatar
Guido van Rossum committed
609
            else:
610
                yield TokenInfo(ERRORTOKEN, line[pos],
611
                           (lnum, pos), (lnum, pos+1), line)
Benjamin Peterson's avatar
Benjamin Peterson committed
612
                pos += 1
Guido van Rossum's avatar
Guido van Rossum committed
613 614

    for indent in indents[1:]:                 # pop remaining indent levels
615 616
        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossum's avatar
Guido van Rossum committed
617

618 619 620 621 622

# An undocumented, backwards compatible, API for all the places in the standard
# library that expect to be able to use tokenize with strings
def generate_tokens(readline):
    return _tokenize(readline, None)
623

624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645
def main():
    import argparse

    # Helper error handling routines
    def perror(message):
        print(message, file=sys.stderr)

    def error(message, filename=None, location=None):
        if location:
            args = (filename,) + location + (message,)
            perror("%s:%d:%d: error: %s" % args)
        elif filename:
            perror("%s: error: %s" % (filename, message))
        else:
            perror("error: %s" % message)
        sys.exit(1)

    # Parse the arguments and options
    parser = argparse.ArgumentParser(prog='python -m tokenize')
    parser.add_argument(dest='filename', nargs='?',
                        metavar='filename.py',
                        help='the file to tokenize; defaults to stdin')
646 647
    parser.add_argument('-e', '--exact', dest='exact', action='store_true',
                        help='display token names using the exact type')
648 649 650 651 652 653 654 655 656 657 658 659 660 661
    args = parser.parse_args()

    try:
        # Tokenize the input
        if args.filename:
            filename = args.filename
            with builtins.open(filename, 'rb') as f:
                tokens = list(tokenize(f.readline))
        else:
            filename = "<stdin>"
            tokens = _tokenize(sys.stdin.readline, None)

        # Output the tokenization
        for token in tokens:
662 663 664
            token_type = token.type
            if args.exact:
                token_type = token.exact_type
665 666
            token_range = "%d,%d-%d,%d:" % (token.start + token.end)
            print("%-20s%-15s%-15r" %
667
                  (token_range, tok_name[token_type], token.string))
668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
    except IndentationError as err:
        line, column = err.args[1][1:3]
        error(err.args[0], filename, (line, column))
    except TokenError as err:
        line, column = err.args[1]
        error(err.args[0], filename, (line, column))
    except SyntaxError as err:
        error(err, filename)
    except IOError as err:
        error(err)
    except KeyboardInterrupt:
        print("interrupted\n")
    except Exception as err:
        perror("unexpected error: %s" % err)
        raise

684
if __name__ == "__main__":
685
    main()