tokenize.py 24.3 KB
Newer Older
1 2
"""Tokenization help for Python programs.

3 4 5
tokenize(readline) is a generator that breaks a stream of bytes into
Python tokens.  It decodes the bytes according to PEP-0263 for
determining source file encoding.
6

7 8 9
It accepts a readline-like method which is called repeatedly to get the
next line of input (or b"" for EOF).  It generates 5-tuples with these
members:
10 11 12 13 14 15 16 17 18

    the token type (see token.py)
    the token (a string)
    the starting (row, column) indices of the token (a 2-tuple of ints)
    the ending (row, column) indices of the token (a 2-tuple of ints)
    the original line (string)

It is designed to match the working of the Python tokenizer exactly, except
that it produces COMMENT tokens for comments and gives type OP for all
19 20 21
operators.  Additionally, all token lists start with an ENCODING token
which tells you which encoding was used to decode the bytes stream.
"""
22

23
__author__ = 'Ka-Ping Yee <ping@lfw.org>'
24 25 26
__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
               'Michael Foord')
27
import builtins
28
from codecs import lookup, BOM_UTF8
29
import collections
30
from io import TextIOWrapper
31 32 33 34 35
from itertools import chain
import re
import sys
from token import *

36
cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
37
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
Guido van Rossum's avatar
Guido van Rossum committed
38

39
import token
40 41
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
                           "NL", "untokenize", "ENCODING", "TokenInfo"]
42 43
del token

44 45
COMMENT = N_TOKENS
tok_name[COMMENT] = 'COMMENT'
46 47
NL = N_TOKENS + 1
tok_name[NL] = 'NL'
48 49 50
ENCODING = N_TOKENS + 2
tok_name[ENCODING] = 'ENCODING'
N_TOKENS += 3
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
EXACT_TOKEN_TYPES = {
    '(':   LPAR,
    ')':   RPAR,
    '[':   LSQB,
    ']':   RSQB,
    ':':   COLON,
    ',':   COMMA,
    ';':   SEMI,
    '+':   PLUS,
    '-':   MINUS,
    '*':   STAR,
    '/':   SLASH,
    '|':   VBAR,
    '&':   AMPER,
    '<':   LESS,
    '>':   GREATER,
    '=':   EQUAL,
    '.':   DOT,
    '%':   PERCENT,
    '{':   LBRACE,
    '}':   RBRACE,
    '==':  EQEQUAL,
    '!=':  NOTEQUAL,
    '<=':  LESSEQUAL,
    '>=':  GREATEREQUAL,
    '~':   TILDE,
    '^':   CIRCUMFLEX,
    '<<':  LEFTSHIFT,
    '>>':  RIGHTSHIFT,
    '**':  DOUBLESTAR,
    '+=':  PLUSEQUAL,
    '-=':  MINEQUAL,
    '*=':  STAREQUAL,
    '/=':  SLASHEQUAL,
    '%=':  PERCENTEQUAL,
    '&=':  AMPEREQUAL,
    '|=':  VBAREQUAL,
    '^=': CIRCUMFLEXEQUAL,
    '<<=': LEFTSHIFTEQUAL,
    '>>=': RIGHTSHIFTEQUAL,
    '**=': DOUBLESTAREQUAL,
    '//':  DOUBLESLASH,
    '//=': DOUBLESLASHEQUAL,
94 95
    '@':   AT,
    '@=':  ATEQUAL,
96
}
97

98
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
99
    def __repr__(self):
100 101 102
        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
                self._replace(type=annotated_type))
103

104 105 106 107 108 109 110
    @property
    def exact_type(self):
        if self.type == OP and self.string in EXACT_TOKEN_TYPES:
            return EXACT_TOKEN_TYPES[self.string]
        else:
            return self.type

111
def group(*choices): return '(' + '|'.join(choices) + ')'
112 113
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'
Guido van Rossum's avatar
Guido van Rossum committed
114

115 116
# Note: we use unicode matching for names ("\w") but ascii matching for
# number literals.
117 118 119
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
120
Name = r'\w+'
Guido van Rossum's avatar
Guido van Rossum committed
121

122
Hexnumber = r'0[xX][0-9a-fA-F]+'
123 124
Binnumber = r'0[bB][01]+'
Octnumber = r'0[oO][0-7]+'
125
Decnumber = r'(?:0+|[1-9][0-9]*)'
126
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
127 128 129
Exponent = r'[eE][-+]?[0-9]+'
Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
Expfloat = r'[0-9]+' + Exponent
130
Floatnumber = group(Pointfloat, Expfloat)
131
Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
132
Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum's avatar
Guido van Rossum committed
133

134
StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
135

136 137 138 139 140 141 142 143
# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
# Tail end of " string.
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
# Tail end of ''' string.
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
144
Triple = group(StringPrefix + "'''", StringPrefix + '"""')
145
# Single-line ' or " string.
146 147
String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum's avatar
Guido van Rossum committed
148

149 150 151
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
152
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
153
                 r"//=?", r"->",
154
                 r"[+\-*/%&@|^=<>]=?",
155
                 r"~")
156

Guido van Rossum's avatar
Guido van Rossum committed
157
Bracket = '[][(){}]'
158
Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossum's avatar
Guido van Rossum committed
159
Funny = group(Operator, Bracket, Special)
Guido van Rossum's avatar
Guido van Rossum committed
160

161
PlainToken = group(Number, Funny, String, Name)
Guido van Rossum's avatar
Guido van Rossum committed
162
Token = Ignore + PlainToken
Guido van Rossum's avatar
Guido van Rossum committed
163

164
# First (or only) line of ' or " string.
165
ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
166
                group("'", r'\\\r?\n'),
167
                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
168
                group('"', r'\\\r?\n'))
169
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
170
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
171

172 173 174
def _compile(expr):
    return re.compile(expr, re.UNICODE)

175 176 177 178 179 180
endpats = {"'": Single, '"': Double,
           "'''": Single3, '"""': Double3,
           "r'''": Single3, 'r"""': Double3,
           "b'''": Single3, 'b"""': Double3,
           "R'''": Single3, 'R"""': Double3,
           "B'''": Single3, 'B"""': Double3,
181
           "br'''": Single3, 'br"""': Double3,
182 183 184
           "bR'''": Single3, 'bR"""': Double3,
           "Br'''": Single3, 'Br"""': Double3,
           "BR'''": Single3, 'BR"""': Double3,
185 186 187 188
           "rb'''": Single3, 'rb"""': Double3,
           "Rb'''": Single3, 'Rb"""': Double3,
           "rB'''": Single3, 'rB"""': Double3,
           "RB'''": Single3, 'RB"""': Double3,
189 190 191 192 193
           "u'''": Single3, 'u"""': Double3,
           "R'''": Single3, 'R"""': Double3,
           "U'''": Single3, 'U"""': Double3,
           'r': None, 'R': None, 'b': None, 'B': None,
           'u': None, 'U': None}
Guido van Rossum's avatar
Guido van Rossum committed
194

195 196 197
triple_quoted = {}
for t in ("'''", '"""',
          "r'''", 'r"""', "R'''", 'R"""',
198 199
          "b'''", 'b"""', "B'''", 'B"""',
          "br'''", 'br"""', "Br'''", 'Br"""',
200
          "bR'''", 'bR"""', "BR'''", 'BR"""',
201 202
          "rb'''", 'rb"""', "rB'''", 'rB"""',
          "Rb'''", 'Rb"""', "RB'''", 'RB"""',
203
          "u'''", 'u"""', "U'''", 'U"""',
204
          ):
205 206 207 208
    triple_quoted[t] = t
single_quoted = {}
for t in ("'", '"',
          "r'", 'r"', "R'", 'R"',
209 210
          "b'", 'b"', "B'", 'B"',
          "br'", 'br"', "Br'", 'Br"',
211
          "bR'", 'bR"', "BR'", 'BR"' ,
212 213
          "rb'", 'rb"', "rB'", 'rB"',
          "Rb'", 'Rb"', "RB'", 'RB"' ,
214
          "u'", 'u"', "U'", 'U"',
215
          ):
216 217
    single_quoted[t] = t

Guido van Rossum's avatar
Guido van Rossum committed
218
tabsize = 8
219

220 221 222
class TokenError(Exception): pass

class StopTokenizing(Exception): pass
223

224

225 226 227 228 229 230
class Untokenizer:

    def __init__(self):
        self.tokens = []
        self.prev_row = 1
        self.prev_col = 0
231
        self.encoding = None
232 233 234

    def add_whitespace(self, start):
        row, col = start
235 236 237
        if row < self.prev_row or row == self.prev_row and col < self.prev_col:
            raise ValueError("start ({},{}) precedes previous end ({},{})"
                             .format(row, col, self.prev_row, self.prev_col))
238
        row_offset = row - self.prev_row
Terry Jan Reedy's avatar
Terry Jan Reedy committed
239
        if row_offset:
240 241
            self.tokens.append("\\\n" * row_offset)
            self.prev_col = 0
242 243 244 245 246
        col_offset = col - self.prev_col
        if col_offset:
            self.tokens.append(" " * col_offset)

    def untokenize(self, iterable):
247 248
        it = iter(iterable)
        for t in it:
249
            if len(t) == 2:
250
                self.compat(t, it)
251 252
                break
            tok_type, token, start, end, line = t
253 254 255
            if tok_type == ENCODING:
                self.encoding = token
                continue
256 257
            if tok_type == ENDMARKER:
                break
258 259 260 261 262 263 264 265 266 267 268
            self.add_whitespace(start)
            self.tokens.append(token)
            self.prev_row, self.prev_col = end
            if tok_type in (NEWLINE, NL):
                self.prev_row += 1
                self.prev_col = 0
        return "".join(self.tokens)

    def compat(self, token, iterable):
        indents = []
        toks_append = self.tokens.append
269
        startline = token[0] in (NEWLINE, NL)
270
        prevstring = False
271 272

        for tok in chain([token], iterable):
273
            toknum, tokval = tok[:2]
274 275 276
            if toknum == ENCODING:
                self.encoding = tokval
                continue
277 278 279 280

            if toknum in (NAME, NUMBER):
                tokval += ' '

281 282 283 284 285 286 287 288
            # Insert a space between two consecutive strings
            if toknum == STRING:
                if prevstring:
                    tokval = ' ' + tokval
                prevstring = True
            else:
                prevstring = False

289 290 291 292 293 294 295 296 297 298 299 300
            if toknum == INDENT:
                indents.append(tokval)
                continue
            elif toknum == DEDENT:
                indents.pop()
                continue
            elif toknum in (NEWLINE, NL):
                startline = True
            elif startline and indents:
                toks_append(indents[-1])
                startline = False
            toks_append(tokval)
301

302

303 304
def untokenize(iterable):
    """Transform tokens back into Python source code.
305 306
    It returns a bytes object, encoded using the ENCODING
    token, which is the first token sequence output by tokenize.
307 308

    Each element returned by the iterable must be a token sequence
309 310 311 312 313
    with at least two elements, a token number and token value.  If
    only two tokens are passed, the resulting output is poor.

    Round-trip invariant for full input:
        Untokenized source will match input source exactly
314

315
    Round-trip invariant for limited intput:
316 317
        # Output bytes will tokenize the back to the input
        t1 = [tok[:2] for tok in tokenize(f.readline)]
318
        newcode = untokenize(t1)
319 320
        readline = BytesIO(newcode).readline
        t2 = [tok[:2] for tok in tokenize(readline)]
321 322
        assert t1 == t2
    """
323
    ut = Untokenizer()
324 325 326 327
    out = ut.untokenize(iterable)
    if ut.encoding is not None:
        out = out.encode(ut.encoding)
    return out
328

329

330 331 332 333 334 335 336 337 338 339 340
def _get_normal_name(orig_enc):
    """Imitates get_normal_name in tokenizer.c."""
    # Only care about the first 12 characters.
    enc = orig_enc[:12].lower().replace("_", "-")
    if enc == "utf-8" or enc.startswith("utf-8-"):
        return "utf-8"
    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
        return "iso-8859-1"
    return orig_enc

341
def detect_encoding(readline):
342
    """
343
    The detect_encoding() function is used to detect the encoding that should
344
    be used to decode a Python source file.  It requires one argument, readline,
345 346 347
    in the same way as the tokenize() generator.

    It will call readline a maximum of twice, and return the encoding used
348
    (as a string) and a list of any lines (left as bytes) it has read in.
349 350

    It detects the encoding from the presence of a utf-8 bom or an encoding
351 352 353
    cookie as specified in pep-0263.  If both a bom and a cookie are present,
    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
354
    'utf-8-sig' is returned.
355 356 357

    If no encoding is specified, then the default of 'utf-8' will be returned.
    """
358 359 360 361
    try:
        filename = readline.__self__.name
    except AttributeError:
        filename = None
362 363
    bom_found = False
    encoding = None
364
    default = 'utf-8'
365 366 367 368 369 370 371 372
    def read_or_stop():
        try:
            return readline()
        except StopIteration:
            return b''

    def find_cookie(line):
        try:
373 374 375 376
            # Decode as UTF-8. Either the line is an encoding declaration,
            # in which case it should be pure ASCII, or it must be UTF-8
            # per default encoding.
            line_string = line.decode('utf-8')
377
        except UnicodeDecodeError:
378 379 380 381
            msg = "invalid or missing encoding declaration"
            if filename is not None:
                msg = '{} for {!r}'.format(msg, filename)
            raise SyntaxError(msg)
382

383 384
        match = cookie_re.match(line_string)
        if not match:
385
            return None
386
        encoding = _get_normal_name(match.group(1))
387 388 389 390
        try:
            codec = lookup(encoding)
        except LookupError:
            # This behaviour mimics the Python interpreter
391 392 393 394 395 396
            if filename is None:
                msg = "unknown encoding: " + encoding
            else:
                msg = "unknown encoding for {!r}: {}".format(filename,
                        encoding)
            raise SyntaxError(msg)
397

398
        if bom_found:
399
            if encoding != 'utf-8':
400
                # This behaviour mimics the Python interpreter
401 402 403 404 405
                if filename is None:
                    msg = 'encoding problem: utf-8'
                else:
                    msg = 'encoding problem for {!r}: utf-8'.format(filename)
                raise SyntaxError(msg)
406
            encoding += '-sig'
407
        return encoding
408 409

    first = read_or_stop()
410
    if first.startswith(BOM_UTF8):
411 412
        bom_found = True
        first = first[3:]
413
        default = 'utf-8-sig'
414
    if not first:
415
        return default, []
416 417 418 419

    encoding = find_cookie(first)
    if encoding:
        return encoding, [first]
420 421
    if not blank_re.match(first):
        return default, [first]
422 423 424

    second = read_or_stop()
    if not second:
425
        return default, [first]
426 427 428 429 430

    encoding = find_cookie(second)
    if encoding:
        return encoding, [first, second]

431
    return default, [first, second]
432 433


434 435 436 437
def open(filename):
    """Open a file in read only mode using the encoding detected by
    detect_encoding().
    """
438
    buffer = builtins.open(filename, 'rb')
439 440 441 442 443 444 445
    encoding, lines = detect_encoding(buffer.readline)
    buffer.seek(0)
    text = TextIOWrapper(buffer, encoding, line_buffering=True)
    text.mode = 'r'
    return text


446 447 448
def tokenize(readline):
    """
    The tokenize() generator requires one argment, readline, which
449
    must be a callable object which provides the same interface as the
450
    readline() method of built-in file objects.  Each call to the function
451
    should return one line of input as bytes.  Alternately, readline
452
    can be a callable function terminating with StopIteration:
453
        readline = open(myfile, 'rb').__next__  # Example of alternate readline
Tim Peters's avatar
Tim Peters committed
454

455 456 457 458
    The generator produces 5-tuples with these members: the token type; the
    token string; a 2-tuple (srow, scol) of ints specifying the row and
    column where the token begins in the source; a 2-tuple (erow, ecol) of
    ints specifying the row and column where the token ends in the source;
459
    and the line on which the token was found.  The line passed is the
Tim Peters's avatar
Tim Peters committed
460
    logical line; continuation lines are included.
461 462 463

    The first token sequence will always be an ENCODING token
    which tells you which encoding was used to decode the bytes stream.
464
    """
465 466
    # This import is here to avoid problems when the itertools module is not
    # built yet and tokenize is imported.
467
    from itertools import chain, repeat
468
    encoding, consumed = detect_encoding(readline)
469 470 471
    rl_gen = iter(readline, b"")
    empty = repeat(b"")
    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
472 473 474


def _tokenize(readline, encoding):
475
    lnum = parenlev = continued = 0
476
    numchars = '0123456789'
477
    contstr, needcont = '', 0
478
    contline = None
Guido van Rossum's avatar
Guido van Rossum committed
479
    indents = [0]
480

481
    if encoding is not None:
482 483 484
        if encoding == "utf-8-sig":
            # BOM will already have been stripped.
            encoding = "utf-8"
485
        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
486
    while True:             # loop over lines in stream
487 488 489
        try:
            line = readline()
        except StopIteration:
490 491 492 493
            line = b''

        if encoding is not None:
            line = line.decode(encoding)
Benjamin Peterson's avatar
Benjamin Peterson committed
494
        lnum += 1
Guido van Rossum's avatar
Guido van Rossum committed
495 496 497
        pos, max = 0, len(line)

        if contstr:                            # continued string
498
            if not line:
499
                raise TokenError("EOF in multi-line string", strstart)
500 501 502
            endmatch = endprog.match(line)
            if endmatch:
                pos = end = endmatch.end(0)
503
                yield TokenInfo(STRING, contstr + line[:end],
504
                       strstart, (lnum, end), contline + line)
505
                contstr, needcont = '', 0
506
                contline = None
507
            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
508
                yield TokenInfo(ERRORTOKEN, contstr + line,
509
                           strstart, (lnum, len(line)), contline)
Guido van Rossum's avatar
Guido van Rossum committed
510
                contstr = ''
511
                contline = None
512
                continue
Guido van Rossum's avatar
Guido van Rossum committed
513 514
            else:
                contstr = contstr + line
515
                contline = contline + line
Guido van Rossum's avatar
Guido van Rossum committed
516 517
                continue

518
        elif parenlev == 0 and not continued:  # new statement
Guido van Rossum's avatar
Guido van Rossum committed
519 520
            if not line: break
            column = 0
521
            while pos < max:                   # measure leading whitespace
Benjamin Peterson's avatar
Benjamin Peterson committed
522 523 524 525 526 527 528 529 530 531 532
                if line[pos] == ' ':
                    column += 1
                elif line[pos] == '\t':
                    column = (column//tabsize + 1)*tabsize
                elif line[pos] == '\f':
                    column = 0
                else:
                    break
                pos += 1
            if pos == max:
                break
533 534

            if line[pos] in '#\r\n':           # skip comments or blank lines
535 536 537
                if line[pos] == '#':
                    comment_token = line[pos:].rstrip('\r\n')
                    nl_pos = pos + len(comment_token)
538
                    yield TokenInfo(COMMENT, comment_token,
539
                           (lnum, pos), (lnum, pos + len(comment_token)), line)
540
                    yield TokenInfo(NL, line[nl_pos:],
541 542
                           (lnum, nl_pos), (lnum, len(line)), line)
                else:
543
                    yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
544 545
                           (lnum, pos), (lnum, len(line)), line)
                continue
Guido van Rossum's avatar
Guido van Rossum committed
546 547 548

            if column > indents[-1]:           # count indents or dedents
                indents.append(column)
549
                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
550
            while column < indents[-1]:
551 552
                if column not in indents:
                    raise IndentationError(
553 554
                        "unindent does not match any outer indentation level",
                        ("<tokenize>", lnum, pos, line))
Guido van Rossum's avatar
Guido van Rossum committed
555
                indents = indents[:-1]
556
                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
557 558

        else:                                  # continued statement
559
            if not line:
560
                raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossum's avatar
Guido van Rossum committed
561 562 563
            continued = 0

        while pos < max:
564
            pseudomatch = _compile(PseudoToken).match(line, pos)
565 566
            if pseudomatch:                                # scan for tokens
                start, end = pseudomatch.span(1)
567
                spos, epos, pos = (lnum, start), (lnum, end), end
568 569
                if start == end:
                    continue
570
                token, initial = line[start:end], line[start]
Guido van Rossum's avatar
Guido van Rossum committed
571

572 573
                if (initial in numchars or                  # ordinary number
                    (initial == '.' and token != '.' and token != '...')):
574
                    yield TokenInfo(NUMBER, token, spos, epos, line)
575
                elif initial in '\r\n':
576
                    yield TokenInfo(NL if parenlev > 0 else NEWLINE,
577
                           token, spos, epos, line)
578
                elif initial == '#':
579
                    assert not token.endswith("\n")
580
                    yield TokenInfo(COMMENT, token, spos, epos, line)
581
                elif token in triple_quoted:
582
                    endprog = _compile(endpats[token])
583 584 585
                    endmatch = endprog.match(line, pos)
                    if endmatch:                           # all on one line
                        pos = endmatch.end(0)
586
                        token = line[start:pos]
587
                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
588
                    else:
589 590
                        strstart = (lnum, start)           # multiple lines
                        contstr = line[start:]
591
                        contline = line
Guido van Rossum's avatar
Guido van Rossum committed
592
                        break
593 594 595
                elif initial in single_quoted or \
                    token[:2] in single_quoted or \
                    token[:3] in single_quoted:
Guido van Rossum's avatar
Guido van Rossum committed
596
                    if token[-1] == '\n':                  # continued string
597
                        strstart = (lnum, start)
598 599 600
                        endprog = _compile(endpats[initial] or
                                           endpats[token[1]] or
                                           endpats[token[2]])
601
                        contstr, needcont = line[start:], 1
602
                        contline = line
Guido van Rossum's avatar
Guido van Rossum committed
603 604
                        break
                    else:                                  # ordinary string
605
                        yield TokenInfo(STRING, token, spos, epos, line)
606
                elif initial.isidentifier():               # ordinary name
607
                    yield TokenInfo(NAME, token, spos, epos, line)
608 609
                elif initial == '\\':                      # continued stmt
                    continued = 1
Guido van Rossum's avatar
Guido van Rossum committed
610
                else:
Benjamin Peterson's avatar
Benjamin Peterson committed
611 612 613 614
                    if initial in '([{':
                        parenlev += 1
                    elif initial in ')]}':
                        parenlev -= 1
615
                    yield TokenInfo(OP, token, spos, epos, line)
Guido van Rossum's avatar
Guido van Rossum committed
616
            else:
617
                yield TokenInfo(ERRORTOKEN, line[pos],
618
                           (lnum, pos), (lnum, pos+1), line)
Benjamin Peterson's avatar
Benjamin Peterson committed
619
                pos += 1
Guido van Rossum's avatar
Guido van Rossum committed
620 621

    for indent in indents[1:]:                 # pop remaining indent levels
622 623
        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossum's avatar
Guido van Rossum committed
624

625 626 627 628 629

# An undocumented, backwards compatible, API for all the places in the standard
# library that expect to be able to use tokenize with strings
def generate_tokens(readline):
    return _tokenize(readline, None)
630

631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652
def main():
    import argparse

    # Helper error handling routines
    def perror(message):
        print(message, file=sys.stderr)

    def error(message, filename=None, location=None):
        if location:
            args = (filename,) + location + (message,)
            perror("%s:%d:%d: error: %s" % args)
        elif filename:
            perror("%s: error: %s" % (filename, message))
        else:
            perror("error: %s" % message)
        sys.exit(1)

    # Parse the arguments and options
    parser = argparse.ArgumentParser(prog='python -m tokenize')
    parser.add_argument(dest='filename', nargs='?',
                        metavar='filename.py',
                        help='the file to tokenize; defaults to stdin')
653 654
    parser.add_argument('-e', '--exact', dest='exact', action='store_true',
                        help='display token names using the exact type')
655 656 657 658 659 660 661 662 663 664 665 666 667 668
    args = parser.parse_args()

    try:
        # Tokenize the input
        if args.filename:
            filename = args.filename
            with builtins.open(filename, 'rb') as f:
                tokens = list(tokenize(f.readline))
        else:
            filename = "<stdin>"
            tokens = _tokenize(sys.stdin.readline, None)

        # Output the tokenization
        for token in tokens:
669 670 671
            token_type = token.type
            if args.exact:
                token_type = token.exact_type
672 673
            token_range = "%d,%d-%d,%d:" % (token.start + token.end)
            print("%-20s%-15s%-15r" %
674
                  (token_range, tok_name[token_type], token.string))
675 676 677 678 679 680 681 682
    except IndentationError as err:
        line, column = err.args[1][1:3]
        error(err.args[0], filename, (line, column))
    except TokenError as err:
        line, column = err.args[1]
        error(err.args[0], filename, (line, column))
    except SyntaxError as err:
        error(err, filename)
683
    except OSError as err:
684 685 686 687 688 689 690
        error(err)
    except KeyboardInterrupt:
        print("interrupted\n")
    except Exception as err:
        perror("unexpected error: %s" % err)
        raise

691
if __name__ == "__main__":
692
    main()