tokenize.py 13.2 KB
Newer Older
1 2
"""Tokenization help for Python programs.

3
generate_tokens(readline) is a generator that breaks a stream of
4
text into Python tokens.  It accepts a readline-like method which is called
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
repeatedly to get the next line of input (or "" for EOF).  It generates
5-tuples with these members:

    the token type (see token.py)
    the token (a string)
    the starting (row, column) indices of the token (a 2-tuple of ints)
    the ending (row, column) indices of the token (a 2-tuple of ints)
    the original line (string)

It is designed to match the working of the Python tokenizer exactly, except
that it produces COMMENT tokens for comments and gives type OP for all
operators

Older entry points
    tokenize_loop(readline, tokeneater)
    tokenize(readline, tokeneater=printtoken)
are the same, except instead of generating tokens, tokeneater is a callback
function to which the 5 fields described above are passed as 5 arguments,
each time a new token is found."""
24

25
__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Ka-Ping Yee's avatar
Ka-Ping Yee committed
26 27
__credits__ = \
    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
28

29
import string, re
Guido van Rossum's avatar
Guido van Rossum committed
30
from token import *
Guido van Rossum's avatar
Guido van Rossum committed
31

32
import token
33
__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
34
           "generate_tokens", "NL", "untokenize"]
35
del x
36 37
del token

38 39
COMMENT = N_TOKENS
tok_name[COMMENT] = 'COMMENT'
40 41
NL = N_TOKENS + 1
tok_name[NL] = 'NL'
42
N_TOKENS += 2
43

44
def group(*choices): return '(' + '|'.join(choices) + ')'
45 46
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'
Guido van Rossum's avatar
Guido van Rossum committed
47

48 49 50 51
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Name = r'[a-zA-Z_]\w*'
Guido van Rossum's avatar
Guido van Rossum committed
52

53 54 55
Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
Octnumber = r'0[0-7]*[lL]?'
Decnumber = r'[1-9]\d*[lL]?'
56
Intnumber = group(Hexnumber, Octnumber, Decnumber)
57 58
Exponent = r'[eE][-+]?\d+'
Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
59
Expfloat = r'\d+' + Exponent
60
Floatnumber = group(Pointfloat, Expfloat)
61
Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
62
Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum's avatar
Guido van Rossum committed
63

64 65 66 67 68 69 70 71
# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
# Tail end of " string.
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
# Tail end of ''' string.
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72
Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73
# Single-line ' or " string.
74 75
String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum's avatar
Guido van Rossum committed
76

77 78 79 80
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
81
                 r"//=?",
82 83
                 r"[+\-*/%&|^=<>]=?",
                 r"~")
84

Guido van Rossum's avatar
Guido van Rossum committed
85
Bracket = '[][(){}]'
86
Special = group(r'\r?\n', r'[:;.,`@]')
Guido van Rossum's avatar
Guido van Rossum committed
87
Funny = group(Operator, Bracket, Special)
Guido van Rossum's avatar
Guido van Rossum committed
88

89
PlainToken = group(Number, Funny, String, Name)
Guido van Rossum's avatar
Guido van Rossum committed
90
Token = Ignore + PlainToken
Guido van Rossum's avatar
Guido van Rossum committed
91

92
# First (or only) line of ' or " string.
93 94 95 96
ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
                group("'", r'\\\r?\n'),
                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                group('"', r'\\\r?\n'))
97 98
PseudoExtras = group(r'\\\r?\n', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
99

100 101
tokenprog, pseudoprog, single3prog, double3prog = map(
    re.compile, (Token, PseudoToken, Single3, Double3))
102
endprogs = {"'": re.compile(Single), '"': re.compile(Double),
103
            "'''": single3prog, '"""': double3prog,
104
            "r'''": single3prog, 'r"""': double3prog,
105 106 107 108 109 110 111 112
            "u'''": single3prog, 'u"""': double3prog,
            "ur'''": single3prog, 'ur"""': double3prog,
            "R'''": single3prog, 'R"""': double3prog,
            "U'''": single3prog, 'U"""': double3prog,
            "uR'''": single3prog, 'uR"""': double3prog,
            "Ur'''": single3prog, 'Ur"""': double3prog,
            "UR'''": single3prog, 'UR"""': double3prog,
            'r': None, 'R': None, 'u': None, 'U': None}
Guido van Rossum's avatar
Guido van Rossum committed
113

114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
triple_quoted = {}
for t in ("'''", '"""',
          "r'''", 'r"""', "R'''", 'R"""',
          "u'''", 'u"""', "U'''", 'U"""',
          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
          "uR'''", 'uR"""', "UR'''", 'UR"""'):
    triple_quoted[t] = t
single_quoted = {}
for t in ("'", '"',
          "r'", 'r"', "R'", 'R"',
          "u'", 'u"', "U'", 'U"',
          "ur'", 'ur"', "Ur'", 'Ur"',
          "uR'", 'uR"', "UR'", 'UR"' ):
    single_quoted[t] = t

Guido van Rossum's avatar
Guido van Rossum committed
129
tabsize = 8
130

131 132 133
class TokenError(Exception): pass

class StopTokenizing(Exception): pass
134

135 136 137
def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
    print "%d,%d-%d,%d:\t%s\t%s" % \
        (srow, scol, erow, ecol, tok_name[type], repr(token))
Guido van Rossum's avatar
Guido van Rossum committed
138

139
def tokenize(readline, tokeneater=printtoken):
140 141 142
    """
    The tokenize() function accepts two parameters: one representing the
    input stream, and one providing an output mechanism for tokenize().
Tim Peters's avatar
Tim Peters committed
143

144 145
    The first parameter, readline, must be a callable object which provides
    the same interface as the readline() method of built-in file objects.
Tim Peters's avatar
Tim Peters committed
146
    Each call to the function should return one line of input as a string.
147 148 149

    The second parameter, tokeneater, must also be a callable object. It is
    called once for each token, with five arguments, corresponding to the
Tim Peters's avatar
Tim Peters committed
150
    tuples generated by generate_tokens().
151
    """
152 153 154 155 156
    try:
        tokenize_loop(readline, tokeneater)
    except StopTokenizing:
        pass

157
# backwards compatible interface
158
def tokenize_loop(readline, tokeneater):
159
    for token_info in generate_tokens(readline):
160
        tokeneater(*token_info)
161

162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202

def untokenize(iterable):
    """Transform tokens back into Python source code.

    Each element returned by the iterable must be a token sequence
    with at least two elements, a token number and token value.

    Round-trip invariant:
        # Output text will tokenize the back to the input
        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
        newcode = untokenize(t1)
        readline = iter(newcode.splitlines(1)).next
        t2 = [tok[:2] for tokin generate_tokens(readline)]
        assert t1 == t2
    """

    startline = False
    indents = []
    toks = []
    toks_append = toks.append
    for tok in iterable:
        toknum, tokval = tok[:2]

        if toknum == NAME:
            tokval += ' '

        if toknum == INDENT:
            indents.append(tokval)
            continue
        elif toknum == DEDENT:
            indents.pop()
            continue
        elif toknum in (NEWLINE, COMMENT, NL):
            startline = True
        elif startline and indents:
            toks_append(indents[-1])
            startline = False
        toks_append(tokval)
    return ''.join(toks)


203
def generate_tokens(readline):
204 205 206 207
    """
    The generate_tokens() generator requires one argment, readline, which
    must be a callable object which provides the same interface as the
    readline() method of built-in file objects. Each call to the function
208 209 210
    should return one line of input as a string.  Alternately, readline
    can be a callable function terminating with StopIteration:
        readline = open(myfile).next    # Example of alternate readline
Tim Peters's avatar
Tim Peters committed
211

212 213 214 215 216
    The generator produces 5-tuples with these members: the token type; the
    token string; a 2-tuple (srow, scol) of ints specifying the row and
    column where the token begins in the source; a 2-tuple (erow, ecol) of
    ints specifying the row and column where the token ends in the source;
    and the line on which the token was found. The line passed is the
Tim Peters's avatar
Tim Peters committed
217
    logical line; continuation lines are included.
218
    """
219
    lnum = parenlev = continued = 0
220
    namechars, numchars = string.ascii_letters + '_', '0123456789'
221
    contstr, needcont = '', 0
222
    contline = None
Guido van Rossum's avatar
Guido van Rossum committed
223
    indents = [0]
224

Guido van Rossum's avatar
Guido van Rossum committed
225
    while 1:                                   # loop over lines in stream
226 227 228 229
        try:
            line = readline()
        except StopIteration:
            line = ''
230
        lnum = lnum + 1
Guido van Rossum's avatar
Guido van Rossum committed
231 232 233
        pos, max = 0, len(line)

        if contstr:                            # continued string
234 235
            if not line:
                raise TokenError, ("EOF in multi-line string", strstart)
236 237 238
            endmatch = endprog.match(line)
            if endmatch:
                pos = end = endmatch.end(0)
239
                yield (STRING, contstr + line[:end],
240
                           strstart, (lnum, end), contline + line)
241
                contstr, needcont = '', 0
242
                contline = None
243
            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
244
                yield (ERRORTOKEN, contstr + line,
245
                           strstart, (lnum, len(line)), contline)
Guido van Rossum's avatar
Guido van Rossum committed
246
                contstr = ''
247
                contline = None
248
                continue
Guido van Rossum's avatar
Guido van Rossum committed
249 250
            else:
                contstr = contstr + line
251
                contline = contline + line
Guido van Rossum's avatar
Guido van Rossum committed
252 253
                continue

254
        elif parenlev == 0 and not continued:  # new statement
Guido van Rossum's avatar
Guido van Rossum committed
255 256
            if not line: break
            column = 0
257
            while pos < max:                   # measure leading whitespace
Guido van Rossum's avatar
Guido van Rossum committed
258
                if line[pos] == ' ': column = column + 1
259
                elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossum's avatar
Guido van Rossum committed
260 261 262
                elif line[pos] == '\f': column = 0
                else: break
                pos = pos + 1
263
            if pos == max: break
264 265

            if line[pos] in '#\r\n':           # skip comments or blank lines
266
                yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
267 268
                           (lnum, pos), (lnum, len(line)), line)
                continue
Guido van Rossum's avatar
Guido van Rossum committed
269 270 271

            if column > indents[-1]:           # count indents or dedents
                indents.append(column)
272
                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
273
            while column < indents[-1]:
274 275 276
                if column not in indents:
                    raise IndentationError(
                        "unindent does not match any outer indentation level")
Guido van Rossum's avatar
Guido van Rossum committed
277
                indents = indents[:-1]
278
                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
279 280

        else:                                  # continued statement
281 282
            if not line:
                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
Guido van Rossum's avatar
Guido van Rossum committed
283 284 285
            continued = 0

        while pos < max:
286 287 288
            pseudomatch = pseudoprog.match(line, pos)
            if pseudomatch:                                # scan for tokens
                start, end = pseudomatch.span(1)
289
                spos, epos, pos = (lnum, start), (lnum, end), end
290
                token, initial = line[start:end], line[start]
Guido van Rossum's avatar
Guido van Rossum committed
291

292 293
                if initial in numchars or \
                   (initial == '.' and token != '.'):      # ordinary number
294
                    yield (NUMBER, token, spos, epos, line)
295
                elif initial in '\r\n':
296
                    yield (parenlev > 0 and NL or NEWLINE,
297
                               token, spos, epos, line)
298
                elif initial == '#':
299
                    yield (COMMENT, token, spos, epos, line)
300
                elif token in triple_quoted:
Guido van Rossum's avatar
Guido van Rossum committed
301
                    endprog = endprogs[token]
302 303 304
                    endmatch = endprog.match(line, pos)
                    if endmatch:                           # all on one line
                        pos = endmatch.end(0)
305
                        token = line[start:pos]
306
                        yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
307
                    else:
308 309
                        strstart = (lnum, start)           # multiple lines
                        contstr = line[start:]
310
                        contline = line
Guido van Rossum's avatar
Guido van Rossum committed
311
                        break
312 313 314
                elif initial in single_quoted or \
                    token[:2] in single_quoted or \
                    token[:3] in single_quoted:
Guido van Rossum's avatar
Guido van Rossum committed
315
                    if token[-1] == '\n':                  # continued string
316
                        strstart = (lnum, start)
317 318
                        endprog = (endprogs[initial] or endprogs[token[1]] or
                                   endprogs[token[2]])
319
                        contstr, needcont = line[start:], 1
320
                        contline = line
Guido van Rossum's avatar
Guido van Rossum committed
321 322
                        break
                    else:                                  # ordinary string
323
                        yield (STRING, token, spos, epos, line)
324
                elif initial in namechars:                 # ordinary name
325
                    yield (NAME, token, spos, epos, line)
326 327
                elif initial == '\\':                      # continued stmt
                    continued = 1
Guido van Rossum's avatar
Guido van Rossum committed
328
                else:
329 330
                    if initial in '([{': parenlev = parenlev + 1
                    elif initial in ')]}': parenlev = parenlev - 1
331
                    yield (OP, token, spos, epos, line)
Guido van Rossum's avatar
Guido van Rossum committed
332
            else:
333
                yield (ERRORTOKEN, line[pos],
334
                           (lnum, pos), (lnum, pos+1), line)
Guido van Rossum's avatar
Guido van Rossum committed
335 336 337
                pos = pos + 1

    for indent in indents[1:]:                 # pop remaining indent levels
338 339
        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossum's avatar
Guido van Rossum committed
340 341 342

if __name__ == '__main__':                     # testing
    import sys
343
    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
344
    else: tokenize(sys.stdin.readline)