tokenize.py 10.4 KB
Newer Older
1 2
"""Tokenization help for Python programs.

3
generate_tokens(readline) is a generator that breaks a stream of
4
text into Python tokens.  It accepts a readline-like method which is called
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
repeatedly to get the next line of input (or "" for EOF).  It generates
5-tuples with these members:

    the token type (see token.py)
    the token (a string)
    the starting (row, column) indices of the token (a 2-tuple of ints)
    the ending (row, column) indices of the token (a 2-tuple of ints)
    the original line (string)

It is designed to match the working of the Python tokenizer exactly, except
that it produces COMMENT tokens for comments and gives type OP for all
operators

Older entry points
    tokenize_loop(readline, tokeneater)
    tokenize(readline, tokeneater=printtoken)
are the same, except instead of generating tokens, tokeneater is a callback
function to which the 5 fields described above are passed as 5 arguments,
each time a new token is found."""
24

25 26
from __future__ import generators

27
__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Ka-Ping Yee's avatar
Ka-Ping Yee committed
28 29
__credits__ = \
    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
30

31
import string, re
Guido van Rossum's avatar
Guido van Rossum committed
32
from token import *
Guido van Rossum's avatar
Guido van Rossum committed
33

34 35 36 37
import token
__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
del token

38 39
COMMENT = N_TOKENS
tok_name[COMMENT] = 'COMMENT'
40 41
NL = N_TOKENS + 1
tok_name[NL] = 'NL'
42
N_TOKENS += 2
43

44
def group(*choices): return '(' + '|'.join(choices) + ')'
45 46
def any(*choices): return apply(group, choices) + '*'
def maybe(*choices): return apply(group, choices) + '?'
Guido van Rossum's avatar
Guido van Rossum committed
47

48 49 50 51
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Name = r'[a-zA-Z_]\w*'
Guido van Rossum's avatar
Guido van Rossum committed
52

53 54 55
Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
Octnumber = r'0[0-7]*[lL]?'
Decnumber = r'[1-9]\d*[lL]?'
56
Intnumber = group(Hexnumber, Octnumber, Decnumber)
57 58 59
Exponent = r'[eE][-+]?\d+'
Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
Expfloat = r'[1-9]\d*' + Exponent
60
Floatnumber = group(Pointfloat, Expfloat)
61
Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]')
62
Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum's avatar
Guido van Rossum committed
63

64 65 66 67 68 69 70 71
# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
# Tail end of " string.
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
# Tail end of ''' string.
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72
Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73
# Single-line ' or " string.
74 75
String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum's avatar
Guido van Rossum committed
76

77 78 79 80 81 82
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
                 r"[+\-*/%&|^=<>]=?",
                 r"~")
83

Guido van Rossum's avatar
Guido van Rossum committed
84
Bracket = '[][(){}]'
85
Special = group(r'\r?\n', r'[:;.,`]')
Guido van Rossum's avatar
Guido van Rossum committed
86
Funny = group(Operator, Bracket, Special)
Guido van Rossum's avatar
Guido van Rossum committed
87

88
PlainToken = group(Number, Funny, String, Name)
Guido van Rossum's avatar
Guido van Rossum committed
89
Token = Ignore + PlainToken
Guido van Rossum's avatar
Guido van Rossum committed
90

91
# First (or only) line of ' or " string.
92 93 94 95
ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
                group("'", r'\\\r?\n'),
                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                group('"', r'\\\r?\n'))
96 97
PseudoExtras = group(r'\\\r?\n', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
98

99 100
tokenprog, pseudoprog, single3prog, double3prog = map(
    re.compile, (Token, PseudoToken, Single3, Double3))
101
endprogs = {"'": re.compile(Single), '"': re.compile(Double),
102
            "'''": single3prog, '"""': double3prog,
103
            "r'''": single3prog, 'r"""': double3prog,
104 105 106 107 108 109 110 111
            "u'''": single3prog, 'u"""': double3prog,
            "ur'''": single3prog, 'ur"""': double3prog,
            "R'''": single3prog, 'R"""': double3prog,
            "U'''": single3prog, 'U"""': double3prog,
            "uR'''": single3prog, 'uR"""': double3prog,
            "Ur'''": single3prog, 'Ur"""': double3prog,
            "UR'''": single3prog, 'UR"""': double3prog,
            'r': None, 'R': None, 'u': None, 'U': None}
Guido van Rossum's avatar
Guido van Rossum committed
112 113

tabsize = 8
114

115 116 117
class TokenError(Exception): pass

class StopTokenizing(Exception): pass
118

119 120 121
def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
    print "%d,%d-%d,%d:\t%s\t%s" % \
        (srow, scol, erow, ecol, tok_name[type], repr(token))
Guido van Rossum's avatar
Guido van Rossum committed
122

123
def tokenize(readline, tokeneater=printtoken):
124 125 126 127 128
    try:
        tokenize_loop(readline, tokeneater)
    except StopTokenizing:
        pass

129
# backwards compatible interface
130
def tokenize_loop(readline, tokeneater):
131 132 133 134
    for token_info in generate_tokens(readline):
        apply(tokeneater, token_info)

def generate_tokens(readline):
135
    lnum = parenlev = continued = 0
Guido van Rossum's avatar
Guido van Rossum committed
136
    namechars, numchars = string.letters + '_', string.digits
137
    contstr, needcont = '', 0
138
    contline = None
Guido van Rossum's avatar
Guido van Rossum committed
139
    indents = [0]
140

Guido van Rossum's avatar
Guido van Rossum committed
141 142
    while 1:                                   # loop over lines in stream
        line = readline()
143
        lnum = lnum + 1
Guido van Rossum's avatar
Guido van Rossum committed
144 145 146
        pos, max = 0, len(line)

        if contstr:                            # continued string
147 148
            if not line:
                raise TokenError, ("EOF in multi-line string", strstart)
149 150 151
            endmatch = endprog.match(line)
            if endmatch:
                pos = end = endmatch.end(0)
152
                yield (STRING, contstr + line[:end],
153
                           strstart, (lnum, end), contline + line)
154
                contstr, needcont = '', 0
155
                contline = None
156
            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
157
                yield (ERRORTOKEN, contstr + line,
158
                           strstart, (lnum, len(line)), contline)
Guido van Rossum's avatar
Guido van Rossum committed
159
                contstr = ''
160
                contline = None
161
                continue
Guido van Rossum's avatar
Guido van Rossum committed
162 163
            else:
                contstr = contstr + line
164
                contline = contline + line
Guido van Rossum's avatar
Guido van Rossum committed
165 166
                continue

167
        elif parenlev == 0 and not continued:  # new statement
Guido van Rossum's avatar
Guido van Rossum committed
168 169
            if not line: break
            column = 0
170
            while pos < max:                   # measure leading whitespace
Guido van Rossum's avatar
Guido van Rossum committed
171
                if line[pos] == ' ': column = column + 1
172
                elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossum's avatar
Guido van Rossum committed
173 174 175
                elif line[pos] == '\f': column = 0
                else: break
                pos = pos + 1
176
            if pos == max: break
177 178

            if line[pos] in '#\r\n':           # skip comments or blank lines
179
                yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
180 181
                           (lnum, pos), (lnum, len(line)), line)
                continue
Guido van Rossum's avatar
Guido van Rossum committed
182 183 184

            if column > indents[-1]:           # count indents or dedents
                indents.append(column)
185
                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
186 187
            while column < indents[-1]:
                indents = indents[:-1]
188
                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
189 190

        else:                                  # continued statement
191 192
            if not line:
                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
Guido van Rossum's avatar
Guido van Rossum committed
193 194 195
            continued = 0

        while pos < max:
196 197 198
            pseudomatch = pseudoprog.match(line, pos)
            if pseudomatch:                                # scan for tokens
                start, end = pseudomatch.span(1)
199
                spos, epos, pos = (lnum, start), (lnum, end), end
200
                token, initial = line[start:end], line[start]
Guido van Rossum's avatar
Guido van Rossum committed
201

202 203
                if initial in numchars or \
                   (initial == '.' and token != '.'):      # ordinary number
204
                    yield (NUMBER, token, spos, epos, line)
205
                elif initial in '\r\n':
206
                    yield (parenlev > 0 and NL or NEWLINE,
207
                               token, spos, epos, line)
208
                elif initial == '#':
209
                    yield (COMMENT, token, spos, epos, line)
210
                elif token in ("'''", '"""',               # triple-quoted
211 212 213 214
                               "r'''", 'r"""', "R'''", 'R"""',
                               "u'''", 'u"""', "U'''", 'U"""',
                               "ur'''", 'ur"""', "Ur'''", 'Ur"""',
                               "uR'''", 'uR"""', "UR'''", 'UR"""'):
Guido van Rossum's avatar
Guido van Rossum committed
215
                    endprog = endprogs[token]
216 217 218
                    endmatch = endprog.match(line, pos)
                    if endmatch:                           # all on one line
                        pos = endmatch.end(0)
219
                        token = line[start:pos]
220
                        yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
221
                    else:
222 223
                        strstart = (lnum, start)           # multiple lines
                        contstr = line[start:]
224
                        contline = line
Guido van Rossum's avatar
Guido van Rossum committed
225
                        break
226
                elif initial in ("'", '"') or \
227 228 229 230
                    token[:2] in ("r'", 'r"', "R'", 'R"',
                                  "u'", 'u"', "U'", 'U"') or \
                    token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
                                  "uR'", 'uR"', "UR'", 'UR"' ):
Guido van Rossum's avatar
Guido van Rossum committed
231
                    if token[-1] == '\n':                  # continued string
232
                        strstart = (lnum, start)
233 234
                        endprog = (endprogs[initial] or endprogs[token[1]] or
                                   endprogs[token[2]])
235
                        contstr, needcont = line[start:], 1
236
                        contline = line
Guido van Rossum's avatar
Guido van Rossum committed
237 238
                        break
                    else:                                  # ordinary string
239
                        yield (STRING, token, spos, epos, line)
240
                elif initial in namechars:                 # ordinary name
241
                    yield (NAME, token, spos, epos, line)
242 243
                elif initial == '\\':                      # continued stmt
                    continued = 1
Guido van Rossum's avatar
Guido van Rossum committed
244
                else:
245 246
                    if initial in '([{': parenlev = parenlev + 1
                    elif initial in ')]}': parenlev = parenlev - 1
247
                    yield (OP, token, spos, epos, line)
Guido van Rossum's avatar
Guido van Rossum committed
248
            else:
249
                yield (ERRORTOKEN, line[pos],
250
                           (lnum, pos), (lnum, pos+1), line)
Guido van Rossum's avatar
Guido van Rossum committed
251 252 253
                pos = pos + 1

    for indent in indents[1:]:                 # pop remaining indent levels
254 255
        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossum's avatar
Guido van Rossum committed
256 257 258

if __name__ == '__main__':                     # testing
    import sys
259
    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
260
    else: tokenize(sys.stdin.readline)