Ka-Ping's muich improved version of March 26, 1997:

# Ignore now accepts \f as whitespace. Operator now includes '**'. # Ignore and Special now accept \n or \r\n at the end of a line. # Imagnumber is new. Expfloat is corrected to reject '0e4'.

Ka-Ping's muich improved version of March 26, 1997:
# Ignore now accepts \f as whitespace. Operator now includes '**'. # Ignore and Special now accept \n or \r\n at the end of a line. # Imagnumber is new. Expfloat is corrected to reject '0e4'.
1aec3236 · Guido van Rossum · 24dacb38 · 1aec3236
Kaydet (Commit) 1aec3236 authored Nis 08, 1997 tarafından Guido van Rossum
Hide whitespace changes
Inline Side-by-side

Showing with 98 additions and 74 deletions

tokenize.py Lib/tokenize.py +98 -74

No files found.
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
 """Tokenization help for Python programs.
-This module compiles a regular expression that recognizes Python
+This module exports a function called 'tokenize()' that breaks a stream of
-tokens in individual lines of text.  The regular expression handles
+text into Python tokens.  It accepts a readline-like method which is called
-everything except indentation, continuations, and triple-quoted
+repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
-strings.  The function 'tokenize.tokenize()' takes care of these
+function which is called once for each token found.  The latter function is
-things for streams of text.  It accepts a readline-like function which
+passed the token type, a string containing the token, the starting and
-is called repeatedly to come up with the next input line (or "" for
+ending (row, column) coordinates of the token, and the original line.  It is
-EOF), and a "token-eater" function which is called for each token
+designed to match the working of the Python tokenizer exactly, except that
-found, passing its type, a string containing the token, the line
+it produces COMMENT tokens for comments and gives type OP for all operators.
-number, the line, and the starting and ending positions of the token
-within the line.  It is designed to match the working of the Python
+For compatibility with the older 'tokenize' module, this also compiles a
-tokenizer exactly.
+regular expression into 'tokenprog' that matches Python tokens in individual
+lines of text, leaving the token in 'tokenprog.group(3)', but does not
-"""
+handle indentation, continuations, or multi-line strings."""
-__version__ = "Ka-Ping Yee, 4 March 1997, updated by GvR, 10 March 1997"
+__version__ = "Ka-Ping Yee, 26 March 1997"
 import string, regex
 from token import *
+COMMENT = N_TOKENS
+tok_name[COMMENT] = 'COMMENT'
+# Changes from 1.3:
+#     Ignore now accepts \f as whitespace.  Operator now includes '**'.
+#     Ignore and Special now accept \n or \r\n at the end of a line.
+#     Imagnumber is new.  Expfloat is corrected to reject '0e4'.
+# Note: to get a quoted backslash in a regex, it must be enclosed in brackets.
 def group(*choices): return '\(' + string.join(choices, '\|') + '\)'
-Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?'
+Whitespace = '[ \f\t]*'
+Comment = '\(#[^\r\n]*\)'
+Ignore = Whitespace + group('[\]\r?\n' + Whitespace)+'*' + Comment+'?'
 Name = '[a-zA-Z_][a-zA-Z0-9_]*'
-ImagZero = '0[jJ]' # This is not caught by any of the following
 Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
 Octnumber = '0[0-7]*[lL]?'
-Decnumber = '[1-9][0-9]*[lLjJ]?'
+Decnumber = '[1-9][0-9]*[lL]?'
-Intnumber = group(ImagZero, Hexnumber, Octnumber, Decnumber)
+Intnumber = group(Hexnumber, Octnumber, Decnumber)
 Exponent = '[eE][-+]?[0-9]+'
 Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?'
-Expfloat = '[0-9]+' + Exponent
+Expfloat = '[1-9][0-9]*' + Exponent
-Floatnumber = group(Pointfloat, Expfloat) + "[jJ]?"
+Floatnumber = group(Pointfloat, Expfloat)
-Number = group(Floatnumber, Intnumber)
+Imagnumber = group('0[jJ]', '[1-9][0-9]*[jJ]', Floatnumber + '[jJ]')
+Number = group(Imagnumber, Floatnumber, Intnumber)
-Single = group('^\'', '[^\]\'')
+Single = group("^'", "[^\]'")
 Double = group('^"', '[^\]"')
-Tsingle = group('^\'\'\'', '[^\]\'\'\'')
+Single3 = group("^'''", "[^\]'''")
-Tdouble = group('^"""', '[^\]"""')
+Double3 = group('^"""', '[^\]"""')
-Triple = group('\'\'\'', '"""')
+Triple = group("'''", '"""')
-String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'),
+String = group("'" + group('[\].', "[^\n'\]") + "*'",
-               '"'  + group('[\].', '[^"\]') + '*' + group('"', '[\]\n'))
+               '"' + group('[\].', '[^\n"\]') + '*"')
 Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|',
                 '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
 Bracket = '[][(){}]'
-Special = group('[\]?\r?\n', '[:;.,`\f]')
+Special = group('\r?\n', '[:;.,`]')
 Funny = group(Operator, Bracket, Special)
-PlainToken = group(Name, Number, Triple, String, Funny)
+PlainToken = group(Name, Number, String, Funny)
 Token = Ignore + PlainToken
+ContStr = group("'" + group('[\].', "[^\n'\]")+'*' + group("'", '[\]\r?\n'),
+                '"' + group('[\].', '[^\n"\]')+'*' + group('"', '[\]\r?\n'))
+PseudoExtras = group('[\]\r?\n', Comment, Triple)
+PseudoToken = Whitespace + group(PseudoExtras, Name, Number, ContStr, Funny)
 try:
-    save_syntax = regex.set_syntax(0)          # use default syntax
+    saved_syntax = regex.set_syntax(0)         # use default syntax
    tokenprog = regex.compile(Token)
+    pseudoprog = regex.compile(PseudoToken)
    endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double),
-        '\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) }
+        '\'\'\'': regex.compile(Single3), '"""': regex.compile(Double3) }
 finally:
-    regex.set_syntax(save_syntax)              # restore original syntax
+    regex.set_syntax(saved_syntax)             # restore original syntax
 tabsize = 8
 TokenError = 'TokenError'
-def printtoken(type, string, linenum, line, start, end):   # for testing
+def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
-    print `linenum` + ':', tok_name[type], repr(string)
+    print "%d,%d-%d,%d:\t%s\t%s" % \
+        (srow, scol, erow, ecol, tok_name[type], repr(token))
-def tokenize(readline, tokeneater = printtoken):
+def tokenize(readline, tokeneater=printtoken):
-    linenum = parenlev = continued = 0
+    lnum = parenlev = continued = 0
    namechars, numchars = string.letters + '_', string.digits
    contstr = ''
    indents = [0]
    while 1:                                   # loop over lines in stream
        line = readline()
-        linenum = linenum + 1
+        lnum = lnum + 1
-        if line[-2:] == '\r\n': line = line[:-2] + '\n'
        pos, max = 0, len(line)
        if contstr:                            # continued string
            if not line: raise TokenError, "EOF within multi-line string"
-            if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n'
            if endprog.search(line) >= 0:
                pos = end = endprog.regs[0][1]
-                tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0)
+                tokeneater(STRING, contstr + line[:end],
+                           strstart, (lnum, end), line)
                contstr = ''
            else:
                contstr = contstr + line
                continue
-        elif parenlev == 0 and not continued:  # this is a new statement
+        elif parenlev == 0 and not continued:  # new statement
            if not line: break
            column = 0
-            while 1:                           # measure leading whitespace
+            while pos < max:                   # measure leading whitespace
                if line[pos] == ' ': column = column + 1
-                elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize
+                elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
                elif line[pos] == '\f': column = 0
                else: break
                pos = pos + 1
-            if line[pos] in '#\n': continue    # skip comments or blank lines
+	    if pos == max: break
+            if line[pos] in '#\r\n':           # skip comments or blank lines
+                tokeneater((NEWLINE, COMMENT)[line[pos] == '#'], line[pos:],
+                           (lnum, pos), (lnum, len(line)), line)
+                continue
            if column > indents[-1]:           # count indents or dedents
                indents.append(column)
-                tokeneater(INDENT, '\t', linenum, line, 0, 0)
+                tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
            while column < indents[-1]:
                indents = indents[:-1]
-                tokeneater(DEDENT, '\t', linenum, line, 0, 0)
+                tokeneater(DEDENT, line[:pos], (lnum, 0), (lnum, pos), line)
        else:                                  # continued statement
            if not line: raise TokenError, "EOF within multi-line statement"
            continued = 0
        while pos < max:
-            if tokenprog.match(line, pos) > 0:             # scan for tokens
+            if pseudoprog.match(line, pos) > 0:            # scan for tokens
-                start, end = tokenprog.regs[3]
+                start, end = pseudoprog.regs[1]
-                token = line[start:end]
+                spos, epos = (lnum, start), (lnum, end)
+                token, initial = line[start:end], line[start]
                pos = end
-                if token[0] in namechars:                  # ordinary name
+                if initial in namechars:                   # ordinary name
-                    tokeneater(NAME, token, linenum, line, start, end)
+                    tokeneater(NAME, token, spos, epos, line)
-                elif token[0] in numchars:                 # ordinary number
+                elif initial in numchars:                  # ordinary number
-                    tokeneater(NUMBER, token, linenum, line, start, end)
+                    tokeneater(NUMBER, token, spos, epos, line)
+                elif initial in '\r\n':
+                    tokeneater(NEWLINE, token, spos, epos, line)
+                elif initial == '#':
+                    tokeneater(COMMENT, token, spos, epos, line)
+                elif initial == '\\':                      # continued stmt
+                    continued = 1
                elif token in ('\'\'\'', '"""'):           # triple-quoted
                    endprog = endprogs[token]
                    if endprog.search(line, pos) >= 0:     # all on one line
                        pos = endprog.regs[0][1]
-			token = line[start:pos]
+                        token = line[start:pos]
-                        tokeneater(STRING, token, linenum, line, start, pos)
+                        tokeneater(STRING, token, spos, (lnum, pos), line)
                    else:
-                        contstr = line[start:]             # multiple lines
+                        strstart = (lnum, start)           # multiple lines
+                        contstr = line[start:]
                        break
-                elif token[0] in '\'"':
+                elif initial in '\'"':
                    if token[-1] == '\n':                  # continued string
-                        endprog, contstr = endprogs[token[0]], line[start:]
+                        strstart = (lnum, start)
+                        endprog, contstr = endprogs[initial], line[start:]
                        break
                    else:                                  # ordinary string
-                        tokeneater(STRING, token, linenum, line, start, end)
+                        tokeneater(STRING, token, spos, epos, line)
-                elif token[0] == '\n':
-                    tokeneater(NEWLINE, token, linenum, line, start, end)
-                elif token[0] == '\\':                     # continued stmt
-                    continued = 1
                else:
-                    if token[0] in '([{': parenlev = parenlev + 1
+                    if initial in '([{': parenlev = parenlev + 1
-                    if token[0] in ')]}': parenlev = parenlev - 1
+                    elif initial in ')]}': parenlev = parenlev - 1
-                    tokeneater(OP, token, linenum, line, start, end)
+                    tokeneater(OP, token, spos, epos, line)
            else:
-                tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1)
+                tokeneater(ERRORTOKEN, line[pos], spos, (lnum, pos+1), line)
                pos = pos + 1
    for indent in indents[1:]:                 # pop remaining indent levels
-        tokeneater(DEDENT, '\t', linenum, line, 0, 0)
+        tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '')
 if __name__ == '__main__':                     # testing
    import sys
-    file = open(sys.argv[-1])
+    tokenize(open(sys.argv[-1]).readline)
-    tokenize(file.readline)