Kaydet (Commit) 040d7ca4 authored tarafından Guido van Rossum's avatar Guido van Rossum

Rewritten using the tokenize module, which gives us a real tokenizer

rather than a number of approximating regular expressions.
Alas, it is 3-4 times slower.  Let that be a challenge for the
tokenize module.
üst fd372aa8
...@@ -4,10 +4,11 @@ Parse enough of a Python file to recognize class and method ...@@ -4,10 +4,11 @@ Parse enough of a Python file to recognize class and method
definitions and to find out the superclasses of a class. definitions and to find out the superclasses of a class.
The interface consists of a single function: The interface consists of a single function:
readmodule(module, path) readmodule_ex(module [, path[, inpackage]])
module is the name of a Python module, path is an optional list of module is the name of a Python module, path is an optional list of
directories where the module is to be searched. If present, path is directories where the module is to be searched. If present, path is
prepended to the system search path sys.path. prepended to the system search path sys.path. (inpackage is used
internally to search for a submodule of a package.)
The return value is a dictionary. The keys of the dictionary are The return value is a dictionary. The keys of the dictionary are
the names of the classes defined in the module (including classes the names of the classes defined in the module (including classes
that are defined via the from XXX import YYY construct). The values that are defined via the from XXX import YYY construct). The values
...@@ -28,12 +29,10 @@ string giving the name of the super class. Since import statements ...@@ -28,12 +29,10 @@ string giving the name of the super class. Since import statements
are recognized and imported modules are scanned as well, this are recognized and imported modules are scanned as well, this
shouldn't happen often. shouldn't happen often.
XXX describe the Function class.
BUGS BUGS
- Continuation lines are not dealt with at all, except inside strings.
- Nested classes and functions can confuse it. - Nested classes and functions can confuse it.
- Code that doesn't pass tabnanny or python -t will confuse it, unless
you set the module TABWIDTH vrbl (default 8) to the correct tab width
for the file.
PACKAGE RELATED BUGS PACKAGE RELATED BUGS
- If you have a package and a module inside that or another package - If you have a package and a module inside that or another package
...@@ -52,69 +51,11 @@ PACKAGE RELATED BUGS ...@@ -52,69 +51,11 @@ PACKAGE RELATED BUGS
import sys import sys
import imp import imp
import re import tokenize # Python tokenizer
import string from token import NAME
__all__ = ["readmodule"] __all__ = ["readmodule"]
TABWIDTH = 8
_getnext = re.compile(r"""
(?P<String>
\""" [^"\\]* (?:
(?: \\. | "(?!"") )
[^"\\]*
)*
\"""
| ''' [^'\\]* (?:
(?: \\. | '(?!'') )
[^'\\]*
)*
'''
| " [^"\\\n]* (?: \\. [^"\\\n]*)* "
| ' [^'\\\n]* (?: \\. [^'\\\n]*)* '
)
| (?P<Method>
^
(?P<MethodIndent> [ \t]* )
def [ \t]+
(?P<MethodName> [a-zA-Z_] \w* )
[ \t]* \(
)
| (?P<Class>
^
(?P<ClassIndent> [ \t]* )
class [ \t]+
(?P<ClassName> [a-zA-Z_] \w* )
[ \t]*
(?P<ClassSupers> \( [^)\n]* \) )?
[ \t]* :
)
| (?P<Import>
^ import [ \t]+
(?P<ImportList> [^#;\n]+ )
)
| (?P<ImportFrom>
^ from [ \t]+
(?P<ImportFromPath>
[a-zA-Z_] \w*
(?:
[ \t]* \. [ \t]* [a-zA-Z_] \w*
)*
)
[ \t]+
import [ \t]+
(?P<ImportFromList> [^#;\n]+ )
)
""", re.VERBOSE | re.DOTALL | re.MULTILINE).search
_modules = {} # cache of modules we've seen _modules = {} # cache of modules we've seen
# each Python class is represented by an instance of this class # each Python class is represented by an instance of this class
...@@ -140,7 +81,7 @@ class Function(Class): ...@@ -140,7 +81,7 @@ class Function(Class):
def _addmethod(self, name, lineno): def _addmethod(self, name, lineno):
assert 0, "Function._addmethod() shouldn't be called" assert 0, "Function._addmethod() shouldn't be called"
def readmodule(module, path=[], inpackage=0): def readmodule(module, path=[], inpackage=False):
'''Backwards compatible interface. '''Backwards compatible interface.
Like readmodule_ex() but strips Function objects from the Like readmodule_ex() but strips Function objects from the
...@@ -153,7 +94,7 @@ def readmodule(module, path=[], inpackage=0): ...@@ -153,7 +94,7 @@ def readmodule(module, path=[], inpackage=0):
res[key] = value res[key] = value
return res return res
def readmodule_ex(module, path=[], inpackage=0): def readmodule_ex(module, path=[], inpackage=False):
'''Read a module file and return a dictionary of classes. '''Read a module file and return a dictionary of classes.
Search for MODULE in PATH and sys.path, read and parse the Search for MODULE in PATH and sys.path, read and parse the
...@@ -168,7 +109,7 @@ def readmodule_ex(module, path=[], inpackage=0): ...@@ -168,7 +109,7 @@ def readmodule_ex(module, path=[], inpackage=0):
package = module[:i].strip() package = module[:i].strip()
submodule = module[i+1:].strip() submodule = module[i+1:].strip()
parent = readmodule_ex(package, path, inpackage) parent = readmodule_ex(package, path, inpackage)
child = readmodule_ex(submodule, parent['__path__'], 1) child = readmodule_ex(submodule, parent['__path__'], True)
return child return child
if module in _modules: if module in _modules:
...@@ -204,129 +145,144 @@ def readmodule_ex(module, path=[], inpackage=0): ...@@ -204,129 +145,144 @@ def readmodule_ex(module, path=[], inpackage=0):
_modules[module] = dict _modules[module] = dict
classstack = [] # stack of (class, indent) pairs classstack = [] # stack of (class, indent) pairs
src = f.read()
f.close()
# To avoid having to stop the regexp at each newline, instead
# when we need a line number we simply count the number of
# newlines in the string since the last time we did this; i.e.,
# lineno += src.count('\n', last_lineno_pos, here)
# last_lineno_pos = here
lineno, last_lineno_pos = 1, 0
i = 0
while 1:
m = _getnext(src, i)
if not m:
break
start, i = m.span()
if m.start("Method") >= 0:
# found a method definition or function
thisindent = _indent(m.group("MethodIndent"))
meth_name = m.group("MethodName")
lineno += src.count('\n', last_lineno_pos, start)
last_lineno_pos = start
# close all classes indented at least as much
while classstack and \
classstack[-1][1] >= thisindent:
del classstack[-1]
if classstack:
# it's a class method
cur_class = classstack[-1][0]
cur_class._addmethod(meth_name, lineno)
else:
# it's a function
f = Function(module, meth_name,
file, lineno)
dict[meth_name] = f
elif m.start("String") >= 0:
pass
elif m.start("Class") >= 0: g = tokenize.generate_tokens(f.readline)
# we found a class definition try:
thisindent = _indent(m.group("ClassIndent")) for tokentype, token, start, end, line in g:
# close all classes indented at least as much if token == 'def':
while classstack and \ lineno, thisindent = start
classstack[-1][1] >= thisindent: tokentype, meth_name, start, end, line = g.next()
del classstack[-1] if tokentype != NAME:
lineno += src.count('\n', last_lineno_pos, start) continue # Syntax error
last_lineno_pos = start # close all classes indented at least as much
class_name = m.group("ClassName") while classstack and \
inherit = m.group("ClassSupers") classstack[-1][1] >= thisindent:
if inherit: del classstack[-1]
# the class inherits from other classes if classstack:
inherit = inherit[1:-1].strip() # it's a class method
names = [] cur_class = classstack[-1][0]
for n in inherit.split(','): cur_class._addmethod(meth_name, lineno)
n = n.strip() else:
if n in dict: # it's a function
# we know this super class dict[meth_name] = Function(module, meth_name, file, lineno)
n = dict[n] elif token == 'class':
else: lineno, thisindent = start
c = n.split('.') tokentype, class_name, start, end, line = g.next()
if len(c) > 1: if tokentype != NAME:
# super class continue # Syntax error
# is of the # close all classes indented at least as much
# form module.class: while classstack and \
# look in classstack[-1][1] >= thisindent:
# module for class del classstack[-1]
m = c[-2] # parse what follows the class name
c = c[-1] tokentype, token, start, end, line = g.next()
if m in _modules: inherit = None
d = _modules[m] if token == '(':
if c in d: names = [] # List of superclasses
n = d[c] # there's a list of superclasses
names.append(n) level = 1
inherit = names super = [] # Tokens making up current superclass
# remember this class while True:
cur_class = Class(module, class_name, inherit, tokentype, token, start, end, line = g.next()
file, lineno) if token in (')', ',') and level == 1:
dict[class_name] = cur_class n = "".join(super)
classstack.append((cur_class, thisindent)) if n in dict:
# we know this super class
elif m.start("Import") >= 0: n = dict[n]
# import module else:
for n in m.group("ImportList").split(','): c = n.split('.')
n = n.strip() if len(c) > 1:
# super class is of the form
# module.class: look in module for
# class
m = c[-2]
c = c[-1]
if m in _modules:
d = _modules[m]
if c in d:
n = d[c]
names.append(n)
if token == '(':
level += 1
elif token == ')':
level -= 1
if level == 0:
break
elif token == ',' and level == 1:
pass
else:
super.append(token)
inherit = names
cur_class = Class(module, class_name, inherit, file, lineno)
dict[class_name] = cur_class
classstack.append((cur_class, thisindent))
elif token == 'import' and start[1] == 0:
modules = _getnamelist(g)
for mod, mod2 in modules:
readmodule_ex(mod, path, inpackage)
elif token == 'from' and start[1] == 0:
mod, token = _getname(g)
if not mod or token != "import":
continue
names = _getnamelist(g)
try: try:
# recursively read the imported module # recursively read the imported module
d = readmodule_ex(n, path, inpackage) d = readmodule_ex(mod, path, inpackage)
except: except:
##print 'module', n, 'not found' continue
pass # add any classes that were defined in the imported module
# to our name space if they were mentioned in the list
elif m.start("ImportFrom") >= 0: for n, n2 in names:
# from module import stuff if n in d:
mod = m.group("ImportFromPath") dict[n2 or n] = d[n]
names = m.group("ImportFromList").split(',') elif n == '*':
try: # only add a name if not already there (to mimic
# recursively read the imported module # what Python does internally) also don't add
d = readmodule_ex(mod, path, inpackage) # names that start with _
except: for n in d:
##print 'module', mod, 'not found' if n[0] != '_' and not n in dict:
continue dict[n] = d[n]
# add any classes that were defined in the except StopIteration:
# imported module to our name space if they pass
# were mentioned in the list
for n in names:
n = n.strip()
if n in d:
dict[n] = d[n]
elif n == '*':
# only add a name if not
# already there (to mimic what
# Python does internally)
# also don't add names that
# start with _
for n in d:
if n[0] != '_' and \
not n in dict:
dict[n] = d[n]
else:
assert 0, "regexp _getnext found something unexpected"
f.close()
return dict return dict
def _indent(ws, _expandtabs=string.expandtabs): def _getnamelist(g):
return len(_expandtabs(ws, TABWIDTH)) # Helper to get a comma-separated list of dotted names plus 'as'
# clauses. Return a list of pairs (name, name2) where name2 is
# the 'as' name, or None if there is no 'as' clause.
names = []
while True:
name, token = _getname(g)
if not name:
break
if token == 'as':
name2, token = _getname(g)
else:
name2 = None
names.append((name, name2))
while token != "," and "\n" not in token:
tokentype, token, start, end, line = g.next()
if token != ",":
break
return names
def _getname(g):
# Helper to get a dotted name, return a pair (name, token) where
# name is the dotted name, or None if there was no dotted name,
# and token is the next input token.
parts = []
tokentype, token, start, end, line = g.next()
if tokentype != NAME and token != '*':
return (None, token)
parts.append(token)
while True:
tokentype, token, start, end, line = g.next()
if token != '.':
break
tokentype, token, start, end, line = g.next()
if tokentype != NAME:
break
parts.append(token)
return (".".join(parts), token)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment