Kaydet (Commit) 040d7ca4 authored tarafından Guido van Rossum's avatar Guido van Rossum

Rewritten using the tokenize module, which gives us a real tokenizer

rather than a number of approximating regular expressions.
Alas, it is 3-4 times slower.  Let that be a challenge for the
tokenize module.
üst fd372aa8
......@@ -4,10 +4,11 @@ Parse enough of a Python file to recognize class and method
definitions and to find out the superclasses of a class.
The interface consists of a single function:
readmodule(module, path)
readmodule_ex(module [, path[, inpackage]])
module is the name of a Python module, path is an optional list of
directories where the module is to be searched. If present, path is
prepended to the system search path sys.path.
prepended to the system search path sys.path. (inpackage is used
internally to search for a submodule of a package.)
The return value is a dictionary. The keys of the dictionary are
the names of the classes defined in the module (including classes
that are defined via the from XXX import YYY construct). The values
......@@ -28,12 +29,10 @@ string giving the name of the super class. Since import statements
are recognized and imported modules are scanned as well, this
shouldn't happen often.
XXX describe the Function class.
BUGS
- Continuation lines are not dealt with at all, except inside strings.
- Nested classes and functions can confuse it.
- Code that doesn't pass tabnanny or python -t will confuse it, unless
you set the module TABWIDTH vrbl (default 8) to the correct tab width
for the file.
PACKAGE RELATED BUGS
- If you have a package and a module inside that or another package
......@@ -52,69 +51,11 @@ PACKAGE RELATED BUGS
import sys
import imp
import re
import string
import tokenize # Python tokenizer
from token import NAME
__all__ = ["readmodule"]
TABWIDTH = 8
_getnext = re.compile(r"""
(?P<String>
\""" [^"\\]* (?:
(?: \\. | "(?!"") )
[^"\\]*
)*
\"""
| ''' [^'\\]* (?:
(?: \\. | '(?!'') )
[^'\\]*
)*
'''
| " [^"\\\n]* (?: \\. [^"\\\n]*)* "
| ' [^'\\\n]* (?: \\. [^'\\\n]*)* '
)
| (?P<Method>
^
(?P<MethodIndent> [ \t]* )
def [ \t]+
(?P<MethodName> [a-zA-Z_] \w* )
[ \t]* \(
)
| (?P<Class>
^
(?P<ClassIndent> [ \t]* )
class [ \t]+
(?P<ClassName> [a-zA-Z_] \w* )
[ \t]*
(?P<ClassSupers> \( [^)\n]* \) )?
[ \t]* :
)
| (?P<Import>
^ import [ \t]+
(?P<ImportList> [^#;\n]+ )
)
| (?P<ImportFrom>
^ from [ \t]+
(?P<ImportFromPath>
[a-zA-Z_] \w*
(?:
[ \t]* \. [ \t]* [a-zA-Z_] \w*
)*
)
[ \t]+
import [ \t]+
(?P<ImportFromList> [^#;\n]+ )
)
""", re.VERBOSE | re.DOTALL | re.MULTILINE).search
_modules = {} # cache of modules we've seen
# each Python class is represented by an instance of this class
......@@ -140,7 +81,7 @@ class Function(Class):
def _addmethod(self, name, lineno):
assert 0, "Function._addmethod() shouldn't be called"
def readmodule(module, path=[], inpackage=0):
def readmodule(module, path=[], inpackage=False):
'''Backwards compatible interface.
Like readmodule_ex() but strips Function objects from the
......@@ -153,7 +94,7 @@ def readmodule(module, path=[], inpackage=0):
res[key] = value
return res
def readmodule_ex(module, path=[], inpackage=0):
def readmodule_ex(module, path=[], inpackage=False):
'''Read a module file and return a dictionary of classes.
Search for MODULE in PATH and sys.path, read and parse the
......@@ -168,7 +109,7 @@ def readmodule_ex(module, path=[], inpackage=0):
package = module[:i].strip()
submodule = module[i+1:].strip()
parent = readmodule_ex(package, path, inpackage)
child = readmodule_ex(submodule, parent['__path__'], 1)
child = readmodule_ex(submodule, parent['__path__'], True)
return child
if module in _modules:
......@@ -204,28 +145,15 @@ def readmodule_ex(module, path=[], inpackage=0):
_modules[module] = dict
classstack = [] # stack of (class, indent) pairs
src = f.read()
f.close()
# To avoid having to stop the regexp at each newline, instead
# when we need a line number we simply count the number of
# newlines in the string since the last time we did this; i.e.,
# lineno += src.count('\n', last_lineno_pos, here)
# last_lineno_pos = here
lineno, last_lineno_pos = 1, 0
i = 0
while 1:
m = _getnext(src, i)
if not m:
break
start, i = m.span()
if m.start("Method") >= 0:
# found a method definition or function
thisindent = _indent(m.group("MethodIndent"))
meth_name = m.group("MethodName")
lineno += src.count('\n', last_lineno_pos, start)
last_lineno_pos = start
g = tokenize.generate_tokens(f.readline)
try:
for tokentype, token, start, end, line in g:
if token == 'def':
lineno, thisindent = start
tokentype, meth_name, start, end, line = g.next()
if tokentype != NAME:
continue # Syntax error
# close all classes indented at least as much
while classstack and \
classstack[-1][1] >= thisindent:
......@@ -236,41 +164,37 @@ def readmodule_ex(module, path=[], inpackage=0):
cur_class._addmethod(meth_name, lineno)
else:
# it's a function
f = Function(module, meth_name,
file, lineno)
dict[meth_name] = f
elif m.start("String") >= 0:
pass
elif m.start("Class") >= 0:
# we found a class definition
thisindent = _indent(m.group("ClassIndent"))
dict[meth_name] = Function(module, meth_name, file, lineno)
elif token == 'class':
lineno, thisindent = start
tokentype, class_name, start, end, line = g.next()
if tokentype != NAME:
continue # Syntax error
# close all classes indented at least as much
while classstack and \
classstack[-1][1] >= thisindent:
del classstack[-1]
lineno += src.count('\n', last_lineno_pos, start)
last_lineno_pos = start
class_name = m.group("ClassName")
inherit = m.group("ClassSupers")
if inherit:
# the class inherits from other classes
inherit = inherit[1:-1].strip()
names = []
for n in inherit.split(','):
n = n.strip()
# parse what follows the class name
tokentype, token, start, end, line = g.next()
inherit = None
if token == '(':
names = [] # List of superclasses
# there's a list of superclasses
level = 1
super = [] # Tokens making up current superclass
while True:
tokentype, token, start, end, line = g.next()
if token in (')', ',') and level == 1:
n = "".join(super)
if n in dict:
# we know this super class
n = dict[n]
else:
c = n.split('.')
if len(c) > 1:
# super class
# is of the
# form module.class:
# look in
# module for class
# super class is of the form
# module.class: look in module for
# class
m = c[-2]
c = c[-1]
if m in _modules:
......@@ -278,55 +202,87 @@ def readmodule_ex(module, path=[], inpackage=0):
if c in d:
n = d[c]
names.append(n)
if token == '(':
level += 1
elif token == ')':
level -= 1
if level == 0:
break
elif token == ',' and level == 1:
pass
else:
super.append(token)
inherit = names
# remember this class
cur_class = Class(module, class_name, inherit,
file, lineno)
cur_class = Class(module, class_name, inherit, file, lineno)
dict[class_name] = cur_class
classstack.append((cur_class, thisindent))
elif m.start("Import") >= 0:
# import module
for n in m.group("ImportList").split(','):
n = n.strip()
try:
# recursively read the imported module
d = readmodule_ex(n, path, inpackage)
except:
##print 'module', n, 'not found'
pass
elif m.start("ImportFrom") >= 0:
# from module import stuff
mod = m.group("ImportFromPath")
names = m.group("ImportFromList").split(',')
elif token == 'import' and start[1] == 0:
modules = _getnamelist(g)
for mod, mod2 in modules:
readmodule_ex(mod, path, inpackage)
elif token == 'from' and start[1] == 0:
mod, token = _getname(g)
if not mod or token != "import":
continue
names = _getnamelist(g)
try:
# recursively read the imported module
d = readmodule_ex(mod, path, inpackage)
except:
##print 'module', mod, 'not found'
continue
# add any classes that were defined in the
# imported module to our name space if they
# were mentioned in the list
for n in names:
n = n.strip()
# add any classes that were defined in the imported module
# to our name space if they were mentioned in the list
for n, n2 in names:
if n in d:
dict[n] = d[n]
dict[n2 or n] = d[n]
elif n == '*':
# only add a name if not
# already there (to mimic what
# Python does internally)
# also don't add names that
# start with _
# only add a name if not already there (to mimic
# what Python does internally) also don't add
# names that start with _
for n in d:
if n[0] != '_' and \
not n in dict:
if n[0] != '_' and not n in dict:
dict[n] = d[n]
else:
assert 0, "regexp _getnext found something unexpected"
except StopIteration:
pass
f.close()
return dict
def _indent(ws, _expandtabs=string.expandtabs):
return len(_expandtabs(ws, TABWIDTH))
def _getnamelist(g):
# Helper to get a comma-separated list of dotted names plus 'as'
# clauses. Return a list of pairs (name, name2) where name2 is
# the 'as' name, or None if there is no 'as' clause.
names = []
while True:
name, token = _getname(g)
if not name:
break
if token == 'as':
name2, token = _getname(g)
else:
name2 = None
names.append((name, name2))
while token != "," and "\n" not in token:
tokentype, token, start, end, line = g.next()
if token != ",":
break
return names
def _getname(g):
# Helper to get a dotted name, return a pair (name, token) where
# name is the dotted name, or None if there was no dotted name,
# and token is the next input token.
parts = []
tokentype, token, start, end, line = g.next()
if tokentype != NAME and token != '*':
return (None, token)
parts.append(token)
while True:
tokentype, token, start, end, line = g.next()
if token != '.':
break
tokentype, token, start, end, line = g.next()
if tokentype != NAME:
break
parts.append(token)
return (".".join(parts), token)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment