Kaydet (Commit) 53187f32 authored tarafından Jeremy Hylton's avatar Jeremy Hylton

now produces valid pyc files for a least a trivial subset of the

language.

CodeGenerator:
* modify to track stack depth
* add emit method that call's PythonVMCode's makeCodeObject
* thread filenames through in hackish way
* set flags for code objects for modules and functions
  XXX the docs for the flags seem out of date and/or incomplete

PythonVMCode:
* add doc string describing the elements of a real code object

LineAddrTable:
* creates an lnotab (no quite correctly though)
üst aa9d2d61
...@@ -11,6 +11,10 @@ import misc ...@@ -11,6 +11,10 @@ import misc
import marshal import marshal
import new import new
import string import string
import sys
import os
import stat
import struct
def parse(path): def parse(path):
f = open(path) f = open(path)
...@@ -60,7 +64,7 @@ class ASTVisitor: ...@@ -60,7 +64,7 @@ class ASTVisitor:
XXX Perhaps I can use a postorder walk for the code generator? XXX Perhaps I can use a postorder walk for the code generator?
""" """
VERBOSE = 0 VERBOSE = 1
def __init__(self): def __init__(self):
self.node = None self.node = None
...@@ -101,9 +105,34 @@ class ASTVisitor: ...@@ -101,9 +105,34 @@ class ASTVisitor:
return meth(node) return meth(node)
class CodeGenerator: class CodeGenerator:
def __init__(self): def __init__(self, filename=None):
self.code = PythonVMCode() self.filename = filename
self.code = PythonVMCode(filename=filename)
self.code.setFlags(0)
self.locals = misc.Stack() self.locals = misc.Stack()
# track the current and max stack size
# XXX does this belong here or in the PythonVMCode?
self.curStack = 0
self.maxStack = 0
def emit(self):
"""Create a Python code object
XXX It is confusing that this method isn't related to the
method named emit in the PythonVMCode.
"""
return self.code.makeCodeObject(self.maxStack)
def push(self, n):
self.curStack = self.curStack + n
if self.curStack > self.maxStack:
self.maxStack = self.curStack
def pop(self, n):
if n >= self.curStack:
self.curStack = self.curStack - n
else:
self.curStack = 0
def visitDiscard(self, node): def visitDiscard(self, node):
return 1 return 1
...@@ -112,16 +141,16 @@ class CodeGenerator: ...@@ -112,16 +141,16 @@ class CodeGenerator:
lnf = walk(node.node, LocalNameFinder()) lnf = walk(node.node, LocalNameFinder())
self.locals.push(lnf.getLocals()) self.locals.push(lnf.getLocals())
self.visit(node.node) self.visit(node.node)
self.code.emit('LOAD_CONST', 'None') self.code.emit('LOAD_CONST', None)
self.code.emit('RETURN_VALUE') self.code.emit('RETURN_VALUE')
return 1 return 1
def visitFunction(self, node): def visitFunction(self, node):
codeBody = NestedCodeGenerator(node.code, node.argnames) codeBody = NestedCodeGenerator(node, filename=self.filename)
walk(node.code, codeBody) walk(node, codeBody)
self.code.setLineNo(node.lineno) self.code.setLineNo(node.lineno)
self.code.emit('LOAD_CONST', codeBody.code) self.code.emit('LOAD_CONST', codeBody)
self.code.emit('MAKE_FUNCTION') self.code.emit('MAKE_FUNCTION', 0)
self.code.emit('STORE_NAME', node.name) self.code.emit('STORE_NAME', node.name)
return 1 return 1
...@@ -212,6 +241,7 @@ class CodeGenerator: ...@@ -212,6 +241,7 @@ class CodeGenerator:
self.visit(node.left) self.visit(node.left)
self.visit(node.right) self.visit(node.right)
self.code.emit(op) self.code.emit(op)
self.pop(1)
return 1 return 1
def visitAdd(self, node): def visitAdd(self, node):
...@@ -232,9 +262,11 @@ class CodeGenerator: ...@@ -232,9 +262,11 @@ class CodeGenerator:
self.code.loadFast(node.name) self.code.loadFast(node.name)
else: else:
self.code.loadGlobal(node.name) self.code.loadGlobal(node.name)
self.push(1)
def visitConst(self, node): def visitConst(self, node):
self.code.loadConst(node.value) self.code.loadConst(node.value)
self.push(1)
def visitReturn(self, node): def visitReturn(self, node):
self.code.setLineNo(node.lineno) self.code.setLineNo(node.lineno)
...@@ -262,6 +294,7 @@ class CodeGenerator: ...@@ -262,6 +294,7 @@ class CodeGenerator:
for child in node.nodes: for child in node.nodes:
self.visit(child) self.visit(child)
self.code.emit('PRINT_ITEM') self.code.emit('PRINT_ITEM')
self.pop(len(node.nodes))
return 1 return 1
def visitPrintnl(self, node): def visitPrintnl(self, node):
...@@ -276,27 +309,39 @@ class NestedCodeGenerator(CodeGenerator): ...@@ -276,27 +309,39 @@ class NestedCodeGenerator(CodeGenerator):
""" """
super_init = CodeGenerator.__init__ super_init = CodeGenerator.__init__
def __init__(self, code, args): def __init__(self, func, filename='<?>'):
"""code and args of function or class being walked """code and args of function or class being walked
XXX need to separately pass to ASTVisitor. the constructor XXX need to separately pass to ASTVisitor. the constructor
only uses the code object to find the local names only uses the code object to find the local names
Copies code form parent __init__ rather than calling it.
""" """
self.super_init() self.name = func.name
lnf = walk(code, LocalNameFinder(args)) self.super_init(filename)
args = func.argnames
self.code = PythonVMCode(len(args), name=func.name,
filename=filename)
if func.varargs:
self.code.setVarArgs()
if func.kwargs:
self.code.setKWArgs()
lnf = walk(func.code, LocalNameFinder(args))
self.locals.push(lnf.getLocals()) self.locals.push(lnf.getLocals())
def __repr__(self):
return "<NestedCodeGenerator: %s>" % self.name
def visitFunction(self, node): def visitFunction(self, node):
lnf = walk(node.code, LocalNameFinder(node.argnames)) lnf = walk(node.code, LocalNameFinder(node.argnames))
self.locals.push(lnf.getLocals()) self.locals.push(lnf.getLocals())
# XXX need to handle def foo((a, b)): # XXX need to handle def foo((a, b)):
self.code.setLineNo(node.lineno) self.code.setLineNo(node.lineno)
self.visit(node.code) self.visit(node.code)
self.code.emit('LOAD_CONST', 'None') self.code.emit('LOAD_CONST', None)
self.code.emit('RETURN_VALUE') self.code.emit('RETURN_VALUE')
return 1 return 1
class LocalNameFinder: class LocalNameFinder:
def __init__(self, names=()): def __init__(self, names=()):
self.names = misc.Set() self.names = misc.Set()
...@@ -353,64 +398,86 @@ class ForwardRef: ...@@ -353,64 +398,86 @@ class ForwardRef:
def resolve(self): def resolve(self):
return self.val return self.val
class CompiledModule: def add_hook(hooks, type, meth):
"""Store the code object for a compiled module """Helper function for PythonVMCode _emit_hooks"""
l = hooks.get(type, [])
l.append(meth)
hooks[type] = l
XXX Not clear how the code objects will be stored. Seems possible class PythonVMCode:
that a single code attribute is sufficient, because it will """Creates Python code objects
contains references to all the need code objects. That might be
messy, though. The new module is used to create the code object. The following
attribute definitions are included from the reference manual:
co_name gives the function name
co_argcount is the number of positional arguments (including
arguments with default values)
co_nlocals is the number of local variables used by the function
(including arguments)
co_varnames is a tuple containing the names of the local variables
(starting with the argument names)
co_code is a string representing the sequence of bytecode instructions
co_consts is a tuple containing the literals used by the bytecode
co_names is a tuple containing the names used by the bytecode
co_filename is the filename from which the code was compiled
co_firstlineno is the first line number of the function
co_lnotab is a string encoding the mapping from byte code offsets
to line numbers (for detais see the source code of the
interpreter)
see code com_set_lineno and com_add_lnotab
it's a string with 2bytes per set_lineno
co_stacksize is the required stack size (including local variables)
co_flags is an integer encoding a number of flags for the
interpreter.
The following flag bits are defined for co_flags: bit 2 is set if
the function uses the "*arguments" syntax to accept an arbitrary
number of positional arguments; bit 3 is set if the function uses
the "**keywords" syntax to accept arbitrary keyword arguments;
other bits are used internally or reserved for future use.
If a code object represents a function, the first item in
co_consts is the documentation string of the function, or None if
undefined.
""" """
MAGIC = (20121 | (ord('\r')<<16) | (ord('\n')<<24))
def __init__(self):
self.code = None
def addCode(self, code):
"""addCode(self: SelfType, code: PythonVMCode)"""
def dump(self, path):
"""create a .pyc file"""
f = open(path, 'wb')
f.write(self._pyc_header())
marshal.dump(self.code, f)
f.close()
def _pyc_header(self, path): # XXX flag bits
# compile.c uses marshal to write a long directly, with VARARGS = 0x04
# calling the interface that would also generate a 1-byte code KWARGS = 0x08
# to indicate the type of the value. simplest way to get the
# same effect is to call marshal and then skip the code.
buf = marshal.dumps(self.MAGIC)[1:]
# skip the mtime for now, since I don't have the write
# structure to pass the filename being compiled into this
# instance
return buf + chr(0) * 4
class PythonVMCode:
def __init__(self): def __init__(self, argcount=0, name='?', filename='<?>',
docstring=None):
# XXX why is the default value for flags 3?
self.insts = [] self.insts = []
# used by makeCodeObject # used by makeCodeObject
self.argcount = 0 self.argcount = argcount
self.code = '' self.code = ''
self.consts = [] self.consts = [docstring]
self.filename = '' self.filename = filename
self.firstlineno = 0 self.flags = 3
self.flags = 0 self.name = name
self.lnotab = None
self.name = ''
self.names = [] self.names = []
self.nlocals = 0
self.stacksize = 2
self.varnames = [] self.varnames = []
# lnotab support
self.firstlineno = 0
self.lastlineno = 0
self.last_addr = 0
self.lnotab = ''
def __repr__(self): def __repr__(self):
return "<bytecode: %d instrs>" % len(self.insts) return "<bytecode: %d instrs>" % len(self.insts)
def emit(self, *args): def setFlags(self, val):
print "emit", args """XXX for module's function"""
self.insts.append(args) self.flags = 0
def setVarArgs(self):
self.flags = self.flags | self.VARARGS
def setKWArgs(self):
self.flags = self.flags | self.KWARGS
def getCurInst(self): def getCurInst(self):
return len(self.insts) return len(self.insts)
...@@ -418,23 +485,70 @@ class PythonVMCode: ...@@ -418,23 +485,70 @@ class PythonVMCode:
def getNextInst(self): def getNextInst(self):
return len(self.insts) + 1 return len(self.insts) + 1
def convert(self): def dump(self, io=sys.stdout):
"""Convert human-readable names to real bytecode""" i = 0
pass for inst in self.insts:
if inst[0] == 'SET_LINENO':
io.write("\n")
io.write(" %3d " % i)
if len(inst) == 1:
io.write("%s\n" % inst)
else:
io.write("%-15.15s\t%s\n" % inst)
i = i + 1
def makeCodeObject(self, stacksize):
"""Make a Python code object
This creates a Python code object using the new module. This
seems simpler than reverse-engineering the way marshal dumps
code objects into .pyc files. One of the key difficulties is
figuring out how to layout references to code objects that
appear on the VM stack; e.g.
3 SET_LINENO 1
6 LOAD_CONST 0 (<code object fact at 8115878 [...]
9 MAKE_FUNCTION 0
12 STORE_NAME 0 (fact)
"""
def makeCodeObject(self):
"""Make a Python code object"""
code = []
self._findOffsets() self._findOffsets()
lnotab = LineAddrTable()
for t in self.insts: for t in self.insts:
opname = t[0] opname = t[0]
if len(t) == 1: if len(t) == 1:
code.append(chr(self.opnum[opname])) lnotab.addCode(chr(self.opnum[opname]))
elif len(t) == 2: elif len(t) == 2:
oparg = self._convertArg(opname, t[1]) oparg = self._convertArg(opname, t[1])
if opname == 'SET_LINENO':
lnotab.nextLine(oparg)
hi, lo = divmod(oparg, 256) hi, lo = divmod(oparg, 256)
code.append(chr(self.opnum[opname]) + chr(lo) + chr(hi)) lnotab.addCode(chr(self.opnum[opname]) + chr(lo) +
return string.join(code, '') chr(hi))
# why is a module a special case?
if self.flags == 0:
nlocals = 0
else:
nlocals = len(self.varnames)
co = new.code(self.argcount, nlocals, stacksize,
self.flags, lnotab.getCode(), self._getConsts(),
tuple(self.names), tuple(self.varnames),
self.filename, self.name, self.firstlineno,
lnotab.getTable())
return co
def _getConsts(self):
"""Return a tuple for the const slot of a code object
Converts PythonVMCode objects to code objects
"""
l = []
for elt in self.consts:
if isinstance(elt, CodeGenerator):
l.append(elt.emit())
else:
l.append(elt)
return tuple(l)
def _findOffsets(self): def _findOffsets(self):
"""Find offsets for use in resolving ForwardRefs""" """Find offsets for use in resolving ForwardRefs"""
...@@ -464,6 +578,9 @@ class PythonVMCode: ...@@ -464,6 +578,9 @@ class PythonVMCode:
if op == 'LOAD_CONST': if op == 'LOAD_CONST':
return self._lookupName(arg, self.consts) return self._lookupName(arg, self.consts)
if op == 'LOAD_FAST': if op == 'LOAD_FAST':
if arg in self.names:
return self._lookupName(arg, self.varnames)
else:
return self._lookupName(arg, self.varnames, self.names) return self._lookupName(arg, self.varnames, self.names)
if op == 'LOAD_GLOBAL': if op == 'LOAD_GLOBAL':
return self._lookupName(arg, self.names) return self._lookupName(arg, self.names)
...@@ -475,7 +592,6 @@ class PythonVMCode: ...@@ -475,7 +592,6 @@ class PythonVMCode:
return self.offsets[arg.resolve()] return self.offsets[arg.resolve()]
if self.hasjabs.has_elt(op): if self.hasjabs.has_elt(op):
return self.offsets[arg.resolve()] - arg.__offset return self.offsets[arg.resolve()] - arg.__offset
print op, arg
return arg return arg
def _lookupName(self, name, list, list2=None): def _lookupName(self, name, list, list2=None):
...@@ -511,6 +627,11 @@ class PythonVMCode: ...@@ -511,6 +627,11 @@ class PythonVMCode:
# it seems redundant to add a function for each opcode, # it seems redundant to add a function for each opcode,
# particularly because the method and opcode basically have the # particularly because the method and opcode basically have the
# same name. # same name.
# on the other hand, we need to track things like stack depth in
# order to generator code objects. if we wrap instructions in a
# method, we get an easy way to track these. a simpler
# approach, however, would be to define hooks that can be called
# by emit.
def setLineNo(self, num): def setLineNo(self, num):
self.emit('SET_LINENO', num) self.emit('SET_LINENO', num)
...@@ -557,15 +678,120 @@ class PythonVMCode: ...@@ -557,15 +678,120 @@ class PythonVMCode:
def callFunction(self, num): def callFunction(self, num):
self.emit('CALL_FUNCTION', num) self.emit('CALL_FUNCTION', num)
# this version of emit + arbitrary hooks might work, but it's damn
# messy.
def emit(self, *args):
self._emitDispatch(args[0], args[1:])
self.insts.append(args)
def _emitDispatch(self, type, args):
for func in self._emit_hooks.get(type, []):
func(self, args)
_emit_hooks = {}
class LineAddrTable:
"""lnotab
This class builds the lnotab, which is undocumented but described
by com_set_lineno in compile.c. Here's an attempt at explanation:
For each SET_LINENO instruction after the first one, two bytes are
added to lnotab. (In some cases, multiple two-byte entries are
added.) The first byte is the distance in bytes between the
instruction for the last SET_LINENO and the current SET_LINENO.
The second byte is offset in line numbers. If either offset is
greater than 255, multiple two-byte entries are added -- one entry
for each factor of 255.
"""
def __init__(self):
self.code = []
self.codeOffset = 0
self.firstline = 0
self.lastline = 0
self.lastoff = 0
self.lnotab = []
def addCode(self, code):
self.code.append(code)
self.codeOffset = self.codeOffset + len(code)
def nextLine(self, lineno):
if self.firstline == 0:
self.firstline = lineno
self.lastline = lineno
else:
# compute deltas
addr = self.codeOffset - self.lastoff
line = lineno - self.lastline
while addr > 0 or line > 0:
# write the values in 1-byte chunks that sum
# to desired value
trunc_addr = addr
trunc_line = line
if trunc_addr > 255:
trunc_addr = 255
if trunc_line > 255:
trunc_line = 255
self.lnotab.append(trunc_addr)
self.lnotab.append(trunc_line)
addr = addr - trunc_addr
line = line - trunc_line
self.lastline = lineno
self.lastoff = self.codeOffset
def getCode(self):
return string.join(self.code, '')
def getTable(self):
return string.join(map(chr, self.lnotab), '')
class CompiledModule:
"""Store the code object for a compiled module
XXX Not clear how the code objects will be stored. Seems possible
that a single code attribute is sufficient, because it will
contains references to all the need code objects. That might be
messy, though.
"""
MAGIC = (20121 | (ord('\r')<<16) | (ord('\n')<<24))
def __init__(self, source, filename):
self.source = source
self.filename = filename
def compile(self):
t = transformer.Transformer()
self.ast = t.parsesuite(self.source)
cg = CodeGenerator(self.filename)
walk(self.ast, cg)
self.code = cg.emit()
def dump(self, path):
"""create a .pyc file"""
f = open(path, 'wb')
f.write(self._pyc_header())
marshal.dump(self.code, f)
f.close()
def _pyc_header(self):
# compile.c uses marshal to write a long directly, with
# calling the interface that would also generate a 1-byte code
# to indicate the type of the value. simplest way to get the
# same effect is to call marshal and then skip the code.
magic = marshal.dumps(self.MAGIC)[1:]
mtime = os.stat(self.filename)[stat.ST_MTIME]
mtime = struct.pack('i', mtime)
return magic + mtime
if __name__ == "__main__": if __name__ == "__main__":
tree = parse('test.py') if len(sys.argv) > 1:
cg = CodeGenerator() filename = sys.argv[1]
ASTVisitor.VERBOSE = 1 else:
w = walk(tree, cg) filename = 'test.py'
w.VERBOSE = 1 buf = open(filename).read()
for i in range(len(cg.code.insts)): mod = CompiledModule(buf, filename)
inst = cg.code.insts[i] mod.compile()
if inst[0] == 'SET_LINENO': mod.dump(filename + 'c')
print
print "%4d" % i, inst
code = cg.code.makeCodeObject()
...@@ -11,6 +11,10 @@ import misc ...@@ -11,6 +11,10 @@ import misc
import marshal import marshal
import new import new
import string import string
import sys
import os
import stat
import struct
def parse(path): def parse(path):
f = open(path) f = open(path)
...@@ -60,7 +64,7 @@ class ASTVisitor: ...@@ -60,7 +64,7 @@ class ASTVisitor:
XXX Perhaps I can use a postorder walk for the code generator? XXX Perhaps I can use a postorder walk for the code generator?
""" """
VERBOSE = 0 VERBOSE = 1
def __init__(self): def __init__(self):
self.node = None self.node = None
...@@ -101,9 +105,34 @@ class ASTVisitor: ...@@ -101,9 +105,34 @@ class ASTVisitor:
return meth(node) return meth(node)
class CodeGenerator: class CodeGenerator:
def __init__(self): def __init__(self, filename=None):
self.code = PythonVMCode() self.filename = filename
self.code = PythonVMCode(filename=filename)
self.code.setFlags(0)
self.locals = misc.Stack() self.locals = misc.Stack()
# track the current and max stack size
# XXX does this belong here or in the PythonVMCode?
self.curStack = 0
self.maxStack = 0
def emit(self):
"""Create a Python code object
XXX It is confusing that this method isn't related to the
method named emit in the PythonVMCode.
"""
return self.code.makeCodeObject(self.maxStack)
def push(self, n):
self.curStack = self.curStack + n
if self.curStack > self.maxStack:
self.maxStack = self.curStack
def pop(self, n):
if n >= self.curStack:
self.curStack = self.curStack - n
else:
self.curStack = 0
def visitDiscard(self, node): def visitDiscard(self, node):
return 1 return 1
...@@ -112,16 +141,16 @@ class CodeGenerator: ...@@ -112,16 +141,16 @@ class CodeGenerator:
lnf = walk(node.node, LocalNameFinder()) lnf = walk(node.node, LocalNameFinder())
self.locals.push(lnf.getLocals()) self.locals.push(lnf.getLocals())
self.visit(node.node) self.visit(node.node)
self.code.emit('LOAD_CONST', 'None') self.code.emit('LOAD_CONST', None)
self.code.emit('RETURN_VALUE') self.code.emit('RETURN_VALUE')
return 1 return 1
def visitFunction(self, node): def visitFunction(self, node):
codeBody = NestedCodeGenerator(node.code, node.argnames) codeBody = NestedCodeGenerator(node, filename=self.filename)
walk(node.code, codeBody) walk(node, codeBody)
self.code.setLineNo(node.lineno) self.code.setLineNo(node.lineno)
self.code.emit('LOAD_CONST', codeBody.code) self.code.emit('LOAD_CONST', codeBody)
self.code.emit('MAKE_FUNCTION') self.code.emit('MAKE_FUNCTION', 0)
self.code.emit('STORE_NAME', node.name) self.code.emit('STORE_NAME', node.name)
return 1 return 1
...@@ -212,6 +241,7 @@ class CodeGenerator: ...@@ -212,6 +241,7 @@ class CodeGenerator:
self.visit(node.left) self.visit(node.left)
self.visit(node.right) self.visit(node.right)
self.code.emit(op) self.code.emit(op)
self.pop(1)
return 1 return 1
def visitAdd(self, node): def visitAdd(self, node):
...@@ -232,9 +262,11 @@ class CodeGenerator: ...@@ -232,9 +262,11 @@ class CodeGenerator:
self.code.loadFast(node.name) self.code.loadFast(node.name)
else: else:
self.code.loadGlobal(node.name) self.code.loadGlobal(node.name)
self.push(1)
def visitConst(self, node): def visitConst(self, node):
self.code.loadConst(node.value) self.code.loadConst(node.value)
self.push(1)
def visitReturn(self, node): def visitReturn(self, node):
self.code.setLineNo(node.lineno) self.code.setLineNo(node.lineno)
...@@ -262,6 +294,7 @@ class CodeGenerator: ...@@ -262,6 +294,7 @@ class CodeGenerator:
for child in node.nodes: for child in node.nodes:
self.visit(child) self.visit(child)
self.code.emit('PRINT_ITEM') self.code.emit('PRINT_ITEM')
self.pop(len(node.nodes))
return 1 return 1
def visitPrintnl(self, node): def visitPrintnl(self, node):
...@@ -276,27 +309,39 @@ class NestedCodeGenerator(CodeGenerator): ...@@ -276,27 +309,39 @@ class NestedCodeGenerator(CodeGenerator):
""" """
super_init = CodeGenerator.__init__ super_init = CodeGenerator.__init__
def __init__(self, code, args): def __init__(self, func, filename='<?>'):
"""code and args of function or class being walked """code and args of function or class being walked
XXX need to separately pass to ASTVisitor. the constructor XXX need to separately pass to ASTVisitor. the constructor
only uses the code object to find the local names only uses the code object to find the local names
Copies code form parent __init__ rather than calling it.
""" """
self.super_init() self.name = func.name
lnf = walk(code, LocalNameFinder(args)) self.super_init(filename)
args = func.argnames
self.code = PythonVMCode(len(args), name=func.name,
filename=filename)
if func.varargs:
self.code.setVarArgs()
if func.kwargs:
self.code.setKWArgs()
lnf = walk(func.code, LocalNameFinder(args))
self.locals.push(lnf.getLocals()) self.locals.push(lnf.getLocals())
def __repr__(self):
return "<NestedCodeGenerator: %s>" % self.name
def visitFunction(self, node): def visitFunction(self, node):
lnf = walk(node.code, LocalNameFinder(node.argnames)) lnf = walk(node.code, LocalNameFinder(node.argnames))
self.locals.push(lnf.getLocals()) self.locals.push(lnf.getLocals())
# XXX need to handle def foo((a, b)): # XXX need to handle def foo((a, b)):
self.code.setLineNo(node.lineno) self.code.setLineNo(node.lineno)
self.visit(node.code) self.visit(node.code)
self.code.emit('LOAD_CONST', 'None') self.code.emit('LOAD_CONST', None)
self.code.emit('RETURN_VALUE') self.code.emit('RETURN_VALUE')
return 1 return 1
class LocalNameFinder: class LocalNameFinder:
def __init__(self, names=()): def __init__(self, names=()):
self.names = misc.Set() self.names = misc.Set()
...@@ -353,64 +398,86 @@ class ForwardRef: ...@@ -353,64 +398,86 @@ class ForwardRef:
def resolve(self): def resolve(self):
return self.val return self.val
class CompiledModule: def add_hook(hooks, type, meth):
"""Store the code object for a compiled module """Helper function for PythonVMCode _emit_hooks"""
l = hooks.get(type, [])
l.append(meth)
hooks[type] = l
XXX Not clear how the code objects will be stored. Seems possible class PythonVMCode:
that a single code attribute is sufficient, because it will """Creates Python code objects
contains references to all the need code objects. That might be
messy, though. The new module is used to create the code object. The following
attribute definitions are included from the reference manual:
co_name gives the function name
co_argcount is the number of positional arguments (including
arguments with default values)
co_nlocals is the number of local variables used by the function
(including arguments)
co_varnames is a tuple containing the names of the local variables
(starting with the argument names)
co_code is a string representing the sequence of bytecode instructions
co_consts is a tuple containing the literals used by the bytecode
co_names is a tuple containing the names used by the bytecode
co_filename is the filename from which the code was compiled
co_firstlineno is the first line number of the function
co_lnotab is a string encoding the mapping from byte code offsets
to line numbers (for detais see the source code of the
interpreter)
see code com_set_lineno and com_add_lnotab
it's a string with 2bytes per set_lineno
co_stacksize is the required stack size (including local variables)
co_flags is an integer encoding a number of flags for the
interpreter.
The following flag bits are defined for co_flags: bit 2 is set if
the function uses the "*arguments" syntax to accept an arbitrary
number of positional arguments; bit 3 is set if the function uses
the "**keywords" syntax to accept arbitrary keyword arguments;
other bits are used internally or reserved for future use.
If a code object represents a function, the first item in
co_consts is the documentation string of the function, or None if
undefined.
""" """
MAGIC = (20121 | (ord('\r')<<16) | (ord('\n')<<24))
def __init__(self):
self.code = None
def addCode(self, code):
"""addCode(self: SelfType, code: PythonVMCode)"""
def dump(self, path):
"""create a .pyc file"""
f = open(path, 'wb')
f.write(self._pyc_header())
marshal.dump(self.code, f)
f.close()
def _pyc_header(self, path): # XXX flag bits
# compile.c uses marshal to write a long directly, with VARARGS = 0x04
# calling the interface that would also generate a 1-byte code KWARGS = 0x08
# to indicate the type of the value. simplest way to get the
# same effect is to call marshal and then skip the code.
buf = marshal.dumps(self.MAGIC)[1:]
# skip the mtime for now, since I don't have the write
# structure to pass the filename being compiled into this
# instance
return buf + chr(0) * 4
class PythonVMCode:
def __init__(self): def __init__(self, argcount=0, name='?', filename='<?>',
docstring=None):
# XXX why is the default value for flags 3?
self.insts = [] self.insts = []
# used by makeCodeObject # used by makeCodeObject
self.argcount = 0 self.argcount = argcount
self.code = '' self.code = ''
self.consts = [] self.consts = [docstring]
self.filename = '' self.filename = filename
self.firstlineno = 0 self.flags = 3
self.flags = 0 self.name = name
self.lnotab = None
self.name = ''
self.names = [] self.names = []
self.nlocals = 0
self.stacksize = 2
self.varnames = [] self.varnames = []
# lnotab support
self.firstlineno = 0
self.lastlineno = 0
self.last_addr = 0
self.lnotab = ''
def __repr__(self): def __repr__(self):
return "<bytecode: %d instrs>" % len(self.insts) return "<bytecode: %d instrs>" % len(self.insts)
def emit(self, *args): def setFlags(self, val):
print "emit", args """XXX for module's function"""
self.insts.append(args) self.flags = 0
def setVarArgs(self):
self.flags = self.flags | self.VARARGS
def setKWArgs(self):
self.flags = self.flags | self.KWARGS
def getCurInst(self): def getCurInst(self):
return len(self.insts) return len(self.insts)
...@@ -418,23 +485,70 @@ class PythonVMCode: ...@@ -418,23 +485,70 @@ class PythonVMCode:
def getNextInst(self): def getNextInst(self):
return len(self.insts) + 1 return len(self.insts) + 1
def convert(self): def dump(self, io=sys.stdout):
"""Convert human-readable names to real bytecode""" i = 0
pass for inst in self.insts:
if inst[0] == 'SET_LINENO':
io.write("\n")
io.write(" %3d " % i)
if len(inst) == 1:
io.write("%s\n" % inst)
else:
io.write("%-15.15s\t%s\n" % inst)
i = i + 1
def makeCodeObject(self, stacksize):
"""Make a Python code object
This creates a Python code object using the new module. This
seems simpler than reverse-engineering the way marshal dumps
code objects into .pyc files. One of the key difficulties is
figuring out how to layout references to code objects that
appear on the VM stack; e.g.
3 SET_LINENO 1
6 LOAD_CONST 0 (<code object fact at 8115878 [...]
9 MAKE_FUNCTION 0
12 STORE_NAME 0 (fact)
"""
def makeCodeObject(self):
"""Make a Python code object"""
code = []
self._findOffsets() self._findOffsets()
lnotab = LineAddrTable()
for t in self.insts: for t in self.insts:
opname = t[0] opname = t[0]
if len(t) == 1: if len(t) == 1:
code.append(chr(self.opnum[opname])) lnotab.addCode(chr(self.opnum[opname]))
elif len(t) == 2: elif len(t) == 2:
oparg = self._convertArg(opname, t[1]) oparg = self._convertArg(opname, t[1])
if opname == 'SET_LINENO':
lnotab.nextLine(oparg)
hi, lo = divmod(oparg, 256) hi, lo = divmod(oparg, 256)
code.append(chr(self.opnum[opname]) + chr(lo) + chr(hi)) lnotab.addCode(chr(self.opnum[opname]) + chr(lo) +
return string.join(code, '') chr(hi))
# why is a module a special case?
if self.flags == 0:
nlocals = 0
else:
nlocals = len(self.varnames)
co = new.code(self.argcount, nlocals, stacksize,
self.flags, lnotab.getCode(), self._getConsts(),
tuple(self.names), tuple(self.varnames),
self.filename, self.name, self.firstlineno,
lnotab.getTable())
return co
def _getConsts(self):
"""Return a tuple for the const slot of a code object
Converts PythonVMCode objects to code objects
"""
l = []
for elt in self.consts:
if isinstance(elt, CodeGenerator):
l.append(elt.emit())
else:
l.append(elt)
return tuple(l)
def _findOffsets(self): def _findOffsets(self):
"""Find offsets for use in resolving ForwardRefs""" """Find offsets for use in resolving ForwardRefs"""
...@@ -464,6 +578,9 @@ class PythonVMCode: ...@@ -464,6 +578,9 @@ class PythonVMCode:
if op == 'LOAD_CONST': if op == 'LOAD_CONST':
return self._lookupName(arg, self.consts) return self._lookupName(arg, self.consts)
if op == 'LOAD_FAST': if op == 'LOAD_FAST':
if arg in self.names:
return self._lookupName(arg, self.varnames)
else:
return self._lookupName(arg, self.varnames, self.names) return self._lookupName(arg, self.varnames, self.names)
if op == 'LOAD_GLOBAL': if op == 'LOAD_GLOBAL':
return self._lookupName(arg, self.names) return self._lookupName(arg, self.names)
...@@ -475,7 +592,6 @@ class PythonVMCode: ...@@ -475,7 +592,6 @@ class PythonVMCode:
return self.offsets[arg.resolve()] return self.offsets[arg.resolve()]
if self.hasjabs.has_elt(op): if self.hasjabs.has_elt(op):
return self.offsets[arg.resolve()] - arg.__offset return self.offsets[arg.resolve()] - arg.__offset
print op, arg
return arg return arg
def _lookupName(self, name, list, list2=None): def _lookupName(self, name, list, list2=None):
...@@ -511,6 +627,11 @@ class PythonVMCode: ...@@ -511,6 +627,11 @@ class PythonVMCode:
# it seems redundant to add a function for each opcode, # it seems redundant to add a function for each opcode,
# particularly because the method and opcode basically have the # particularly because the method and opcode basically have the
# same name. # same name.
# on the other hand, we need to track things like stack depth in
# order to generator code objects. if we wrap instructions in a
# method, we get an easy way to track these. a simpler
# approach, however, would be to define hooks that can be called
# by emit.
def setLineNo(self, num): def setLineNo(self, num):
self.emit('SET_LINENO', num) self.emit('SET_LINENO', num)
...@@ -557,15 +678,120 @@ class PythonVMCode: ...@@ -557,15 +678,120 @@ class PythonVMCode:
def callFunction(self, num): def callFunction(self, num):
self.emit('CALL_FUNCTION', num) self.emit('CALL_FUNCTION', num)
# this version of emit + arbitrary hooks might work, but it's damn
# messy.
def emit(self, *args):
self._emitDispatch(args[0], args[1:])
self.insts.append(args)
def _emitDispatch(self, type, args):
for func in self._emit_hooks.get(type, []):
func(self, args)
_emit_hooks = {}
class LineAddrTable:
"""lnotab
This class builds the lnotab, which is undocumented but described
by com_set_lineno in compile.c. Here's an attempt at explanation:
For each SET_LINENO instruction after the first one, two bytes are
added to lnotab. (In some cases, multiple two-byte entries are
added.) The first byte is the distance in bytes between the
instruction for the last SET_LINENO and the current SET_LINENO.
The second byte is offset in line numbers. If either offset is
greater than 255, multiple two-byte entries are added -- one entry
for each factor of 255.
"""
def __init__(self):
self.code = []
self.codeOffset = 0
self.firstline = 0
self.lastline = 0
self.lastoff = 0
self.lnotab = []
def addCode(self, code):
self.code.append(code)
self.codeOffset = self.codeOffset + len(code)
def nextLine(self, lineno):
if self.firstline == 0:
self.firstline = lineno
self.lastline = lineno
else:
# compute deltas
addr = self.codeOffset - self.lastoff
line = lineno - self.lastline
while addr > 0 or line > 0:
# write the values in 1-byte chunks that sum
# to desired value
trunc_addr = addr
trunc_line = line
if trunc_addr > 255:
trunc_addr = 255
if trunc_line > 255:
trunc_line = 255
self.lnotab.append(trunc_addr)
self.lnotab.append(trunc_line)
addr = addr - trunc_addr
line = line - trunc_line
self.lastline = lineno
self.lastoff = self.codeOffset
def getCode(self):
return string.join(self.code, '')
def getTable(self):
return string.join(map(chr, self.lnotab), '')
class CompiledModule:
"""Store the code object for a compiled module
XXX Not clear how the code objects will be stored. Seems possible
that a single code attribute is sufficient, because it will
contains references to all the need code objects. That might be
messy, though.
"""
MAGIC = (20121 | (ord('\r')<<16) | (ord('\n')<<24))
def __init__(self, source, filename):
self.source = source
self.filename = filename
def compile(self):
t = transformer.Transformer()
self.ast = t.parsesuite(self.source)
cg = CodeGenerator(self.filename)
walk(self.ast, cg)
self.code = cg.emit()
def dump(self, path):
"""create a .pyc file"""
f = open(path, 'wb')
f.write(self._pyc_header())
marshal.dump(self.code, f)
f.close()
def _pyc_header(self):
# compile.c uses marshal to write a long directly, with
# calling the interface that would also generate a 1-byte code
# to indicate the type of the value. simplest way to get the
# same effect is to call marshal and then skip the code.
magic = marshal.dumps(self.MAGIC)[1:]
mtime = os.stat(self.filename)[stat.ST_MTIME]
mtime = struct.pack('i', mtime)
return magic + mtime
if __name__ == "__main__": if __name__ == "__main__":
tree = parse('test.py') if len(sys.argv) > 1:
cg = CodeGenerator() filename = sys.argv[1]
ASTVisitor.VERBOSE = 1 else:
w = walk(tree, cg) filename = 'test.py'
w.VERBOSE = 1 buf = open(filename).read()
for i in range(len(cg.code.insts)): mod = CompiledModule(buf, filename)
inst = cg.code.insts[i] mod.compile()
if inst[0] == 'SET_LINENO': mod.dump(filename + 'c')
print
print "%4d" % i, inst
code = cg.code.makeCodeObject()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment