Add untokenize() function to allow full round-trip tokenization.

Should significantly enhance the utility of the module by supporting the creation of tools that modify the token stream and writeback the modified result.

Add untokenize() function to allow full round-trip tokenization.
Should significantly enhance the utility of the module by supporting the creation of tools that modify the token stream and writeback the modified result.
68c04534 · Raymond Hettinger · bf7255ff · 68c04534 · 68c04534 · 68c04534
Kaydet (Commit) 68c04534 authored Haz 10, 2005 tarafından Raymond Hettinger
Showing with 182 additions and 7 deletions

libtokenize.tex Doc/lib/libtokenize.tex +52 -0

regrtest.py Lib/test/regrtest.py +3 -1

test_tokenize.py Lib/test/test_tokenize.py +73 -3

tokenize.py Lib/tokenize.py +49 -3

NEWS Misc/NEWS +5 -0

No files found.
--- a/Doc/lib/libtokenize.tex
+++ b/Doc/lib/libtokenize.tex
@@ -45,6 +45,9 @@ An older entry point is retained for backward compatibility:
  provides the same interface as the \method{readline()} method of
  built-in file objects (see section~\ref{bltin-file-objects}).  Each
  call to the function should return one line of input as a string.
+  Alternately, \var{readline} may be a callable object that signals
+  completion by raising \exception{StopIteration}.
+  \versionchanged[Added StopIteration support]{2.5}

  The second parameter, \var{tokeneater}, must also be a callable
  object.  It is called once for each token, with five arguments,
@@ -65,3 +68,52 @@ passed to the \var{tokeneater} function by \function{tokenize()}:
  are generated when a logical line of code is continued over multiple
  physical lines.
 \end{datadesc}
+
+Another function is provided to reverse the tokenization process.
+This is useful for creating tools that tokenize a script, modify
+the token stream, and write back the modified script.
+
+\begin{funcdesc}{untokenize}{iterable}
+  Converts tokens back into Python source code.  The \variable{iterable}
+  must return sequences with at least two elements, the token type and
+  the token string.  Any additional sequence elements are ignored.
+
+  The reconstructed script is returned as a single string.  The
+  result is guaranteed to tokenize back to match the input so that
+  the conversion is lossless and round-trips are assured.  The
+  guarantee applies only to the token type and token string as
+  the spacing between tokens (column positions) may change.
+  \versionadded{2.5}
+\end{funcdesc}
+
+Example of a script re-writer that transforms float literals into
+Decimal objects:
+\begin{verbatim}
+def decistmt(s):
+    """Substitute Decimals for floats in a string of statements.
+
+    >>> from decimal import Decimal
+    >>> s = 'print +21.3e-5*-.1234/81.7'
+    >>> decistmt(s)
+    "print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"
+
+    >>> exec(s)
+    -3.21716034272e-007
+    >>> exec(decistmt(s))
+    -3.217160342717258261933904529E-7
+
+    """
+    result = []
+    g = generate_tokens(StringIO(s).readline)   # tokenize the string
+    for toknum, tokval, _, _, _  in g:
+        if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens
+            result.extend([
+                (NAME, 'Decimal'),
+                (OP, '('),
+                (STRING, repr(tokval)),
+                (OP, ')')
+            ])
+        else:
+            result.append((toknum, tokval))
+    return untokenize(result)
+\end{verbatim}
--- a/Lib/test/regrtest.py
+++ b/Lib/test/regrtest.py
@@ -91,7 +91,9 @@ resources to test.  Currently only the following are defined:

    compiler -  Test the compiler package by compiling all the source
                in the standard library and test suite.  This takes
-                a long time.
+                a long time.  Enabling this resource also allows
+                test_tokenize to verify round-trip lexing on every
+                file in the test library.

    subprocess  Run all tests for the subprocess module.


--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
-from test.test_support import verbose, findfile
-import tokenize, os, sys
+from test.test_support import verbose, findfile, is_resource_enabled
+import os, glob, random
+from tokenize import (tokenize, generate_tokens, untokenize,
+                      NUMBER, NAME, OP, STRING)

 if verbose:
    print 'starting...'

 f = file(findfile('tokenize_tests' + os.extsep + 'txt'))
-tokenize.tokenize(f.readline)
+tokenize(f.readline)
 f.close()

+
+
+###### Test roundtrip for untokenize ##########################
+
+def test_roundtrip(f):
+    ## print 'Testing:', f
+    f = file(f)
+    try:
+        fulltok = list(generate_tokens(f.readline))
+    finally:
+        f.close()
+
+    t1 = [tok[:2] for tok in fulltok]
+    newtext = untokenize(t1)
+    readline = iter(newtext.splitlines(1)).next
+    t2 = [tok[:2] for tok in generate_tokens(readline)]
+    assert t1 == t2
+
+
+f = findfile('tokenize_tests' + os.extsep + 'txt')
+test_roundtrip(f)
+
+testdir = os.path.dirname(f) or os.curdir
+testfiles = glob.glob(testdir + os.sep + 'test*.py')
+if not is_resource_enabled('compiler'):
+    testfiles = random.sample(testfiles, 10)
+
+for f in testfiles:
+    test_roundtrip(f)
+
+
+
+###### Test example in the docs ###############################
+
+from decimal import Decimal
+from cStringIO import StringIO
+
+def decistmt(s):
+    """Substitute Decimals for floats in a string of statements.
+
+    >>> from decimal import Decimal
+    >>> s = 'print +21.3e-5*-.1234/81.7'
+    >>> decistmt(s)
+    "print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"
+
+    >>> exec(s)
+    -3.21716034272e-007
+    >>> exec(decistmt(s))
+    -3.217160342717258261933904529E-7
+
+    """
+    result = []
+    g = generate_tokens(StringIO(s).readline)   # tokenize the string
+    for toknum, tokval, _, _, _  in g:
+        if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens
+            result.extend([
+                (NAME, 'Decimal'),
+                (OP, '('),
+                (STRING, repr(tokval)),
+                (OP, ')')
+            ])
+        else:
+            result.append((toknum, tokval))
+    return untokenize(result)
+
+import doctest
+doctest.testmod()
+
 if verbose:
    print 'finished'
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -31,7 +31,7 @@ from token import *

 import token
 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
-           "generate_tokens", "NL"]
+           "generate_tokens", "NL", "untokenize"]
 del x
 del token

@@ -159,12 +159,55 @@ def tokenize_loop(readline, tokeneater):
    for token_info in generate_tokens(readline):
        tokeneater(*token_info)

+
+def untokenize(iterable):
+    """Transform tokens back into Python source code.
+
+    Each element returned by the iterable must be a token sequence
+    with at least two elements, a token number and token value.
+
+    Round-trip invariant:
+        # Output text will tokenize the back to the input
+        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
+        newcode = untokenize(t1)
+        readline = iter(newcode.splitlines(1)).next
+        t2 = [tok[:2] for tokin generate_tokens(readline)]
+        assert t1 == t2
+    """
+
+    startline = False
+    indents = []
+    toks = []
+    toks_append = toks.append
+    for tok in iterable:
+        toknum, tokval = tok[:2]
+
+        if toknum == NAME:
+            tokval += ' '
+
+        if toknum == INDENT:
+            indents.append(tokval)
+            continue
+        elif toknum == DEDENT:
+            indents.pop()
+            continue
+        elif toknum in (NEWLINE, COMMENT, NL):
+            startline = True
+        elif startline and indents:
+            toks_append(indents[-1])
+            startline = False
+        toks_append(tokval)
+    return ''.join(toks)
+
+
 def generate_tokens(readline):
    """
    The generate_tokens() generator requires one argment, readline, which
    must be a callable object which provides the same interface as the
    readline() method of built-in file objects. Each call to the function
-    should return one line of input as a string.
+    should return one line of input as a string.  Alternately, readline
+    can be a callable function terminating with StopIteration:
+        readline = open(myfile).next    # Example of alternate readline

    The generator produces 5-tuples with these members: the token type; the
    token string; a 2-tuple (srow, scol) of ints specifying the row and
@@ -180,7 +223,10 @@ def generate_tokens(readline):
    indents = [0]

    while 1:                                   # loop over lines in stream
-        line = readline()
+        try:
+            line = readline()
+        except StopIteration:
+            line = ''
        lnum = lnum + 1
        pos, max = 0, len(line)


--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -141,6 +141,11 @@ Extension Modules
 Library
 -------

+- The tokenize module has a new untokenize() function to support a full
+  roundtrip from lexed tokens back to Python sourcecode.  In addition,
+  the generate_tokens() function now accepts a callable argument that
+  terminates by raising StopIteration.
+
 - Bug #1196315: fix weakref.WeakValueDictionary constructor.

 - Bug #1213894: os.path.realpath didn't resolve symlinks that were the first