Kaydet (Commit) de65527e authored tarafından Guido van Rossum's avatar Guido van Rossum

Ping's latest. Fixes triple quoted strings ending in odd

#backslashes, and other stuff I don't know.
üst 1836a620
...@@ -14,7 +14,7 @@ regular expression into 'tokenprog' that matches Python tokens in individual ...@@ -14,7 +14,7 @@ regular expression into 'tokenprog' that matches Python tokens in individual
lines of text, leaving the token in 'tokenprog.group(3)', but does not lines of text, leaving the token in 'tokenprog.group(3)', but does not
handle indentation, continuations, or multi-line strings.""" handle indentation, continuations, or multi-line strings."""
__version__ = "Ka-Ping Yee, 26 March 1997" __version__ = "Ka-Ping Yee, 29 March 1997"
import string, regex import string, regex
from token import * from token import *
...@@ -46,13 +46,13 @@ Floatnumber = group(Pointfloat, Expfloat) ...@@ -46,13 +46,13 @@ Floatnumber = group(Pointfloat, Expfloat)
Imagnumber = group('0[jJ]', '[1-9][0-9]*[jJ]', Floatnumber + '[jJ]') Imagnumber = group('0[jJ]', '[1-9][0-9]*[jJ]', Floatnumber + '[jJ]')
Number = group(Imagnumber, Floatnumber, Intnumber) Number = group(Imagnumber, Floatnumber, Intnumber)
Single = group("^'", "[^\]'") Single = group("[^'\]", "[\].") + "*'"
Double = group('^"', '[^\]"') Double = group('[^"\]', '[\].') + '*"'
Single3 = group("^'''", "[^\]'''") Single3 = group("[^'\]","[\].","'[^'\]","'[\].","''[^'\]","''[\].") + "*'''"
Double3 = group('^"""', '[^\]"""') Double3 = group('[^"\]','[\].','"[^"\]','"[\].','""[^"\]','""[\].') + '*"""'
Triple = group("'''", '"""') Triple = group("'''", '"""')
String = group("'" + group('[\].', "[^\n'\]") + "*'", String = group("'" + group("[^\n'\]", "[\].") + "*'",
'"' + group('[\].', '[^\n"\]') + '*"') '"' + group('[^\n"\]', '[\].') + '*"')
Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|', Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|',
'<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>') '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
...@@ -86,7 +86,7 @@ def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing ...@@ -86,7 +86,7 @@ def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
def tokenize(readline, tokeneater=printtoken): def tokenize(readline, tokeneater=printtoken):
lnum = parenlev = continued = 0 lnum = parenlev = continued = 0
namechars, numchars = string.letters + '_', string.digits namechars, numchars = string.letters + '_', string.digits
contstr = '' contstr, needcont = '', 0
indents = [0] indents = [0]
while 1: # loop over lines in stream while 1: # loop over lines in stream
...@@ -95,12 +95,18 @@ def tokenize(readline, tokeneater=printtoken): ...@@ -95,12 +95,18 @@ def tokenize(readline, tokeneater=printtoken):
pos, max = 0, len(line) pos, max = 0, len(line)
if contstr: # continued string if contstr: # continued string
if not line: raise TokenError, "EOF within multi-line string" if not line:
if endprog.search(line) >= 0: raise TokenError, ("EOF in multi-line string", strstart)
if endprog.match(line) >= 0:
pos = end = endprog.regs[0][1] pos = end = endprog.regs[0][1]
tokeneater(STRING, contstr + line[:end], tokeneater(STRING, contstr + line[:end],
strstart, (lnum, end), line) strstart, (lnum, end), line)
contstr, needcont = '', 0
elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
tokeneater(ERRORTOKEN, contstr + line,
strstart, (lnum, len(line)), line)
contstr = '' contstr = ''
continue
else: else:
contstr = contstr + line contstr = contstr + line
continue continue
...@@ -114,7 +120,7 @@ def tokenize(readline, tokeneater=printtoken): ...@@ -114,7 +120,7 @@ def tokenize(readline, tokeneater=printtoken):
elif line[pos] == '\f': column = 0 elif line[pos] == '\f': column = 0
else: break else: break
pos = pos + 1 pos = pos + 1
if pos == max: break if pos == max: break
if line[pos] in '#\r\n': # skip comments or blank lines if line[pos] in '#\r\n': # skip comments or blank lines
tokeneater((NEWLINE, COMMENT)[line[pos] == '#'], line[pos:], tokeneater((NEWLINE, COMMENT)[line[pos] == '#'], line[pos:],
...@@ -126,22 +132,23 @@ def tokenize(readline, tokeneater=printtoken): ...@@ -126,22 +132,23 @@ def tokenize(readline, tokeneater=printtoken):
tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
while column < indents[-1]: while column < indents[-1]:
indents = indents[:-1] indents = indents[:-1]
tokeneater(DEDENT, line[:pos], (lnum, 0), (lnum, pos), line) tokeneater(DEDENT, '', (lnum, pos), (lnum, pos), line)
else: # continued statement else: # continued statement
if not line: raise TokenError, "EOF within multi-line statement" if not line:
raise TokenError, ("EOF in multi-line statement", (lnum, 0))
continued = 0 continued = 0
while pos < max: while pos < max:
if pseudoprog.match(line, pos) > 0: # scan for tokens if pseudoprog.match(line, pos) > 0: # scan for tokens
start, end = pseudoprog.regs[1] start, end = pseudoprog.regs[1]
spos, epos = (lnum, start), (lnum, end) spos, epos, pos = (lnum, start), (lnum, end), end
token, initial = line[start:end], line[start] token, initial = line[start:end], line[start]
pos = end
if initial in namechars: # ordinary name if initial in namechars: # ordinary name
tokeneater(NAME, token, spos, epos, line) tokeneater(NAME, token, spos, epos, line)
elif initial in numchars: # ordinary number elif initial in numchars \
or (initial == '.' and token != '.'): # ordinary number
tokeneater(NUMBER, token, spos, epos, line) tokeneater(NUMBER, token, spos, epos, line)
elif initial in '\r\n': elif initial in '\r\n':
tokeneater(NEWLINE, token, spos, epos, line) tokeneater(NEWLINE, token, spos, epos, line)
...@@ -151,7 +158,7 @@ def tokenize(readline, tokeneater=printtoken): ...@@ -151,7 +158,7 @@ def tokenize(readline, tokeneater=printtoken):
continued = 1 continued = 1
elif token in ('\'\'\'', '"""'): # triple-quoted elif token in ('\'\'\'', '"""'): # triple-quoted
endprog = endprogs[token] endprog = endprogs[token]
if endprog.search(line, pos) >= 0: # all on one line if endprog.match(line, pos) >= 0: # all on one line
pos = endprog.regs[0][1] pos = endprog.regs[0][1]
token = line[start:pos] token = line[start:pos]
tokeneater(STRING, token, spos, (lnum, pos), line) tokeneater(STRING, token, spos, (lnum, pos), line)
...@@ -162,7 +169,8 @@ def tokenize(readline, tokeneater=printtoken): ...@@ -162,7 +169,8 @@ def tokenize(readline, tokeneater=printtoken):
elif initial in '\'"': elif initial in '\'"':
if token[-1] == '\n': # continued string if token[-1] == '\n': # continued string
strstart = (lnum, start) strstart = (lnum, start)
endprog, contstr = endprogs[initial], line[start:] endprog = endprogs[initial]
contstr, needcont = line[start:], 1
break break
else: # ordinary string else: # ordinary string
tokeneater(STRING, token, spos, epos, line) tokeneater(STRING, token, spos, epos, line)
...@@ -171,12 +179,15 @@ def tokenize(readline, tokeneater=printtoken): ...@@ -171,12 +179,15 @@ def tokenize(readline, tokeneater=printtoken):
elif initial in ')]}': parenlev = parenlev - 1 elif initial in ')]}': parenlev = parenlev - 1
tokeneater(OP, token, spos, epos, line) tokeneater(OP, token, spos, epos, line)
else: else:
tokeneater(ERRORTOKEN, line[pos], spos, (lnum, pos+1), line) tokeneater(ERRORTOKEN, line[pos],
(lnum, pos), (lnum, pos+1), line)
pos = pos + 1 pos = pos + 1
for indent in indents[1:]: # pop remaining indent levels for indent in indents[1:]: # pop remaining indent levels
tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '') tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '')
tokeneater(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
if __name__ == '__main__': # testing if __name__ == '__main__': # testing
import sys import sys
tokenize(open(sys.argv[-1]).readline) if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
else: tokenize(syss.tdin.readline)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment