Kaydet (Commit) a0e4c1bf authored tarafından Guido van Rossum's avatar Guido van Rossum

Jeffrey's latest -- seems to solve most problems!

üst 75fce308
......@@ -60,6 +60,7 @@ def valid_identifier(id):
_cache = {}
_MAXCACHE = 20
def _cachecompile(pattern, flags):
key = (pattern, flags)
try:
......@@ -74,16 +75,16 @@ def _cachecompile(pattern, flags):
def match(pattern, string, flags=0):
return _cachecompile(pattern, flags).match(string)
def search(pattern, string, flags=0):
return _cachecompile(pattern, flags).search(string)
def sub(pattern, repl, string, count=0):
return _cachecompile(pattern).sub(repl, string, count)
def subn(pattern, repl, string, count=0):
return _cachecompile(pattern).subn(repl, string, count)
def split(pattern, string, maxsplit=0):
return _cachecompile(pattern).subn(string, maxsplit)
......@@ -100,12 +101,16 @@ class RegexObject:
self.groupindex = groupindex
self.callouts = callouts
self.fastmap = build_fastmap(code)
if code[0].name == 'bol':
self.anchor = 1
elif code[0].name == 'begbuf':
self.anchor = 2
else:
self.anchor = 0
self.buffer = assemble(code)
def search(self, string, pos=0):
regs = reop.search(self.buffer,
......@@ -118,10 +123,12 @@ class RegexObject:
pos)
if regs is None:
return None
return MatchObject(self,
string,
pos,
regs)
def match(self, string, pos=0):
regs = reop.match(self.buffer,
self.num_regs,
......@@ -133,14 +140,18 @@ class RegexObject:
pos)
if regs is None:
return None
return MatchObject(self,
string,
pos,
regs)
def sub(self, repl, string, count=0):
pass
def subn(self, repl, string, count=0):
pass
def split(self, string, maxsplit=0):
pass
......@@ -150,6 +161,7 @@ class MatchObject:
self.string = string
self.pos = pos
self.regs = regs
def start(self, g):
if type(g) == type(''):
try:
......@@ -157,6 +169,7 @@ class MatchObject:
except (KeyError, TypeError):
raise IndexError, ('group "' + g + '" is undefined')
return self.regs[g][0]
def end(self, g):
if type(g) == type(''):
try:
......@@ -164,6 +177,7 @@ class MatchObject:
except (KeyError, TypeError):
raise IndexError, ('group "' + g + '" is undefined')
return self.regs[g][1]
def span(self, g):
if type(g) == type(''):
try:
......@@ -171,6 +185,7 @@ class MatchObject:
except (KeyError, TypeError):
raise IndexError, ('group "' + g + '" is undefined')
return self.regs[g]
def group(self, *groups):
if len(groups) == 0:
groups = range(1, self.re.num_regs)
......@@ -339,7 +354,7 @@ class UpdateFailureJump(JumpInstruction):
JumpInstruction.__init__(self, chr(12), label)
class DummyFailureJump(JumpInstruction):
name = 'update_failure_jump'
name = 'dummy_failure_jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(13), label)
......@@ -764,11 +779,34 @@ def expand_escape(pattern, index, context=NORMAL):
def compile(pattern, flags=0):
stack = []
index = 0
label = 0
register = 1
groupindex = {}
callouts = []
# preprocess the pattern looking for embedded pattern modifiers
index = 0
while (index != -1):
index = string.find(pattern, '(?', index)
if index != -1:
index = index + 2
if (index < len(pattern)) and (pattern[index] in 'iImMsSxX'):
while (index < len(pattern)) and (pattern[index] != ')'):
if pattern[index] in 'iI':
flags = flags | IGNORECASE
elif pattern[index] in 'mM':
flags = flags | MULTILINE
elif pattern[index] in 'sS':
flags = flags | DOTALL
elif pattern[index] in 'xX':
flags = flags | VERBOSE
else:
raise error, 'unknown flag'
index = index + 1
index = 0
while (index < len(pattern)):
char = pattern[index]
index = index + 1
......@@ -809,12 +847,6 @@ def compile(pattern, flags=0):
raise error, 'unknown escape type'
elif char == '|':
if len(stack) == 0:
raise error, 'alternate with nothing on the left'
if stack[-1][0].name == '(':
raise error, 'alternate with nothing on the left in the group'
if stack[-1][0].name == '|':
raise error, 'alternates with nothing inbetween them'
expr = []
while (len(stack) != 0) and \
......@@ -915,17 +947,10 @@ def compile(pattern, flags=0):
'assertion is unsupported')
elif pattern[index] in 'iImMsSxX':
# ignore embedded pattern modifiers here, they
# have already been taken care of in the
# preprocessing
while (index < len(pattern)) and (pattern[index] != ')'):
if pattern[index] in 'iI':
flags = flags | IGNORECASE
elif pattern[index] in 'mM':
flags = flags | MULTILINE
elif pattern[index] in 'sS':
flags = flags | DOTALL
elif pattern[index] in 'xX':
flags = flags | VERBOSE
else:
raise error, 'unknown flag'
index = index + 1
index = index + 1
......@@ -947,13 +972,6 @@ def compile(pattern, flags=0):
if len(stack) == 0:
raise error, 'too many close parens'
if len(expr) == 0:
raise error, 'nothing inside parens'
# check to see if alternation used correctly
if (expr[-1].name == '|'):
raise error, 'alternate with nothing on the right'
# remove markers left by alternation
expr = filter(lambda x: x.name != '|', expr)
......@@ -1023,18 +1041,17 @@ def compile(pattern, flags=0):
while min > 0:
expr = expr + stack[-1]
min = min - 1
registers = registers_used(stack[-1])
if minimal:
expr = expr + \
([Jump(label + 1),
Label(label)] + \
stack[-1] + \
[Label(label + 1),
FailureJump(label, registers)])
FailureJump(label)])
else:
expr = expr + \
([Label(label),
FailureJump(label + 1, registers)] +
FailureJump(label + 1)] +
stack[-1] +
[StarJump(label),
Label(label + 1)])
......@@ -1109,7 +1126,7 @@ def compile(pattern, flags=0):
registers = registers_used(stack[-1])
if (index < len(pattern)) and (pattern[index] == '?'):
# non-greedy matching
expr = [JumpInstructions(label + 1),
expr = [Jump(label + 1),
Label(label)] + \
stack[-1] + \
[Label(label + 1),
......@@ -1130,9 +1147,10 @@ def compile(pattern, flags=0):
# positive closure
if len(stack) == 0:
raise error, '+ needs something to repeat'
if (stack[-1][0].name == '(') or (stack[-1][0].name == '|'):
raise error, '+ needs something to repeat'
registers = registers_used(stack[-1])
if (index < len(pattern)) and (pattern[index] == '?'):
# non-greedy
expr = [Label(label)] + \
......@@ -1156,7 +1174,6 @@ def compile(pattern, flags=0):
elif char == '?':
if len(stack) == 0:
raise error, 'need something to be optional'
registers = registers_used(stack[-1])
if (index < len(pattern)) and (pattern[index] == '?'):
# non-greedy matching
expr = [FailureJump(label),
......@@ -1177,7 +1194,7 @@ def compile(pattern, flags=0):
elif char == '.':
if flags & DOTALL:
stack.append(Set(map(chr, range(256))))
stack.append([Set(map(chr, range(256)))])
else:
stack.append([AnyChar()])
......@@ -1337,8 +1354,6 @@ def compile(pattern, flags=0):
del stack[-1]
if len(code) == 0:
raise error, 'no code generated'
if (code[-1].name == '|'):
raise error, 'alternate with nothing on the right'
code = filter(lambda x: x.name != '|', code)
need_label = 0
for i in range(len(code)):
......
This diff is collapsed.
......@@ -262,7 +262,7 @@ tests = [
('(*)b', '-', SYNTAX_ERROR),
('$b', 'b', FAIL),
('a\\', '-', SYNTAX_ERROR),
('a\\(b', 'a(b', SUCCEED, 'found+"-"+g1', 'a(b-'),
('a\\(b', 'a(b', SUCCEED, 'found+"-"+g1', 'a(b-Error'),
('a\\(*b', 'ab', SUCCEED, 'found', 'ab'),
('a\\(*b', 'a((b', SUCCEED, 'found', 'a((b'),
('a\\\\b', 'a\\b', SUCCEED, 'found', 'a\\b'),
......@@ -306,21 +306,22 @@ tests = [
('(ab|a)b*c', 'abc', SUCCEED, 'found+"-"+g1', 'abc-ab'),
('((a)(b)c)(d)', 'abcd', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'),
('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', SUCCEED, 'found', 'alpha'),
('^a(bc+|b[eh])g|.h$', 'abh', SUCCEED, 'found+"-"+g1', 'bh-'),
('(bc+d$|ef*g.|h?i(j|k))', 'effgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-'),
('^a(bc+|b[eh])g|.h$', 'abh', SUCCEED, 'found+"-"+g1', 'bh-None'),
('(bc+d$|ef*g.|h?i(j|k))', 'effgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'),
('(bc+d$|ef*g.|h?i(j|k))', 'ij', SUCCEED, 'found+"-"+g1+"-"+g2', 'ij-ij-j'),
('(bc+d$|ef*g.|h?i(j|k))', 'effg', FAIL),
('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', FAIL),
('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-'),
('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'),
('((((((((((a))))))))))', 'a', SUCCEED, 'g10', 'a'),
('((((((((((a))))))))))\\10', 'aa', SUCCEED, 'found', 'aa'),
('((((((((((a))))))))))\\41', 'aa', FAIL),
('((((((((((a))))))))))\\41', 'a!', SUCCEED, 'found', 'a!'),
# Python does not have the same rules for \\41 so this is a syntax error
# ('((((((((((a))))))))))\\41', 'aa', FAIL),
# ('((((((((((a))))))))))\\41', 'a!', SUCCEED, 'found', 'a!'),
('(((((((((a)))))))))', 'a', SUCCEED, 'found', 'a'),
('multiple words of text', 'uh-uh', FAIL),
('multiple words', 'multiple words, yeah', SUCCEED, 'found', 'multiple words'),
('(.*)c(.*)', 'abcde', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'),
('\\((.*), (.*)\\)', '(a, b)', SUCCEED, '(g2, g1)', '(b, a)'),
('\\((.*), (.*)\\)', '(a, b)', SUCCEED, 'g2+"-"+g1', 'b-a'),
('[k]', 'ab', FAIL),
##('abcd', 'abcd', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', 'abcd-$&-\\abcd'),
##('a(bc)d', 'abcd', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', 'bc-$1-\\bc'),
......@@ -389,7 +390,7 @@ tests = [
('(?i)(*)b', '-', SYNTAX_ERROR),
('(?i)$b', 'B', FAIL),
('(?i)a\\', '-', SYNTAX_ERROR),
('(?i)a\\(b', 'A(B', SUCCEED, 'found+"-"+g1', 'A(B-'),
('(?i)a\\(b', 'A(B', SUCCEED, 'found+"-"+g1', 'A(B-Error'),
('(?i)a\\(*b', 'AB', SUCCEED, 'found', 'AB'),
('(?i)a\\(*b', 'A((B', SUCCEED, 'found', 'A((B'),
('(?i)a\\\\b', 'A\\B', SUCCEED, 'found', 'A\\B'),
......@@ -409,7 +410,7 @@ tests = [
('(?i)(a+|b){1,}', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'),
('(?i)(a+|b)?', 'AB', SUCCEED, 'found+"-"+g1', 'A-A'),
('(?i)(a+|b){0,1}', 'AB', SUCCEED, 'found+"-"+g1', 'A-A'),
('(?i)(a+|b){0,1}?', 'AB', SUCCEED, 'found+"-"+g1', '-'),
('(?i)(a+|b){0,1}?', 'AB', SUCCEED, 'found+"-"+g1', '-None'),
('(?i))(', '-', SYNTAX_ERROR),
('(?i)[^ab]*', 'CDE', SUCCEED, 'found', 'CDE'),
('(?i)abc', '', FAIL),
......@@ -436,35 +437,62 @@ tests = [
('(?i)(ab|a)b*c', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-AB'),
('(?i)((a)(b)c)(d)', 'ABCD', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'ABC-A-B-D'),
('(?i)[a-zA-Z_][a-zA-Z0-9_]*', 'ALPHA', SUCCEED, 'found', 'ALPHA'),
('(?i)^a(bc+|b[eh])g|.h$', 'ABH', SUCCEED, 'found+"-"+g1', 'BH-'),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-'),
('(?i)^a(bc+|b[eh])g|.h$', 'ABH', SUCCEED, 'found+"-"+g1', 'BH-None'),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-None'),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'IJ', SUCCEED, 'found+"-"+g1+"-"+g2', 'IJ-IJ-J'),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFG', FAIL),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'BCDD', FAIL),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'REFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-'),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'REFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-None'),
('(?i)((((((((((a))))))))))', 'A', SUCCEED, 'g10', 'A'),
('(?i)((((((((((a))))))))))\\10', 'AA', SUCCEED, 'found', 'AA'),
('(?i)((((((((((a))))))))))\\41', 'AA', FAIL),
('(?i)((((((((((a))))))))))\\41', 'A!', SUCCEED, 'found', 'A!'),
#('(?i)((((((((((a))))))))))\\41', 'AA', FAIL),
#('(?i)((((((((((a))))))))))\\41', 'A!', SUCCEED, 'found', 'A!'),
('(?i)(((((((((a)))))))))', 'A', SUCCEED, 'found', 'A'),
('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))', 'A', SUCCEED, 'g1', 'A'),
('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))', 'C', SUCCEED, 'g1', 'C'),
('(?i)multiple words of text', 'UH-UH', FAIL),
('(?i)multiple words', 'MULTIPLE WORDS, YEAH', SUCCEED, 'found', 'MULTIPLE WORDS'),
('(?i)(.*)c(.*)', 'ABCDE', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCDE-AB-DE'),
('(?i)\\((.*), (.*)\\)', '(A, B)', SUCCEED, '(g2, g1)', '(B, A)'),
('(?i)\\((.*), (.*)\\)', '(A, B)', SUCCEED, 'g2+"-"+g1', 'B-A'),
('(?i)[k]', 'AB', FAIL),
##('(?i)abcd', 'ABCD', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', 'ABCD-$&-\\ABCD'),
##('(?i)a(bc)d', 'ABCD', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', 'BC-$1-\\BC'),
('(?i)a[-]?c', 'AC', SUCCEED, 'found', 'AC'),
('(?i)(abc)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'),
('(?i)([a-c]*)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'),
('a(?!b).', 'abad', SUCCEED, 'found', 'ad'),
('a(?=d).', 'abad', SUCCEED, 'found', 'ad'),
('a(?=c|d).', 'abad', SUCCEED, 'found', 'ad'),
# these zero-width assertions are not supported
#('a(?!b).', 'abad', SUCCEED, 'found', 'ad'),
#('a(?=d).', 'abad', SUCCEED, 'found', 'ad'),
#('a(?=c|d).', 'abad', SUCCEED, 'found', 'ad'),
('a(?:b|c|d)(.)', 'ace', SUCCEED, 'g1', 'e'),
('a(?:b|c|d)*(.)', 'ace', SUCCEED, 'g1', 'e'),
('a(?:b|c|d)+?(.)', 'ace', SUCCEED, 'g1', 'e'),
('a(?:b|(c|e){1,2}?|d)+?(.)', 'ace', SUCCEED, 'g1+"-"+g2', 'c-e'),
('a(?:b|(c|e){1,2}?|d)+?(.)', 'ace', SUCCEED, 'g1 + g2', 'ce'),
('^(.+)?B', 'AB', SUCCEED, 'g1', 'A'),
# Comments using the (?#...) syntax
('w(?# comment', 'w', SYNTAX_ERROR),
('w(?# comment 1)xy(?# comment 2)z', 'wxyz', SUCCEED, 'found', 'wxyz'),
# Comments using the x embedded pattern modifier (in an unusual place too)
("""w# comment 1
x(?x) y
# comment 2
z""", 'wxyz', SUCCEED, 'found', 'wxyz'),
# using the m embedded pattern modifier
('^abc', """jkl
abc
xyz""", FAIL),
('(?m)^abc', """jkl
abc
xyz""", SUCCEED, 'found', 'abc'),
# using the s embedded pattern modifier
('a.b', 'a\nb', FAIL),
('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'),
]
#!/usr/local/bin/python
# -*- mode: python -*-
# $Id$
from test_support import verbose
import re
import reop
import sys, os, string, traceback
from re_tests import *
......@@ -7,6 +12,7 @@ if verbose: print 'Running re_tests test suite'
for t in tests:
print t
sys.stdout.flush()
pattern=s=outcome=repl=expected=None
if len(t)==5:
pattern, s, outcome, repl, expected = t
......@@ -21,6 +27,8 @@ for t in tests:
if outcome==SYNTAX_ERROR: pass # Expected a syntax error
else:
print '=== Syntax error:', t
except KeyboardInterrupt:
raise KeyboardInterrupt
except:
print '*** Unexpected error ***'
if verbose:
......@@ -28,7 +36,7 @@ for t in tests:
else:
try:
result=obj.search(s)
except regex.error, msg:
except (re.error, reop.error), msg:
print '=== Unexpected exception', t, repr(msg)
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
......@@ -41,22 +49,26 @@ for t in tests:
# Matched, as expected, so now we compute the
# result string and compare it to our expected result.
start, end = result.span(0)
vardict={'found': result.group(0), 'groups': result.group()}
vardict={'found': result.group(0),
'groups': result.group(),
'flags': result.re.flags}
for i in range(1, 100):
try:
gi = result.group(i)
# Special hack because else the string concat fails:
if gi is None: gi = "None"
if gi is None:
gi = "None"
except IndexError:
gi = "Error"
vardict['g%d' % i] = gi
for i in result.re.groupindex.keys():
try:
gi = result.group(i)
if gi is None:
gi = "None"
except IndexError:
pass
else:
vardict[i] = str(gi)
gi = "Error"
vardict[i] = gi
repl=eval(repl, vardict)
if repl!=expected:
print '=== grouping error', t,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment