Kaydet (Commit) 436c3d58 authored tarafından Fredrik Lundh's avatar Fredrik Lundh

towards 1.6b1

üst 102f3ad6
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# #
import sre_compile import sre_compile
import sre_parse
# flags # flags
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
...@@ -20,6 +21,13 @@ M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE ...@@ -20,6 +21,13 @@ M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE
S = DOTALL = sre_compile.SRE_FLAG_DOTALL S = DOTALL = sre_compile.SRE_FLAG_DOTALL
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE
# sre extensions (may or may not be in 1.6 final)
T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE
U = UNICODE = sre_compile.SRE_FLAG_UNICODE
# sre exception
error = sre_parse.error
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# public interface # public interface
...@@ -46,6 +54,9 @@ def findall(pattern, string, maxsplit=0): ...@@ -46,6 +54,9 @@ def findall(pattern, string, maxsplit=0):
def compile(pattern, flags=0): def compile(pattern, flags=0):
return _compile(pattern, flags) return _compile(pattern, flags)
def template(pattern, flags=0):
return _compile(pattern, flags|T)
def escape(pattern): def escape(pattern):
s = list(pattern) s = list(pattern)
for i in range(len(pattern)): for i in range(len(pattern)):
...@@ -83,18 +94,14 @@ def _sub(pattern, template, string, count=0): ...@@ -83,18 +94,14 @@ def _sub(pattern, template, string, count=0):
# internal: pattern.sub implementation hook # internal: pattern.sub implementation hook
return _subn(pattern, template, string, count)[0] return _subn(pattern, template, string, count)[0]
def _expand(match, template):
# internal: expand template
return template # FIXME
def _subn(pattern, template, string, count=0): def _subn(pattern, template, string, count=0):
# internal: pattern.subn implementation hook # internal: pattern.subn implementation hook
if callable(template): if callable(template):
filter = template filter = template
else: else:
# FIXME: prepare template template = sre_parse.parse_template(template, pattern)
def filter(match, template=template): def filter(match, template=template):
return _expand(match, template) return sre_parse.expand_template(template, match)
n = i = 0 n = i = 0
s = [] s = []
append = s.append append = s.append
...@@ -108,6 +115,8 @@ def _subn(pattern, template, string, count=0): ...@@ -108,6 +115,8 @@ def _subn(pattern, template, string, count=0):
append(string[i:j]) append(string[i:j])
append(filter(m)) append(filter(m))
i = m.end() i = m.end()
if i <= j:
break
n = n + 1 n = n + 1
if i < len(string): if i < len(string):
append(string[i:]) append(string[i:])
...@@ -126,6 +135,8 @@ def _split(pattern, string, maxsplit=0): ...@@ -126,6 +135,8 @@ def _split(pattern, string, maxsplit=0):
j = m.start() j = m.start()
append(string[i:j]) append(string[i:j])
i = m.end() i = m.end()
if i <= j:
break
n = n + 1 n = n + 1
if i < len(string): if i < len(string):
append(string[i:]) append(string[i:])
......
...@@ -48,7 +48,7 @@ class Code: ...@@ -48,7 +48,7 @@ class Code:
print self.data print self.data
raise raise
def _compile(code, pattern, flags, level=0): def _compile(code, pattern, flags):
append = code.append append = code.append
for op, av in pattern: for op, av in pattern:
if op is ANY: if op is ANY:
...@@ -70,23 +70,26 @@ def _compile(code, pattern, flags, level=0): ...@@ -70,23 +70,26 @@ def _compile(code, pattern, flags, level=0):
tail = [] tail = []
for av in av[1]: for av in av[1]:
skip = len(code); append(0) skip = len(code); append(0)
_compile(code, av, flags, level) _compile(code, av, flags)
append(OPCODES[JUMP]) ## append(OPCODES[SUCCESS])
tail.append(len(code)); append(0) append(OPCODES[JUMP])
tail.append(len(code)); append(0)
code[skip] = len(code) - skip code[skip] = len(code) - skip
append(0) # end of branch append(0) # end of branch
for tail in tail: for tail in tail:
code[tail] = len(code) - tail code[tail] = len(code) - tail
elif op is CALL: elif op is CALL:
append(OPCODES[op]) append(OPCODES[op])
skip = len(code); append(0) skip = len(code); append(0)
_compile(code, av, flags, level+1) _compile(code, av, flags)
append(OPCODES[SUCCESS]) append(OPCODES[SUCCESS])
code[skip] = len(code) - skip code[skip] = len(code) - skip
elif op is CATEGORY: # not used by current parser elif op is CATEGORY:
append(OPCODES[op]) append(OPCODES[op])
if flags & SRE_FLAG_LOCALE: if flags & SRE_FLAG_LOCALE:
append(CH_LOCALE[CHCODES[av]]) append(CH_LOCALE[CHCODES[av]])
elif flags & SRE_FLAG_UNICODE:
append(CH_UNICODE[CHCODES[av]])
else: else:
append(CHCODES[av]) append(CHCODES[av])
elif op is GROUP: elif op is GROUP:
...@@ -98,8 +101,8 @@ def _compile(code, pattern, flags, level=0): ...@@ -98,8 +101,8 @@ def _compile(code, pattern, flags, level=0):
elif op is IN: elif op is IN:
if flags & SRE_FLAG_IGNORECASE: if flags & SRE_FLAG_IGNORECASE:
append(OPCODES[OP_IGNORE[op]]) append(OPCODES[OP_IGNORE[op]])
def fixup(literal): def fixup(literal, flags=flags):
return ord(literal.lower()) return _sre.getlower(ord(literal), flags)
else: else:
append(OPCODES[op]) append(OPCODES[op])
fixup = ord fixup = ord
...@@ -116,6 +119,8 @@ def _compile(code, pattern, flags, level=0): ...@@ -116,6 +119,8 @@ def _compile(code, pattern, flags, level=0):
elif op is CATEGORY: elif op is CATEGORY:
if flags & SRE_FLAG_LOCALE: if flags & SRE_FLAG_LOCALE:
append(CH_LOCALE[CHCODES[av]]) append(CH_LOCALE[CHCODES[av]])
elif flags & SRE_FLAG_UNICODE:
append(CH_UNICODE[CHCODES[av]])
else: else:
append(CHCODES[av]) append(CHCODES[av])
else: else:
...@@ -125,42 +130,49 @@ def _compile(code, pattern, flags, level=0): ...@@ -125,42 +130,49 @@ def _compile(code, pattern, flags, level=0):
elif op in (LITERAL, NOT_LITERAL): elif op in (LITERAL, NOT_LITERAL):
if flags & SRE_FLAG_IGNORECASE: if flags & SRE_FLAG_IGNORECASE:
append(OPCODES[OP_IGNORE[op]]) append(OPCODES[OP_IGNORE[op]])
append(ord(av.lower()))
else: else:
append(OPCODES[op]) append(OPCODES[op])
append(ord(av)) append(ord(av))
elif op is MARK: elif op is MARK:
append(OPCODES[op]) append(OPCODES[op])
append(av) append(av)
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
lo, hi = av[2].getwidth() if flags & SRE_FLAG_TEMPLATE:
if lo == 0: append(OPCODES[REPEAT])
raise SyntaxError, "cannot repeat zero-width items"
if lo == hi == 1 and op is MAX_REPEAT:
append(OPCODES[MAX_REPEAT_ONE])
skip = len(code); append(0) skip = len(code); append(0)
append(av[0]) append(av[0])
append(av[1]) append(av[1])
_compile(code, av[2], flags, level+1) _compile(code, av[2], flags)
append(OPCODES[SUCCESS]) append(OPCODES[SUCCESS])
code[skip] = len(code) - skip code[skip] = len(code) - skip
else: else:
append(OPCODES[op]) lo, hi = av[2].getwidth()
skip = len(code); append(0) if lo == 0:
append(av[0]) raise error, "nothing to repeat"
append(av[1]) if 0 and lo == hi == 1 and op is MAX_REPEAT:
_compile(code, av[2], flags, level+1) # FIXME: <fl> need a better way to figure out when
if op is MIN_REPEAT: # it's safe to use this one (in the parser, probably)
append(OPCODES[MIN_UNTIL]) append(OPCODES[MAX_REPEAT_ONE])
skip = len(code); append(0)
append(av[0])
append(av[1])
_compile(code, av[2], flags)
append(OPCODES[SUCCESS])
code[skip] = len(code) - skip
else: else:
append(OPCODES[MAX_UNTIL]) append(OPCODES[op])
code[skip] = len(code) - skip skip = len(code); append(0)
append(av[0])
append(av[1])
_compile(code, av[2], flags)
append(OPCODES[SUCCESS])
code[skip] = len(code) - skip
elif op is SUBPATTERN: elif op is SUBPATTERN:
group = av[0] group = av[0]
if group: if group:
append(OPCODES[MARK]) append(OPCODES[MARK])
append((group-1)*2) append((group-1)*2)
_compile(code, av[1], flags, level+1) _compile(code, av[1], flags)
if group: if group:
append(OPCODES[MARK]) append(OPCODES[MARK])
append((group-1)*2+1) append((group-1)*2+1)
......
...@@ -15,6 +15,11 @@ ...@@ -15,6 +15,11 @@
# other compatibility work. # other compatibility work.
# #
# should this really be here?
class error(Exception):
pass
# operators # operators
FAILURE = "failure" FAILURE = "failure"
...@@ -30,20 +35,20 @@ GROUP = "group" ...@@ -30,20 +35,20 @@ GROUP = "group"
GROUP_IGNORE = "group_ignore" GROUP_IGNORE = "group_ignore"
IN = "in" IN = "in"
IN_IGNORE = "in_ignore" IN_IGNORE = "in_ignore"
INFO = "info"
JUMP = "jump" JUMP = "jump"
LITERAL = "literal" LITERAL = "literal"
LITERAL_IGNORE = "literal_ignore" LITERAL_IGNORE = "literal_ignore"
MARK = "mark" MARK = "mark"
MAX_REPEAT = "max_repeat" MAX_REPEAT = "max_repeat"
MAX_REPEAT_ONE = "max_repeat_one" MAX_REPEAT_ONE = "max_repeat_one"
MAX_UNTIL = "max_until"
MIN_REPEAT = "min_repeat" MIN_REPEAT = "min_repeat"
MIN_UNTIL = "min_until"
NEGATE = "negate" NEGATE = "negate"
NOT_LITERAL = "not_literal" NOT_LITERAL = "not_literal"
NOT_LITERAL_IGNORE = "not_literal_ignore" NOT_LITERAL_IGNORE = "not_literal_ignore"
RANGE = "range" RANGE = "range"
REPEAT = "repeat" REPEAT = "repeat"
REPEAT_ONE = "repeat_one"
SUBPATTERN = "subpattern" SUBPATTERN = "subpattern"
# positions # positions
...@@ -63,14 +68,16 @@ CATEGORY_WORD = "category_word" ...@@ -63,14 +68,16 @@ CATEGORY_WORD = "category_word"
CATEGORY_NOT_WORD = "category_not_word" CATEGORY_NOT_WORD = "category_not_word"
CATEGORY_LINEBREAK = "category_linebreak" CATEGORY_LINEBREAK = "category_linebreak"
CATEGORY_NOT_LINEBREAK = "category_not_linebreak" CATEGORY_NOT_LINEBREAK = "category_not_linebreak"
CATEGORY_LOC_DIGIT = "category_loc_digit"
CATEGORY_LOC_NOT_DIGIT = "category_loc_not_digit"
CATEGORY_LOC_SPACE = "category_loc_space"
CATEGORY_LOC_NOT_SPACE = "category_loc_not_space"
CATEGORY_LOC_WORD = "category_loc_word" CATEGORY_LOC_WORD = "category_loc_word"
CATEGORY_LOC_NOT_WORD = "category_loc_not_word" CATEGORY_LOC_NOT_WORD = "category_loc_not_word"
CATEGORY_LOC_LINEBREAK = "category_loc_linebreak" CATEGORY_UNI_DIGIT = "category_uni_digit"
CATEGORY_LOC_NOT_LINEBREAK = "category_loc_not_linebreak" CATEGORY_UNI_NOT_DIGIT = "category_uni_not_digit"
CATEGORY_UNI_SPACE = "category_uni_space"
CATEGORY_UNI_NOT_SPACE = "category_uni_not_space"
CATEGORY_UNI_WORD = "category_uni_word"
CATEGORY_UNI_NOT_WORD = "category_uni_not_word"
CATEGORY_UNI_LINEBREAK = "category_uni_linebreak"
CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak"
OPCODES = [ OPCODES = [
...@@ -85,12 +92,13 @@ OPCODES = [ ...@@ -85,12 +92,13 @@ OPCODES = [
CATEGORY, CATEGORY,
GROUP, GROUP_IGNORE, GROUP, GROUP_IGNORE,
IN, IN_IGNORE, IN, IN_IGNORE,
INFO,
JUMP, JUMP,
LITERAL, LITERAL_IGNORE, LITERAL, LITERAL_IGNORE,
MARK, MARK,
MAX_REPEAT, MAX_UNTIL, MAX_REPEAT,
MAX_REPEAT_ONE, MAX_REPEAT_ONE,
MIN_REPEAT, MIN_UNTIL, MIN_REPEAT,
NOT_LITERAL, NOT_LITERAL_IGNORE, NOT_LITERAL, NOT_LITERAL_IGNORE,
NEGATE, NEGATE,
RANGE, RANGE,
...@@ -106,10 +114,11 @@ ATCODES = [ ...@@ -106,10 +114,11 @@ ATCODES = [
CHCODES = [ CHCODES = [
CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE, CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE,
CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD, CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD,
CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_DIGIT, CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_WORD,
CATEGORY_LOC_NOT_DIGIT, CATEGORY_LOC_SPACE, CATEGORY_LOC_NOT_WORD, CATEGORY_UNI_DIGIT, CATEGORY_UNI_NOT_DIGIT,
CATEGORY_LOC_NOT_SPACE, CATEGORY_LOC_WORD, CATEGORY_LOC_NOT_WORD, CATEGORY_UNI_SPACE, CATEGORY_UNI_NOT_SPACE, CATEGORY_UNI_WORD,
CATEGORY_LOC_LINEBREAK, CATEGORY_LOC_NOT_LINEBREAK CATEGORY_UNI_NOT_WORD, CATEGORY_UNI_LINEBREAK,
CATEGORY_UNI_NOT_LINEBREAK
] ]
def makedict(list): def makedict(list):
...@@ -138,23 +147,35 @@ AT_MULTILINE = { ...@@ -138,23 +147,35 @@ AT_MULTILINE = {
} }
CH_LOCALE = { CH_LOCALE = {
CATEGORY_DIGIT: CATEGORY_LOC_DIGIT, CATEGORY_DIGIT: CATEGORY_DIGIT,
CATEGORY_NOT_DIGIT: CATEGORY_LOC_NOT_DIGIT, CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT,
CATEGORY_SPACE: CATEGORY_LOC_SPACE, CATEGORY_SPACE: CATEGORY_SPACE,
CATEGORY_NOT_SPACE: CATEGORY_LOC_NOT_SPACE, CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE,
CATEGORY_WORD: CATEGORY_LOC_WORD, CATEGORY_WORD: CATEGORY_LOC_WORD,
CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD, CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD,
CATEGORY_LINEBREAK: CATEGORY_LOC_LINEBREAK, CATEGORY_LINEBREAK: CATEGORY_LINEBREAK,
CATEGORY_NOT_LINEBREAK: CATEGORY_LOC_NOT_LINEBREAK CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK
}
CH_UNICODE = {
CATEGORY_DIGIT: CATEGORY_UNI_DIGIT,
CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT,
CATEGORY_SPACE: CATEGORY_UNI_SPACE,
CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE,
CATEGORY_WORD: CATEGORY_UNI_WORD,
CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD,
CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK,
CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
} }
# flags # flags
SRE_FLAG_TEMPLATE = 1 # NYI SRE_FLAG_TEMPLATE = 1
SRE_FLAG_IGNORECASE = 2 SRE_FLAG_IGNORECASE = 2
SRE_FLAG_LOCALE = 4 SRE_FLAG_LOCALE = 4
SRE_FLAG_MULTILINE = 8 SRE_FLAG_MULTILINE = 8
SRE_FLAG_DOTALL = 16 SRE_FLAG_DOTALL = 16
SRE_FLAG_VERBOSE = 32 SRE_FLAG_UNICODE = 32
SRE_FLAG_VERBOSE = 64
if __name__ == "__main__": if __name__ == "__main__":
import string import string
...@@ -168,5 +189,12 @@ if __name__ == "__main__": ...@@ -168,5 +189,12 @@ if __name__ == "__main__":
dump(f, OPCODES, "SRE_OP") dump(f, OPCODES, "SRE_OP")
dump(f, ATCODES, "SRE") dump(f, ATCODES, "SRE")
dump(f, CHCODES, "SRE") dump(f, CHCODES, "SRE")
f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE)
f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE)
f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE)
f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE)
f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL)
f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE)
f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE)
f.close() f.close()
print "done" print "done"
...@@ -20,14 +20,15 @@ import _sre ...@@ -20,14 +20,15 @@ import _sre
from sre_constants import * from sre_constants import *
# FIXME: should be 65535, but the array module currently chokes on # FIXME: <fl> should be 65535, but the array module currently chokes
# unsigned integers larger than 32767... # on unsigned integers larger than 32767 [fixed in 1.6b1?]
MAXREPEAT = int(2L**(_sre.getcodesize()*8-1))-1 MAXREPEAT = int(2L**(_sre.getcodesize()*8-1))-1
SPECIAL_CHARS = ".\\[{()*+?^$|" SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{" REPEAT_CHARS = "*+?{"
# FIXME: string in tuple tests may explode with if char is unicode :-( # FIXME: <fl> string in tuple tests may explode with if char is
# unicode [fixed in 1.6b1?]
DIGITS = tuple(string.digits) DIGITS = tuple(string.digits)
OCTDIGITS = tuple("01234567") OCTDIGITS = tuple("01234567")
...@@ -59,12 +60,15 @@ CATEGORIES = { ...@@ -59,12 +60,15 @@ CATEGORIES = {
} }
FLAGS = { FLAGS = {
# standard flags
"i": SRE_FLAG_IGNORECASE, "i": SRE_FLAG_IGNORECASE,
"L": SRE_FLAG_LOCALE, "L": SRE_FLAG_LOCALE,
"m": SRE_FLAG_MULTILINE, "m": SRE_FLAG_MULTILINE,
"s": SRE_FLAG_DOTALL, "s": SRE_FLAG_DOTALL,
"t": SRE_FLAG_TEMPLATE,
"x": SRE_FLAG_VERBOSE, "x": SRE_FLAG_VERBOSE,
# extensions
"t": SRE_FLAG_TEMPLATE,
"u": SRE_FLAG_UNICODE,
} }
class State: class State:
...@@ -151,7 +155,7 @@ class Tokenizer: ...@@ -151,7 +155,7 @@ class Tokenizer:
try: try:
c = self.string[self.index + 1] c = self.string[self.index + 1]
except IndexError: except IndexError:
raise SyntaxError, "bogus escape" raise error, "bogus escape"
char = char + c char = char + c
self.index = self.index + len(char) self.index = self.index + len(char)
return char return char
...@@ -205,7 +209,7 @@ def _class_escape(source, escape): ...@@ -205,7 +209,7 @@ def _class_escape(source, escape):
return LITERAL, escape[1] return LITERAL, escape[1]
except ValueError: except ValueError:
pass pass
raise SyntaxError, "bogus escape: %s" % repr(escape) raise error, "bogus escape: %s" % repr(escape)
def _escape(source, escape, state): def _escape(source, escape, state):
# handle escape code in expression # handle escape code in expression
...@@ -241,13 +245,12 @@ def _escape(source, escape, state): ...@@ -241,13 +245,12 @@ def _escape(source, escape, state):
return LITERAL, escape[1] return LITERAL, escape[1]
except ValueError: except ValueError:
pass pass
raise SyntaxError, "bogus escape: %s" % repr(escape) raise error, "bogus escape: %s" % repr(escape)
def _branch(pattern, items): def _branch(pattern, items):
# form a branch operator from a set of items (FIXME: move this # form a branch operator from a set of items
# optimization to the compiler module!)
subpattern = SubPattern(pattern) subpattern = SubPattern(pattern)
...@@ -332,7 +335,7 @@ def _parse(source, state, flags=0): ...@@ -332,7 +335,7 @@ def _parse(source, state, flags=0):
elif this: elif this:
code1 = LITERAL, this code1 = LITERAL, this
else: else:
raise SyntaxError, "unexpected end of regular expression" raise error, "unexpected end of regular expression"
if source.match("-"): if source.match("-"):
# potential range # potential range
this = source.get() this = source.get()
...@@ -346,9 +349,9 @@ def _parse(source, state, flags=0): ...@@ -346,9 +349,9 @@ def _parse(source, state, flags=0):
else: else:
code2 = LITERAL, this code2 = LITERAL, this
if code1[0] != LITERAL or code2[0] != LITERAL: if code1[0] != LITERAL or code2[0] != LITERAL:
raise SyntaxError, "illegal range" raise error, "illegal range"
if len(code1[1]) != 1 or len(code2[1]) != 1: if len(code1[1]) != 1 or len(code2[1]) != 1:
raise SyntaxError, "illegal range" raise error, "illegal range"
set.append((RANGE, (code1[1], code2[1]))) set.append((RANGE, (code1[1], code2[1])))
else: else:
if code1[0] is IN: if code1[0] is IN:
...@@ -383,19 +386,19 @@ def _parse(source, state, flags=0): ...@@ -383,19 +386,19 @@ def _parse(source, state, flags=0):
else: else:
hi = lo hi = lo
if not source.match("}"): if not source.match("}"):
raise SyntaxError, "bogus range" raise error, "bogus range"
if lo: if lo:
min = int(lo) min = int(lo)
if hi: if hi:
max = int(hi) max = int(hi)
# FIXME: <fl> check that hi >= lo! # FIXME: <fl> check that hi >= lo!
else: else:
raise SyntaxError, "not supported" raise error, "not supported"
# figure out which item to repeat # figure out which item to repeat
if subpattern: if subpattern:
item = subpattern[-1:] item = subpattern[-1:]
else: else:
raise SyntaxError, "nothing to repeat" raise error, "nothing to repeat"
if source.match("?"): if source.match("?"):
subpattern[-1] = (MIN_REPEAT, (min, max, item)) subpattern[-1] = (MIN_REPEAT, (min, max, item))
else: else:
...@@ -418,7 +421,7 @@ def _parse(source, state, flags=0): ...@@ -418,7 +421,7 @@ def _parse(source, state, flags=0):
while 1: while 1:
char = source.get() char = source.get()
if char is None: if char is None:
raise SyntaxError, "unterminated name" raise error, "unterminated name"
if char == ">": if char == ">":
break break
# FIXME: check for valid character # FIXME: check for valid character
...@@ -426,22 +429,21 @@ def _parse(source, state, flags=0): ...@@ -426,22 +429,21 @@ def _parse(source, state, flags=0):
group = 1 group = 1
elif source.match("="): elif source.match("="):
# named backreference # named backreference
raise SyntaxError, "not yet implemented" raise error, "not yet implemented"
else: else:
char = source.get() char = source.get()
if char is None: if char is None:
raise SyntaxError, "unexpected end of pattern" raise error, "unexpected end of pattern"
raise SyntaxError, "unknown specifier: ?P%s" % char raise error, "unknown specifier: ?P%s" % char
elif source.match(":"): elif source.match(":"):
# non-capturing group # non-capturing group
group = 2 group = 2
elif source.match("#"): elif source.match("#"):
# comment # comment
while 1: while 1:
char = source.get() if source.next is None or source.next == ")":
if char is None or char == ")":
break break
source.get()
else: else:
# flags # flags
while FLAGS.has_key(source.next): while FLAGS.has_key(source.next):
...@@ -465,13 +467,13 @@ def _parse(source, state, flags=0): ...@@ -465,13 +467,13 @@ def _parse(source, state, flags=0):
elif source.match("|"): elif source.match("|"):
b.append(p) b.append(p)
else: else:
raise SyntaxError, "group not properly closed" raise error, "group not properly closed"
else: else:
while 1: while 1:
char = source.get() char = source.get()
if char is None or char == ")": if char is None or char == ")":
break break
# FIXME: skip characters? raise error, "unknown extension"
elif this == "^": elif this == "^":
subpattern.append((AT, AT_BEGINNING)) subpattern.append((AT, AT_BEGINNING))
...@@ -484,7 +486,7 @@ def _parse(source, state, flags=0): ...@@ -484,7 +486,7 @@ def _parse(source, state, flags=0):
subpattern.append(code) subpattern.append(code)
else: else:
raise SyntaxError, "parser error" raise error, "parser error"
return subpattern return subpattern
...@@ -499,17 +501,17 @@ def parse(pattern, flags=0): ...@@ -499,17 +501,17 @@ def parse(pattern, flags=0):
if tail == "|": if tail == "|":
b.append(p) b.append(p)
elif tail == ")": elif tail == ")":
raise SyntaxError, "unbalanced parenthesis" raise error, "unbalanced parenthesis"
elif tail is None: elif tail is None:
if b: if b:
b.append(p) b.append(p)
p = _branch(state, b) p = _branch(state, b)
break break
else: else:
raise SyntaxError, "bogus characters at end of regular expression" raise error, "bogus characters at end of regular expression"
return p return p
def parse_replacement(source, pattern): def parse_template(source, pattern):
# parse 're' replacement string into list of literals and # parse 're' replacement string into list of literals and
# group references # group references
s = Tokenizer(source) s = Tokenizer(source)
...@@ -520,15 +522,56 @@ def parse_replacement(source, pattern): ...@@ -520,15 +522,56 @@ def parse_replacement(source, pattern):
if this is None: if this is None:
break # end of replacement string break # end of replacement string
if this and this[0] == "\\": if this and this[0] == "\\":
try: if this == "\\g":
a(LITERAL, ESCAPES[this]) name = ""
except KeyError: if s.match("<"):
for char in this: while 1:
a(LITERAL, char) char = s.get()
if char is None:
raise error, "unterminated index"
if char == ">":
break
# FIXME: check for valid character
name = name + char
if not name:
raise error, "bad index"
try:
index = int(name)
except ValueError:
try:
index = pattern.groupindex[name]
except KeyError:
raise IndexError, "unknown index"
a((MARK, index))
elif len(this) > 1 and this[1] in DIGITS:
while s.next in DIGITS:
this = this + s.get()
a((MARK, int(this[1:])))
else:
try:
a(ESCAPES[this])
except KeyError:
for char in this:
a((LITERAL, char))
else: else:
a(LITERAL, this) a((LITERAL, this))
return p return p
def expand_template(template, match):
# FIXME: <fl> this is sooooo slow. drop in the slicelist
# code instead
p = []
a = p.append
for c, s in template:
if c is LITERAL:
a(s)
elif c is MARK:
s = match.group(s)
if s is None:
raise error, "empty group"
a(s)
return match.string[:0].join(p)
if __name__ == "__main__": if __name__ == "__main__":
from pprint import pprint from pprint import pprint
from testpatterns import PATTERNS from testpatterns import PATTERNS
...@@ -548,7 +591,7 @@ if __name__ == "__main__": ...@@ -548,7 +591,7 @@ if __name__ == "__main__":
except: except:
pass pass
a = a + 1 a = a + 1
except SyntaxError, v: except error, v:
print "**", repr(pattern), v print "**", repr(pattern), v
b = b + 1 b = b + 1
print "-"*68 print "-"*68
......
...@@ -3,19 +3,22 @@ ...@@ -3,19 +3,22 @@
* Secret Labs' Regular Expression Engine * Secret Labs' Regular Expression Engine
* $Id$ * $Id$
* *
* simple regular expression matching engine n * simple regular expression matching engine
* *
* partial history: * partial history:
* 99-10-24 fl created (based on the template matcher) * 99-10-24 fl created (based on existing template matcher code)
* 99-11-13 fl added categories, branching, and more (0.2) * 99-11-13 fl added categories, branching, and more (0.2)
* 99-11-16 fl some tweaks to compile on non-Windows platforms * 99-11-16 fl some tweaks to compile on non-Windows platforms
* 99-12-18 fl non-literals, generic maximizing repeat (0.3) * 99-12-18 fl non-literals, generic maximizing repeat (0.3)
* 99-02-28 fl tons of changes (not all to the better ;-) (0.4) * 00-02-28 fl tons of changes (not all to the better ;-) (0.4)
* 99-03-06 fl first alpha, sort of (0.5) * 00-03-06 fl first alpha, sort of (0.5)
* 99-03-14 fl removed most compatibility stuff (0.6) * 00-03-14 fl removed most compatibility stuff (0.6)
* 99-05-10 fl towards third alpha (0.8.2) * 00-05-10 fl towards third alpha (0.8.2)
* 99-05-13 fl added experimental cursor stuff (0.8.3) * 00-05-13 fl added experimental cursor stuff (0.8.3)
* 99-05-27 fl final bug hunt (0.8.4) * 00-05-27 fl final bug hunt (0.8.4)
* 00-06-21 fl less bugs, more taste (0.8.5)
* 00-06-25 fl major changes to better deal with nested repeats (0.9)
* 00-06-28 fl fixed findall (0.9.1)
* *
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
* *
...@@ -27,16 +30,21 @@ ...@@ -27,16 +30,21 @@
* other compatibility work. * other compatibility work.
*/ */
/*
* FIXME: repeated groups don't work (they're usually come out empty)
* FIXME: rename to 're'
* FIXME: enable repeat_one optimization
*/
#ifndef SRE_RECURSIVE #ifndef SRE_RECURSIVE
char copyright[] = " SRE 0.8.4 Copyright (c) 1997-2000 by Secret Labs AB "; static char
copyright[] = " SRE 0.9.1 Copyright (c) 1997-2000 by Secret Labs AB ";
#include "Python.h" #include "Python.h"
#include "sre.h" #include "sre.h"
#include "unicodeobject.h"
#if defined(HAVE_LIMITS_H) #if defined(HAVE_LIMITS_H)
#include <limits.h> #include <limits.h>
#else #else
...@@ -45,10 +53,18 @@ char copyright[] = " SRE 0.8.4 Copyright (c) 1997-2000 by Secret Labs AB "; ...@@ -45,10 +53,18 @@ char copyright[] = " SRE 0.8.4 Copyright (c) 1997-2000 by Secret Labs AB ";
#include <ctype.h> #include <ctype.h>
/* name of this module, minus the leading underscore */
#define MODULE "sre"
/* defining this one enables tracing */ /* defining this one enables tracing */
#undef DEBUG #undef DEBUG
#ifdef WIN32 /* FIXME: <fl> don't assume Windows == MSVC */ #if PY_VERSION_HEX >= 0x01060000
/* defining this enables unicode support (default under 1.6) */
#define HAVE_UNICODE
#endif
#if defined(WIN32) /* FIXME: <fl> don't assume Windows == MSVC */
#pragma optimize("agtw", on) /* doesn't seem to make much difference... */ #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
/* fastest possible local call under MSVC */ /* fastest possible local call under MSVC */
#define LOCAL(type) static __inline type __fastcall #define LOCAL(type) static __inline type __fastcall
...@@ -60,39 +76,91 @@ char copyright[] = " SRE 0.8.4 Copyright (c) 1997-2000 by Secret Labs AB "; ...@@ -60,39 +76,91 @@ char copyright[] = " SRE 0.8.4 Copyright (c) 1997-2000 by Secret Labs AB ";
#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */ #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
#define SRE_ERROR_MEMORY -9 /* out of memory */ #define SRE_ERROR_MEMORY -9 /* out of memory */
#ifdef DEBUG #if defined(DEBUG)
#define TRACE(v) printf v #define TRACE(v) printf v
#else #else
#define TRACE(v) #define TRACE(v)
#endif #endif
#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
#define SRE_CODE unsigned short /* unsigned short or larger */ #define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
/* -------------------------------------------------------------------- */ /* -------------------------------------------------------------------- */
/* search engine state */ /* search engine state */
/* unicode character predicates */ /* default character predicates (run sre_chars.py to regenerate tables) */
#define SRE_TO_LOWER(ch) Py_UNICODE_TOLOWER((Py_UNICODE)(ch))
#define SRE_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch)) #define SRE_DIGIT_MASK 1
#define SRE_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch)) #define SRE_SPACE_MASK 2
#define SRE_IS_LINEBREAK(ch) ((ch) == '\n') #define SRE_LINEBREAK_MASK 4
/* #define SRE_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) */ #define SRE_ALNUM_MASK 8
#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0) #define SRE_WORD_MASK 16
#define SRE_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
static char sre_char_tolower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, 127 };
static unsigned int sre_tolower(unsigned int ch)
{
return ((ch) < 128 ? sre_char_tolower[ch] : ch);
}
#define SRE_IS_DIGIT(ch)\
((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
#define SRE_IS_SPACE(ch)\
((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
#define SRE_IS_LINEBREAK(ch)\
((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
#define SRE_IS_ALNUM(ch)\
((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
#define SRE_IS_WORD(ch)\
((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
/* locale-specific character predicates */ /* locale-specific character predicates */
#define SRE_LOC_TO_LOWER(ch) ((ch) < 256 ? tolower((ch)) : ch)
static unsigned int sre_tolower_locale(unsigned int ch)
{
return ((ch) < 256 ? tolower((ch)) : ch);
}
#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0) #define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0) #define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n') #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0) #define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_') #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
/* unicode-specific character predicates */
#if defined(HAVE_UNICODE)
static unsigned int sre_tolower_unicode(unsigned int ch)
{
return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
}
#define SRE_UNI_TO_LOWER(ch) Py_UNICODE_TOLOWER((Py_UNICODE)(ch))
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
#define SRE_UNI_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
#define SRE_UNI_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
#endif
LOCAL(int) LOCAL(int)
sre_category(SRE_CODE category, unsigned int ch) sre_category(SRE_CODE category, unsigned int ch)
{ {
switch (category) { switch (category) {
case SRE_CATEGORY_DIGIT: case SRE_CATEGORY_DIGIT:
return SRE_IS_DIGIT(ch); return SRE_IS_DIGIT(ch);
case SRE_CATEGORY_NOT_DIGIT: case SRE_CATEGORY_NOT_DIGIT:
...@@ -109,22 +177,30 @@ sre_category(SRE_CODE category, unsigned int ch) ...@@ -109,22 +177,30 @@ sre_category(SRE_CODE category, unsigned int ch)
return SRE_IS_LINEBREAK(ch); return SRE_IS_LINEBREAK(ch);
case SRE_CATEGORY_NOT_LINEBREAK: case SRE_CATEGORY_NOT_LINEBREAK:
return !SRE_IS_LINEBREAK(ch); return !SRE_IS_LINEBREAK(ch);
case SRE_CATEGORY_LOC_DIGIT:
return SRE_LOC_IS_DIGIT(ch);
case SRE_CATEGORY_LOC_NOT_DIGIT:
return !SRE_LOC_IS_DIGIT(ch);
case SRE_CATEGORY_LOC_SPACE:
return SRE_LOC_IS_SPACE(ch);
case SRE_CATEGORY_LOC_NOT_SPACE:
return !SRE_LOC_IS_SPACE(ch);
case SRE_CATEGORY_LOC_WORD: case SRE_CATEGORY_LOC_WORD:
return SRE_LOC_IS_WORD(ch); return SRE_LOC_IS_WORD(ch);
case SRE_CATEGORY_LOC_NOT_WORD: case SRE_CATEGORY_LOC_NOT_WORD:
return !SRE_LOC_IS_WORD(ch); return !SRE_LOC_IS_WORD(ch);
case SRE_CATEGORY_LOC_LINEBREAK:
return SRE_LOC_IS_LINEBREAK(ch); #if defined(HAVE_UNICODE)
case SRE_CATEGORY_LOC_NOT_LINEBREAK: case SRE_CATEGORY_UNI_DIGIT:
return !SRE_LOC_IS_LINEBREAK(ch); return SRE_UNI_IS_DIGIT(ch);
case SRE_CATEGORY_UNI_NOT_DIGIT:
return !SRE_UNI_IS_DIGIT(ch);
case SRE_CATEGORY_UNI_SPACE:
return SRE_UNI_IS_SPACE(ch);
case SRE_CATEGORY_UNI_NOT_SPACE:
return !SRE_UNI_IS_SPACE(ch);
case SRE_CATEGORY_UNI_WORD:
return SRE_UNI_IS_WORD(ch);
case SRE_CATEGORY_UNI_NOT_WORD:
return !SRE_UNI_IS_WORD(ch);
case SRE_CATEGORY_UNI_LINEBREAK:
return SRE_UNI_IS_LINEBREAK(ch);
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
return !SRE_UNI_IS_LINEBREAK(ch);
#endif
} }
return 0; return 0;
} }
...@@ -146,7 +222,7 @@ _stack_free(SRE_STATE* state) ...@@ -146,7 +222,7 @@ _stack_free(SRE_STATE* state)
static int /* shouldn't be LOCAL */ static int /* shouldn't be LOCAL */
_stack_extend(SRE_STATE* state, int lo, int hi) _stack_extend(SRE_STATE* state, int lo, int hi)
{ {
void** stack; SRE_STACK* stack;
int stacksize; int stacksize;
/* grow the stack to a suitable size; we need at least lo entries, /* grow the stack to a suitable size; we need at least lo entries,
...@@ -163,7 +239,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi) ...@@ -163,7 +239,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
else if (stacksize > hi) else if (stacksize > hi)
stacksize = hi; stacksize = hi;
TRACE(("allocate stack %d\n", stacksize)); TRACE(("allocate stack %d\n", stacksize));
stack = malloc(sizeof(void*) * stacksize); stack = malloc(sizeof(SRE_STACK) * stacksize);
} else { } else {
/* grow the stack (typically by a factor of two) */ /* grow the stack (typically by a factor of two) */
while (stacksize < lo) while (stacksize < lo)
...@@ -171,7 +247,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi) ...@@ -171,7 +247,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
/* FIXME: <fl> could trim size if it's larger than lo, and /* FIXME: <fl> could trim size if it's larger than lo, and
much larger than hi */ much larger than hi */
TRACE(("grow stack to %d\n", stacksize)); TRACE(("grow stack to %d\n", stacksize));
stack = realloc(state->stack, sizeof(void*) * stacksize); stack = realloc(state->stack, sizeof(SRE_STACK) * stacksize);
} }
if (!stack) { if (!stack) {
...@@ -192,11 +268,13 @@ _stack_extend(SRE_STATE* state, int lo, int hi) ...@@ -192,11 +268,13 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
#define SRE_MEMBER sre_member #define SRE_MEMBER sre_member
#define SRE_MATCH sre_match #define SRE_MATCH sre_match
#define SRE_SEARCH sre_search #define SRE_SEARCH sre_search
#define SRE_RECURSIVE
#include "_sre.c" #if defined(HAVE_UNICODE)
#define SRE_RECURSIVE
#include "_sre.c"
#undef SRE_RECURSIVE #undef SRE_RECURSIVE
#undef SRE_SEARCH #undef SRE_SEARCH
#undef SRE_MATCH #undef SRE_MATCH
#undef SRE_MEMBER #undef SRE_MEMBER
...@@ -210,6 +288,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi) ...@@ -210,6 +288,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
#define SRE_MEMBER sre_umember #define SRE_MEMBER sre_umember
#define SRE_MATCH sre_umatch #define SRE_MATCH sre_umatch
#define SRE_SEARCH sre_usearch #define SRE_SEARCH sre_usearch
#endif
#endif /* SRE_RECURSIVE */ #endif /* SRE_RECURSIVE */
...@@ -308,13 +387,21 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -308,13 +387,21 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
SRE_CHAR* end = state->end; SRE_CHAR* end = state->end;
SRE_CHAR* ptr = state->ptr; SRE_CHAR* ptr = state->ptr;
int stacksize; int stack;
int stackbase; int stackbase;
int lastmark;
int i, count; int i, count;
/* FIXME: this is one ugly hack */ /* FIXME: this is a hack! */
void* *mark = NULL; void* mark_copy[64];
void* mark_data[64]; void* mark = NULL;
TRACE(("%8d: enter\n", PTR(ptr)));
stackbase = stack = state->stackbase;
lastmark = state->lastmark;
retry:
for (;;) { for (;;) {
...@@ -334,7 +421,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -334,7 +421,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_AT: case SRE_OP_AT:
/* match at given position */ /* match at given position */
/* args: <at> */ /* args: <at> */
TRACE(("%8d: match at \\%c\n", PTR(ptr), *pattern)); TRACE(("%8d: position %d\n", PTR(ptr), *pattern));
if (!SRE_AT(state, ptr, *pattern)) if (!SRE_AT(state, ptr, *pattern))
goto failure; goto failure;
pattern++; pattern++;
...@@ -343,18 +430,20 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -343,18 +430,20 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_CATEGORY: case SRE_OP_CATEGORY:
/* match at given category */ /* match at given category */
/* args: <category> */ /* args: <category> */
TRACE(("%8d: category match at \\%c\n", PTR(ptr), *pattern)); TRACE(("%8d: category %d [category %d]\n", PTR(ptr),
*ptr, *pattern));
if (ptr >= end || !sre_category(pattern[0], ptr[0])) if (ptr >= end || !sre_category(pattern[0], ptr[0]))
goto failure; goto failure;
TRACE(("%8d: category ok\n", PTR(ptr)));
pattern++; pattern++;
ptr++; ptr++;
break; break;
case SRE_OP_LITERAL: case SRE_OP_LITERAL:
/* match literal character */ /* match literal string */
/* args: <code> */ /* args: <code> */
TRACE(("%8d: literal %c\n", PTR(ptr), (SRE_CHAR) *pattern)); TRACE(("%8d: literal %c\n", PTR(ptr), (SRE_CHAR) pattern[0]));
if (ptr >= end || *ptr != (SRE_CHAR) *pattern) if (ptr >= end || *ptr != (SRE_CHAR) pattern[0])
goto failure; goto failure;
pattern++; pattern++;
ptr++; ptr++;
...@@ -363,8 +452,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -363,8 +452,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_NOT_LITERAL: case SRE_OP_NOT_LITERAL:
/* match anything that is not literal character */ /* match anything that is not literal character */
/* args: <code> */ /* args: <code> */
TRACE(("%8d: literal not %c\n", PTR(ptr), (SRE_CHAR) *pattern)); TRACE(("%8d: literal not %c\n", PTR(ptr), (SRE_CHAR) pattern[0]));
if (ptr >= end || *ptr == (SRE_CHAR) *pattern) if (ptr >= end || *ptr == (SRE_CHAR) pattern[0])
goto failure; goto failure;
pattern++; pattern++;
ptr++; ptr++;
...@@ -372,7 +461,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -372,7 +461,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_ANY: case SRE_OP_ANY:
/* match anything */ /* match anything */
TRACE(("%8d: any\n", PTR(ptr))); TRACE(("%8d: anything\n", PTR(ptr)));
if (ptr >= end) if (ptr >= end)
goto failure; goto failure;
ptr++; ptr++;
...@@ -393,14 +482,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -393,14 +482,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: group %d\n", PTR(ptr), pattern[0])); TRACE(("%8d: group %d\n", PTR(ptr), pattern[0]));
i = pattern[0]; i = pattern[0];
{ {
/* FIXME: optimize! */
SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i]; SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1]; SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
TRACE(("%8d: group %p %p\n", PTR(ptr), p, e));
if (!p || !e || e < p) if (!p || !e || e < p)
goto failure; goto failure;
while (p < e) { while (p < e) {
TRACE(("%8d: group test %c %c\n", PTR(ptr), *ptr, *p));
if (ptr >= end || *ptr != *p) if (ptr >= end || *ptr != *p)
goto failure; goto failure;
p++; ptr++; p++; ptr++;
...@@ -414,15 +500,13 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -414,15 +500,13 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0])); TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0]));
i = pattern[0]; i = pattern[0];
{ {
/* FIXME: optimize! */
SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i]; SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1]; SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
TRACE(("%8d: group %p %p\n", PTR(ptr), p, e));
if (!p || !e || e < p) if (!p || !e || e < p)
goto failure; goto failure;
while (p < e) { while (p < e) {
TRACE(("%8d: group test %c %c\n", PTR(ptr), *ptr, *p)); if (ptr >= end ||
if (ptr >= end || SRE_TO_LOWER(*ptr) != SRE_TO_LOWER(*p)) state->tolower(*ptr) != state->tolower(*p))
goto failure; goto failure;
p++; ptr++; p++; ptr++;
} }
...@@ -432,7 +516,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -432,7 +516,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_LITERAL_IGNORE: case SRE_OP_LITERAL_IGNORE:
TRACE(("%8d: literal lower(%c)\n", PTR(ptr), (SRE_CHAR) *pattern)); TRACE(("%8d: literal lower(%c)\n", PTR(ptr), (SRE_CHAR) *pattern));
if (ptr >= end || SRE_TO_LOWER(*ptr) != (SRE_CHAR) *pattern) if (ptr >= end ||
state->tolower(*ptr) != state->tolower(*pattern))
goto failure; goto failure;
pattern++; pattern++;
ptr++; ptr++;
...@@ -440,8 +525,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -440,8 +525,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_NOT_LITERAL_IGNORE: case SRE_OP_NOT_LITERAL_IGNORE:
TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), TRACE(("%8d: literal not lower(%c)\n", PTR(ptr),
(SRE_CHAR) *pattern)); (SRE_CHAR) *pattern));
if (ptr >= end || SRE_TO_LOWER(*ptr) == (SRE_CHAR) *pattern) if (ptr >= end ||
state->tolower(*ptr) == state->tolower(*pattern))
goto failure; goto failure;
pattern++; pattern++;
ptr++; ptr++;
...@@ -450,7 +536,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -450,7 +536,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_IN_IGNORE: case SRE_OP_IN_IGNORE:
TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr)); TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
if (ptr >= end if (ptr >= end
|| !SRE_MEMBER(pattern+1, (SRE_CHAR) SRE_TO_LOWER(*ptr))) || !SRE_MEMBER(pattern+1, (SRE_CHAR) state->tolower(*ptr)))
goto failure; goto failure;
pattern += pattern[0]; pattern += pattern[0];
ptr++; ptr++;
...@@ -459,39 +545,50 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -459,39 +545,50 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_MARK: case SRE_OP_MARK:
/* set mark */ /* set mark */
/* args: <mark> */ /* args: <mark> */
TRACE(("%8d: set mark(%d)\n", PTR(ptr), pattern[0])); TRACE(("%8d: set mark %d\n", PTR(ptr), pattern[0]));
if (state->lastmark < pattern[0])
state->lastmark = pattern[0];
if (!mark) { if (!mark) {
mark = mark_data; mark = mark_copy;
memcpy(mark, state->mark, sizeof(state->mark)); memcpy(mark, state->mark, state->lastmark*sizeof(void*));
} }
state->mark[pattern[0]] = ptr; state->mark[pattern[0]] = ptr;
pattern++; pattern++;
break; break;
case SRE_OP_JUMP: case SRE_OP_JUMP:
case SRE_OP_INFO:
/* jump forward */ /* jump forward */
/* args: <skip> */ /* args: <skip> */
TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0])); TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0]));
pattern += pattern[0]; pattern += pattern[0];
break; break;
#if 0
case SRE_OP_CALL: case SRE_OP_CALL:
/* match subpattern, without backtracking */ /* match subpattern, without backtracking */
/* args: <skip> <pattern> */ /* args: <skip> <pattern> */
TRACE(("%8d: match subpattern\n", PTR(ptr))); TRACE(("%8d: subpattern\n", PTR(ptr)));
state->ptr = ptr; state->ptr = ptr;
if (!SRE_MATCH(state, pattern + 1)) i = SRE_MATCH(state, pattern + 1);
if (i < 0)
return i;
if (!i)
goto failure; goto failure;
pattern += pattern[0]; pattern += pattern[0];
ptr = state->ptr; ptr = state->ptr;
break; break;
#endif
#if 0
case SRE_OP_MAX_REPEAT_ONE: case SRE_OP_MAX_REPEAT_ONE:
/* match repeated sequence (maximizing regexp) */ /* match repeated sequence (maximizing regexp) */
/* this variant only works if the repeated item is exactly
one character wide, and we're not already collecting /* this operator only works if the repeated item is
backtracking points. for other cases, use the exactly one character wide, and we're not already
MAX_REPEAT operator instead */ collecting backtracking points. for other cases,
use the MAX_REPEAT operator instead */
/* args: <skip> <min> <max> <step> */ /* args: <skip> <min> <max> <step> */
TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr), TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
pattern[1], pattern[2])); pattern[1], pattern[2]));
...@@ -523,7 +620,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -523,7 +620,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* repeated literal */ /* repeated literal */
SRE_CHAR chr = (SRE_CHAR) pattern[4]; SRE_CHAR chr = (SRE_CHAR) pattern[4];
while (count < (int) pattern[2]) { while (count < (int) pattern[2]) {
if (ptr >= end || (SRE_CHAR) SRE_TO_LOWER(*ptr) != chr) if (ptr >= end || (SRE_CHAR) state->tolower(*ptr) != chr)
break; break;
ptr++; ptr++;
count++; count++;
...@@ -543,7 +640,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -543,7 +640,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* repeated non-literal */ /* repeated non-literal */
SRE_CHAR chr = (SRE_CHAR) pattern[4]; SRE_CHAR chr = (SRE_CHAR) pattern[4];
while (count < (int) pattern[2]) { while (count < (int) pattern[2]) {
if (ptr >= end || (SRE_CHAR) SRE_TO_LOWER(*ptr) == chr) if (ptr >= end || (SRE_CHAR) state->tolower(*ptr) == chr)
break; break;
ptr++; ptr++;
count++; count++;
...@@ -564,8 +661,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -564,8 +661,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
while (count < (int) pattern[2]) { while (count < (int) pattern[2]) {
i = SRE_MATCH(state, pattern + 3); i = SRE_MATCH(state, pattern + 3);
if (i < 0) if (i < 0)
goto failure; return i;
if (i == 0) if (!i)
break; break;
count++; count++;
} }
...@@ -621,7 +718,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -621,7 +718,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
while (count >= (int) pattern[1]) { while (count >= (int) pattern[1]) {
state->ptr = ptr; state->ptr = ptr;
i = SRE_MATCH(state, pattern + pattern[0]); i = SRE_MATCH(state, pattern + pattern[0]);
if (i > 0) { if (i < 0)
return i;
if (i) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
goto success; goto success;
} }
...@@ -631,108 +730,84 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -631,108 +730,84 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
} }
} }
goto failure; goto failure;
#endif
case SRE_OP_MAX_REPEAT: case SRE_OP_MAX_REPEAT:
/* match repeated sequence (maximizing regexp). repeated /* match repeated sequence (maximizing regexp). repeated
group should end with a MAX_UNTIL code */ group should end with a MAX_UNTIL code */
TRACE(("%8d: max repeat %d %d\n", PTR(ptr), /* args: <skip> <min> <max> <item> */
TRACE(("%8d: max repeat (%d %d)\n", PTR(ptr),
pattern[1], pattern[2])); pattern[1], pattern[2]));
count = 0; count = 0;
state->ptr = ptr; state->ptr = ptr;
/* FIXME: <fl> umm. what about matching the minimum /* match minimum number of items */
number of items before starting to collect backtracking while (count < (int) pattern[1]) {
positions? */ i = SRE_MATCH(state, pattern + 3);
if (i < 0)
return i;
if (!i)
goto failure;
if (state->ptr == ptr) {
/* if the match was successful but empty, set the
count to max and terminate the scanning loop */
count = (int) pattern[2];
break;
}
count++;
ptr = state->ptr;
}
stackbase = state->stackbase; TRACE(("%8d: found %d leading items\n", PTR(ptr), count));
while (count < (int) pattern[2]) { if (count < (int) pattern[1])
/* store current position on the stack */ goto failure;
TRACE(("%8d: push mark at index %d\n", PTR(ptr), count));
if (stackbase + count >= state->stacksize) { /* match maximum number of items, pushing alternate end
i = _stack_extend(state, stackbase + count + 1, points to the stack */
stackbase + pattern[2]);
if (i < 0) while (pattern[2] == 32767 || count < (int) pattern[2]) {
goto failure; state->stackbase = stack;
}
state->stack[stackbase + count] = ptr;
/* check if we can match another item */
state->stackbase += count + 1;
i = SRE_MATCH(state, pattern + 3); i = SRE_MATCH(state, pattern + 3);
state->stackbase = stackbase; /* rewind */ state->stackbase = stackbase; /* rewind */
if (i != 2) if (i < 0)
return i;
if (!i)
break; break;
if (state->ptr == ptr) { if (state->ptr == ptr) {
/* if the match was successful but empty, set the
count to max and terminate the scanning loop */
stacksize = count; /* actual size of stack */
count = (int) pattern[2]; count = (int) pattern[2];
goto check_tail; /* FIXME: <fl> eliminate goto */ break;
} }
count++; /* this position was valid; add it to the retry
stack */
if (stack >= state->stacksize) {
i = _stack_extend(state, stack + 1,
stackbase + pattern[2]);
if (i < 0)
return i; /* out of memory */
}
TRACE(("%8d: stack[%d] = %d\n", PTR(ptr), stack, PTR(ptr)));
state->stack[stack].ptr = ptr;
state->stack[stack].pattern = pattern + pattern[0];
stack++;
/* move forward */
ptr = state->ptr; ptr = state->ptr;
count++;
}
stacksize = count; /* actual number of entries on the stack */
check_tail:
/* when we get here, count is the number of matches,
stacksize is the number of match points on the stack
(usually same as count, but it might be smaller) and
ptr points to the tail. */
if (count < (int) pattern[1])
goto failure;
/* make sure that rest of the expression matches. if it
doesn't, backtrack */
TRACE(("%8d: repeat %d found (stack size = %d)\n", PTR(ptr),
count, stacksize + 1));
TRACE(("%8d: tail is pattern\n", PTR(ptr)));
/* hope for the best */
state->ptr = ptr;
state->stackbase += stacksize + 1;
i = SRE_MATCH(state, pattern + pattern[0]);
state->stackbase = stackbase;
if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
goto success;
} }
/* backtrack! */ /* when we get here, count is the number of successful
while (count >= (int) pattern[1]) { matches, and ptr points to the tail. */
ptr = state->stack[stackbase + (count < stacksize ? count : stacksize)];
state->ptr = ptr;
count--;
TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
state->stackbase += stacksize + 1;
i = SRE_MATCH(state, pattern + pattern[0]);
state->stackbase = stackbase;
if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
goto success;
}
}
goto failure;
case SRE_OP_MAX_UNTIL: TRACE(("%8d: skip +%d\n", PTR(ptr), pattern[0]));
/* match repeated sequence (maximizing regexp). repeated
group should end with a MAX_UNTIL code */
TRACE(("%8d: max until\n", PTR(ptr))); pattern += pattern[0];
state->ptr = ptr; break;
goto success; /* always succeeds, for now... */
case SRE_OP_MIN_REPEAT: case SRE_OP_MIN_REPEAT:
/* match repeated sequence (minimizing regexp) */ /* match repeated sequence (minimizing regexp) */
/* FIXME: HERE BE BUGS! */
TRACE(("%8d: min repeat %d %d\n", PTR(ptr), TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
pattern[1], pattern[2])); pattern[1], pattern[2]));
count = 0; count = 0;
...@@ -740,7 +815,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -740,7 +815,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* match minimum number of items */ /* match minimum number of items */
while (count < (int) pattern[1]) { while (count < (int) pattern[1]) {
i = SRE_MATCH(state, pattern + 3); i = SRE_MATCH(state, pattern + 3);
if (i <= 0) if (i < 0)
return i;
if (!i)
goto failure; goto failure;
count++; count++;
} }
...@@ -752,21 +829,16 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -752,21 +829,16 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
goto success; goto success;
} }
TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
state->ptr = ptr; /* backtrack */ state->ptr = ptr; /* backtrack */
i = SRE_MATCH(state, pattern + 3); i = SRE_MATCH(state, pattern + 3);
if (i <= 0) if (i < 0)
return i;
if (!i)
goto failure; goto failure;
count++; count++;
} }
goto failure; goto failure;
case SRE_OP_MIN_UNTIL:
/* end of repeat group */
TRACE(("%8d: min until\n", PTR(ptr)));
state->ptr = ptr;
goto success; /* always succeeds, for now... */
case SRE_OP_BRANCH: case SRE_OP_BRANCH:
/* match one of several subpatterns */ /* match one of several subpatterns */
/* format: <branch> <size> <head> ... <null> <tail> */ /* format: <branch> <size> <head> ... <null> <tail> */
...@@ -777,7 +849,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -777,7 +849,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: branch check\n", PTR(ptr))); TRACE(("%8d: branch check\n", PTR(ptr)));
state->ptr = ptr; state->ptr = ptr;
i = SRE_MATCH(state, pattern + 1); i = SRE_MATCH(state, pattern + 1);
if (i > 0) { if (i < 0)
return i;
if (i) {
TRACE(("%8d: branch succeeded\n", PTR(ptr))); TRACE(("%8d: branch succeeded\n", PTR(ptr)));
goto success; goto success;
} }
...@@ -789,14 +863,20 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -789,14 +863,20 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_REPEAT: case SRE_OP_REPEAT:
/* TEMPLATE: match repeated sequence (no backtracking) */ /* TEMPLATE: match repeated sequence (no backtracking) */
/* format: <repeat> <skip> <min> <max> */ /* args: <skip> <min> <max> */
TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2])); TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2]));
count = 0; count = 0;
state->ptr = ptr; state->ptr = ptr;
while (count < (int) pattern[2]) { while (count < (int) pattern[2]) {
i = SRE_MATCH(state, pattern + 3); i = SRE_MATCH(state, pattern + 3);
if (i <= 0) if (i < 0)
return i;
if (!i)
break; break;
if (state->ptr == ptr) {
count = (int) pattern[2];
break;
}
count++; count++;
} }
if (count <= (int) pattern[1]) if (count <= (int) pattern[1])
...@@ -807,16 +887,28 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -807,16 +887,28 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
break; break;
default: default:
TRACE(("%8d: unknown opcode %d\n", PTR(ptr), pattern[-1]));
return SRE_ERROR_ILLEGAL; return SRE_ERROR_ILLEGAL;
} }
} }
failure: failure:
if (stack-- > stackbase) {
ptr = state->stack[stack].ptr;
pattern = state->stack[stack].pattern;
TRACE(("%8d: retry (%d)\n", PTR(ptr), stack));
goto retry;
}
TRACE(("%8d: leave (failure)\n", PTR(ptr)));
state->stackbase = stackbase;
state->lastmark = lastmark;
if (mark) if (mark)
memcpy(state->mark, mark, sizeof(state->mark)); memcpy(state->mark, mark, state->lastmark*sizeof(void*));
return 0; return 0;
success: success:
TRACE(("%8d: leave (success)\n", PTR(ptr)));
state->stackbase = stackbase;
return 1; return 1;
} }
...@@ -827,7 +919,12 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -827,7 +919,12 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
SRE_CHAR* end = state->end; SRE_CHAR* end = state->end;
int status = 0; int status = 0;
/* FIXME: <fl> add IGNORE cases (or implement full ASSERT support? */ if (pattern[0] == SRE_OP_INFO) {
/* don't look too far */
end -= pattern[2];
pattern += pattern[1];
/* FIXME: add support for fast scan */
}
if (pattern[0] == SRE_OP_LITERAL) { if (pattern[0] == SRE_OP_LITERAL) {
/* pattern starts with a literal */ /* pattern starts with a literal */
...@@ -837,7 +934,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -837,7 +934,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
ptr++; ptr++;
if (ptr == end) if (ptr == end)
return 0; return 0;
TRACE(("%8d: search found literal\n", PTR(ptr))); TRACE(("%8d: === SEARCH === literal\n", PTR(ptr)));
state->start = ptr; state->start = ptr;
state->ptr = ++ptr; state->ptr = ++ptr;
status = SRE_MATCH(state, pattern + 2); status = SRE_MATCH(state, pattern + 2);
...@@ -845,25 +942,9 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -845,25 +942,9 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
break; break;
} }
} else if (pattern[0] == SRE_OP_IN) {
/* pattern starts with a set */
for (;;) {
/* format: <in> <skip> <data> */
while (ptr < end && !SRE_MEMBER(pattern + 2, *ptr))
ptr++;
if (ptr == end)
return 0;
TRACE(("%8d: search found set\n", PTR(ptr)));
state->start = ptr;
state->ptr = ++ptr;
status = SRE_MATCH(state, pattern + pattern[1] + 1);
if (status != 0)
break;
}
} else } else
while (ptr <= end) { while (ptr <= end) {
TRACE(("%8d: search\n", PTR(ptr))); TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));
state->start = state->ptr = ptr++; state->start = state->ptr = ptr++;
status = SRE_MATCH(state, pattern); status = SRE_MATCH(state, pattern);
if (status != 0) if (status != 0)
...@@ -873,7 +954,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -873,7 +954,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
return status; return status;
} }
#ifndef SRE_RECURSIVE #if !defined(SRE_RECURSIVE)
/* -------------------------------------------------------------------- */ /* -------------------------------------------------------------------- */
/* factories and destructors */ /* factories and destructors */
...@@ -923,13 +1004,28 @@ _compile(PyObject* self_, PyObject* args) ...@@ -923,13 +1004,28 @@ _compile(PyObject* self_, PyObject* args)
} }
static PyObject * static PyObject *
_getcodesize(PyObject* self_, PyObject* args) sre_codesize(PyObject* self, PyObject* args)
{ {
return Py_BuildValue("i", sizeof(SRE_CODE)); return Py_BuildValue("i", sizeof(SRE_CODE));
} }
static PyObject *
sre_lower(PyObject* self, PyObject* args)
{
int character, flags;
if (!PyArg_ParseTuple(args, "ii", &character, &flags))
return NULL;
if (flags & SRE_FLAG_LOCALE)
return Py_BuildValue("i", sre_tolower_locale(character));
#if defined(HAVE_UNICODE)
if (flags & SRE_FLAG_UNICODE)
return Py_BuildValue("i", sre_tolower_unicode(character));
#endif
return Py_BuildValue("i", sre_tolower(character));
}
LOCAL(PyObject*) LOCAL(PyObject*)
_setup(SRE_STATE* state, PyObject* args) _setup(SRE_STATE* state, PatternObject* pattern, PyObject* args)
{ {
/* prepare state object */ /* prepare state object */
...@@ -960,7 +1056,11 @@ _setup(SRE_STATE* state, PyObject* args) ...@@ -960,7 +1056,11 @@ _setup(SRE_STATE* state, PyObject* args)
} }
/* determine character size */ /* determine character size */
#if defined(HAVE_UNICODE)
state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1); state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
#else
state->charsize = 1;
#endif
count /= state->charsize; count /= state->charsize;
...@@ -980,6 +1080,8 @@ _setup(SRE_STATE* state, PyObject* args) ...@@ -980,6 +1080,8 @@ _setup(SRE_STATE* state, PyObject* args)
state->start = (void*) ((char*) ptr + start * state->charsize); state->start = (void*) ((char*) ptr + start * state->charsize);
state->end = (void*) ((char*) ptr + end * state->charsize); state->end = (void*) ((char*) ptr + end * state->charsize);
state->lastmark = 0;
/* FIXME: dynamic! */ /* FIXME: dynamic! */
for (i = 0; i < 64; i++) for (i = 0; i < 64; i++)
state->mark[i] = NULL; state->mark[i] = NULL;
...@@ -988,6 +1090,15 @@ _setup(SRE_STATE* state, PyObject* args) ...@@ -988,6 +1090,15 @@ _setup(SRE_STATE* state, PyObject* args)
state->stackbase = 0; state->stackbase = 0;
state->stacksize = 0; state->stacksize = 0;
if (pattern->flags & SRE_FLAG_LOCALE)
state->tolower = sre_tolower_locale;
#if defined(HAVE_UNICODE)
else if (pattern->flags & SRE_FLAG_UNICODE)
state->tolower = sre_tolower_unicode;
#endif
else
state->tolower = sre_tolower;
return string; return string;
} }
...@@ -999,8 +1110,8 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state, ...@@ -999,8 +1110,8 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state,
MatchObject* match; MatchObject* match;
int i, j; int i, j;
char* base;
TRACE(("status = %d\n", status)); int n;
if (status > 0) { if (status > 0) {
...@@ -1017,19 +1128,18 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state, ...@@ -1017,19 +1128,18 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state,
match->groups = pattern->groups+1; match->groups = pattern->groups+1;
base = (char*) state->beginning;
n = state->charsize;
/* group zero */ /* group zero */
match->mark[0] = ((char*) state->start - match->mark[0] = ((char*) state->start - base) / n;
(char*) state->beginning) / state->charsize; match->mark[1] = ((char*) state->ptr - base) / n;
match->mark[1] = ((char*) state->ptr -
(char*) state->beginning) / state->charsize;
/* fill in the rest of the groups */ /* fill in the rest of the groups */
for (i = j = 0; i < pattern->groups; i++, j+=2) for (i = j = 0; i < pattern->groups; i++, j+=2)
if (state->mark[j] != NULL && state->mark[j+1] != NULL) { if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
match->mark[j+2] = ((char*) state->mark[j] - match->mark[j+2] = ((char*) state->mark[j] - base) / n;
(char*) state->beginning) / state->charsize; match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
match->mark[j+3] = ((char*) state->mark[j+1] -
(char*) state->beginning) / state->charsize;
} else } else
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */ match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
...@@ -1050,7 +1160,7 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state, ...@@ -1050,7 +1160,7 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state,
} }
static PyObject* static PyObject*
_pattern_cursor(PyObject* pattern, PyObject* args) _pattern_cursor(PatternObject* pattern, PyObject* args)
{ {
/* create search state object */ /* create search state object */
...@@ -1062,14 +1172,14 @@ _pattern_cursor(PyObject* pattern, PyObject* args) ...@@ -1062,14 +1172,14 @@ _pattern_cursor(PyObject* pattern, PyObject* args)
if (self == NULL) if (self == NULL)
return NULL; return NULL;
string = _setup(&self->state, args); string = _setup(&self->state, pattern, args);
if (!string) { if (!string) {
/* FIXME: dealloc cursor object */ PyObject_DEL(self);
return NULL; return NULL;
} }
Py_INCREF(pattern); Py_INCREF(pattern);
self->pattern = pattern; self->pattern = (PyObject*) pattern;
Py_INCREF(string); Py_INCREF(string);
self->string = string; self->string = string;
...@@ -1093,7 +1203,7 @@ _pattern_match(PatternObject* self, PyObject* args) ...@@ -1093,7 +1203,7 @@ _pattern_match(PatternObject* self, PyObject* args)
PyObject* string; PyObject* string;
int status; int status;
string = _setup(&state, args); string = _setup(&state, self, args);
if (!string) if (!string)
return NULL; return NULL;
...@@ -1102,7 +1212,9 @@ _pattern_match(PatternObject* self, PyObject* args) ...@@ -1102,7 +1212,9 @@ _pattern_match(PatternObject* self, PyObject* args)
if (state.charsize == 1) { if (state.charsize == 1) {
status = sre_match(&state, PatternObject_GetCode(self)); status = sre_match(&state, PatternObject_GetCode(self));
} else { } else {
#if defined(HAVE_UNICODE)
status = sre_umatch(&state, PatternObject_GetCode(self)); status = sre_umatch(&state, PatternObject_GetCode(self));
#endif
} }
_stack_free(&state); _stack_free(&state);
...@@ -1117,14 +1229,16 @@ _pattern_search(PatternObject* self, PyObject* args) ...@@ -1117,14 +1229,16 @@ _pattern_search(PatternObject* self, PyObject* args)
PyObject* string; PyObject* string;
int status; int status;
string = _setup(&state, args); string = _setup(&state, self, args);
if (!string) if (!string)
return NULL; return NULL;
if (state.charsize == 1) { if (state.charsize == 1) {
status = sre_search(&state, PatternObject_GetCode(self)); status = sre_search(&state, PatternObject_GetCode(self));
} else { } else {
#if defined(HAVE_UNICODE)
status = sre_usearch(&state, PatternObject_GetCode(self)); status = sre_usearch(&state, PatternObject_GetCode(self));
#endif
} }
_stack_free(&state); _stack_free(&state);
...@@ -1140,7 +1254,7 @@ call(char* function, PyObject* args) ...@@ -1140,7 +1254,7 @@ call(char* function, PyObject* args)
PyObject* func; PyObject* func;
PyObject* result; PyObject* result;
name = PyString_FromString("sre"); name = PyString_FromString(MODULE);
if (!name) if (!name)
return NULL; return NULL;
module = PyImport_Import(name); module = PyImport_Import(name);
...@@ -1203,46 +1317,47 @@ _pattern_findall(PatternObject* self, PyObject* args) ...@@ -1203,46 +1317,47 @@ _pattern_findall(PatternObject* self, PyObject* args)
PyObject* list; PyObject* list;
int status; int status;
string = _setup(&state, args); string = _setup(&state, self, args);
if (!string) if (!string)
return NULL; return NULL;
list = PyList_New(0); list = PyList_New(0);
while (state.start < state.end) { while (state.start <= state.end) {
PyObject* item; PyObject* item;
state.ptr = state.start; state.ptr = state.start;
if (state.charsize == 1) { if (state.charsize == 1) {
status = sre_match(&state, PatternObject_GetCode(self)); status = sre_search(&state, PatternObject_GetCode(self));
} else { } else {
status = sre_umatch(&state, PatternObject_GetCode(self)); #if defined(HAVE_UNICODE)
status = sre_usearch(&state, PatternObject_GetCode(self));
#endif
} }
if (status >= 0) { if (status > 0) {
if (status == 0)
state.ptr = (void*) ((char*) state.start + 1);
/* FIXME: if one group is defined, slice that group
instead. if multiple groups are defined, add tuple
containing all slices */
item = PySequence_GetSlice( item = PySequence_GetSlice(
string, string,
((char*) state.start - (char*) state.beginning), ((char*) state.start - (char*) state.beginning) / state.charsize,
((char*) state.ptr - (char*) state.beginning)); ((char*) state.ptr - (char*) state.beginning) / state.charsize);
if (!item) if (!item)
goto error; goto error;
if (PyList_Append(list, item) < 0) if (PyList_Append(list, item) < 0)
goto error; goto error;
state.start = state.ptr; if (state.ptr == state.start)
state.start = (void*) ((char*) state.ptr + state.charsize);
else
state.start = state.ptr;
} else { } else {
if (status == 0)
break;
/* internal error */ /* internal error */
PyErr_SetString( PyErr_SetString(
PyExc_RuntimeError, PyExc_RuntimeError,
...@@ -1347,20 +1462,26 @@ getslice_by_index(MatchObject* self, int index) ...@@ -1347,20 +1462,26 @@ getslice_by_index(MatchObject* self, int index)
); );
} }
static PyObject* static int
getslice(MatchObject* self, PyObject* index) getindex(MatchObject* self, PyObject* index)
{ {
if (!PyInt_Check(index) && self->pattern->groupindex != NULL) { if (!PyInt_Check(index) && self->pattern->groupindex != NULL) {
/* FIXME: resource leak? */ /* FIXME: resource leak? */
index = PyObject_GetItem(self->pattern->groupindex, index); index = PyObject_GetItem(self->pattern->groupindex, index);
if (!index) if (!index)
return NULL; return -1;
} }
if (PyInt_Check(index)) if (PyInt_Check(index))
return getslice_by_index(self, (int) PyInt_AS_LONG(index)); return (int) PyInt_AS_LONG(index);
return getslice_by_index(self, -1); /* signal error */ return -1;
}
static PyObject*
getslice(MatchObject* self, PyObject* index)
{
return getslice_by_index(self, getindex(self, index));
} }
static PyObject* static PyObject*
...@@ -1441,10 +1562,10 @@ _match_groupdict(MatchObject* self, PyObject* args) ...@@ -1441,10 +1562,10 @@ _match_groupdict(MatchObject* self, PyObject* args)
if (!keys) if (!keys)
return NULL; return NULL;
for (index = 0; index < PySequence_Length(keys); index++) { for (index = 0; index < PyList_GET_SIZE(keys); index++) {
PyObject* key; PyObject* key;
PyObject* item; PyObject* item;
key = PySequence_GetItem(keys, index); key = PyList_GET_ITEM(keys, index);
if (!key) { if (!key) {
Py_DECREF(keys); Py_DECREF(keys);
Py_DECREF(result); Py_DECREF(result);
...@@ -1469,10 +1590,14 @@ _match_groupdict(MatchObject* self, PyObject* args) ...@@ -1469,10 +1590,14 @@ _match_groupdict(MatchObject* self, PyObject* args)
static PyObject* static PyObject*
_match_start(MatchObject* self, PyObject* args) _match_start(MatchObject* self, PyObject* args)
{ {
int index = 0; int index;
if (!PyArg_ParseTuple(args, "|i", &index))
PyObject* index_ = Py_False;
if (!PyArg_ParseTuple(args, "|O", &index_))
return NULL; return NULL;
index = getindex(self, index_);
if (index < 0 || index >= self->groups) { if (index < 0 || index >= self->groups) {
PyErr_SetString( PyErr_SetString(
PyExc_IndexError, PyExc_IndexError,
...@@ -1492,10 +1617,14 @@ _match_start(MatchObject* self, PyObject* args) ...@@ -1492,10 +1617,14 @@ _match_start(MatchObject* self, PyObject* args)
static PyObject* static PyObject*
_match_end(MatchObject* self, PyObject* args) _match_end(MatchObject* self, PyObject* args)
{ {
int index = 0; int index;
if (!PyArg_ParseTuple(args, "|i", &index))
PyObject* index_ = Py_False;
if (!PyArg_ParseTuple(args, "|O", &index_))
return NULL; return NULL;
index = getindex(self, index_);
if (index < 0 || index >= self->groups) { if (index < 0 || index >= self->groups) {
PyErr_SetString( PyErr_SetString(
PyExc_IndexError, PyExc_IndexError,
...@@ -1515,10 +1644,14 @@ _match_end(MatchObject* self, PyObject* args) ...@@ -1515,10 +1644,14 @@ _match_end(MatchObject* self, PyObject* args)
static PyObject* static PyObject*
_match_span(MatchObject* self, PyObject* args) _match_span(MatchObject* self, PyObject* args)
{ {
int index = 0; int index;
if (!PyArg_ParseTuple(args, "|i", &index))
PyObject* index_ = Py_False;
if (!PyArg_ParseTuple(args, "|O", &index_))
return NULL; return NULL;
index = getindex(self, index_);
if (index < 0 || index >= self->groups) { if (index < 0 || index >= self->groups) {
PyErr_SetString( PyErr_SetString(
PyExc_IndexError, PyExc_IndexError,
...@@ -1615,16 +1748,18 @@ _cursor_match(CursorObject* self, PyObject* args) ...@@ -1615,16 +1748,18 @@ _cursor_match(CursorObject* self, PyObject* args)
if (state->charsize == 1) { if (state->charsize == 1) {
status = sre_match(state, PatternObject_GetCode(self->pattern)); status = sre_match(state, PatternObject_GetCode(self->pattern));
} else { } else {
#if defined(HAVE_UNICODE)
status = sre_umatch(state, PatternObject_GetCode(self->pattern)); status = sre_umatch(state, PatternObject_GetCode(self->pattern));
#endif
} }
match = _pattern_new_match((PatternObject*) self->pattern, match = _pattern_new_match((PatternObject*) self->pattern,
state, self->string, status); state, self->string, status);
if (status >= 0) if (status == 0 || state->ptr == state->start)
state->start = state->ptr; state->start = (void*) ((char*) state->ptr + state->charsize);
else else
state->start = (char*) state->ptr + state->charsize; state->start = state->ptr;
return match; return match;
} }
...@@ -1642,7 +1777,9 @@ _cursor_search(CursorObject* self, PyObject* args) ...@@ -1642,7 +1777,9 @@ _cursor_search(CursorObject* self, PyObject* args)
if (state->charsize == 1) { if (state->charsize == 1) {
status = sre_search(state, PatternObject_GetCode(self->pattern)); status = sre_search(state, PatternObject_GetCode(self->pattern));
} else { } else {
#if defined(HAVE_UNICODE)
status = sre_usearch(state, PatternObject_GetCode(self->pattern)); status = sre_usearch(state, PatternObject_GetCode(self->pattern));
#endif
} }
match = _pattern_new_match((PatternObject*) self->pattern, match = _pattern_new_match((PatternObject*) self->pattern,
...@@ -1693,12 +1830,13 @@ statichere PyTypeObject Cursor_Type = { ...@@ -1693,12 +1830,13 @@ statichere PyTypeObject Cursor_Type = {
static PyMethodDef _functions[] = { static PyMethodDef _functions[] = {
{"compile", _compile, 1}, {"compile", _compile, 1},
{"getcodesize", _getcodesize, 1}, {"getcodesize", sre_codesize, 1},
{"getlower", sre_lower, 1},
{NULL, NULL} {NULL, NULL}
}; };
void void
#ifdef WIN32 #if defined(WIN32)
__declspec(dllexport) __declspec(dllexport)
#endif #endif
init_sre() init_sre()
...@@ -1707,7 +1845,7 @@ init_sre() ...@@ -1707,7 +1845,7 @@ init_sre()
Pattern_Type.ob_type = Match_Type.ob_type = Pattern_Type.ob_type = Match_Type.ob_type =
Cursor_Type.ob_type = &PyType_Type; Cursor_Type.ob_type = &PyType_Type;
Py_InitModule("_sre", _functions); Py_InitModule("_" MODULE, _functions);
} }
#endif #endif /* !defined(SRE_RECURSIVE) */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment