Kaydet (Commit) b1aa1951 authored tarafından Jeremy Hylton's avatar Jeremy Hylton

Fredrik Lundh: here's the 96.6% version of SRE

üst 0292d78e
# -*- Mode: Python; tab-width: 4 -*-
# #
# Secret Labs' Regular Expression Engine # Secret Labs' Regular Expression Engine
# $Id$ # $Id$
...@@ -7,39 +6,127 @@ ...@@ -7,39 +6,127 @@
# #
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved. # Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
# #
# This code can only be used for 1.6 alpha testing. All other use
# require explicit permission from Secret Labs AB.
#
# Portions of this engine have been developed in cooperation with # Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 1.6 integration and # CNRI. Hewlett-Packard provided funding for 1.6 integration and
# other compatibility work. # other compatibility work.
# #
"""
this is a long string
"""
import sre_compile import sre_compile
# flags
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
L = LOCALE = sre_compile.SRE_FLAG_LOCALE
M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE
S = DOTALL = sre_compile.SRE_FLAG_DOTALL
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# public interface # public interface
def compile(pattern, flags=0): # FIXME: add docstrings
return sre_compile.compile(pattern, _fixflags(flags))
def match(pattern, string, flags=0): def match(pattern, string, flags=0):
return compile(pattern, _fixflags(flags)).match(string) return _compile(pattern, flags).match(string)
def search(pattern, string, flags=0): def search(pattern, string, flags=0):
return compile(pattern, _fixflags(flags)).search(string) return _compile(pattern, flags).search(string)
def sub(pattern, repl, string, count=0):
return _compile(pattern).sub(repl, string, count)
def subn(pattern, repl, string, count=0):
return _compile(pattern).subn(repl, string, count)
def split(pattern, string, maxsplit=0):
return _compile(pattern).split(string, maxsplit)
# FIXME: etc def findall(pattern, string, maxsplit=0):
return _compile(pattern).findall(string, maxsplit)
def compile(pattern, flags=0):
return _compile(pattern, flags)
def escape(pattern):
s = list(pattern)
for i in range(len(pattern)):
c = pattern[i]
if not ("a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9"):
if c == "\000":
s[i] = "\\000"
else:
s[i] = "\\" + c
return pattern[:0].join(s)
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# helpers # internals
_cache = {}
_MAXCACHE = 100
def _compile(pattern, flags=0):
# internal: compile pattern
tp = type(pattern)
if tp not in (type(""), type(u"")):
return pattern
key = (tp, pattern, flags)
try:
return _cache[key]
except KeyError:
pass
p = sre_compile.compile(pattern, flags)
if len(_cache) >= _MAXCACHE:
_cache.clear()
_cache[key] = p
return p
def _sub(pattern, template, string, count=0):
# internal: pattern.sub implementation hook
return _subn(pattern, template, string, count)[0]
def _expand(match, template):
# internal: expand template
return template # FIXME
def _fixflags(flags): def _subn(pattern, template, string, count=0):
# convert flag bitmask to sequence # internal: pattern.subn implementation hook
assert not flags if callable(template):
return () filter = callable
else:
# FIXME: prepare template
def filter(match, template=template):
return _expand(match, template)
n = i = 0
s = []
append = s.append
c = pattern.cursor(string)
while not count or n < count:
m = c.search()
if not m:
break
j = m.start()
if j > i:
append(string[i:j])
append(filter(m))
i = m.end()
n = n + 1
if i < len(string):
append(string[i:])
return string[:0].join(s), n
def _split(pattern, string, maxsplit=0):
# internal: pattern.split implementation hook
n = i = 0
s = []
append = s.append
c = pattern.cursor(string)
while not maxsplit or n < maxsplit:
m = c.search()
if not m:
break
j = m.start()
append(string[i:j])
i = m.end()
n = n + 1
if i < len(string):
append(string[i:])
return s
...@@ -14,9 +14,6 @@ ...@@ -14,9 +14,6 @@
# other compatibility work. # other compatibility work.
# #
# FIXME: <fl> formalize (objectify?) and document the compiler code
# format, so that other frontends can use this compiler
import array, string, sys import array, string, sys
import _sre import _sre
...@@ -45,64 +42,70 @@ class Code: ...@@ -45,64 +42,70 @@ class Code:
self.data.append(code) self.data.append(code)
def todata(self): def todata(self):
# print self.data # print self.data
return array.array(WORDSIZE, self.data).tostring() try:
return array.array(WORDSIZE, self.data).tostring()
def _lower(literal): except OverflowError:
# return _sre._lower(literal) # FIXME print self.data
return string.lower(literal) raise
def _compile(code, pattern, flags): def _compile(code, pattern, flags, level=0):
append = code.append append = code.append
for op, av in pattern: for op, av in pattern:
if op is ANY: if op is ANY:
if "s" in flags: if flags & SRE_FLAG_DOTALL:
append(CODES[op]) # any character at all! append(OPCODES[op]) # any character at all!
else: else:
append(CODES[NOT_LITERAL]) append(OPCODES[CATEGORY])
append(10) append(CHCODES[CATEGORY_NOT_LINEBREAK])
elif op in (SUCCESS, FAILURE): elif op in (SUCCESS, FAILURE):
append(CODES[op]) append(OPCODES[op])
elif op is AT: elif op is AT:
append(CODES[op]) append(OPCODES[op])
append(POSITIONS[av]) if flags & SRE_FLAG_MULTILINE:
append(ATCODES[AT_MULTILINE[av]])
else:
append(ATCODES[av])
elif op is BRANCH: elif op is BRANCH:
append(CODES[op]) append(OPCODES[op])
tail = [] tail = []
for av in av[1]: for av in av[1]:
skip = len(code); append(0) skip = len(code); append(0)
_compile(code, av, flags) _compile(code, av, flags, level)
append(CODES[JUMP]) append(OPCODES[JUMP])
tail.append(len(code)); append(0) tail.append(len(code)); append(0)
code[skip] = len(code) - skip code[skip] = len(code) - skip
append(0) # end of branch append(0) # end of branch
for tail in tail: for tail in tail:
code[tail] = len(code) - tail code[tail] = len(code) - tail
elif op is CALL: elif op is CALL:
append(CODES[op]) append(OPCODES[op])
skip = len(code); append(0) skip = len(code); append(0)
_compile(code, av, flags) _compile(code, av, flags, level+1)
append(CODES[SUCCESS]) append(OPCODES[SUCCESS])
code[skip] = len(code) - skip code[skip] = len(code) - skip
elif op is CATEGORY: # not used by current parser elif op is CATEGORY: # not used by current parser
append(CODES[op]) append(OPCODES[op])
append(CATEGORIES[av]) if flags & SRE_FLAG_LOCALE:
append(CH_LOCALE[CHCODES[av]])
else:
append(CHCODES[av])
elif op is GROUP: elif op is GROUP:
if "i" in flags: if flags & SRE_FLAG_IGNORECASE:
append(CODES[MAP_IGNORE[op]]) append(OPCODES[OP_IGNORE[op]])
else: else:
append(CODES[op]) append(OPCODES[op])
append(av) append(av-1)
elif op is IN: elif op is IN:
if "i" in flags: if flags & SRE_FLAG_IGNORECASE:
append(CODES[MAP_IGNORE[op]]) append(OPCODES[OP_IGNORE[op]])
def fixup(literal): def fixup(literal):
return ord(_lower(literal)) return ord(literal.lower())
else: else:
append(CODES[op]) append(OPCODES[op])
fixup = ord fixup = ord
skip = len(code); append(0) skip = len(code); append(0)
for op, av in av: for op, av in av:
append(CODES[op]) append(OPCODES[op])
if op is NEGATE: if op is NEGATE:
pass pass
elif op is LITERAL: elif op is LITERAL:
...@@ -111,58 +114,60 @@ def _compile(code, pattern, flags): ...@@ -111,58 +114,60 @@ def _compile(code, pattern, flags):
append(fixup(av[0])) append(fixup(av[0]))
append(fixup(av[1])) append(fixup(av[1]))
elif op is CATEGORY: elif op is CATEGORY:
append(CATEGORIES[av]) if flags & SRE_FLAG_LOCALE:
append(CH_LOCALE[CHCODES[av]])
else:
append(CHCODES[av])
else: else:
raise ValueError, "unsupported set operator" raise ValueError, "unsupported set operator"
append(CODES[FAILURE]) append(OPCODES[FAILURE])
code[skip] = len(code) - skip code[skip] = len(code) - skip
elif op in (LITERAL, NOT_LITERAL): elif op in (LITERAL, NOT_LITERAL):
if "i" in flags: if flags & SRE_FLAG_IGNORECASE:
append(CODES[MAP_IGNORE[op]]) append(OPCODES[OP_IGNORE[op]])
append(ord(_lower(av))) append(ord(av.lower()))
else: else:
append(CODES[op]) append(OPCODES[op])
append(ord(av)) append(ord(av))
elif op is MARK: elif op is MARK:
append(CODES[op]) append(OPCODES[op])
append(av) append(av)
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
lo, hi = av[2].getwidth() lo, hi = av[2].getwidth()
if lo == 0: if lo == 0:
raise SyntaxError, "cannot repeat zero-width items" raise SyntaxError, "cannot repeat zero-width items"
if lo == hi == 1 and op is MAX_REPEAT: if lo == hi == 1 and op is MAX_REPEAT:
append(CODES[MAX_REPEAT_ONE]) append(OPCODES[MAX_REPEAT_ONE])
skip = len(code); append(0) skip = len(code); append(0)
append(av[0]) append(av[0])
append(av[1]) append(av[1])
_compile(code, av[2], flags) _compile(code, av[2], flags, level+1)
append(CODES[SUCCESS]) append(OPCODES[SUCCESS])
code[skip] = len(code) - skip code[skip] = len(code) - skip
else: else:
append(CODES[op]) append(OPCODES[op])
skip = len(code); append(0) skip = len(code); append(0)
append(av[0]) append(av[0])
append(av[1]) append(av[1])
_compile(code, av[2], flags) _compile(code, av[2], flags, level+1)
if op is MIN_REPEAT: if op is MIN_REPEAT:
append(CODES[MIN_UNTIL]) append(OPCODES[MIN_UNTIL])
else: else:
# FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?) append(OPCODES[MAX_UNTIL])
append(CODES[MAX_UNTIL])
code[skip] = len(code) - skip code[skip] = len(code) - skip
elif op is SUBPATTERN: elif op is SUBPATTERN:
## group = av[0] group = av[0]
## if group: if group:
## append(CODES[MARK]) append(OPCODES[MARK])
## append((group-1)*2) append((group-1)*2)
_compile(code, av[1], flags) _compile(code, av[1], flags, level+1)
## if group: if group:
## append(CODES[MARK]) append(OPCODES[MARK])
## append((group-1)*2+1) append((group-1)*2+1)
else: else:
raise ValueError, ("unsupported operand type", op) raise ValueError, ("unsupported operand type", op)
def compile(p, flags=()): def compile(p, flags=0):
# convert pattern list to internal format # convert pattern list to internal format
if type(p) in (type(""), type(u"")): if type(p) in (type(""), type(u"")):
import sre_parse import sre_parse
...@@ -170,12 +175,10 @@ def compile(p, flags=()): ...@@ -170,12 +175,10 @@ def compile(p, flags=()):
p = sre_parse.parse(p) p = sre_parse.parse(p)
else: else:
pattern = None pattern = None
# print p.getwidth() flags = p.pattern.flags | flags
# print p
code = Code() code = Code()
_compile(code, p.data, p.pattern.flags) _compile(code, p.data, flags)
code.append(CODES[SUCCESS]) code.append(OPCODES[SUCCESS])
# print list(code.data)
data = code.todata() data = code.todata()
if 0: # debugging if 0: # debugging
print print
...@@ -183,5 +186,8 @@ def compile(p, flags=()): ...@@ -183,5 +186,8 @@ def compile(p, flags=()):
import sre_disasm import sre_disasm
sre_disasm.disasm(data) sre_disasm.disasm(data)
print "-" * 68 print "-" * 68
# print len(data), p.pattern.groups, len(p.pattern.groupdict) return _sre.compile(
return _sre.compile(pattern, data, p.pattern.groups-1, p.pattern.groupdict) pattern, flags,
data,
p.pattern.groups-1, p.pattern.groupdict
)
...@@ -48,20 +48,31 @@ SUBPATTERN = "subpattern" ...@@ -48,20 +48,31 @@ SUBPATTERN = "subpattern"
# positions # positions
AT_BEGINNING = "at_beginning" AT_BEGINNING = "at_beginning"
AT_BEGINNING_LINE = "at_beginning_line"
AT_BOUNDARY = "at_boundary" AT_BOUNDARY = "at_boundary"
AT_NON_BOUNDARY = "at_non_boundary" AT_NON_BOUNDARY = "at_non_boundary"
AT_END = "at_end" AT_END = "at_end"
AT_END_LINE = "at_end_line"
# categories # categories
CATEGORY_DIGIT = "category_digit" CATEGORY_DIGIT = "category_digit"
CATEGORY_NOT_DIGIT = "category_not_digit" CATEGORY_NOT_DIGIT = "category_not_digit"
CATEGORY_SPACE = "category_space" CATEGORY_SPACE = "category_space"
CATEGORY_NOT_SPACE = "category_not_space" CATEGORY_NOT_SPACE = "category_not_space"
CATEGORY_WORD = "category_word" CATEGORY_WORD = "category_word"
CATEGORY_NOT_WORD = "category_not_word" CATEGORY_NOT_WORD = "category_not_word"
CATEGORY_LINEBREAK = "category_linebreak"
CATEGORY_NOT_LINEBREAK = "category_not_linebreak"
CATEGORY_LOC_DIGIT = "category_loc_digit"
CATEGORY_LOC_NOT_DIGIT = "category_loc_not_digit"
CATEGORY_LOC_SPACE = "category_loc_space"
CATEGORY_LOC_NOT_SPACE = "category_loc_not_space"
CATEGORY_LOC_WORD = "category_loc_word"
CATEGORY_LOC_NOT_WORD = "category_loc_not_word"
CATEGORY_LOC_LINEBREAK = "category_loc_linebreak"
CATEGORY_LOC_NOT_LINEBREAK = "category_loc_not_linebreak"
CODES = [ OPCODES = [
# failure=0 success=1 (just because it looks better that way :-) # failure=0 success=1 (just because it looks better that way :-)
FAILURE, SUCCESS, FAILURE, SUCCESS,
...@@ -87,45 +98,75 @@ CODES = [ ...@@ -87,45 +98,75 @@ CODES = [
] ]
# convert to dictionary ATCODES = [
c = {} AT_BEGINNING, AT_BEGINNING_LINE, AT_BOUNDARY,
i = 0 AT_NON_BOUNDARY, AT_END, AT_END_LINE
for code in CODES: ]
c[code] = i
i = i + 1 CHCODES = [
CODES = c CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE,
CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD,
CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_DIGIT,
CATEGORY_LOC_NOT_DIGIT, CATEGORY_LOC_SPACE,
CATEGORY_LOC_NOT_SPACE, CATEGORY_LOC_WORD, CATEGORY_LOC_NOT_WORD,
CATEGORY_LOC_LINEBREAK, CATEGORY_LOC_NOT_LINEBREAK
]
def makedict(list):
d = {}
i = 0
for item in list:
d[item] = i
i = i + 1
return d
OPCODES = makedict(OPCODES)
ATCODES = makedict(ATCODES)
CHCODES = makedict(CHCODES)
# replacement operations for "ignore case" mode # replacement operations for "ignore case" mode
MAP_IGNORE = { OP_IGNORE = {
GROUP: GROUP_IGNORE, GROUP: GROUP_IGNORE,
IN: IN_IGNORE, IN: IN_IGNORE,
LITERAL: LITERAL_IGNORE, LITERAL: LITERAL_IGNORE,
NOT_LITERAL: NOT_LITERAL_IGNORE NOT_LITERAL: NOT_LITERAL_IGNORE
} }
POSITIONS = { AT_MULTILINE = {
AT_BEGINNING: ord("a"), AT_BEGINNING: AT_BEGINNING_LINE,
AT_BOUNDARY: ord("b"), AT_END: AT_END_LINE
AT_NON_BOUNDARY: ord("B"),
AT_END: ord("z"),
} }
CATEGORIES = { CH_LOCALE = {
CATEGORY_DIGIT: ord("d"), CATEGORY_DIGIT: CATEGORY_LOC_DIGIT,
CATEGORY_NOT_DIGIT: ord("D"), CATEGORY_NOT_DIGIT: CATEGORY_LOC_NOT_DIGIT,
CATEGORY_SPACE: ord("s"), CATEGORY_SPACE: CATEGORY_LOC_SPACE,
CATEGORY_NOT_SPACE: ord("S"), CATEGORY_NOT_SPACE: CATEGORY_LOC_NOT_SPACE,
CATEGORY_WORD: ord("w"), CATEGORY_WORD: CATEGORY_LOC_WORD,
CATEGORY_NOT_WORD: ord("W"), CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD,
CATEGORY_LINEBREAK: CATEGORY_LOC_LINEBREAK,
CATEGORY_NOT_LINEBREAK: CATEGORY_LOC_NOT_LINEBREAK
} }
# flags
SRE_FLAG_TEMPLATE = 1 # NYI
SRE_FLAG_IGNORECASE = 2
SRE_FLAG_LOCALE = 4
SRE_FLAG_MULTILINE = 8
SRE_FLAG_DOTALL = 16
SRE_FLAG_VERBOSE = 32
if __name__ == "__main__": if __name__ == "__main__":
import string import string
items = CODES.items() def dump(f, d, prefix):
items.sort(lambda a, b: cmp(a[1], b[1])) items = d.items()
items.sort(lambda a, b: cmp(a[1], b[1]))
for k, v in items:
f.write("#define %s_%s %s\n" % (prefix, string.upper(k), v))
f = open("sre_constants.h", "w") f = open("sre_constants.h", "w")
f.write("/* generated by sre_constants.py */\n") f.write("/* generated from sre_constants.py */\n")
for k, v in items: dump(f, OPCODES, "SRE_OP")
f.write("#define SRE_OP_" + string.upper(k) + " " + str(v) + "\n") dump(f, ATCODES, "SRE")
dump(f, CHCODES, "SRE")
f.close() f.close()
print "done" print "done"
...@@ -6,13 +6,16 @@ ...@@ -6,13 +6,16 @@
* simple regular expression matching engine * simple regular expression matching engine
* *
* partial history: * partial history:
* 99-10-24 fl created (bits and pieces from the template matcher) * 99-10-24 fl created (based on the template matcher)
* 99-11-13 fl added categories, branching, and more (0.2) * 99-11-13 fl added categories, branching, and more (0.2)
* 99-11-16 fl some tweaks to compile on non-Windows platforms * 99-11-16 fl some tweaks to compile on non-Windows platforms
* 99-12-18 fl non-literals, generic maximizing repeat (0.3) * 99-12-18 fl non-literals, generic maximizing repeat (0.3)
* 99-02-28 fl tons of changes (not all to the better ;-) (0.4) * 99-02-28 fl tons of changes (not all to the better ;-) (0.4)
* 99-03-06 fl first alpha, sort of (0.5) * 99-03-06 fl first alpha, sort of (0.5)
* 99-03-14 fl removed most compatibility stuff (0.6) * 99-03-14 fl removed most compatibility stuff (0.6)
* 99-05-10 fl towards third alpha (0.8.2)
* 99-05-13 fl added experimental cursor stuff (0.8.3)
* 99-05-27 fl final bug hunt (0.8.4)
* *
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
* *
...@@ -26,7 +29,7 @@ ...@@ -26,7 +29,7 @@
#ifndef SRE_RECURSIVE #ifndef SRE_RECURSIVE
char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB "; char copyright[] = " SRE 0.8.4 Copyright (c) 1997-2000 by Secret Labs AB ";
#include "Python.h" #include "Python.h"
...@@ -40,7 +43,7 @@ char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB "; ...@@ -40,7 +43,7 @@ char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB ";
#define INT_MAX 2147483647 #define INT_MAX 2147483647
#endif #endif
#include <ctype.h> /* temporary hack */ #include <ctype.h>
/* defining this one enables tracing */ /* defining this one enables tracing */
#undef DEBUG #undef DEBUG
...@@ -59,61 +62,69 @@ char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB "; ...@@ -59,61 +62,69 @@ char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB ";
#ifdef DEBUG #ifdef DEBUG
#define TRACE(v) printf v #define TRACE(v) printf v
#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
#else #else
#define TRACE(v) #define TRACE(v)
#endif #endif
#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
#define SRE_CODE unsigned short /* unsigned short or larger */ #define SRE_CODE unsigned short /* unsigned short or larger */
typedef struct { /* -------------------------------------------------------------------- */
/* string pointers */ /* search engine state */
void* ptr; /* current position (also end of current slice) */
void* beginning; /* start of original string */ /* unicode character predicates */
void* start; /* start of current slice */ #define SRE_TO_LOWER(ch) Py_UNICODE_TOLOWER((Py_UNICODE)(ch))
void* end; /* end of original string */ #define SRE_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
/* character size */ #define SRE_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
int charsize; #define SRE_IS_LINEBREAK(ch) ((ch) == '\n')
/* registers */ /* #define SRE_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) */
int marks;
void* mark[64]; /* FIXME: <fl> should be dynamically allocated! */
/* FIXME */
/* backtracking stack */
void** stack;
int stacksize;
int stackbase;
} SRE_STATE;
#if 1 /* FIXME: <fl> fix this one! */
#define SRE_TO_LOWER Py_UNICODE_TOLOWER
#define SRE_IS_DIGIT Py_UNICODE_ISDIGIT
#define SRE_IS_SPACE Py_UNICODE_ISSPACE
#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
#else
#define SRE_TO_LOWER(ch) ((ch) < 256 ? tolower((ch)) : ch)
#define SRE_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
#define SRE_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0) #define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
#endif
#define SRE_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_') #define SRE_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
/* locale-specific character predicates */
#define SRE_LOC_TO_LOWER(ch) ((ch) < 256 ? tolower((ch)) : ch)
#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
LOCAL(int) LOCAL(int)
sre_category(SRE_CODE category, unsigned int ch) sre_category(SRE_CODE category, unsigned int ch)
{ {
switch (category) { switch (category) {
case 'd': case SRE_CATEGORY_DIGIT:
return SRE_IS_DIGIT(ch); return SRE_IS_DIGIT(ch);
case 'D': case SRE_CATEGORY_NOT_DIGIT:
return !SRE_IS_DIGIT(ch); return !SRE_IS_DIGIT(ch);
case 's': case SRE_CATEGORY_SPACE:
return SRE_IS_SPACE(ch); return SRE_IS_SPACE(ch);
case 'S': case SRE_CATEGORY_NOT_SPACE:
return !SRE_IS_SPACE(ch); return !SRE_IS_SPACE(ch);
case 'w': case SRE_CATEGORY_WORD:
return SRE_IS_WORD(ch); return SRE_IS_WORD(ch);
case 'W': case SRE_CATEGORY_NOT_WORD:
return !SRE_IS_WORD(ch); return !SRE_IS_WORD(ch);
case SRE_CATEGORY_LINEBREAK:
return SRE_IS_LINEBREAK(ch);
case SRE_CATEGORY_NOT_LINEBREAK:
return !SRE_IS_LINEBREAK(ch);
case SRE_CATEGORY_LOC_DIGIT:
return SRE_LOC_IS_DIGIT(ch);
case SRE_CATEGORY_LOC_NOT_DIGIT:
return !SRE_LOC_IS_DIGIT(ch);
case SRE_CATEGORY_LOC_SPACE:
return SRE_LOC_IS_SPACE(ch);
case SRE_CATEGORY_LOC_NOT_SPACE:
return !SRE_LOC_IS_SPACE(ch);
case SRE_CATEGORY_LOC_WORD:
return SRE_LOC_IS_WORD(ch);
case SRE_CATEGORY_LOC_NOT_WORD:
return !SRE_LOC_IS_WORD(ch);
case SRE_CATEGORY_LOC_LINEBREAK:
return SRE_LOC_IS_LINEBREAK(ch);
case SRE_CATEGORY_LOC_NOT_LINEBREAK:
return !SRE_LOC_IS_LINEBREAK(ch);
} }
return 0; return 0;
} }
...@@ -174,7 +185,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi) ...@@ -174,7 +185,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
return 0; return 0;
} }
/* set things up for the 8-bit version */ /* generate 8-bit version */
#define SRE_CHAR unsigned char #define SRE_CHAR unsigned char
#define SRE_AT sre_at #define SRE_AT sre_at
...@@ -192,7 +203,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi) ...@@ -192,7 +203,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
#undef SRE_AT #undef SRE_AT
#undef SRE_CHAR #undef SRE_CHAR
/* set things up for the 16-bit unicode version */ /* generate 16-bit unicode version */
#define SRE_CHAR Py_UNICODE #define SRE_CHAR Py_UNICODE
#define SRE_AT sre_uat #define SRE_AT sre_uat
...@@ -211,20 +222,22 @@ _stack_extend(SRE_STATE* state, int lo, int hi) ...@@ -211,20 +222,22 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
LOCAL(int) LOCAL(int)
SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
{ {
/* check if pointer is at given position. return 1 if so, 0 /* check if pointer is at given position */
otherwise */
int this, that; int this, that;
switch (at) { switch (at) {
case 'a': case SRE_AT_BEGINNING:
/* beginning */
return ((void*) ptr == state->beginning); return ((void*) ptr == state->beginning);
case 'z': case SRE_AT_BEGINNING_LINE:
/* end */ return ((void*) ptr == state->beginning ||
SRE_IS_LINEBREAK((int) ptr[-1]));
case SRE_AT_END:
return ((void*) ptr == state->end); return ((void*) ptr == state->end);
case 'b': case SRE_AT_END_LINE:
/* word boundary */ return ((void*) ptr == state->end ||
SRE_IS_LINEBREAK((int) ptr[0]));
case SRE_AT_BOUNDARY:
if (state->beginning == state->end) if (state->beginning == state->end)
return 0; return 0;
that = ((void*) ptr > state->beginning) ? that = ((void*) ptr > state->beginning) ?
...@@ -232,8 +245,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) ...@@ -232,8 +245,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
this = ((void*) ptr < state->end) ? this = ((void*) ptr < state->end) ?
SRE_IS_WORD((int) ptr[0]) : 0; SRE_IS_WORD((int) ptr[0]) : 0;
return this != that; return this != that;
case 'B': case SRE_AT_NON_BOUNDARY:
/* word non-boundary */
if (state->beginning == state->end) if (state->beginning == state->end)
return 0; return 0;
that = ((void*) ptr > state->beginning) ? that = ((void*) ptr > state->beginning) ?
...@@ -249,8 +261,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) ...@@ -249,8 +261,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
LOCAL(int) LOCAL(int)
SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch) SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch)
{ {
/* check if character is a member of the given set. return 1 if /* check if character is a member of the given set */
so, 0 otherwise */
int ok = 1; int ok = 1;
...@@ -301,29 +312,42 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -301,29 +312,42 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
int stackbase; int stackbase;
int i, count; int i, count;
for (;;) { /* FIXME: this is one ugly hack */
void* *mark = NULL;
void* mark_data[64];
TRACE(("[%p]\n", pattern)); for (;;) {
switch (*pattern++) { switch (*pattern++) {
case SRE_OP_FAILURE: case SRE_OP_FAILURE:
/* immediate failure */ /* immediate failure */
TRACE(("%8d: failure\n", PTR(ptr))); TRACE(("%8d: failure\n", PTR(ptr)));
return 0; goto failure;
case SRE_OP_SUCCESS: case SRE_OP_SUCCESS:
/* end of pattern */ /* end of pattern */
TRACE(("%8d: success\n", PTR(ptr))); TRACE(("%8d: success\n", PTR(ptr)));
state->ptr = ptr; state->ptr = ptr;
return 1; goto success;
case SRE_OP_AT: case SRE_OP_AT:
/* match at given position */ /* match at given position */
/* args: <at> */
TRACE(("%8d: match at \\%c\n", PTR(ptr), *pattern)); TRACE(("%8d: match at \\%c\n", PTR(ptr), *pattern));
if (!SRE_AT(state, ptr, *pattern)) if (!SRE_AT(state, ptr, *pattern))
return 0; goto failure;
pattern++;
break;
case SRE_OP_CATEGORY:
/* match at given category */
/* args: <category> */
TRACE(("%8d: category match at \\%c\n", PTR(ptr), *pattern));
if (ptr >= end || !sre_category(pattern[0], ptr[0]))
goto failure;
pattern++; pattern++;
ptr++;
break; break;
case SRE_OP_LITERAL: case SRE_OP_LITERAL:
...@@ -331,7 +355,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -331,7 +355,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* args: <code> */ /* args: <code> */
TRACE(("%8d: literal %c\n", PTR(ptr), (SRE_CHAR) *pattern)); TRACE(("%8d: literal %c\n", PTR(ptr), (SRE_CHAR) *pattern));
if (ptr >= end || *ptr != (SRE_CHAR) *pattern) if (ptr >= end || *ptr != (SRE_CHAR) *pattern)
return 0; goto failure;
pattern++; pattern++;
ptr++; ptr++;
break; break;
...@@ -341,7 +365,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -341,7 +365,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* args: <code> */ /* args: <code> */
TRACE(("%8d: literal not %c\n", PTR(ptr), (SRE_CHAR) *pattern)); TRACE(("%8d: literal not %c\n", PTR(ptr), (SRE_CHAR) *pattern));
if (ptr >= end || *ptr == (SRE_CHAR) *pattern) if (ptr >= end || *ptr == (SRE_CHAR) *pattern)
return 0; goto failure;
pattern++; pattern++;
ptr++; ptr++;
break; break;
...@@ -350,7 +374,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -350,7 +374,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* match anything */ /* match anything */
TRACE(("%8d: any\n", PTR(ptr))); TRACE(("%8d: any\n", PTR(ptr)));
if (ptr >= end) if (ptr >= end)
return 0; goto failure;
ptr++; ptr++;
break; break;
...@@ -359,23 +383,47 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -359,23 +383,47 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* args: <skip> <set> */ /* args: <skip> <set> */
TRACE(("%8d: set %c\n", PTR(ptr), *ptr)); TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr)) if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
return 0; goto failure;
pattern += pattern[0]; pattern += pattern[0];
ptr++; ptr++;
break; break;
case SRE_OP_GROUP: case SRE_OP_GROUP:
/* match backreference */ /* match backreference */
TRACE(("%8d: group %d\n", PTR(ptr), pattern[0]));
i = pattern[0]; i = pattern[0];
{ {
/* FIXME: optimize size! */ /* FIXME: optimize! */
SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i]; SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1]; SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
TRACE(("%8d: group %p %p\n", PTR(ptr), p, e));
if (!p || !e || e < p) if (!p || !e || e < p)
return 0; goto failure;
while (p < e) { while (p < e) {
TRACE(("%8d: group test %c %c\n", PTR(ptr), *ptr, *p));
if (ptr >= end || *ptr != *p) if (ptr >= end || *ptr != *p)
return 0; goto failure;
p++; ptr++;
}
}
pattern++;
break;
case SRE_OP_GROUP_IGNORE:
/* match backreference */
TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0]));
i = pattern[0];
{
/* FIXME: optimize! */
SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
TRACE(("%8d: group %p %p\n", PTR(ptr), p, e));
if (!p || !e || e < p)
goto failure;
while (p < e) {
TRACE(("%8d: group test %c %c\n", PTR(ptr), *ptr, *p));
if (ptr >= end || SRE_TO_LOWER(*ptr) != SRE_TO_LOWER(*p))
goto failure;
p++; ptr++; p++; ptr++;
} }
} }
...@@ -385,7 +433,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -385,7 +433,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_LITERAL_IGNORE: case SRE_OP_LITERAL_IGNORE:
TRACE(("%8d: literal lower(%c)\n", PTR(ptr), (SRE_CHAR) *pattern)); TRACE(("%8d: literal lower(%c)\n", PTR(ptr), (SRE_CHAR) *pattern));
if (ptr >= end || SRE_TO_LOWER(*ptr) != (SRE_CHAR) *pattern) if (ptr >= end || SRE_TO_LOWER(*ptr) != (SRE_CHAR) *pattern)
return 0; goto failure;
pattern++; pattern++;
ptr++; ptr++;
break; break;
...@@ -394,7 +442,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -394,7 +442,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), TRACE(("%8d: literal not lower(%c)\n", PTR(ptr),
(SRE_CHAR) *pattern)); (SRE_CHAR) *pattern));
if (ptr >= end || SRE_TO_LOWER(*ptr) == (SRE_CHAR) *pattern) if (ptr >= end || SRE_TO_LOWER(*ptr) == (SRE_CHAR) *pattern)
return 0; goto failure;
pattern++; pattern++;
ptr++; ptr++;
break; break;
...@@ -403,7 +451,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -403,7 +451,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr)); TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
if (ptr >= end if (ptr >= end
|| !SRE_MEMBER(pattern+1, (SRE_CHAR) SRE_TO_LOWER(*ptr))) || !SRE_MEMBER(pattern+1, (SRE_CHAR) SRE_TO_LOWER(*ptr)))
return 0; goto failure;
pattern += pattern[0]; pattern += pattern[0];
ptr++; ptr++;
break; break;
...@@ -412,7 +460,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -412,7 +460,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* set mark */ /* set mark */
/* args: <mark> */ /* args: <mark> */
TRACE(("%8d: set mark(%d)\n", PTR(ptr), pattern[0])); TRACE(("%8d: set mark(%d)\n", PTR(ptr), pattern[0]));
state->mark[pattern[0]] = ptr; if (!mark) {
mark = mark_data;
memcpy(mark, state->mark, sizeof(state->mark));
}
state->mark[pattern[0]] = ptr;
pattern++; pattern++;
break; break;
...@@ -429,21 +481,18 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -429,21 +481,18 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: match subpattern\n", PTR(ptr))); TRACE(("%8d: match subpattern\n", PTR(ptr)));
state->ptr = ptr; state->ptr = ptr;
if (!SRE_MATCH(state, pattern + 1)) if (!SRE_MATCH(state, pattern + 1))
return 0; goto failure;
pattern += pattern[0]; pattern += pattern[0];
ptr = state->ptr; ptr = state->ptr;
break; break;
case SRE_OP_MAX_REPEAT_ONE: case SRE_OP_MAX_REPEAT_ONE:
/* match repeated sequence (maximizing regexp) */
/* match repeated sequence (maximizing regexp). this /* this variant only works if the repeated item is exactly
variant only works if the repeated item is exactly one one character wide, and we're not already collecting
character wide, and we're not already collecting backtracking points. for other cases, use the
backtracking points. for other cases, use the
MAX_REPEAT operator instead */ MAX_REPEAT operator instead */
/* args: <skip> <min> <max> <step> */ /* args: <skip> <min> <max> <step> */
TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr), TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
pattern[1], pattern[2])); pattern[1], pattern[2]));
...@@ -454,7 +503,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -454,7 +503,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
string, and backtrack from there */ string, and backtrack from there */
/* FIXME: must look for line endings */ /* FIXME: must look for line endings */
if (ptr + pattern[1] > end) if (ptr + pattern[1] > end)
return 0; /* cannot match */ goto failure; /* cannot match */
count = pattern[2]; count = pattern[2];
if (count > end - ptr) if (count > end - ptr)
count = end - ptr; count = end - ptr;
...@@ -515,7 +564,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -515,7 +564,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
while (count < (int) pattern[2]) { while (count < (int) pattern[2]) {
i = SRE_MATCH(state, pattern + 3); i = SRE_MATCH(state, pattern + 3);
if (i < 0) if (i < 0)
return i; goto failure;
if (i == 0) if (i == 0)
break; break;
count++; count++;
...@@ -529,23 +578,20 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -529,23 +578,20 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
string. check if the rest of the pattern matches, and string. check if the rest of the pattern matches, and
backtrack if not. */ backtrack if not. */
/* FIXME: <fl> this is a mess. fix it! */
TRACE(("%8d: repeat %d found\n", PTR(ptr), count)); TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
if (count < (int) pattern[1]) if (count < (int) pattern[1])
return 0; goto failure;
if (pattern[pattern[0]] == SRE_OP_SUCCESS) { if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
/* tail is empty. we're finished */ /* tail is empty. we're finished */
TRACE(("%8d: tail is empty\n", PTR(ptr))); TRACE(("%8d: tail is empty\n", PTR(ptr)));
state->ptr = ptr; state->ptr = ptr;
return 1; goto success;
} else if (pattern[pattern[0]] == SRE_OP_LITERAL) { } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
/* tail starts with a literal. we can speed things up /* tail starts with a literal. skip positions where
by skipping positions where the rest of the pattern the rest of the pattern cannot possibly match */
cannot possibly match */
SRE_CHAR chr = (SRE_CHAR) pattern[pattern[0]+1]; SRE_CHAR chr = (SRE_CHAR) pattern[pattern[0]+1];
TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr)); TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
for (;;) { for (;;) {
...@@ -562,7 +608,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -562,7 +608,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i = SRE_MATCH(state, pattern + pattern[0]); i = SRE_MATCH(state, pattern + pattern[0]);
if (i > 0) { if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
return 1; goto success;
} }
TRACE(("%8d: BACKTRACK\n", PTR(ptr))); TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
ptr--; ptr--;
...@@ -570,23 +616,21 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -570,23 +616,21 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
} }
} else { } else {
/* general case */
TRACE(("%8d: tail is pattern\n", PTR(ptr))); TRACE(("%8d: tail is pattern\n", PTR(ptr)));
while (count >= (int) pattern[1]) { while (count >= (int) pattern[1]) {
state->ptr = ptr; state->ptr = ptr;
i = SRE_MATCH(state, pattern + pattern[0]); i = SRE_MATCH(state, pattern + pattern[0]);
if (i > 0) { if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
return 1; goto success;
} }
TRACE(("%8d: BACKTRACK\n", PTR(ptr))); TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
ptr--; ptr--;
count--; count--;
} }
} }
return 0; /* failure! */ goto failure;
/* ----------------------------------------------------------------------- */
/* FIXME: the following section is just plain broken */
case SRE_OP_MAX_REPEAT: case SRE_OP_MAX_REPEAT:
/* match repeated sequence (maximizing regexp). repeated /* match repeated sequence (maximizing regexp). repeated
...@@ -611,7 +655,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -611,7 +655,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i = _stack_extend(state, stackbase + count + 1, i = _stack_extend(state, stackbase + count + 1,
stackbase + pattern[2]); stackbase + pattern[2]);
if (i < 0) if (i < 0)
return i; goto failure;
} }
state->stack[stackbase + count] = ptr; state->stack[stackbase + count] = ptr;
/* check if we can match another item */ /* check if we can match another item */
...@@ -642,7 +686,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -642,7 +686,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
ptr points to the tail. */ ptr points to the tail. */
if (count < (int) pattern[1]) if (count < (int) pattern[1])
return 0; goto failure;
/* make sure that rest of the expression matches. if it /* make sure that rest of the expression matches. if it
doesn't, backtrack */ doesn't, backtrack */
...@@ -659,7 +703,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -659,7 +703,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
state->stackbase = stackbase; state->stackbase = stackbase;
if (i > 0) { if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
return 1; goto success;
} }
/* backtrack! */ /* backtrack! */
...@@ -673,10 +717,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -673,10 +717,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
state->stackbase = stackbase; state->stackbase = stackbase;
if (i > 0) { if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
return 1; goto success;
} }
} }
return 0; /* failure! */ goto failure;
case SRE_OP_MAX_UNTIL: case SRE_OP_MAX_UNTIL:
/* match repeated sequence (maximizing regexp). repeated /* match repeated sequence (maximizing regexp). repeated
...@@ -684,13 +728,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -684,13 +728,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: max until\n", PTR(ptr))); TRACE(("%8d: max until\n", PTR(ptr)));
state->ptr = ptr; state->ptr = ptr;
return 2; /* always succeeds, for now... */ goto success; /* always succeeds, for now... */
/* end of totally broken section */
/* ----------------------------------------------------------------------- */
case SRE_OP_MIN_REPEAT: case SRE_OP_MIN_REPEAT:
/* match repeated sequence (minimizing regexp) */ /* match repeated sequence (minimizing regexp) */
/* FIXME: HERE BE BUGS! */
TRACE(("%8d: min repeat %d %d\n", PTR(ptr), TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
pattern[1], pattern[2])); pattern[1], pattern[2]));
count = 0; count = 0;
...@@ -699,7 +741,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -699,7 +741,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
while (count < (int) pattern[1]) { while (count < (int) pattern[1]) {
i = SRE_MATCH(state, pattern + 3); i = SRE_MATCH(state, pattern + 3);
if (i <= 0) if (i <= 0)
return i; goto failure;
count++; count++;
} }
/* move forward until the tail matches. */ /* move forward until the tail matches. */
...@@ -708,22 +750,22 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -708,22 +750,22 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i = SRE_MATCH(state, pattern + pattern[0]); i = SRE_MATCH(state, pattern + pattern[0]);
if (i > 0) { if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
return 1; goto success;
} }
TRACE(("%8d: BACKTRACK\n", PTR(ptr))); TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
state->ptr = ptr; /* backtrack */ state->ptr = ptr; /* backtrack */
i = SRE_MATCH(state, pattern + 3); i = SRE_MATCH(state, pattern + 3);
if (i <= 0) if (i <= 0)
return i; goto failure;
count++; count++;
} }
return 0; /* failure! */ goto failure;
case SRE_OP_MIN_UNTIL: case SRE_OP_MIN_UNTIL:
/* end of repeat group */ /* end of repeat group */
TRACE(("%8d: min until\n", PTR(ptr))); TRACE(("%8d: min until\n", PTR(ptr)));
state->ptr = ptr; state->ptr = ptr;
return 2; /* always succeeds, for now... */ goto success; /* always succeeds, for now... */
case SRE_OP_BRANCH: case SRE_OP_BRANCH:
/* match one of several subpatterns */ /* match one of several subpatterns */
...@@ -737,13 +779,13 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -737,13 +779,13 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i = SRE_MATCH(state, pattern + 1); i = SRE_MATCH(state, pattern + 1);
if (i > 0) { if (i > 0) {
TRACE(("%8d: branch succeeded\n", PTR(ptr))); TRACE(("%8d: branch succeeded\n", PTR(ptr)));
return 1; goto success;
} }
} }
pattern += *pattern; pattern += *pattern;
} }
TRACE(("%8d: branch failed\n", PTR(ptr))); TRACE(("%8d: branch failed\n", PTR(ptr)));
return 0; /* failure! */ goto failure;
case SRE_OP_REPEAT: case SRE_OP_REPEAT:
/* TEMPLATE: match repeated sequence (no backtracking) */ /* TEMPLATE: match repeated sequence (no backtracking) */
...@@ -758,16 +800,24 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -758,16 +800,24 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
count++; count++;
} }
if (count <= (int) pattern[1]) if (count <= (int) pattern[1])
return 0; goto failure;
TRACE(("%8d: repeat %d matches\n", PTR(ptr), count)); TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
pattern += pattern[0]; pattern += pattern[0];
ptr = state->ptr; ptr = state->ptr;
break; break;
default: default:
return SRE_ERROR_ILLEGAL; return SRE_ERROR_ILLEGAL;
} }
} }
failure:
if (mark)
memcpy(state->mark, mark, sizeof(state->mark));
return 0;
success:
return 1;
} }
LOCAL(int) LOCAL(int)
...@@ -832,6 +882,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -832,6 +882,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
staticforward PyTypeObject Pattern_Type; staticforward PyTypeObject Pattern_Type;
staticforward PyTypeObject Match_Type; staticforward PyTypeObject Match_Type;
staticforward PyTypeObject Cursor_Type;
static PyObject * static PyObject *
_compile(PyObject* self_, PyObject* args) _compile(PyObject* self_, PyObject* args)
...@@ -841,20 +892,25 @@ _compile(PyObject* self_, PyObject* args) ...@@ -841,20 +892,25 @@ _compile(PyObject* self_, PyObject* args)
PatternObject* self; PatternObject* self;
PyObject* pattern; PyObject* pattern;
int flags = 0;
PyObject* code; PyObject* code;
int groups = 0; int groups = 0;
PyObject* groupindex = NULL; PyObject* groupindex = NULL;
if (!PyArg_ParseTuple(args, "OO!|iO", &pattern, if (!PyArg_ParseTuple(args, "OiO!|iO", &pattern, &flags,
&PyString_Type, &code, &groups, &groupindex)) &PyString_Type, &code,
&groups, &groupindex))
return NULL; return NULL;
self = PyObject_New(PatternObject, &Pattern_Type); self = PyObject_NEW(PatternObject, &Pattern_Type);
if (self == NULL) if (self == NULL)
return NULL; return NULL;
Py_INCREF(pattern); Py_INCREF(pattern);
self->pattern = pattern; self->pattern = pattern;
self->flags = flags;
Py_INCREF(code); Py_INCREF(code);
self->code = code; self->code = code;
...@@ -872,6 +928,69 @@ _getcodesize(PyObject* self_, PyObject* args) ...@@ -872,6 +928,69 @@ _getcodesize(PyObject* self_, PyObject* args)
return Py_BuildValue("i", sizeof(SRE_CODE)); return Py_BuildValue("i", sizeof(SRE_CODE));
} }
LOCAL(PyObject*)
_setup(SRE_STATE* state, PyObject* args)
{
/* prepare state object */
PyBufferProcs *buffer;
int i, count;
void* ptr;
PyObject* string;
int start = 0;
int end = INT_MAX;
if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
return NULL;
/* get pointer to string buffer */
buffer = string->ob_type->tp_as_buffer;
if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
buffer->bf_getsegcount(string, NULL) != 1) {
PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
return NULL;
}
/* determine buffer size */
count = buffer->bf_getreadbuffer(string, 0, &ptr);
if (count < 0) {
/* sanity check */
PyErr_SetString(PyExc_TypeError, "buffer has negative size");
return NULL;
}
/* determine character size */
state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
count /= state->charsize;
/* adjust boundaries */
if (start < 0)
start = 0;
else if (start > count)
start = count;
if (end < 0)
end = 0;
else if (end > count)
end = count;
state->beginning = ptr;
state->start = (void*) ((char*) ptr + start * state->charsize);
state->end = (void*) ((char*) ptr + end * state->charsize);
/* FIXME: dynamic! */
for (i = 0; i < 64; i++)
state->mark[i] = NULL;
state->stack = NULL;
state->stackbase = 0;
state->stacksize = 0;
return string;
}
static PyObject* static PyObject*
_pattern_new_match(PatternObject* pattern, SRE_STATE* state, _pattern_new_match(PatternObject* pattern, SRE_STATE* state,
PyObject* string, int status) PyObject* string, int status)
...@@ -886,7 +1005,7 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state, ...@@ -886,7 +1005,7 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state,
if (status > 0) { if (status > 0) {
/* create match object (with room for extra group marks) */ /* create match object (with room for extra group marks) */
match = PyObject_NewVar(MatchObject, &Match_Type, 2*pattern->groups); match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups);
if (match == NULL) if (match == NULL)
return NULL; return NULL;
...@@ -930,70 +1049,32 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state, ...@@ -930,70 +1049,32 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state,
return Py_None; return Py_None;
} }
/* -------------------------------------------------------------------- */ static PyObject*
/* pattern methods */ _pattern_cursor(PyObject* pattern, PyObject* args)
LOCAL(PyObject*)
_setup(SRE_STATE* state, PyObject* args)
{ {
/* prepare state object */ /* create search state object */
PyBufferProcs *buffer;
int i, count;
void* ptr;
PyObject* string;
int start = 0;
int end = INT_MAX;
if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
return NULL;
/* get pointer to string buffer */ CursorObject* self;
buffer = string->ob_type->tp_as_buffer; PyObject* string;
if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
buffer->bf_getsegcount(string, NULL) != 1) {
PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
return NULL;
}
/* determine buffer size */
count = buffer->bf_getreadbuffer(string, 0, &ptr);
if (count < 0) {
/* sanity check */
PyErr_SetString(PyExc_TypeError, "buffer has negative size");
return NULL;
}
/* determine character size */
state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
count /= state->charsize; /* create match object (with room for extra group marks) */
self = PyObject_NEW(CursorObject, &Cursor_Type);
if (self == NULL)
return NULL;
/* adjust boundaries */ string = _setup(&self->state, args);
if (start < 0) if (!string) {
start = 0; /* FIXME: dealloc cursor object */
else if (start > count) return NULL;
start = count; }
if (end < 0)
end = 0;
else if (end > count)
end = count;
state->beginning = ptr;
state->start = (void*) ((char*) ptr + start * state->charsize);
state->end = (void*) ((char*) ptr + end * state->charsize);
/* FIXME: dynamic! */ Py_INCREF(pattern);
for (i = 0; i < 64; i++) self->pattern = pattern;
state->mark[i] = NULL;
state->stack = NULL; Py_INCREF(string);
state->stackbase = 0; self->string = string;
state->stacksize = 0;
return string; return (PyObject*) self;
} }
static void static void
...@@ -1002,7 +1083,7 @@ _pattern_dealloc(PatternObject* self) ...@@ -1002,7 +1083,7 @@ _pattern_dealloc(PatternObject* self)
Py_XDECREF(self->code); Py_XDECREF(self->code);
Py_XDECREF(self->pattern); Py_XDECREF(self->pattern);
Py_XDECREF(self->groupindex); Py_XDECREF(self->groupindex);
PyObject_Del(self); PyMem_DEL(self);
} }
static PyObject* static PyObject*
...@@ -1052,11 +1133,71 @@ _pattern_search(PatternObject* self, PyObject* args) ...@@ -1052,11 +1133,71 @@ _pattern_search(PatternObject* self, PyObject* args)
} }
static PyObject* static PyObject*
_pattern_findall(PatternObject* self, PyObject* args) call(char* function, PyObject* args)
{
PyObject* name;
PyObject* module;
PyObject* func;
PyObject* result;
name = PyString_FromString("sre");
if (!name)
return NULL;
module = PyImport_Import(name);
Py_DECREF(name);
if (!module)
return NULL;
func = PyObject_GetAttrString(module, function);
Py_DECREF(module);
if (!func)
return NULL;
result = PyObject_CallObject(func, args);
Py_DECREF(func);
Py_DECREF(args);
return result;
}
static PyObject*
_pattern_sub(PatternObject* self, PyObject* args)
{ {
/* FIXME: not sure about the semantics here. this is good enough PyObject* template;
for SXP, though... */ PyObject* string;
PyObject* count;
if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
return NULL;
/* delegate to Python code */
return call("_sub", Py_BuildValue("OOOO", self, template, string, count));
}
static PyObject*
_pattern_subn(PatternObject* self, PyObject* args)
{
PyObject* template;
PyObject* string;
PyObject* count;
if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
return NULL;
/* delegate to Python code */
return call("_subn", Py_BuildValue("OOOO", self, template, string, count));
}
static PyObject*
_pattern_split(PatternObject* self, PyObject* args)
{
PyObject* string;
PyObject* maxsplit;
if (!PyArg_ParseTuple(args, "OO", &string, &maxsplit))
return NULL;
/* delegate to Python code */
return call("_split", Py_BuildValue("OOO", self, string, maxsplit));
}
static PyObject*
_pattern_findall(PatternObject* self, PyObject* args)
{
SRE_STATE state; SRE_STATE state;
PyObject* string; PyObject* string;
PyObject* list; PyObject* list;
...@@ -1077,7 +1218,7 @@ _pattern_findall(PatternObject* self, PyObject* args) ...@@ -1077,7 +1218,7 @@ _pattern_findall(PatternObject* self, PyObject* args)
if (state.charsize == 1) { if (state.charsize == 1) {
status = sre_match(&state, PatternObject_GetCode(self)); status = sre_match(&state, PatternObject_GetCode(self));
} else { } else {
status = sre_umatch(&state, PatternObject_GetCode(self)); status = sre_umatch(&state, PatternObject_GetCode(self));
} }
if (status >= 0) { if (status >= 0) {
...@@ -1085,6 +1226,10 @@ _pattern_findall(PatternObject* self, PyObject* args) ...@@ -1085,6 +1226,10 @@ _pattern_findall(PatternObject* self, PyObject* args)
if (status == 0) if (status == 0)
state.ptr = (void*) ((char*) state.start + 1); state.ptr = (void*) ((char*) state.start + 1);
/* FIXME: if one group is defined, slice that group
instead. if multiple groups are defined, add tuple
containing all slices */
item = PySequence_GetSlice( item = PySequence_GetSlice(
string, string,
((char*) state.start - (char*) state.beginning), ((char*) state.start - (char*) state.beginning),
...@@ -1121,7 +1266,12 @@ error: ...@@ -1121,7 +1266,12 @@ error:
static PyMethodDef _pattern_methods[] = { static PyMethodDef _pattern_methods[] = {
{"match", (PyCFunction) _pattern_match, 1}, {"match", (PyCFunction) _pattern_match, 1},
{"search", (PyCFunction) _pattern_search, 1}, {"search", (PyCFunction) _pattern_search, 1},
{"sub", (PyCFunction) _pattern_sub, 1},
{"subn", (PyCFunction) _pattern_subn, 1},
{"split", (PyCFunction) _pattern_split, 1},
{"findall", (PyCFunction) _pattern_findall, 1}, {"findall", (PyCFunction) _pattern_findall, 1},
/* experimental */
{"cursor", (PyCFunction) _pattern_cursor, 1},
{NULL, NULL} {NULL, NULL}
}; };
...@@ -1142,7 +1292,15 @@ _pattern_getattr(PatternObject* self, char* name) ...@@ -1142,7 +1292,15 @@ _pattern_getattr(PatternObject* self, char* name)
Py_INCREF(self->pattern); Py_INCREF(self->pattern);
return self->pattern; return self->pattern;
} }
if (!strcmp(name, "flags"))
return Py_BuildValue("i", self->flags);
if (!strcmp(name, "groupindex") && self->groupindex) {
Py_INCREF(self->groupindex);
return self->groupindex;
}
PyErr_SetString(PyExc_AttributeError, name); PyErr_SetString(PyExc_AttributeError, name);
return NULL; return NULL;
} }
...@@ -1163,7 +1321,7 @@ _match_dealloc(MatchObject* self) ...@@ -1163,7 +1321,7 @@ _match_dealloc(MatchObject* self)
{ {
Py_XDECREF(self->string); Py_XDECREF(self->string);
Py_DECREF(self->pattern); Py_DECREF(self->pattern);
PyObject_Del(self); PyMem_DEL(self);
} }
static PyObject* static PyObject*
...@@ -1244,6 +1402,8 @@ _match_groups(MatchObject* self, PyObject* args) ...@@ -1244,6 +1402,8 @@ _match_groups(MatchObject* self, PyObject* args)
PyObject* result; PyObject* result;
int index; int index;
/* FIXME: <fl> handle default value! */
result = PyTuple_New(self->groups-1); result = PyTuple_New(self->groups-1);
if (!result) if (!result)
return NULL; return NULL;
...@@ -1269,6 +1429,8 @@ _match_groupdict(MatchObject* self, PyObject* args) ...@@ -1269,6 +1429,8 @@ _match_groupdict(MatchObject* self, PyObject* args)
PyObject* keys; PyObject* keys;
int index; int index;
/* FIXME: <fl> handle default value! */
result = PyDict_New(); result = PyDict_New();
if (!result) if (!result)
return NULL; return NULL;
...@@ -1367,7 +1529,8 @@ _match_span(MatchObject* self, PyObject* args) ...@@ -1367,7 +1529,8 @@ _match_span(MatchObject* self, PyObject* args)
if (self->mark[index*2] < 0) { if (self->mark[index*2] < 0) {
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; Py_INCREF(Py_None);
return Py_BuildValue("OO", Py_None, Py_None);
} }
return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]); return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
...@@ -1394,24 +1557,20 @@ _match_getattr(MatchObject* self, char* name) ...@@ -1394,24 +1557,20 @@ _match_getattr(MatchObject* self, char* name)
PyErr_Clear(); PyErr_Clear();
/* attributes! */ /* attributes */
if (!strcmp(name, "string")) { if (!strcmp(name, "string")) {
Py_INCREF(self->string); Py_INCREF(self->string);
return self->string; return self->string;
} }
if (!strcmp(name, "regs"))
/* FIXME: should return the whole list! */
return Py_BuildValue("((i,i))", self->mark[0], self->mark[1]);
if (!strcmp(name, "re")) { if (!strcmp(name, "re")) {
Py_INCREF(self->pattern); Py_INCREF(self->pattern);
return (PyObject*) self->pattern; return (PyObject*) self->pattern;
} }
if (!strcmp(name, "groupindex") && self->pattern->groupindex) {
Py_INCREF(self->pattern->groupindex);
return self->pattern->groupindex;
}
if (!strcmp(name, "pos")) if (!strcmp(name, "pos"))
return Py_BuildValue("i", 0); /* FIXME */ return Py_BuildValue("i", 0); /* FIXME */
if (!strcmp(name, "endpos")) if (!strcmp(name, "endpos"))
return Py_BuildValue("i", 0); /* FIXME */ return Py_BuildValue("i", 0); /* FIXME */
...@@ -1432,6 +1591,106 @@ statichere PyTypeObject Match_Type = { ...@@ -1432,6 +1591,106 @@ statichere PyTypeObject Match_Type = {
(getattrfunc)_match_getattr, /*tp_getattr*/ (getattrfunc)_match_getattr, /*tp_getattr*/
}; };
/* -------------------------------------------------------------------- */
/* cursor methods (experimental) */
static void
_cursor_dealloc(CursorObject* self)
{
_stack_free(&self->state);
Py_DECREF(self->string);
Py_DECREF(self->pattern);
PyMem_DEL(self);
}
static PyObject*
_cursor_match(CursorObject* self, PyObject* args)
{
SRE_STATE* state = &self->state;
PyObject* match;
int status;
state->ptr = state->start;
if (state->charsize == 1) {
status = sre_match(state, PatternObject_GetCode(self->pattern));
} else {
status = sre_umatch(state, PatternObject_GetCode(self->pattern));
}
match = _pattern_new_match((PatternObject*) self->pattern,
state, self->string, status);
if (status >= 0)
state->start = state->ptr;
else
state->start = (char*) state->ptr + state->charsize;
return match;
}
static PyObject*
_cursor_search(CursorObject* self, PyObject* args)
{
SRE_STATE* state = &self->state;
PyObject* match;
int status;
state->ptr = state->start;
if (state->charsize == 1) {
status = sre_search(state, PatternObject_GetCode(self->pattern));
} else {
status = sre_usearch(state, PatternObject_GetCode(self->pattern));
}
match = _pattern_new_match((PatternObject*) self->pattern,
state, self->string, status);
if (status >= 0)
state->start = state->ptr;
return match;
}
static PyMethodDef _cursor_methods[] = {
{"match", (PyCFunction) _cursor_match, 0},
{"search", (PyCFunction) _cursor_search, 0},
{NULL, NULL}
};
static PyObject*
_cursor_getattr(CursorObject* self, char* name)
{
PyObject* res;
res = Py_FindMethod(_cursor_methods, (PyObject*) self, name);
if (res)
return res;
PyErr_Clear();
/* attributes */
if (!strcmp(name, "pattern")) {
Py_INCREF(self->pattern);
return self->pattern;
}
PyErr_SetString(PyExc_AttributeError, name);
return NULL;
}
statichere PyTypeObject Cursor_Type = {
PyObject_HEAD_INIT(NULL)
0, "Cursor",
sizeof(CursorObject), /* size of basic object */
0,
(destructor)_cursor_dealloc, /*tp_dealloc*/
0, /*tp_print*/
(getattrfunc)_cursor_getattr, /*tp_getattr*/
};
static PyMethodDef _functions[] = { static PyMethodDef _functions[] = {
{"compile", _compile, 1}, {"compile", _compile, 1},
{"getcodesize", _getcodesize, 1}, {"getcodesize", _getcodesize, 1},
...@@ -1445,7 +1704,8 @@ __declspec(dllexport) ...@@ -1445,7 +1704,8 @@ __declspec(dllexport)
init_sre() init_sre()
{ {
/* Patch object types */ /* Patch object types */
Pattern_Type.ob_type = Match_Type.ob_type = &PyType_Type; Pattern_Type.ob_type = Match_Type.ob_type =
Cursor_Type.ob_type = &PyType_Type;
Py_InitModule("_sre", _functions); Py_InitModule("_sre", _functions);
} }
......
...@@ -14,17 +14,18 @@ ...@@ -14,17 +14,18 @@
#include "sre_constants.h" #include "sre_constants.h"
/* Python objects */
typedef struct { typedef struct {
PyObject_HEAD PyObject_HEAD
PyObject* code; /* link to the code string object */ PyObject* code; /* link to the code string object */
PyObject* pattern; /* link to the pattern source (or None) */
int groups; int groups;
PyObject* groupindex; PyObject* groupindex;
/* compatibility */
PyObject* pattern; /* pattern source (or None) */
int flags; /* flags used when compiling pattern source */
} PatternObject; } PatternObject;
#define PatternObject_GetCode(o) ((void*) PyString_AS_STRING((o)->code)) #define PatternObject_GetCode(o)\
((void*) PyString_AS_STRING(((PatternObject*)(o))->code))
typedef struct { typedef struct {
PyObject_HEAD PyObject_HEAD
...@@ -34,5 +35,28 @@ typedef struct { ...@@ -34,5 +35,28 @@ typedef struct {
int mark[2]; int mark[2];
} MatchObject; } MatchObject;
#endif typedef struct {
/* string pointers */
void* ptr; /* current position (also end of current slice) */
void* beginning; /* start of original string */
void* start; /* start of current slice */
void* end; /* end of original string */
/* character size */
int charsize;
/* registers */
int marks;
void* mark[64]; /* FIXME: <fl> should be dynamically allocated! */
/* backtracking stack */
void** stack;
int stacksize;
int stackbase;
} SRE_STATE;
typedef struct {
PyObject_HEAD
PyObject* pattern;
PyObject* string;
SRE_STATE state;
} CursorObject;
#endif
/* generated by sre_constants.py */ /* generated from sre_constants.py */
#define SRE_OP_FAILURE 0 #define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1 #define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2 #define SRE_OP_ANY 2
...@@ -25,3 +25,25 @@ ...@@ -25,3 +25,25 @@
#define SRE_OP_NEGATE 23 #define SRE_OP_NEGATE 23
#define SRE_OP_RANGE 24 #define SRE_OP_RANGE 24
#define SRE_OP_REPEAT 25 #define SRE_OP_REPEAT 25
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BOUNDARY 2
#define SRE_AT_NON_BOUNDARY 3
#define SRE_AT_END 4
#define SRE_AT_END_LINE 5
#define SRE_CATEGORY_DIGIT 0
#define SRE_CATEGORY_NOT_DIGIT 1
#define SRE_CATEGORY_SPACE 2
#define SRE_CATEGORY_NOT_SPACE 3
#define SRE_CATEGORY_WORD 4
#define SRE_CATEGORY_NOT_WORD 5
#define SRE_CATEGORY_LINEBREAK 6
#define SRE_CATEGORY_NOT_LINEBREAK 7
#define SRE_CATEGORY_LOC_DIGIT 8
#define SRE_CATEGORY_LOC_NOT_DIGIT 9
#define SRE_CATEGORY_LOC_SPACE 10
#define SRE_CATEGORY_LOC_NOT_SPACE 11
#define SRE_CATEGORY_LOC_WORD 12
#define SRE_CATEGORY_LOC_NOT_WORD 13
#define SRE_CATEGORY_LOC_LINEBREAK 14
#define SRE_CATEGORY_LOC_NOT_LINEBREAK 15
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment