Kaydet (Commit) e1869838 authored tarafından Fredrik Lundh's avatar Fredrik Lundh

final 0.9.8 updates:

-- added REPEAT_ONE operator
-- added ANY_ALL operator (used to represent "(?s).")
üst fb06539e
...@@ -98,7 +98,10 @@ def _compile(pattern, flags=0): ...@@ -98,7 +98,10 @@ def _compile(pattern, flags=0):
return _cache[key] return _cache[key]
except KeyError: except KeyError:
pass pass
try:
p = sre_compile.compile(pattern, flags) p = sre_compile.compile(pattern, flags)
except error, v:
raise error, v # invalid expression
if len(_cache) >= _MAXCACHE: if len(_cache) >= _MAXCACHE:
_cache.clear() _cache.clear()
_cache[key] = p _cache[key] = p
......
...@@ -73,6 +73,13 @@ def _charset(charset, fixup=None): ...@@ -73,6 +73,13 @@ def _charset(charset, fixup=None):
return out return out
return charset return charset
def _simple(av):
# check if av is a "simple" operator
lo, hi = av[2].getwidth()
if lo == 0:
raise error, "nothing to repeat"
return lo == hi == 1 and av[2][0][0] != SUBPATTERN
def _compile(code, pattern, flags): def _compile(code, pattern, flags):
# internal: compile a (sub)pattern # internal: compile a (sub)pattern
emit = code.append emit = code.append
...@@ -116,10 +123,9 @@ def _compile(code, pattern, flags): ...@@ -116,10 +123,9 @@ def _compile(code, pattern, flags):
code[skip] = len(code) - skip code[skip] = len(code) - skip
elif op is ANY: elif op is ANY:
if flags & SRE_FLAG_DOTALL: if flags & SRE_FLAG_DOTALL:
emit(OPCODES[op]) emit(OPCODES[ANY_ALL])
else: else:
emit(OPCODES[CATEGORY]) emit(OPCODES[ANY])
emit(CHCODES[CATEGORY_NOT_LINEBREAK])
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
if flags & SRE_FLAG_TEMPLATE: if flags & SRE_FLAG_TEMPLATE:
raise error, "internal: unsupported template operator" raise error, "internal: unsupported template operator"
...@@ -130,12 +136,7 @@ def _compile(code, pattern, flags): ...@@ -130,12 +136,7 @@ def _compile(code, pattern, flags):
_compile(code, av[2], flags) _compile(code, av[2], flags)
emit(OPCODES[SUCCESS]) emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip code[skip] = len(code) - skip
else: elif _simple(av) and op == MAX_REPEAT:
lo, hi = av[2].getwidth()
if lo == 0:
raise error, "nothing to repeat"
if 0 and lo == hi == 1 and op is MAX_REPEAT:
# FIXME: <fl> fast and wrong (but we'll fix that)
emit(OPCODES[REPEAT_ONE]) emit(OPCODES[REPEAT_ONE])
skip = len(code); emit(0) skip = len(code); emit(0)
emit(av[0]) emit(av[0])
......
...@@ -20,6 +20,7 @@ FAILURE = "failure" ...@@ -20,6 +20,7 @@ FAILURE = "failure"
SUCCESS = "success" SUCCESS = "success"
ANY = "any" ANY = "any"
ANY_ALL = "any_all"
ASSERT = "assert" ASSERT = "assert"
ASSERT_NOT = "assert_not" ASSERT_NOT = "assert_not"
AT = "at" AT = "at"
...@@ -81,7 +82,7 @@ OPCODES = [ ...@@ -81,7 +82,7 @@ OPCODES = [
# failure=0 success=1 (just because it looks better that way :-) # failure=0 success=1 (just because it looks better that way :-)
FAILURE, SUCCESS, FAILURE, SUCCESS,
ANY, ANY, ANY_ALL,
ASSERT, ASSERT_NOT, ASSERT, ASSERT_NOT,
AT, AT,
BRANCH, BRANCH,
......
...@@ -142,7 +142,7 @@ class SubPattern: ...@@ -142,7 +142,7 @@ class SubPattern:
for av in av[1]: for av in av[1]:
l, h = av.getwidth() l, h = av.getwidth()
i = min(i, l) i = min(i, l)
j = min(j, h) j = max(j, h)
lo = lo + i lo = lo + i
hi = hi + j hi = hi + j
elif op is CALL: elif op is CALL:
......
...@@ -448,6 +448,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) ...@@ -448,6 +448,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
int i, count; int i, count;
SRE_REPEAT* rp; SRE_REPEAT* rp;
int lastmark; int lastmark;
SRE_CODE chr;
SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */ SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
...@@ -525,8 +526,17 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) ...@@ -525,8 +526,17 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
break; break;
case SRE_OP_ANY: case SRE_OP_ANY:
/* match anything */ /* match anything (except a newline) */
/* <ANY> */ /* <ANY> */
TRACE(("%8d: anything (except newline)\n", PTR(ptr)));
if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
return 0;
ptr++;
break;
case SRE_OP_ANY_ALL:
/* match anything */
/* <ANY_ALL> */
TRACE(("%8d: anything\n", PTR(ptr))); TRACE(("%8d: anything\n", PTR(ptr)));
if (ptr >= end) if (ptr >= end)
return 0; return 0;
...@@ -695,60 +705,79 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) ...@@ -695,60 +705,79 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr), TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
pattern[1], pattern[2])); pattern[1], pattern[2]));
if (ptr + pattern[1] > end)
return 0; /* cannot match */
count = 0; count = 0;
if (pattern[3] == SRE_OP_ANY) { switch (pattern[3]) {
case SRE_OP_ANY:
/* repeated wildcard. */
while (count < (int) pattern[2]) {
if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
break;
ptr++;
count++;
}
break;
case SRE_OP_ANY_ALL:
/* repeated wildcard. skip to the end of the target /* repeated wildcard. skip to the end of the target
string, and backtrack from there */ string, and backtrack from there */
/* FIXME: must look for line endings */
if (ptr + pattern[1] > end) if (ptr + pattern[1] > end)
return 0; /* cannot match */ return 0; /* cannot match */
count = pattern[2]; count = pattern[2];
if (count > end - ptr) if (count > end - ptr)
count = end - ptr; count = end - ptr;
ptr += count; ptr += count;
break;
} else if (pattern[3] == SRE_OP_LITERAL) { case SRE_OP_LITERAL:
/* repeated literal */ /* repeated literal */
SRE_CODE chr = pattern[4]; chr = pattern[4];
while (count < (int) pattern[2]) { while (count < (int) pattern[2]) {
if (ptr >= end || (SRE_CODE) ptr[0] != chr) if (ptr >= end || (SRE_CODE) ptr[0] != chr)
break; break;
ptr++; ptr++;
count++; count++;
} }
break;
} else if (pattern[3] == SRE_OP_LITERAL_IGNORE) { case SRE_OP_LITERAL_IGNORE:
/* repeated literal */ /* repeated literal */
SRE_CODE chr = pattern[4]; chr = pattern[4];
while (count < (int) pattern[2]) { while (count < (int) pattern[2]) {
if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr) if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
break; break;
ptr++; ptr++;
count++; count++;
} }
break;
} else if (pattern[3] == SRE_OP_NOT_LITERAL) { case SRE_OP_NOT_LITERAL:
/* repeated non-literal */ /* repeated non-literal */
SRE_CODE chr = pattern[4]; chr = pattern[4];
while (count < (int) pattern[2]) { while (count < (int) pattern[2]) {
if (ptr >= end || (SRE_CODE) ptr[0] == chr) if (ptr >= end || (SRE_CODE) ptr[0] == chr)
break; break;
ptr++; ptr++;
count++; count++;
} }
break;
} else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) { case SRE_OP_NOT_LITERAL_IGNORE:
/* repeated non-literal */ /* repeated non-literal */
SRE_CODE chr = pattern[4]; chr = pattern[4];
while (count < (int) pattern[2]) { while (count < (int) pattern[2]) {
if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr) if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
break; break;
ptr++; ptr++;
count++; count++;
} }
break;
} else if (pattern[3] == SRE_OP_IN) { case SRE_OP_IN:
/* repeated set */ /* repeated set */
while (count < (int) pattern[2]) { while (count < (int) pattern[2]) {
if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr)) if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
...@@ -756,8 +785,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) ...@@ -756,8 +785,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
ptr++; ptr++;
count++; count++;
} }
break;
} else { default:
/* repeated single character pattern */ /* repeated single character pattern */
state->ptr = ptr; state->ptr = ptr;
while (count < (int) pattern[2]) { while (count < (int) pattern[2]) {
...@@ -770,6 +800,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) ...@@ -770,6 +800,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
} }
state->ptr = ptr; state->ptr = ptr;
ptr += count; ptr += count;
break;
} }
/* when we arrive here, count contains the number of /* when we arrive here, count contains the number of
...@@ -791,7 +822,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) ...@@ -791,7 +822,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
} else if (pattern[pattern[0]] == SRE_OP_LITERAL) { } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
/* tail starts with a literal. skip positions where /* tail starts with a literal. skip positions where
the rest of the pattern cannot possibly match */ the rest of the pattern cannot possibly match */
SRE_CODE chr = pattern[pattern[0]+1]; chr = pattern[pattern[0]+1];
TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr)); TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
for (;;) { for (;;) {
TRACE(("%8d: scan for tail match\n", PTR(ptr))); TRACE(("%8d: scan for tail match\n", PTR(ptr)));
......
...@@ -14,31 +14,32 @@ ...@@ -14,31 +14,32 @@
#define SRE_OP_FAILURE 0 #define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1 #define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2 #define SRE_OP_ANY 2
#define SRE_OP_ASSERT 3 #define SRE_OP_ANY_ALL 3
#define SRE_OP_ASSERT_NOT 4 #define SRE_OP_ASSERT 4
#define SRE_OP_AT 5 #define SRE_OP_ASSERT_NOT 5
#define SRE_OP_BRANCH 6 #define SRE_OP_AT 6
#define SRE_OP_CALL 7 #define SRE_OP_BRANCH 7
#define SRE_OP_CATEGORY 8 #define SRE_OP_CALL 8
#define SRE_OP_CHARSET 9 #define SRE_OP_CATEGORY 9
#define SRE_OP_GROUPREF 10 #define SRE_OP_CHARSET 10
#define SRE_OP_GROUPREF_IGNORE 11 #define SRE_OP_GROUPREF 11
#define SRE_OP_IN 12 #define SRE_OP_GROUPREF_IGNORE 12
#define SRE_OP_IN_IGNORE 13 #define SRE_OP_IN 13
#define SRE_OP_INFO 14 #define SRE_OP_IN_IGNORE 14
#define SRE_OP_JUMP 15 #define SRE_OP_INFO 15
#define SRE_OP_LITERAL 16 #define SRE_OP_JUMP 16
#define SRE_OP_LITERAL_IGNORE 17 #define SRE_OP_LITERAL 17
#define SRE_OP_MARK 18 #define SRE_OP_LITERAL_IGNORE 18
#define SRE_OP_MAX_UNTIL 19 #define SRE_OP_MARK 19
#define SRE_OP_MIN_UNTIL 20 #define SRE_OP_MAX_UNTIL 20
#define SRE_OP_NOT_LITERAL 21 #define SRE_OP_MIN_UNTIL 21
#define SRE_OP_NOT_LITERAL_IGNORE 22 #define SRE_OP_NOT_LITERAL 22
#define SRE_OP_NEGATE 23 #define SRE_OP_NOT_LITERAL_IGNORE 23
#define SRE_OP_RANGE 24 #define SRE_OP_NEGATE 24
#define SRE_OP_REPEAT 25 #define SRE_OP_RANGE 25
#define SRE_OP_REPEAT_ONE 26 #define SRE_OP_REPEAT 26
#define SRE_OP_SUBPATTERN 27 #define SRE_OP_REPEAT_ONE 27
#define SRE_OP_SUBPATTERN 28
#define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BOUNDARY 2 #define SRE_AT_BOUNDARY 2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment