Kaydet (Commit) 66dc4648 authored tarafından Serhiy Storchaka's avatar Serhiy Storchaka

Issue #24426: Fast searching optimization in regular expressions now works

for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.
üst 6ee588f1
...@@ -409,57 +409,39 @@ def _generate_overlap_table(prefix): ...@@ -409,57 +409,39 @@ def _generate_overlap_table(prefix):
table[i] = idx + 1 table[i] = idx + 1
return table return table
def _compile_info(code, pattern, flags): def _get_literal_prefix(pattern):
# internal: compile an info block. in the current version, # look for literal prefix
# this contains min/max pattern width, and an optional literal
# prefix or a character map
lo, hi = pattern.getwidth()
if hi > MAXCODE:
hi = MAXCODE
if lo == 0:
code.extend([INFO, 4, 0, lo, hi])
return
# look for a literal prefix
prefix = [] prefix = []
prefixappend = prefix.append prefixappend = prefix.append
prefix_skip = 0 prefix_skip = None
got_all = True
for op, av in pattern.data:
if op is LITERAL:
prefixappend(av)
elif op is SUBPATTERN:
prefix1, prefix_skip1, got_all = _get_literal_prefix(av[1])
if prefix_skip is None:
if av[0] is not None:
prefix_skip = len(prefix)
elif prefix_skip1 is not None:
prefix_skip = len(prefix) + prefix_skip1
prefix.extend(prefix1)
if not got_all:
break
else:
got_all = False
break
return prefix, prefix_skip, got_all
def _get_charset_prefix(pattern):
charset = [] # not used charset = [] # not used
charsetappend = charset.append charsetappend = charset.append
if not (flags & SRE_FLAG_IGNORECASE): if pattern.data:
# look for literal prefix op, av = pattern.data[0]
for op, av in pattern.data: if op is SUBPATTERN and av[1]:
op, av = av[1][0]
if op is LITERAL: if op is LITERAL:
if len(prefix) == prefix_skip: charsetappend((op, av))
prefix_skip = prefix_skip + 1
prefixappend(av)
elif op is SUBPATTERN and len(av[1]) == 1:
op, av = av[1][0]
if op is LITERAL:
prefixappend(av)
else:
break
else:
break
# if no prefix, look for charset prefix
if not prefix and pattern.data:
op, av = pattern.data[0]
if op is SUBPATTERN and av[1]:
op, av = av[1][0]
if op is LITERAL:
charsetappend((op, av))
elif op is BRANCH:
c = []
cappend = c.append
for p in av[1]:
if not p:
break
op, av = p[0]
if op is LITERAL:
cappend((op, av))
else:
break
else:
charset = c
elif op is BRANCH: elif op is BRANCH:
c = [] c = []
cappend = c.append cappend = c.append
...@@ -473,8 +455,43 @@ def _compile_info(code, pattern, flags): ...@@ -473,8 +455,43 @@ def _compile_info(code, pattern, flags):
break break
else: else:
charset = c charset = c
elif op is IN: elif op is BRANCH:
charset = av c = []
cappend = c.append
for p in av[1]:
if not p:
break
op, av = p[0]
if op is LITERAL:
cappend((op, av))
else:
break
else:
charset = c
elif op is IN:
charset = av
return charset
def _compile_info(code, pattern, flags):
# internal: compile an info block. in the current version,
# this contains min/max pattern width, and an optional literal
# prefix or a character map
lo, hi = pattern.getwidth()
if hi > MAXCODE:
hi = MAXCODE
if lo == 0:
code.extend([INFO, 4, 0, lo, hi])
return
# look for a literal prefix
prefix = []
prefix_skip = 0
charset = [] # not used
if not (flags & SRE_FLAG_IGNORECASE):
# look for literal prefix
prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
# if no prefix, look for charset prefix
if not prefix:
charset = _get_charset_prefix(pattern)
## if prefix: ## if prefix:
## print("*** PREFIX", prefix, prefix_skip) ## print("*** PREFIX", prefix, prefix_skip)
## if charset: ## if charset:
...@@ -487,7 +504,7 @@ def _compile_info(code, pattern, flags): ...@@ -487,7 +504,7 @@ def _compile_info(code, pattern, flags):
mask = 0 mask = 0
if prefix: if prefix:
mask = SRE_INFO_PREFIX mask = SRE_INFO_PREFIX
if len(prefix) == prefix_skip == len(pattern.data): if prefix_skip is None and got_all:
mask = mask | SRE_INFO_LITERAL mask = mask | SRE_INFO_LITERAL
elif charset: elif charset:
mask = mask | SRE_INFO_CHARSET mask = mask | SRE_INFO_CHARSET
...@@ -502,6 +519,8 @@ def _compile_info(code, pattern, flags): ...@@ -502,6 +519,8 @@ def _compile_info(code, pattern, flags):
# add literal prefix # add literal prefix
if prefix: if prefix:
emit(len(prefix)) # length emit(len(prefix)) # length
if prefix_skip is None:
prefix_skip = len(prefix)
emit(prefix_skip) # skip emit(prefix_skip) # skip
code.extend(prefix) code.extend(prefix)
# generate overlap table # generate overlap table
......
...@@ -13,6 +13,10 @@ Core and Builtins ...@@ -13,6 +13,10 @@ Core and Builtins
Library Library
------- -------
- Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups. Fast searching optimization
now can't be disabled at compile time.
Documentation Documentation
------------- -------------
......
...@@ -62,9 +62,6 @@ static char copyright[] = ...@@ -62,9 +62,6 @@ static char copyright[] =
/* -------------------------------------------------------------------- */ /* -------------------------------------------------------------------- */
/* optional features */ /* optional features */
/* enables fast searching */
#define USE_FAST_SEARCH
/* enables copy/deepcopy handling (work in progress) */ /* enables copy/deepcopy handling (work in progress) */
#undef USE_BUILTIN_COPY #undef USE_BUILTIN_COPY
......
...@@ -1248,7 +1248,32 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) ...@@ -1248,7 +1248,32 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
prefix, prefix_len, prefix_skip)); prefix, prefix_len, prefix_skip));
TRACE(("charset = %p\n", charset)); TRACE(("charset = %p\n", charset));
#if defined(USE_FAST_SEARCH) if (prefix_len == 1) {
/* pattern starts with a literal character */
SRE_CHAR c = (SRE_CHAR) prefix[0];
#if SIZEOF_SRE_CHAR < 4
if ((SRE_CODE) c != prefix[0])
return 0; /* literal can't match: doesn't fit in char width */
#endif
end = (SRE_CHAR *)state->end;
while (ptr < end) {
while (*ptr != c) {
if (++ptr >= end)
return 0;
}
TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
state->start = ptr;
state->ptr = ptr + prefix_skip;
if (flags & SRE_INFO_LITERAL)
return 1; /* we got all of it */
status = SRE(match)(state, pattern + 2*prefix_skip, 0);
if (status != 0)
return status;
++ptr;
}
return 0;
}
if (prefix_len > 1) { if (prefix_len > 1) {
/* pattern starts with a known prefix. use the overlap /* pattern starts with a known prefix. use the overlap
table to skip forward as fast as we possibly can */ table to skip forward as fast as we possibly can */
...@@ -1297,32 +1322,8 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) ...@@ -1297,32 +1322,8 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
} }
return 0; return 0;
} }
#endif
if (pattern[0] == SRE_OP_LITERAL) { if (charset) {
/* pattern starts with a literal character. this is used
for short prefixes, and if fast search is disabled */
SRE_CHAR c = (SRE_CHAR) pattern[1];
#if SIZEOF_SRE_CHAR < 4
if ((SRE_CODE) c != pattern[1])
return 0; /* literal can't match: doesn't fit in char width */
#endif
end = (SRE_CHAR *)state->end;
while (ptr < end) {
while (*ptr != c) {
if (++ptr >= end)
return 0;
}
TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
state->start = ptr;
state->ptr = ++ptr;
if (flags & SRE_INFO_LITERAL)
return 1; /* we got all of it */
status = SRE(match)(state, pattern + 2, 0);
if (status != 0)
break;
}
} else if (charset) {
/* pattern starts with a character from a known set */ /* pattern starts with a character from a known set */
end = (SRE_CHAR *)state->end; end = (SRE_CHAR *)state->end;
for (;;) { for (;;) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment