Kaydet (Commit) 6f013982 authored tarafından Fredrik Lundh's avatar Fredrik Lundh

- added lookbehind support (?<=pattern), (?<!pattern).

  the pattern must have a fixed width.

- got rid of array-module dependencies; the match pro-
  gram is now stored inside the pattern object, rather
  than in an extra string buffer.

- cleaned up a various of potential leaks, api abuses,
  and other minors in the engine module.

- use mal's new isalnum macro, rather than my own work-
  around.

- untabified test_sre.py.  seems like I removed a couple
  of trailing spaces in the process...
üst 40c48685
...@@ -10,18 +10,10 @@ ...@@ -10,18 +10,10 @@
# other compatibility work. # other compatibility work.
# #
import array
import _sre import _sre
from sre_constants import * from sre_constants import *
# find an array type code that matches the engine's code size
for WORDSIZE in "Hil":
if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
break
else:
raise RuntimeError, "cannot find a useable array type"
MAXCODE = 65535 MAXCODE = 65535
def _charset(charset, fixup): def _charset(charset, fixup):
...@@ -170,7 +162,20 @@ def _compile(code, pattern, flags): ...@@ -170,7 +162,20 @@ def _compile(code, pattern, flags):
emit((group-1)*2+1) emit((group-1)*2+1)
elif op in (SUCCESS, FAILURE): elif op in (SUCCESS, FAILURE):
emit(OPCODES[op]) emit(OPCODES[op])
elif op in (ASSERT, ASSERT_NOT, CALL): elif op in (ASSERT, ASSERT_NOT):
emit(OPCODES[op])
skip = len(code); emit(0)
if av[0] >= 0:
emit(0) # look ahead
else:
lo, hi = av[1].getwidth()
if lo != hi:
raise error, "look-behind requires fixed-width pattern"
emit(lo) # look behind
_compile(code, av[1], flags)
emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
elif op is CALL:
emit(OPCODES[op]) emit(OPCODES[op])
skip = len(code); emit(0) skip = len(code); emit(0)
_compile(code, av, flags) _compile(code, av, flags)
...@@ -305,7 +310,7 @@ def compile(p, flags=0): ...@@ -305,7 +310,7 @@ def compile(p, flags=0):
indexgroup[i] = k indexgroup[i] = k
return _sre.compile( return _sre.compile(
pattern, flags, pattern, flags, code,
array.array(WORDSIZE, code).tostring(), p.pattern.groups-1,
p.pattern.groups-1, groupindex, indexgroup groupindex, indexgroup
) )
...@@ -482,9 +482,15 @@ def _parse(source, state): ...@@ -482,9 +482,15 @@ def _parse(source, state):
if source.next is None or source.next == ")": if source.next is None or source.next == ")":
break break
source.get() source.get()
elif source.next in ("=", "!"): elif source.next in ("=", "!", "<"):
# lookahead assertions # lookahead assertions
char = source.get() char = source.get()
dir = 1
if char == "<":
if source.next not in ("=", "!"):
raise error, "syntax error"
dir = -1 # lookbehind
char = source.get()
b = [] b = []
while 1: while 1:
p = _parse(source, state) p = _parse(source, state)
...@@ -493,9 +499,9 @@ def _parse(source, state): ...@@ -493,9 +499,9 @@ def _parse(source, state):
b.append(p) b.append(p)
p = _branch(state, b) p = _branch(state, b)
if char == "=": if char == "=":
subpattern.append((ASSERT, p)) subpattern.append((ASSERT, (dir, p)))
else: else:
subpattern.append((ASSERT_NOT, p)) subpattern.append((ASSERT_NOT, (dir, p)))
break break
elif source.match("|"): elif source.match("|"):
b.append(p) b.append(p)
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
* 00-06-30 fl added fast search optimization (0.9.3) * 00-06-30 fl added fast search optimization (0.9.3)
* 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4) * 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
* 00-07-02 fl added charset optimizations, etc (0.9.5) * 00-07-02 fl added charset optimizations, etc (0.9.5)
* 00-07-03 fl store code in pattern object, lookbehind, etc
* *
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
* *
...@@ -144,14 +145,6 @@ static unsigned int sre_lower_unicode(unsigned int ch) ...@@ -144,14 +145,6 @@ static unsigned int sre_lower_unicode(unsigned int ch)
{ {
return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch)); return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
} }
#if !defined(Py_UNICODE_ISALNUM)
/* FIXME: workaround. should be fixed in unicodectype.c */
#define Py_UNICODE_ISALNUM(ch)\
(Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISUPPER(ch) ||\
Py_UNICODE_ISTITLE(ch) || Py_UNICODE_ISDIGIT(ch))
#endif
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch)) #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch)) #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
...@@ -592,7 +585,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -592,7 +585,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* set index */ /* set index */
/* args: <index> */ /* args: <index> */
TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0])); TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0]));
state->index = pattern[0]; state->lastindex = pattern[0];
pattern++; pattern++;
break; break;
...@@ -606,10 +599,12 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -606,10 +599,12 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_ASSERT: case SRE_OP_ASSERT:
/* assert subpattern */ /* assert subpattern */
/* args: <skip> <pattern> */ /* args: <skip> <back> <pattern> */
TRACE(("%8d: assert subpattern\n", PTR(ptr))); TRACE(("%8d: assert subpattern %d\n", PTR(ptr), pattern[1]));
state->ptr = ptr; state->ptr = ptr - pattern[1];
i = SRE_MATCH(state, pattern + 1); if (state->ptr < state->beginning)
goto failure;
i = SRE_MATCH(state, pattern + 2);
if (i < 0) if (i < 0)
return i; return i;
if (!i) if (!i)
...@@ -620,9 +615,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) ...@@ -620,9 +615,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_ASSERT_NOT: case SRE_OP_ASSERT_NOT:
/* assert not subpattern */ /* assert not subpattern */
/* args: <skip> <pattern> */ /* args: <skip> <pattern> */
TRACE(("%8d: assert not subpattern\n", PTR(ptr))); TRACE(("%8d: assert not subpattern %d\n", PTR(ptr), pattern[1]));
state->ptr = ptr; state->ptr = ptr - pattern[1];
i = SRE_MATCH(state, pattern + 1); if (state->ptr < state->beginning)
goto failure;
i = SRE_MATCH(state, pattern + 2);
if (i < 0) if (i < 0)
return i; return i;
if (i) if (i)
...@@ -1098,6 +1095,7 @@ _compile(PyObject* self_, PyObject* args) ...@@ -1098,6 +1095,7 @@ _compile(PyObject* self_, PyObject* args)
/* "compile" pattern descriptor to pattern object */ /* "compile" pattern descriptor to pattern object */
PatternObject* self; PatternObject* self;
int i, n;
PyObject* pattern; PyObject* pattern;
int flags = 0; int flags = 0;
...@@ -1105,14 +1103,30 @@ _compile(PyObject* self_, PyObject* args) ...@@ -1105,14 +1103,30 @@ _compile(PyObject* self_, PyObject* args)
int groups = 0; int groups = 0;
PyObject* groupindex = NULL; PyObject* groupindex = NULL;
PyObject* indexgroup = NULL; PyObject* indexgroup = NULL;
if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags, if (!PyArg_ParseTuple(args, "OiO|iOO", &pattern, &flags, &code,
&PyString_Type, &code,
&groups, &groupindex, &indexgroup)) &groups, &groupindex, &indexgroup))
return NULL; return NULL;
self = PyObject_NEW(PatternObject, &Pattern_Type); code = PySequence_Fast(code, "code argument must be a sequence");
if (self == NULL) if (!code)
return NULL;
n = PySequence_Length(code);
self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, 100*n);
if (!self) {
Py_DECREF(code);
return NULL;
}
for (i = 0; i < n; i++) {
PyObject *o = PySequence_Fast_GET_ITEM(code, i);
self->code[i] = (SRE_CODE) PyInt_AsLong(o);
}
Py_DECREF(code);
if (PyErr_Occurred())
return NULL; return NULL;
Py_INCREF(pattern); Py_INCREF(pattern);
...@@ -1120,9 +1134,6 @@ _compile(PyObject* self_, PyObject* args) ...@@ -1120,9 +1134,6 @@ _compile(PyObject* self_, PyObject* args)
self->flags = flags; self->flags = flags;
Py_INCREF(code);
self->code = code;
self->groups = groups; self->groups = groups;
Py_XINCREF(groupindex); Py_XINCREF(groupindex);
...@@ -1217,7 +1228,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args) ...@@ -1217,7 +1228,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
for (i = 0; i < SRE_MARK_SIZE; i++) for (i = 0; i < SRE_MARK_SIZE; i++)
state->mark[i] = NULL; state->mark[i] = NULL;
state->index = -1; state->lastindex = -1;
state->stack = NULL; state->stack = NULL;
state->stackbase = 0; state->stackbase = 0;
...@@ -1274,8 +1285,9 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state, ...@@ -1274,8 +1285,9 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state,
if (status > 0) { if (status > 0) {
/* create match object (with room for extra group marks) */ /* create match object (with room for extra group marks) */
match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups); match = PyObject_NEW_VAR(MatchObject, &Match_Type,
if (match == NULL) 2*(pattern->groups+1));
if (!match)
return NULL; return NULL;
Py_INCREF(pattern); Py_INCREF(pattern);
...@@ -1301,7 +1313,10 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state, ...@@ -1301,7 +1313,10 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state,
} else } else
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */ match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
match->index = state->index; match->lastindex = state->lastindex;
match->pos = ((char*) state->start - base) / n;
match->endpos = ((char*) state->end - base) / n;
return (PyObject*) match; return (PyObject*) match;
...@@ -1329,12 +1344,12 @@ pattern_scanner(PatternObject* pattern, PyObject* args) ...@@ -1329,12 +1344,12 @@ pattern_scanner(PatternObject* pattern, PyObject* args)
/* create match object (with room for extra group marks) */ /* create match object (with room for extra group marks) */
self = PyObject_NEW(ScannerObject, &Scanner_Type); self = PyObject_NEW(ScannerObject, &Scanner_Type);
if (self == NULL) if (!self)
return NULL; return NULL;
string = state_init(&self->state, pattern, args); string = state_init(&self->state, pattern, args);
if (!string) { if (!string) {
PyObject_DEL(self); PyObject_Del(self);
return NULL; return NULL;
} }
...@@ -1350,10 +1365,9 @@ pattern_scanner(PatternObject* pattern, PyObject* args) ...@@ -1350,10 +1365,9 @@ pattern_scanner(PatternObject* pattern, PyObject* args)
static void static void
pattern_dealloc(PatternObject* self) pattern_dealloc(PatternObject* self)
{ {
Py_XDECREF(self->code);
Py_XDECREF(self->pattern); Py_XDECREF(self->pattern);
Py_XDECREF(self->groupindex); Py_XDECREF(self->groupindex);
PyMem_DEL(self); PyObject_DEL(self);
} }
static PyObject* static PyObject*
...@@ -1614,10 +1628,11 @@ pattern_getattr(PatternObject* self, char* name) ...@@ -1614,10 +1628,11 @@ pattern_getattr(PatternObject* self, char* name)
statichere PyTypeObject Pattern_Type = { statichere PyTypeObject Pattern_Type = {
PyObject_HEAD_INIT(NULL) PyObject_HEAD_INIT(NULL)
0, "SRE_Pattern", sizeof(PatternObject), 0, 0, "SRE_Pattern",
sizeof(PatternObject), sizeof(SRE_CODE),
(destructor)pattern_dealloc, /*tp_dealloc*/ (destructor)pattern_dealloc, /*tp_dealloc*/
0, /*tp_print*/ 0, /*tp_print*/
(getattrfunc)pattern_getattr, /*tp_getattr*/ (getattrfunc)pattern_getattr /*tp_getattr*/
}; };
/* -------------------------------------------------------------------- */ /* -------------------------------------------------------------------- */
...@@ -1628,7 +1643,7 @@ match_dealloc(MatchObject* self) ...@@ -1628,7 +1643,7 @@ match_dealloc(MatchObject* self)
{ {
Py_XDECREF(self->string); Py_XDECREF(self->string);
Py_DECREF(self->pattern); Py_DECREF(self->pattern);
PyMem_DEL(self); PyObject_DEL(self);
} }
static PyObject* static PyObject*
...@@ -1643,31 +1658,40 @@ match_getslice_by_index(MatchObject* self, int index, PyObject* def) ...@@ -1643,31 +1658,40 @@ match_getslice_by_index(MatchObject* self, int index, PyObject* def)
return NULL; return NULL;
} }
if (self->string == Py_None || self->mark[index+index] < 0) { index *= 2;
if (self->string == Py_None || self->mark[index] < 0) {
/* return default value if the string or group is undefined */ /* return default value if the string or group is undefined */
Py_INCREF(def); Py_INCREF(def);
return def; return def;
} }
return PySequence_GetSlice( return PySequence_GetSlice(
self->string, self->mark[index+index], self->mark[index+index+1] self->string, self->mark[index], self->mark[index+1]
); );
} }
static int static int
match_getindex(MatchObject* self, PyObject* index) match_getindex(MatchObject* self, PyObject* index)
{ {
if (!PyInt_Check(index) && self->pattern->groupindex != NULL) { int i;
/* FIXME: resource leak? */
index = PyObject_GetItem(self->pattern->groupindex, index);
if (!index)
return -1;
}
if (PyInt_Check(index)) if (PyInt_Check(index))
return (int) PyInt_AS_LONG(index); return (int) PyInt_AS_LONG(index);
return -1; i = -1;
if (self->pattern->groupindex) {
index = PyObject_GetItem(self->pattern->groupindex, index);
if (index) {
if (PyInt_Check(index))
i = (int) PyInt_AS_LONG(index);
Py_DECREF(index);
} else
PyErr_Clear();
}
return i;
} }
static PyObject* static PyObject*
...@@ -1889,17 +1913,17 @@ match_getattr(MatchObject* self, char* name) ...@@ -1889,17 +1913,17 @@ match_getattr(MatchObject* self, char* name)
if (!strcmp(name, "lastindex")) { if (!strcmp(name, "lastindex")) {
/* experimental */ /* experimental */
if (self->index >= 0) if (self->lastindex >= 0)
return Py_BuildValue("i", self->index); return Py_BuildValue("i", self->lastindex);
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
} }
if (!strcmp(name, "lastgroup")) { if (!strcmp(name, "lastgroup")) {
/* experimental */ /* experimental */
if (self->pattern->indexgroup) { if (self->pattern->indexgroup && self->lastindex >= 0) {
PyObject* result = PySequence_GetItem( PyObject* result = PySequence_GetItem(
self->pattern->indexgroup, self->index self->pattern->indexgroup, self->lastindex
); );
if (result) if (result)
return result; return result;
...@@ -1920,10 +1944,10 @@ match_getattr(MatchObject* self, char* name) ...@@ -1920,10 +1944,10 @@ match_getattr(MatchObject* self, char* name)
} }
if (!strcmp(name, "pos")) if (!strcmp(name, "pos"))
return Py_BuildValue("i", 0); /* FIXME */ return Py_BuildValue("i", self->pos);
if (!strcmp(name, "endpos")) if (!strcmp(name, "endpos"))
return Py_BuildValue("i", 0); /* FIXME */ return Py_BuildValue("i", self->endpos);
PyErr_SetString(PyExc_AttributeError, name); PyErr_SetString(PyExc_AttributeError, name);
return NULL; return NULL;
...@@ -1935,11 +1959,10 @@ match_getattr(MatchObject* self, char* name) ...@@ -1935,11 +1959,10 @@ match_getattr(MatchObject* self, char* name)
statichere PyTypeObject Match_Type = { statichere PyTypeObject Match_Type = {
PyObject_HEAD_INIT(NULL) PyObject_HEAD_INIT(NULL)
0, "SRE_Match", 0, "SRE_Match",
sizeof(MatchObject), /* size of basic object */ sizeof(MatchObject), sizeof(int),
sizeof(int), /* space for group item */
(destructor)match_dealloc, /*tp_dealloc*/ (destructor)match_dealloc, /*tp_dealloc*/
0, /*tp_print*/ 0, /*tp_print*/
(getattrfunc)match_getattr, /*tp_getattr*/ (getattrfunc)match_getattr /*tp_getattr*/
}; };
/* -------------------------------------------------------------------- */ /* -------------------------------------------------------------------- */
...@@ -1951,7 +1974,7 @@ scanner_dealloc(ScannerObject* self) ...@@ -1951,7 +1974,7 @@ scanner_dealloc(ScannerObject* self)
state_fini(&self->state); state_fini(&self->state);
Py_DECREF(self->string); Py_DECREF(self->string);
Py_DECREF(self->pattern); Py_DECREF(self->pattern);
PyMem_DEL(self); PyObject_DEL(self);
} }
static PyObject* static PyObject*
...@@ -2041,8 +2064,7 @@ scanner_getattr(ScannerObject* self, char* name) ...@@ -2041,8 +2064,7 @@ scanner_getattr(ScannerObject* self, char* name)
statichere PyTypeObject Scanner_Type = { statichere PyTypeObject Scanner_Type = {
PyObject_HEAD_INIT(NULL) PyObject_HEAD_INIT(NULL)
0, "SRE_Scanner", 0, "SRE_Scanner",
sizeof(ScannerObject), /* size of basic object */ sizeof(ScannerObject), 0,
0,
(destructor)scanner_dealloc, /*tp_dealloc*/ (destructor)scanner_dealloc, /*tp_dealloc*/
0, /*tp_print*/ 0, /*tp_print*/
(getattrfunc)scanner_getattr, /*tp_getattr*/ (getattrfunc)scanner_getattr, /*tp_getattr*/
......
...@@ -17,26 +17,27 @@ ...@@ -17,26 +17,27 @@
#define SRE_CODE unsigned short #define SRE_CODE unsigned short
typedef struct { typedef struct {
PyObject_HEAD PyObject_VAR_HEAD
PyObject* code; /* link to the code string object */
int groups; int groups;
PyObject* groupindex; PyObject* groupindex;
PyObject* indexgroup; PyObject* indexgroup;
/* compatibility */ /* compatibility */
PyObject* pattern; /* pattern source (or None) */ PyObject* pattern; /* pattern source (or None) */
int flags; /* flags used when compiling pattern source */ int flags; /* flags used when compiling pattern source */
/* pattern code */
SRE_CODE code[1];
} PatternObject; } PatternObject;
#define PatternObject_GetCode(o)\ #define PatternObject_GetCode(o) (((PatternObject*)(o))->code)
((void*) PyString_AS_STRING(((PatternObject*)(o))->code))
typedef struct { typedef struct {
PyObject_HEAD PyObject_VAR_HEAD
PyObject* string; /* link to the target string */ PyObject* string; /* link to the target string */
PatternObject* pattern; /* link to the regex (pattern) object */ PatternObject* pattern; /* link to the regex (pattern) object */
int index; /* last index marker seen by the engine (-1 if none) */ int pos, endpos; /* current target slice */
int lastindex; /* last index marker seen by the engine (-1 if none) */
int groups; /* number of groups (start/end marks) */ int groups; /* number of groups (start/end marks) */
int mark[2]; int mark[1];
} MatchObject; } MatchObject;
typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch); typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
...@@ -59,7 +60,7 @@ typedef struct { ...@@ -59,7 +60,7 @@ typedef struct {
/* character size */ /* character size */
int charsize; int charsize;
/* registers */ /* registers */
int index; int lastindex;
int lastmark; int lastmark;
void* mark[SRE_MARK_SIZE]; void* mark[SRE_MARK_SIZE];
/* backtracking stack */ /* backtracking stack */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment