Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
e2ccf560
Kaydet (Commit)
e2ccf560
authored
Eki 10, 2014
tarafından
Serhiy Storchaka
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
Issue #19380: Optimized parsing of regular expressions.
üst
5aa47443
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
122 additions
and
150 deletions
+122
-150
sre_parse.py
Lib/sre_parse.py
+119
-149
NEWS
Misc/NEWS
+3
-1
No files found.
Lib/sre_parse.py
Dosyayı görüntüle @
e2ccf560
...
...
@@ -18,12 +18,15 @@ from _sre import MAXREPEAT
SPECIAL_CHARS
=
".
\\
[{()*+?^$|"
REPEAT_CHARS
=
"*+?{"
DIGITS
=
set
(
"0123456789"
)
DIGITS
=
frozen
set
(
"0123456789"
)
OCTDIGITS
=
set
(
"01234567"
)
HEXDIGITS
=
set
(
"0123456789abcdefABCDEF"
)
OCTDIGITS
=
frozen
set
(
"01234567"
)
HEXDIGITS
=
frozen
set
(
"0123456789abcdefABCDEF"
)
WHITESPACE
=
set
(
"
\t\n\r\v\f
"
)
WHITESPACE
=
frozenset
(
"
\t\n\r\v\f
"
)
_REPEATCODES
=
frozenset
((
MIN_REPEAT
,
MAX_REPEAT
))
_UNITCODES
=
frozenset
((
ANY
,
RANGE
,
IN
,
LITERAL
,
NOT_LITERAL
,
CATEGORY
))
ESCAPES
=
{
r"\a"
:
(
LITERAL
,
ord
(
"
\a
"
)),
...
...
@@ -153,11 +156,9 @@ class SubPattern:
self
.
data
.
append
(
code
)
def
getwidth
(
self
):
# determine the width (min, max) for this subpattern
if
self
.
width
:
if
self
.
width
is
not
None
:
return
self
.
width
lo
=
hi
=
0
UNITCODES
=
(
ANY
,
RANGE
,
IN
,
LITERAL
,
NOT_LITERAL
,
CATEGORY
)
REPEATCODES
=
(
MIN_REPEAT
,
MAX_REPEAT
)
for
op
,
av
in
self
.
data
:
if
op
is
BRANCH
:
i
=
MAXREPEAT
-
1
...
...
@@ -176,11 +177,11 @@ class SubPattern:
i
,
j
=
av
[
1
]
.
getwidth
()
lo
=
lo
+
i
hi
=
hi
+
j
elif
op
in
REPEATCODES
:
elif
op
in
_
REPEATCODES
:
i
,
j
=
av
[
2
]
.
getwidth
()
lo
=
lo
+
i
*
av
[
0
]
hi
=
hi
+
j
*
av
[
1
]
elif
op
in
UNITCODES
:
elif
op
in
_
UNITCODES
:
lo
=
lo
+
1
hi
=
hi
+
1
elif
op
==
SUCCESS
:
...
...
@@ -191,34 +192,31 @@ class SubPattern:
class
Tokenizer
:
def
__init__
(
self
,
string
):
self
.
istext
=
isinstance
(
string
,
str
)
if
not
self
.
istext
:
string
=
str
(
string
,
'latin1'
)
self
.
string
=
string
self
.
index
=
0
self
.
__next
()
def
__next
(
self
):
if
self
.
index
>=
len
(
self
.
string
):
index
=
self
.
index
try
:
char
=
self
.
string
[
index
]
except
IndexError
:
self
.
next
=
None
return
char
=
self
.
string
[
self
.
index
:
self
.
index
+
1
]
# Special case for the str8, since indexing returns a integer
# XXX This is only needed for test_bug_926075 in test_re.py
if
char
and
not
self
.
istext
:
char
=
chr
(
char
[
0
])
if
char
==
"
\\
"
:
index
+=
1
try
:
c
=
self
.
string
[
self
.
index
+
1
]
c
har
+=
self
.
string
[
index
]
except
IndexError
:
raise
error
(
"bogus escape (end of line)"
)
if
not
self
.
istext
:
c
=
chr
(
c
)
char
=
char
+
c
self
.
index
=
self
.
index
+
len
(
char
)
self
.
index
=
index
+
1
self
.
next
=
char
def
match
(
self
,
char
,
skip
=
1
):
def
match
(
self
,
char
):
if
char
==
self
.
next
:
if
skip
:
self
.
__next
()
return
1
return
0
self
.
__next
()
return
True
return
False
def
get
(
self
):
this
=
self
.
next
self
.
__next
()
...
...
@@ -232,6 +230,17 @@ class Tokenizer:
result
+=
c
self
.
__next
()
return
result
def
getuntil
(
self
,
terminator
):
result
=
''
while
True
:
c
=
self
.
next
self
.
__next
()
if
c
is
None
:
raise
error
(
"unterminated name"
)
if
c
==
terminator
:
break
result
+=
c
return
result
def
tell
(
self
):
return
self
.
index
,
self
.
next
def
seek
(
self
,
index
):
...
...
@@ -270,7 +279,7 @@ def _class_escape(source, escape):
if
code
:
return
code
code
=
CATEGORIES
.
get
(
escape
)
if
code
and
code
[
0
]
==
IN
:
if
code
and
code
[
0
]
is
IN
:
return
code
try
:
c
=
escape
[
1
:
2
]
...
...
@@ -279,7 +288,7 @@ def _class_escape(source, escape):
escape
+=
source
.
getwhile
(
2
,
HEXDIGITS
)
if
len
(
escape
)
!=
4
:
raise
ValueError
return
LITERAL
,
int
(
escape
[
2
:],
16
)
&
0xff
return
LITERAL
,
int
(
escape
[
2
:],
16
)
elif
c
==
"u"
and
source
.
istext
:
# unicode escape (exactly four digits)
escape
+=
source
.
getwhile
(
4
,
HEXDIGITS
)
...
...
@@ -325,7 +334,7 @@ def _escape(source, escape, state):
escape
+=
source
.
getwhile
(
2
,
HEXDIGITS
)
if
len
(
escape
)
!=
4
:
raise
ValueError
return
LITERAL
,
int
(
escape
[
2
:],
16
)
&
0xff
return
LITERAL
,
int
(
escape
[
2
:],
16
)
elif
c
==
"u"
and
source
.
istext
:
# unicode escape (exactly four digits)
escape
+=
source
.
getwhile
(
4
,
HEXDIGITS
)
...
...
@@ -347,11 +356,11 @@ def _escape(source, escape, state):
elif
c
in
DIGITS
:
# octal escape *or* decimal group reference (sigh)
if
source
.
next
in
DIGITS
:
escape
=
escape
+
source
.
get
()
escape
+=
source
.
get
()
if
(
escape
[
1
]
in
OCTDIGITS
and
escape
[
2
]
in
OCTDIGITS
and
source
.
next
in
OCTDIGITS
):
# got three octal digits; this is an octal escape
escape
=
escape
+
source
.
get
()
escape
+=
source
.
get
()
c
=
int
(
escape
[
1
:],
8
)
if
c
>
0
o377
:
raise
error
(
'octal escape value
%
r outside of '
...
...
@@ -370,22 +379,18 @@ def _escape(source, escape, state):
pass
raise
error
(
"bogus escape:
%
s"
%
repr
(
escape
))
def
_parse_sub
(
source
,
state
,
nested
=
1
):
def
_parse_sub
(
source
,
state
,
nested
=
True
):
# parse an alternation: a|b|c
items
=
[]
itemsappend
=
items
.
append
sourcematch
=
source
.
match
while
1
:
while
True
:
itemsappend
(
_parse
(
source
,
state
))
if
sourcematch
(
"|"
):
continue
if
not
nested
:
if
not
sourcematch
(
"|"
):
break
if
not
source
.
next
or
sourcematch
(
")"
,
0
):
break
else
:
raise
error
(
"pattern not properly closed"
)
if
nested
and
source
.
next
is
not
None
and
source
.
next
!=
")"
:
raise
error
(
"pattern not properly closed"
)
if
len
(
items
)
==
1
:
return
items
[
0
]
...
...
@@ -394,7 +399,7 @@ def _parse_sub(source, state, nested=1):
subpatternappend
=
subpattern
.
append
# check if all items share a common prefix
while
1
:
while
True
:
prefix
=
None
for
item
in
items
:
if
not
item
:
...
...
@@ -414,16 +419,12 @@ def _parse_sub(source, state, nested=1):
# check if the branch can be replaced by a character set
for
item
in
items
:
if
len
(
item
)
!=
1
or
item
[
0
][
0
]
!=
LITERAL
:
if
len
(
item
)
!=
1
or
item
[
0
][
0
]
is
not
LITERAL
:
break
else
:
# we can store this as a character set instead of a
# branch (the compiler may optimize this even more)
set
=
[]
setappend
=
set
.
append
for
item
in
items
:
setappend
(
item
[
0
])
subpatternappend
((
IN
,
set
))
subpatternappend
((
IN
,
[
item
[
0
]
for
item
in
items
]))
return
subpattern
subpattern
.
append
((
BRANCH
,
(
None
,
items
)))
...
...
@@ -433,21 +434,16 @@ def _parse_sub_cond(source, state, condgroup):
item_yes
=
_parse
(
source
,
state
)
if
source
.
match
(
"|"
):
item_no
=
_parse
(
source
,
state
)
if
source
.
match
(
"|"
)
:
if
source
.
next
==
"|"
:
raise
error
(
"conditional backref with more than two branches"
)
else
:
item_no
=
None
if
source
.
next
and
not
source
.
match
(
")"
,
0
)
:
if
source
.
next
is
not
None
and
source
.
next
!=
")"
:
raise
error
(
"pattern not properly closed"
)
subpattern
=
SubPattern
(
state
)
subpattern
.
append
((
GROUPREF_EXISTS
,
(
condgroup
,
item_yes
,
item_no
)))
return
subpattern
_PATTERNENDERS
=
set
(
"|)"
)
_ASSERTCHARS
=
set
(
"=!<"
)
_LOOKBEHINDASSERTCHARS
=
set
(
"=!"
)
_REPEATCODES
=
set
([
MIN_REPEAT
,
MAX_REPEAT
])
def
_parse
(
source
,
state
):
# parse a simple pattern
subpattern
=
SubPattern
(
state
)
...
...
@@ -457,32 +453,35 @@ def _parse(source, state):
sourceget
=
source
.
get
sourcematch
=
source
.
match
_len
=
len
PATTERNENDERS
=
_PATTERNENDERS
ASSERTCHARS
=
_ASSERTCHARS
LOOKBEHINDASSERTCHARS
=
_LOOKBEHINDASSERTCHARS
REPEATCODES
=
_REPEATCODES
_ord
=
ord
verbose
=
state
.
flags
&
SRE_FLAG_VERBOSE
while
1
:
while
True
:
if
source
.
next
in
PATTERNENDERS
:
break
# end of subpattern
this
=
sourceget
()
this
=
source
.
next
if
this
is
None
:
break
# end of pattern
if
this
in
"|)"
:
break
# end of subpattern
sourceget
()
if
state
.
flags
&
SRE_FLAG_VERBOSE
:
if
verbose
:
# skip whitespace and comments
if
this
in
WHITESPACE
:
continue
if
this
==
"#"
:
while
1
:
while
True
:
this
=
sourceget
()
if
this
i
n
(
None
,
"
\n
"
)
:
if
this
i
s
None
or
this
==
"
\n
"
:
break
continue
if
this
and
this
[
0
]
not
in
SPECIAL_CHARS
:
subpatternappend
((
LITERAL
,
ord
(
this
)))
if
this
[
0
]
==
"
\\
"
:
code
=
_escape
(
source
,
this
,
state
)
subpatternappend
(
code
)
elif
this
not
in
SPECIAL_CHARS
:
subpatternappend
((
LITERAL
,
_ord
(
this
)))
elif
this
==
"["
:
# character set
...
...
@@ -494,39 +493,38 @@ def _parse(source, state):
setappend
((
NEGATE
,
None
))
# check remaining characters
start
=
set
[:]
while
1
:
while
True
:
this
=
sourceget
()
if
this
is
None
:
raise
error
(
"unexpected end of regular expression"
)
if
this
==
"]"
and
set
!=
start
:
break
elif
this
and
this
[
0
]
==
"
\\
"
:
elif
this
[
0
]
==
"
\\
"
:
code1
=
_class_escape
(
source
,
this
)
elif
this
:
code1
=
LITERAL
,
ord
(
this
)
else
:
raise
error
(
"unexpected end of regular expression"
)
code1
=
LITERAL
,
_ord
(
this
)
if
sourcematch
(
"-"
):
# potential range
this
=
sourceget
()
if
this
is
None
:
raise
error
(
"unexpected end of regular expression"
)
if
this
==
"]"
:
if
code1
[
0
]
is
IN
:
code1
=
code1
[
1
][
0
]
setappend
(
code1
)
setappend
((
LITERAL
,
ord
(
"-"
)))
setappend
((
LITERAL
,
_
ord
(
"-"
)))
break
elif
this
:
if
this
[
0
]
==
"
\\
"
:
code2
=
_class_escape
(
source
,
this
)
else
:
code2
=
LITERAL
,
ord
(
this
)
if
code1
[
0
]
!=
LITERAL
or
code2
[
0
]
!=
LITERAL
:
raise
error
(
"bad character range"
)
lo
=
code1
[
1
]
hi
=
code2
[
1
]
if
hi
<
lo
:
raise
error
(
"bad character range"
)
setappend
((
RANGE
,
(
lo
,
hi
)))
if
this
[
0
]
==
"
\\
"
:
code2
=
_class_escape
(
source
,
this
)
else
:
raise
error
(
"unexpected end of regular expression"
)
code2
=
LITERAL
,
_ord
(
this
)
if
code1
[
0
]
!=
LITERAL
or
code2
[
0
]
!=
LITERAL
:
raise
error
(
"bad character range"
)
lo
=
code1
[
1
]
hi
=
code2
[
1
]
if
hi
<
lo
:
raise
error
(
"bad character range"
)
setappend
((
RANGE
,
(
lo
,
hi
)))
else
:
if
code1
[
0
]
is
IN
:
code1
=
code1
[
1
][
0
]
...
...
@@ -541,7 +539,7 @@ def _parse(source, state):
# XXX: <fl> should add charmap optimization here
subpatternappend
((
IN
,
set
))
elif
this
and
this
[
0
]
in
REPEAT_CHARS
:
elif
this
in
REPEAT_CHARS
:
# repeat previous item
if
this
==
"?"
:
min
,
max
=
0
,
1
...
...
@@ -552,20 +550,20 @@ def _parse(source, state):
min
,
max
=
1
,
MAXREPEAT
elif
this
==
"{"
:
if
source
.
next
==
"}"
:
subpatternappend
((
LITERAL
,
ord
(
this
)))
subpatternappend
((
LITERAL
,
_
ord
(
this
)))
continue
here
=
source
.
tell
()
min
,
max
=
0
,
MAXREPEAT
lo
=
hi
=
""
while
source
.
next
in
DIGITS
:
lo
=
lo
+
source
.
get
()
lo
+=
source
get
()
if
sourcematch
(
","
):
while
source
.
next
in
DIGITS
:
hi
=
hi
+
sourceget
()
hi
+=
sourceget
()
else
:
hi
=
lo
if
not
sourcematch
(
"}"
):
subpatternappend
((
LITERAL
,
ord
(
this
)))
subpatternappend
((
LITERAL
,
_
ord
(
this
)))
source
.
seek
(
here
)
continue
if
lo
:
...
...
@@ -587,7 +585,7 @@ def _parse(source, state):
item
=
None
if
not
item
or
(
_len
(
item
)
==
1
and
item
[
0
][
0
]
==
AT
):
raise
error
(
"nothing to repeat"
)
if
item
[
0
][
0
]
in
REPEATCODES
:
if
item
[
0
][
0
]
in
_
REPEATCODES
:
raise
error
(
"multiple repeat"
)
if
sourcematch
(
"?"
):
subpattern
[
-
1
]
=
(
MIN_REPEAT
,
(
min
,
max
,
item
))
...
...
@@ -604,18 +602,14 @@ def _parse(source, state):
if
sourcematch
(
"?"
):
group
=
0
# options
if
sourcematch
(
"P"
):
char
=
sourceget
()
if
char
is
None
:
raise
error
(
"unexpected end of pattern"
)
if
char
==
"P"
:
# python extensions
if
sourcematch
(
"<"
):
# named group: skip forward to end of name
name
=
""
while
1
:
char
=
sourceget
()
if
char
is
None
:
raise
error
(
"unterminated name"
)
if
char
==
">"
:
break
name
=
name
+
char
name
=
source
.
getuntil
(
">"
)
group
=
1
if
not
name
:
raise
error
(
"missing group name"
)
...
...
@@ -623,14 +617,7 @@ def _parse(source, state):
raise
error
(
"bad character in group name
%
r"
%
name
)
elif
sourcematch
(
"="
):
# named backreference
name
=
""
while
1
:
char
=
sourceget
()
if
char
is
None
:
raise
error
(
"unterminated name"
)
if
char
==
")"
:
break
name
=
name
+
char
name
=
source
.
getuntil
(
")"
)
if
not
name
:
raise
error
(
"missing group name"
)
if
not
name
.
isidentifier
():
...
...
@@ -647,27 +634,25 @@ def _parse(source, state):
if
char
is
None
:
raise
error
(
"unexpected end of pattern"
)
raise
error
(
"unknown specifier: ?P
%
s"
%
char
)
elif
sourcematch
(
":"
)
:
elif
char
==
":"
:
# non-capturing group
group
=
2
elif
sourcematch
(
"#"
)
:
elif
char
==
"#"
:
# comment
while
1
:
if
source
.
next
is
None
or
source
.
next
==
")"
:
while
True
:
if
source
.
next
is
None
:
raise
error
(
"unbalanced parenthesis"
)
if
sourceget
()
==
")"
:
break
sourceget
()
if
not
sourcematch
(
")"
):
raise
error
(
"unbalanced parenthesis"
)
continue
elif
source
.
next
in
ASSERTCHARS
:
elif
char
in
"=!<"
:
# lookahead assertions
char
=
sourceget
()
dir
=
1
if
char
==
"<"
:
if
source
.
next
not
in
LOOKBEHINDASSERTCHARS
:
char
=
sourceget
()
if
char
is
None
or
char
not
in
"=!"
:
raise
error
(
"syntax error"
)
dir
=
-
1
# lookbehind
char
=
sourceget
()
p
=
_parse_sub
(
source
,
state
)
if
not
sourcematch
(
")"
):
raise
error
(
"unbalanced parenthesis"
)
...
...
@@ -676,16 +661,9 @@ def _parse(source, state):
else
:
subpatternappend
((
ASSERT_NOT
,
(
dir
,
p
)))
continue
elif
sourcematch
(
"("
)
:
elif
char
==
"("
:
# conditional backreference group
condname
=
""
while
1
:
char
=
sourceget
()
if
char
is
None
:
raise
error
(
"unterminated name"
)
if
char
==
")"
:
break
condname
=
condname
+
char
condname
=
source
.
getuntil
(
")"
)
group
=
2
if
not
condname
:
raise
error
(
"missing group name"
)
...
...
@@ -705,12 +683,14 @@ def _parse(source, state):
raise
error
(
"bad group number"
)
if
condgroup
>=
MAXGROUPS
:
raise
error
(
"the group number is too large"
)
el
se
:
el
if
char
in
FLAGS
:
# flags
if
not
source
.
next
in
FLAGS
:
raise
error
(
"unexpected end of pattern"
)
state
.
flags
|=
FLAGS
[
char
]
while
source
.
next
in
FLAGS
:
state
.
flags
=
state
.
flags
|
FLAGS
[
sourceget
()]
state
.
flags
|=
FLAGS
[
sourceget
()]
verbose
=
state
.
flags
&
SRE_FLAG_VERBOSE
else
:
raise
error
(
"unexpected end of pattern "
+
char
)
if
group
:
# parse group contents
if
group
==
2
:
...
...
@@ -728,7 +708,7 @@ def _parse(source, state):
state
.
closegroup
(
group
)
subpatternappend
((
SUBPATTERN
,
(
group
,
p
)))
else
:
while
1
:
while
True
:
char
=
sourceget
()
if
char
is
None
:
raise
error
(
"unexpected end of pattern"
)
...
...
@@ -742,10 +722,6 @@ def _parse(source, state):
elif
this
==
"$"
:
subpattern
.
append
((
AT
,
AT_END
))
elif
this
and
this
[
0
]
==
"
\\
"
:
code
=
_escape
(
source
,
this
,
state
)
subpatternappend
(
code
)
else
:
raise
error
(
"parser error"
)
...
...
@@ -776,11 +752,11 @@ def parse(str, flags=0, pattern=None):
p
=
_parse_sub
(
source
,
pattern
,
0
)
p
.
pattern
.
flags
=
fix_flags
(
str
,
p
.
pattern
.
flags
)
tail
=
source
.
get
()
if
tail
==
")"
:
raise
error
(
"unbalanced parenthesis"
)
elif
tail
:
raise
error
(
"bogus characters at end of regular expression"
)
if
source
.
next
is
not
None
:
if
source
.
next
==
")"
:
raise
error
(
"unbalanced parenthesis"
)
else
:
raise
error
(
"bogus characters at end of regular expression"
)
if
flags
&
SRE_FLAG_DEBUG
:
p
.
dump
()
...
...
@@ -817,13 +793,7 @@ def parse_template(source, pattern):
if
c
==
"g"
:
name
=
""
if
s
.
match
(
"<"
):
while
True
:
char
=
sget
()
if
char
is
None
:
raise
error
(
"unterminated group name"
)
if
char
==
">"
:
break
name
+=
char
name
=
s
.
getuntil
(
">"
)
if
not
name
:
raise
error
(
"missing group name"
)
try
:
...
...
Misc/NEWS
Dosyayı görüntüle @
e2ccf560
...
...
@@ -166,7 +166,9 @@ Core and Builtins
Library
-------
-
Issue
1519638
:
Now
unmatched
groups
are
replaced
with
empty
strings
in
re
.
sub
()
-
Issue
#
19380
:
Optimized
parsing
of
regular
expressions
.
-
Issue
#
1519638
:
Now
unmatched
groups
are
replaced
with
empty
strings
in
re
.
sub
()
and
re
.
subn
().
-
Issue
#
18615
:
sndhdr
.
what
/
whathdr
now
return
a
namedtuple
.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment