Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
e2ccf560
Kaydet (Commit)
e2ccf560
authored
Eki 10, 2014
tarafından
Serhiy Storchaka
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
Issue #19380: Optimized parsing of regular expressions.
üst
5aa47443
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
122 additions
and
150 deletions
+122
-150
sre_parse.py
Lib/sre_parse.py
+119
-149
NEWS
Misc/NEWS
+3
-1
No files found.
Lib/sre_parse.py
Dosyayı görüntüle @
e2ccf560
...
@@ -18,12 +18,15 @@ from _sre import MAXREPEAT
...
@@ -18,12 +18,15 @@ from _sre import MAXREPEAT
SPECIAL_CHARS
=
".
\\
[{()*+?^$|"
SPECIAL_CHARS
=
".
\\
[{()*+?^$|"
REPEAT_CHARS
=
"*+?{"
REPEAT_CHARS
=
"*+?{"
DIGITS
=
set
(
"0123456789"
)
DIGITS
=
frozen
set
(
"0123456789"
)
OCTDIGITS
=
set
(
"01234567"
)
OCTDIGITS
=
frozen
set
(
"01234567"
)
HEXDIGITS
=
set
(
"0123456789abcdefABCDEF"
)
HEXDIGITS
=
frozen
set
(
"0123456789abcdefABCDEF"
)
WHITESPACE
=
set
(
"
\t\n\r\v\f
"
)
WHITESPACE
=
frozenset
(
"
\t\n\r\v\f
"
)
_REPEATCODES
=
frozenset
((
MIN_REPEAT
,
MAX_REPEAT
))
_UNITCODES
=
frozenset
((
ANY
,
RANGE
,
IN
,
LITERAL
,
NOT_LITERAL
,
CATEGORY
))
ESCAPES
=
{
ESCAPES
=
{
r"\a"
:
(
LITERAL
,
ord
(
"
\a
"
)),
r"\a"
:
(
LITERAL
,
ord
(
"
\a
"
)),
...
@@ -153,11 +156,9 @@ class SubPattern:
...
@@ -153,11 +156,9 @@ class SubPattern:
self
.
data
.
append
(
code
)
self
.
data
.
append
(
code
)
def
getwidth
(
self
):
def
getwidth
(
self
):
# determine the width (min, max) for this subpattern
# determine the width (min, max) for this subpattern
if
self
.
width
:
if
self
.
width
is
not
None
:
return
self
.
width
return
self
.
width
lo
=
hi
=
0
lo
=
hi
=
0
UNITCODES
=
(
ANY
,
RANGE
,
IN
,
LITERAL
,
NOT_LITERAL
,
CATEGORY
)
REPEATCODES
=
(
MIN_REPEAT
,
MAX_REPEAT
)
for
op
,
av
in
self
.
data
:
for
op
,
av
in
self
.
data
:
if
op
is
BRANCH
:
if
op
is
BRANCH
:
i
=
MAXREPEAT
-
1
i
=
MAXREPEAT
-
1
...
@@ -176,11 +177,11 @@ class SubPattern:
...
@@ -176,11 +177,11 @@ class SubPattern:
i
,
j
=
av
[
1
]
.
getwidth
()
i
,
j
=
av
[
1
]
.
getwidth
()
lo
=
lo
+
i
lo
=
lo
+
i
hi
=
hi
+
j
hi
=
hi
+
j
elif
op
in
REPEATCODES
:
elif
op
in
_
REPEATCODES
:
i
,
j
=
av
[
2
]
.
getwidth
()
i
,
j
=
av
[
2
]
.
getwidth
()
lo
=
lo
+
i
*
av
[
0
]
lo
=
lo
+
i
*
av
[
0
]
hi
=
hi
+
j
*
av
[
1
]
hi
=
hi
+
j
*
av
[
1
]
elif
op
in
UNITCODES
:
elif
op
in
_
UNITCODES
:
lo
=
lo
+
1
lo
=
lo
+
1
hi
=
hi
+
1
hi
=
hi
+
1
elif
op
==
SUCCESS
:
elif
op
==
SUCCESS
:
...
@@ -191,34 +192,31 @@ class SubPattern:
...
@@ -191,34 +192,31 @@ class SubPattern:
class
Tokenizer
:
class
Tokenizer
:
def
__init__
(
self
,
string
):
def
__init__
(
self
,
string
):
self
.
istext
=
isinstance
(
string
,
str
)
self
.
istext
=
isinstance
(
string
,
str
)
if
not
self
.
istext
:
string
=
str
(
string
,
'latin1'
)
self
.
string
=
string
self
.
string
=
string
self
.
index
=
0
self
.
index
=
0
self
.
__next
()
self
.
__next
()
def
__next
(
self
):
def
__next
(
self
):
if
self
.
index
>=
len
(
self
.
string
):
index
=
self
.
index
try
:
char
=
self
.
string
[
index
]
except
IndexError
:
self
.
next
=
None
self
.
next
=
None
return
return
char
=
self
.
string
[
self
.
index
:
self
.
index
+
1
]
# Special case for the str8, since indexing returns a integer
# XXX This is only needed for test_bug_926075 in test_re.py
if
char
and
not
self
.
istext
:
char
=
chr
(
char
[
0
])
if
char
==
"
\\
"
:
if
char
==
"
\\
"
:
index
+=
1
try
:
try
:
c
=
self
.
string
[
self
.
index
+
1
]
c
har
+=
self
.
string
[
index
]
except
IndexError
:
except
IndexError
:
raise
error
(
"bogus escape (end of line)"
)
raise
error
(
"bogus escape (end of line)"
)
if
not
self
.
istext
:
self
.
index
=
index
+
1
c
=
chr
(
c
)
char
=
char
+
c
self
.
index
=
self
.
index
+
len
(
char
)
self
.
next
=
char
self
.
next
=
char
def
match
(
self
,
char
,
skip
=
1
):
def
match
(
self
,
char
):
if
char
==
self
.
next
:
if
char
==
self
.
next
:
if
skip
:
self
.
__next
()
self
.
__next
()
return
True
return
1
return
False
return
0
def
get
(
self
):
def
get
(
self
):
this
=
self
.
next
this
=
self
.
next
self
.
__next
()
self
.
__next
()
...
@@ -232,6 +230,17 @@ class Tokenizer:
...
@@ -232,6 +230,17 @@ class Tokenizer:
result
+=
c
result
+=
c
self
.
__next
()
self
.
__next
()
return
result
return
result
def
getuntil
(
self
,
terminator
):
result
=
''
while
True
:
c
=
self
.
next
self
.
__next
()
if
c
is
None
:
raise
error
(
"unterminated name"
)
if
c
==
terminator
:
break
result
+=
c
return
result
def
tell
(
self
):
def
tell
(
self
):
return
self
.
index
,
self
.
next
return
self
.
index
,
self
.
next
def
seek
(
self
,
index
):
def
seek
(
self
,
index
):
...
@@ -270,7 +279,7 @@ def _class_escape(source, escape):
...
@@ -270,7 +279,7 @@ def _class_escape(source, escape):
if
code
:
if
code
:
return
code
return
code
code
=
CATEGORIES
.
get
(
escape
)
code
=
CATEGORIES
.
get
(
escape
)
if
code
and
code
[
0
]
==
IN
:
if
code
and
code
[
0
]
is
IN
:
return
code
return
code
try
:
try
:
c
=
escape
[
1
:
2
]
c
=
escape
[
1
:
2
]
...
@@ -279,7 +288,7 @@ def _class_escape(source, escape):
...
@@ -279,7 +288,7 @@ def _class_escape(source, escape):
escape
+=
source
.
getwhile
(
2
,
HEXDIGITS
)
escape
+=
source
.
getwhile
(
2
,
HEXDIGITS
)
if
len
(
escape
)
!=
4
:
if
len
(
escape
)
!=
4
:
raise
ValueError
raise
ValueError
return
LITERAL
,
int
(
escape
[
2
:],
16
)
&
0xff
return
LITERAL
,
int
(
escape
[
2
:],
16
)
elif
c
==
"u"
and
source
.
istext
:
elif
c
==
"u"
and
source
.
istext
:
# unicode escape (exactly four digits)
# unicode escape (exactly four digits)
escape
+=
source
.
getwhile
(
4
,
HEXDIGITS
)
escape
+=
source
.
getwhile
(
4
,
HEXDIGITS
)
...
@@ -325,7 +334,7 @@ def _escape(source, escape, state):
...
@@ -325,7 +334,7 @@ def _escape(source, escape, state):
escape
+=
source
.
getwhile
(
2
,
HEXDIGITS
)
escape
+=
source
.
getwhile
(
2
,
HEXDIGITS
)
if
len
(
escape
)
!=
4
:
if
len
(
escape
)
!=
4
:
raise
ValueError
raise
ValueError
return
LITERAL
,
int
(
escape
[
2
:],
16
)
&
0xff
return
LITERAL
,
int
(
escape
[
2
:],
16
)
elif
c
==
"u"
and
source
.
istext
:
elif
c
==
"u"
and
source
.
istext
:
# unicode escape (exactly four digits)
# unicode escape (exactly four digits)
escape
+=
source
.
getwhile
(
4
,
HEXDIGITS
)
escape
+=
source
.
getwhile
(
4
,
HEXDIGITS
)
...
@@ -347,11 +356,11 @@ def _escape(source, escape, state):
...
@@ -347,11 +356,11 @@ def _escape(source, escape, state):
elif
c
in
DIGITS
:
elif
c
in
DIGITS
:
# octal escape *or* decimal group reference (sigh)
# octal escape *or* decimal group reference (sigh)
if
source
.
next
in
DIGITS
:
if
source
.
next
in
DIGITS
:
escape
=
escape
+
source
.
get
()
escape
+=
source
.
get
()
if
(
escape
[
1
]
in
OCTDIGITS
and
escape
[
2
]
in
OCTDIGITS
and
if
(
escape
[
1
]
in
OCTDIGITS
and
escape
[
2
]
in
OCTDIGITS
and
source
.
next
in
OCTDIGITS
):
source
.
next
in
OCTDIGITS
):
# got three octal digits; this is an octal escape
# got three octal digits; this is an octal escape
escape
=
escape
+
source
.
get
()
escape
+=
source
.
get
()
c
=
int
(
escape
[
1
:],
8
)
c
=
int
(
escape
[
1
:],
8
)
if
c
>
0
o377
:
if
c
>
0
o377
:
raise
error
(
'octal escape value
%
r outside of '
raise
error
(
'octal escape value
%
r outside of '
...
@@ -370,22 +379,18 @@ def _escape(source, escape, state):
...
@@ -370,22 +379,18 @@ def _escape(source, escape, state):
pass
pass
raise
error
(
"bogus escape:
%
s"
%
repr
(
escape
))
raise
error
(
"bogus escape:
%
s"
%
repr
(
escape
))
def
_parse_sub
(
source
,
state
,
nested
=
1
):
def
_parse_sub
(
source
,
state
,
nested
=
True
):
# parse an alternation: a|b|c
# parse an alternation: a|b|c
items
=
[]
items
=
[]
itemsappend
=
items
.
append
itemsappend
=
items
.
append
sourcematch
=
source
.
match
sourcematch
=
source
.
match
while
1
:
while
True
:
itemsappend
(
_parse
(
source
,
state
))
itemsappend
(
_parse
(
source
,
state
))
if
sourcematch
(
"|"
):
if
not
sourcematch
(
"|"
):
continue
if
not
nested
:
break
break
if
not
source
.
next
or
sourcematch
(
")"
,
0
):
if
nested
and
source
.
next
is
not
None
and
source
.
next
!=
")"
:
break
raise
error
(
"pattern not properly closed"
)
else
:
raise
error
(
"pattern not properly closed"
)
if
len
(
items
)
==
1
:
if
len
(
items
)
==
1
:
return
items
[
0
]
return
items
[
0
]
...
@@ -394,7 +399,7 @@ def _parse_sub(source, state, nested=1):
...
@@ -394,7 +399,7 @@ def _parse_sub(source, state, nested=1):
subpatternappend
=
subpattern
.
append
subpatternappend
=
subpattern
.
append
# check if all items share a common prefix
# check if all items share a common prefix
while
1
:
while
True
:
prefix
=
None
prefix
=
None
for
item
in
items
:
for
item
in
items
:
if
not
item
:
if
not
item
:
...
@@ -414,16 +419,12 @@ def _parse_sub(source, state, nested=1):
...
@@ -414,16 +419,12 @@ def _parse_sub(source, state, nested=1):
# check if the branch can be replaced by a character set
# check if the branch can be replaced by a character set
for
item
in
items
:
for
item
in
items
:
if
len
(
item
)
!=
1
or
item
[
0
][
0
]
!=
LITERAL
:
if
len
(
item
)
!=
1
or
item
[
0
][
0
]
is
not
LITERAL
:
break
break
else
:
else
:
# we can store this as a character set instead of a
# we can store this as a character set instead of a
# branch (the compiler may optimize this even more)
# branch (the compiler may optimize this even more)
set
=
[]
subpatternappend
((
IN
,
[
item
[
0
]
for
item
in
items
]))
setappend
=
set
.
append
for
item
in
items
:
setappend
(
item
[
0
])
subpatternappend
((
IN
,
set
))
return
subpattern
return
subpattern
subpattern
.
append
((
BRANCH
,
(
None
,
items
)))
subpattern
.
append
((
BRANCH
,
(
None
,
items
)))
...
@@ -433,21 +434,16 @@ def _parse_sub_cond(source, state, condgroup):
...
@@ -433,21 +434,16 @@ def _parse_sub_cond(source, state, condgroup):
item_yes
=
_parse
(
source
,
state
)
item_yes
=
_parse
(
source
,
state
)
if
source
.
match
(
"|"
):
if
source
.
match
(
"|"
):
item_no
=
_parse
(
source
,
state
)
item_no
=
_parse
(
source
,
state
)
if
source
.
match
(
"|"
)
:
if
source
.
next
==
"|"
:
raise
error
(
"conditional backref with more than two branches"
)
raise
error
(
"conditional backref with more than two branches"
)
else
:
else
:
item_no
=
None
item_no
=
None
if
source
.
next
and
not
source
.
match
(
")"
,
0
)
:
if
source
.
next
is
not
None
and
source
.
next
!=
")"
:
raise
error
(
"pattern not properly closed"
)
raise
error
(
"pattern not properly closed"
)
subpattern
=
SubPattern
(
state
)
subpattern
=
SubPattern
(
state
)
subpattern
.
append
((
GROUPREF_EXISTS
,
(
condgroup
,
item_yes
,
item_no
)))
subpattern
.
append
((
GROUPREF_EXISTS
,
(
condgroup
,
item_yes
,
item_no
)))
return
subpattern
return
subpattern
_PATTERNENDERS
=
set
(
"|)"
)
_ASSERTCHARS
=
set
(
"=!<"
)
_LOOKBEHINDASSERTCHARS
=
set
(
"=!"
)
_REPEATCODES
=
set
([
MIN_REPEAT
,
MAX_REPEAT
])
def
_parse
(
source
,
state
):
def
_parse
(
source
,
state
):
# parse a simple pattern
# parse a simple pattern
subpattern
=
SubPattern
(
state
)
subpattern
=
SubPattern
(
state
)
...
@@ -457,32 +453,35 @@ def _parse(source, state):
...
@@ -457,32 +453,35 @@ def _parse(source, state):
sourceget
=
source
.
get
sourceget
=
source
.
get
sourcematch
=
source
.
match
sourcematch
=
source
.
match
_len
=
len
_len
=
len
PATTERNENDERS
=
_PATTERNENDERS
_ord
=
ord
ASSERTCHARS
=
_ASSERTCHARS
verbose
=
state
.
flags
&
SRE_FLAG_VERBOSE
LOOKBEHINDASSERTCHARS
=
_LOOKBEHINDASSERTCHARS
REPEATCODES
=
_REPEATCODES
while
1
:
while
True
:
if
source
.
next
in
PATTERNENDERS
:
this
=
source
.
next
break
# end of subpattern
this
=
sourceget
()
if
this
is
None
:
if
this
is
None
:
break
# end of pattern
break
# end of pattern
if
this
in
"|)"
:
break
# end of subpattern
sourceget
()
if
state
.
flags
&
SRE_FLAG_VERBOSE
:
if
verbose
:
# skip whitespace and comments
# skip whitespace and comments
if
this
in
WHITESPACE
:
if
this
in
WHITESPACE
:
continue
continue
if
this
==
"#"
:
if
this
==
"#"
:
while
1
:
while
True
:
this
=
sourceget
()
this
=
sourceget
()
if
this
i
n
(
None
,
"
\n
"
)
:
if
this
i
s
None
or
this
==
"
\n
"
:
break
break
continue
continue
if
this
and
this
[
0
]
not
in
SPECIAL_CHARS
:
if
this
[
0
]
==
"
\\
"
:
subpatternappend
((
LITERAL
,
ord
(
this
)))
code
=
_escape
(
source
,
this
,
state
)
subpatternappend
(
code
)
elif
this
not
in
SPECIAL_CHARS
:
subpatternappend
((
LITERAL
,
_ord
(
this
)))
elif
this
==
"["
:
elif
this
==
"["
:
# character set
# character set
...
@@ -494,39 +493,38 @@ def _parse(source, state):
...
@@ -494,39 +493,38 @@ def _parse(source, state):
setappend
((
NEGATE
,
None
))
setappend
((
NEGATE
,
None
))
# check remaining characters
# check remaining characters
start
=
set
[:]
start
=
set
[:]
while
1
:
while
True
:
this
=
sourceget
()
this
=
sourceget
()
if
this
is
None
:
raise
error
(
"unexpected end of regular expression"
)
if
this
==
"]"
and
set
!=
start
:
if
this
==
"]"
and
set
!=
start
:
break
break
elif
this
and
this
[
0
]
==
"
\\
"
:
elif
this
[
0
]
==
"
\\
"
:
code1
=
_class_escape
(
source
,
this
)
code1
=
_class_escape
(
source
,
this
)
elif
this
:
code1
=
LITERAL
,
ord
(
this
)
else
:
else
:
raise
error
(
"unexpected end of regular expression"
)
code1
=
LITERAL
,
_ord
(
this
)
if
sourcematch
(
"-"
):
if
sourcematch
(
"-"
):
# potential range
# potential range
this
=
sourceget
()
this
=
sourceget
()
if
this
is
None
:
raise
error
(
"unexpected end of regular expression"
)
if
this
==
"]"
:
if
this
==
"]"
:
if
code1
[
0
]
is
IN
:
if
code1
[
0
]
is
IN
:
code1
=
code1
[
1
][
0
]
code1
=
code1
[
1
][
0
]
setappend
(
code1
)
setappend
(
code1
)
setappend
((
LITERAL
,
ord
(
"-"
)))
setappend
((
LITERAL
,
_
ord
(
"-"
)))
break
break
elif
this
:
if
this
[
0
]
==
"
\\
"
:
if
this
[
0
]
==
"
\\
"
:
code2
=
_class_escape
(
source
,
this
)
code2
=
_class_escape
(
source
,
this
)
else
:
code2
=
LITERAL
,
ord
(
this
)
if
code1
[
0
]
!=
LITERAL
or
code2
[
0
]
!=
LITERAL
:
raise
error
(
"bad character range"
)
lo
=
code1
[
1
]
hi
=
code2
[
1
]
if
hi
<
lo
:
raise
error
(
"bad character range"
)
setappend
((
RANGE
,
(
lo
,
hi
)))
else
:
else
:
raise
error
(
"unexpected end of regular expression"
)
code2
=
LITERAL
,
_ord
(
this
)
if
code1
[
0
]
!=
LITERAL
or
code2
[
0
]
!=
LITERAL
:
raise
error
(
"bad character range"
)
lo
=
code1
[
1
]
hi
=
code2
[
1
]
if
hi
<
lo
:
raise
error
(
"bad character range"
)
setappend
((
RANGE
,
(
lo
,
hi
)))
else
:
else
:
if
code1
[
0
]
is
IN
:
if
code1
[
0
]
is
IN
:
code1
=
code1
[
1
][
0
]
code1
=
code1
[
1
][
0
]
...
@@ -541,7 +539,7 @@ def _parse(source, state):
...
@@ -541,7 +539,7 @@ def _parse(source, state):
# XXX: <fl> should add charmap optimization here
# XXX: <fl> should add charmap optimization here
subpatternappend
((
IN
,
set
))
subpatternappend
((
IN
,
set
))
elif
this
and
this
[
0
]
in
REPEAT_CHARS
:
elif
this
in
REPEAT_CHARS
:
# repeat previous item
# repeat previous item
if
this
==
"?"
:
if
this
==
"?"
:
min
,
max
=
0
,
1
min
,
max
=
0
,
1
...
@@ -552,20 +550,20 @@ def _parse(source, state):
...
@@ -552,20 +550,20 @@ def _parse(source, state):
min
,
max
=
1
,
MAXREPEAT
min
,
max
=
1
,
MAXREPEAT
elif
this
==
"{"
:
elif
this
==
"{"
:
if
source
.
next
==
"}"
:
if
source
.
next
==
"}"
:
subpatternappend
((
LITERAL
,
ord
(
this
)))
subpatternappend
((
LITERAL
,
_
ord
(
this
)))
continue
continue
here
=
source
.
tell
()
here
=
source
.
tell
()
min
,
max
=
0
,
MAXREPEAT
min
,
max
=
0
,
MAXREPEAT
lo
=
hi
=
""
lo
=
hi
=
""
while
source
.
next
in
DIGITS
:
while
source
.
next
in
DIGITS
:
lo
=
lo
+
source
.
get
()
lo
+=
source
get
()
if
sourcematch
(
","
):
if
sourcematch
(
","
):
while
source
.
next
in
DIGITS
:
while
source
.
next
in
DIGITS
:
hi
=
hi
+
sourceget
()
hi
+=
sourceget
()
else
:
else
:
hi
=
lo
hi
=
lo
if
not
sourcematch
(
"}"
):
if
not
sourcematch
(
"}"
):
subpatternappend
((
LITERAL
,
ord
(
this
)))
subpatternappend
((
LITERAL
,
_
ord
(
this
)))
source
.
seek
(
here
)
source
.
seek
(
here
)
continue
continue
if
lo
:
if
lo
:
...
@@ -587,7 +585,7 @@ def _parse(source, state):
...
@@ -587,7 +585,7 @@ def _parse(source, state):
item
=
None
item
=
None
if
not
item
or
(
_len
(
item
)
==
1
and
item
[
0
][
0
]
==
AT
):
if
not
item
or
(
_len
(
item
)
==
1
and
item
[
0
][
0
]
==
AT
):
raise
error
(
"nothing to repeat"
)
raise
error
(
"nothing to repeat"
)
if
item
[
0
][
0
]
in
REPEATCODES
:
if
item
[
0
][
0
]
in
_
REPEATCODES
:
raise
error
(
"multiple repeat"
)
raise
error
(
"multiple repeat"
)
if
sourcematch
(
"?"
):
if
sourcematch
(
"?"
):
subpattern
[
-
1
]
=
(
MIN_REPEAT
,
(
min
,
max
,
item
))
subpattern
[
-
1
]
=
(
MIN_REPEAT
,
(
min
,
max
,
item
))
...
@@ -604,18 +602,14 @@ def _parse(source, state):
...
@@ -604,18 +602,14 @@ def _parse(source, state):
if
sourcematch
(
"?"
):
if
sourcematch
(
"?"
):
group
=
0
group
=
0
# options
# options
if
sourcematch
(
"P"
):
char
=
sourceget
()
if
char
is
None
:
raise
error
(
"unexpected end of pattern"
)
if
char
==
"P"
:
# python extensions
# python extensions
if
sourcematch
(
"<"
):
if
sourcematch
(
"<"
):
# named group: skip forward to end of name
# named group: skip forward to end of name
name
=
""
name
=
source
.
getuntil
(
">"
)
while
1
:
char
=
sourceget
()
if
char
is
None
:
raise
error
(
"unterminated name"
)
if
char
==
">"
:
break
name
=
name
+
char
group
=
1
group
=
1
if
not
name
:
if
not
name
:
raise
error
(
"missing group name"
)
raise
error
(
"missing group name"
)
...
@@ -623,14 +617,7 @@ def _parse(source, state):
...
@@ -623,14 +617,7 @@ def _parse(source, state):
raise
error
(
"bad character in group name
%
r"
%
name
)
raise
error
(
"bad character in group name
%
r"
%
name
)
elif
sourcematch
(
"="
):
elif
sourcematch
(
"="
):
# named backreference
# named backreference
name
=
""
name
=
source
.
getuntil
(
")"
)
while
1
:
char
=
sourceget
()
if
char
is
None
:
raise
error
(
"unterminated name"
)
if
char
==
")"
:
break
name
=
name
+
char
if
not
name
:
if
not
name
:
raise
error
(
"missing group name"
)
raise
error
(
"missing group name"
)
if
not
name
.
isidentifier
():
if
not
name
.
isidentifier
():
...
@@ -647,27 +634,25 @@ def _parse(source, state):
...
@@ -647,27 +634,25 @@ def _parse(source, state):
if
char
is
None
:
if
char
is
None
:
raise
error
(
"unexpected end of pattern"
)
raise
error
(
"unexpected end of pattern"
)
raise
error
(
"unknown specifier: ?P
%
s"
%
char
)
raise
error
(
"unknown specifier: ?P
%
s"
%
char
)
elif
sourcematch
(
":"
)
:
elif
char
==
":"
:
# non-capturing group
# non-capturing group
group
=
2
group
=
2
elif
sourcematch
(
"#"
)
:
elif
char
==
"#"
:
# comment
# comment
while
1
:
while
True
:
if
source
.
next
is
None
or
source
.
next
==
")"
:
if
source
.
next
is
None
:
raise
error
(
"unbalanced parenthesis"
)
if
sourceget
()
==
")"
:
break
break
sourceget
()
if
not
sourcematch
(
")"
):
raise
error
(
"unbalanced parenthesis"
)
continue
continue
elif
source
.
next
in
ASSERTCHARS
:
elif
char
in
"=!<"
:
# lookahead assertions
# lookahead assertions
char
=
sourceget
()
dir
=
1
dir
=
1
if
char
==
"<"
:
if
char
==
"<"
:
if
source
.
next
not
in
LOOKBEHINDASSERTCHARS
:
char
=
sourceget
()
if
char
is
None
or
char
not
in
"=!"
:
raise
error
(
"syntax error"
)
raise
error
(
"syntax error"
)
dir
=
-
1
# lookbehind
dir
=
-
1
# lookbehind
char
=
sourceget
()
p
=
_parse_sub
(
source
,
state
)
p
=
_parse_sub
(
source
,
state
)
if
not
sourcematch
(
")"
):
if
not
sourcematch
(
")"
):
raise
error
(
"unbalanced parenthesis"
)
raise
error
(
"unbalanced parenthesis"
)
...
@@ -676,16 +661,9 @@ def _parse(source, state):
...
@@ -676,16 +661,9 @@ def _parse(source, state):
else
:
else
:
subpatternappend
((
ASSERT_NOT
,
(
dir
,
p
)))
subpatternappend
((
ASSERT_NOT
,
(
dir
,
p
)))
continue
continue
elif
sourcematch
(
"("
)
:
elif
char
==
"("
:
# conditional backreference group
# conditional backreference group
condname
=
""
condname
=
source
.
getuntil
(
")"
)
while
1
:
char
=
sourceget
()
if
char
is
None
:
raise
error
(
"unterminated name"
)
if
char
==
")"
:
break
condname
=
condname
+
char
group
=
2
group
=
2
if
not
condname
:
if
not
condname
:
raise
error
(
"missing group name"
)
raise
error
(
"missing group name"
)
...
@@ -705,12 +683,14 @@ def _parse(source, state):
...
@@ -705,12 +683,14 @@ def _parse(source, state):
raise
error
(
"bad group number"
)
raise
error
(
"bad group number"
)
if
condgroup
>=
MAXGROUPS
:
if
condgroup
>=
MAXGROUPS
:
raise
error
(
"the group number is too large"
)
raise
error
(
"the group number is too large"
)
el
se
:
el
if
char
in
FLAGS
:
# flags
# flags
if
not
source
.
next
in
FLAGS
:
state
.
flags
|=
FLAGS
[
char
]
raise
error
(
"unexpected end of pattern"
)
while
source
.
next
in
FLAGS
:
while
source
.
next
in
FLAGS
:
state
.
flags
=
state
.
flags
|
FLAGS
[
sourceget
()]
state
.
flags
|=
FLAGS
[
sourceget
()]
verbose
=
state
.
flags
&
SRE_FLAG_VERBOSE
else
:
raise
error
(
"unexpected end of pattern "
+
char
)
if
group
:
if
group
:
# parse group contents
# parse group contents
if
group
==
2
:
if
group
==
2
:
...
@@ -728,7 +708,7 @@ def _parse(source, state):
...
@@ -728,7 +708,7 @@ def _parse(source, state):
state
.
closegroup
(
group
)
state
.
closegroup
(
group
)
subpatternappend
((
SUBPATTERN
,
(
group
,
p
)))
subpatternappend
((
SUBPATTERN
,
(
group
,
p
)))
else
:
else
:
while
1
:
while
True
:
char
=
sourceget
()
char
=
sourceget
()
if
char
is
None
:
if
char
is
None
:
raise
error
(
"unexpected end of pattern"
)
raise
error
(
"unexpected end of pattern"
)
...
@@ -742,10 +722,6 @@ def _parse(source, state):
...
@@ -742,10 +722,6 @@ def _parse(source, state):
elif
this
==
"$"
:
elif
this
==
"$"
:
subpattern
.
append
((
AT
,
AT_END
))
subpattern
.
append
((
AT
,
AT_END
))
elif
this
and
this
[
0
]
==
"
\\
"
:
code
=
_escape
(
source
,
this
,
state
)
subpatternappend
(
code
)
else
:
else
:
raise
error
(
"parser error"
)
raise
error
(
"parser error"
)
...
@@ -776,11 +752,11 @@ def parse(str, flags=0, pattern=None):
...
@@ -776,11 +752,11 @@ def parse(str, flags=0, pattern=None):
p
=
_parse_sub
(
source
,
pattern
,
0
)
p
=
_parse_sub
(
source
,
pattern
,
0
)
p
.
pattern
.
flags
=
fix_flags
(
str
,
p
.
pattern
.
flags
)
p
.
pattern
.
flags
=
fix_flags
(
str
,
p
.
pattern
.
flags
)
tail
=
source
.
get
()
if
source
.
next
is
not
None
:
if
tail
==
")"
:
if
source
.
next
==
")"
:
raise
error
(
"unbalanced parenthesis"
)
raise
error
(
"unbalanced parenthesis"
)
elif
tail
:
else
:
raise
error
(
"bogus characters at end of regular expression"
)
raise
error
(
"bogus characters at end of regular expression"
)
if
flags
&
SRE_FLAG_DEBUG
:
if
flags
&
SRE_FLAG_DEBUG
:
p
.
dump
()
p
.
dump
()
...
@@ -817,13 +793,7 @@ def parse_template(source, pattern):
...
@@ -817,13 +793,7 @@ def parse_template(source, pattern):
if
c
==
"g"
:
if
c
==
"g"
:
name
=
""
name
=
""
if
s
.
match
(
"<"
):
if
s
.
match
(
"<"
):
while
True
:
name
=
s
.
getuntil
(
">"
)
char
=
sget
()
if
char
is
None
:
raise
error
(
"unterminated group name"
)
if
char
==
">"
:
break
name
+=
char
if
not
name
:
if
not
name
:
raise
error
(
"missing group name"
)
raise
error
(
"missing group name"
)
try
:
try
:
...
...
Misc/NEWS
Dosyayı görüntüle @
e2ccf560
...
@@ -166,7 +166,9 @@ Core and Builtins
...
@@ -166,7 +166,9 @@ Core and Builtins
Library
Library
-------
-------
-
Issue
1519638
:
Now
unmatched
groups
are
replaced
with
empty
strings
in
re
.
sub
()
-
Issue
#
19380
:
Optimized
parsing
of
regular
expressions
.
-
Issue
#
1519638
:
Now
unmatched
groups
are
replaced
with
empty
strings
in
re
.
sub
()
and
re
.
subn
().
and
re
.
subn
().
-
Issue
#
18615
:
sndhdr
.
what
/
whathdr
now
return
a
namedtuple
.
-
Issue
#
18615
:
sndhdr
.
what
/
whathdr
now
return
a
namedtuple
.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment