Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
6d336a02
Kaydet (Commit)
6d336a02
authored
May 09, 2017
tarafından
Serhiy Storchaka
Kaydeden (comit)
GitHub
May 09, 2017
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
bpo-30285: Optimize case-insensitive matching and searching (#1482)
of regular expressions.
üst
f93234bb
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
215 additions
and
70 deletions
+215
-70
3.7.rst
Doc/whatsnew/3.7.rst
+4
-0
sre_compile.py
Lib/sre_compile.py
+102
-69
test_re.py
Lib/test/test_re.py
+9
-0
NEWS
Misc/NEWS
+3
-0
_sre.c
Modules/_sre.c
+34
-0
_sre.c.h
Modules/clinic/_sre.c.h
+63
-1
No files found.
Doc/whatsnew/3.7.rst
Dosyayı görüntüle @
6d336a02
...
...
@@ -208,6 +208,10 @@ Optimizations
using the :func:`os.scandir` function.
(Contributed by Serhiy Storchaka in :issue:`25996`.)
* Optimized case-insensitive matching and searching of :mod:`regular
expressions <re>`. Searching some patterns can now be up to 20 times faster.
(Contributed by Serhiy Storchaka in :issue:`30285`.)
Build and C API Changes
=======================
...
...
Lib/sre_compile.py
Dosyayı görüntüle @
6d336a02
...
...
@@ -69,13 +69,16 @@ def _compile(code, pattern, flags):
REPEATING_CODES
=
_REPEATING_CODES
SUCCESS_CODES
=
_SUCCESS_CODES
ASSERT_CODES
=
_ASSERT_CODES
iscased
=
None
tolower
=
None
fixes
=
None
if
flags
&
SRE_FLAG_IGNORECASE
and
not
flags
&
SRE_FLAG_LOCALE
:
if
flags
&
SRE_FLAG_UNICODE
and
not
flags
&
SRE_FLAG_ASCII
:
iscased
=
_sre
.
unicode_iscased
tolower
=
_sre
.
unicode_tolower
fixes
=
_ignorecase_fixes
else
:
iscased
=
_sre
.
ascii_iscased
tolower
=
_sre
.
ascii_tolower
for
op
,
av
in
pattern
:
if
op
in
LITERAL_CODES
:
...
...
@@ -85,6 +88,9 @@ def _compile(code, pattern, flags):
elif
flags
&
SRE_FLAG_LOCALE
:
emit
(
OP_LOC_IGNORE
[
op
])
emit
(
av
)
elif
not
iscased
(
av
):
emit
(
op
)
emit
(
av
)
else
:
lo
=
tolower
(
av
)
if
fixes
and
lo
in
fixes
:
...
...
@@ -101,14 +107,15 @@ def _compile(code, pattern, flags):
emit
(
OP_IGNORE
[
op
])
emit
(
lo
)
elif
op
is
IN
:
if
not
flags
&
SRE_FLAG_IGNORECASE
:
emit
(
op
)
elif
flags
&
SRE_FLAG_LOCALE
:
charset
,
hascased
=
_optimize_charset
(
av
,
iscased
,
tolower
,
fixes
)
if
flags
&
SRE_FLAG_IGNORECASE
and
flags
&
SRE_FLAG_LOCALE
:
emit
(
IN_LOC_IGNORE
)
el
se
:
el
if
hascased
:
emit
(
IN_IGNORE
)
else
:
emit
(
IN
)
skip
=
_len
(
code
);
emit
(
0
)
_compile_charset
(
av
,
flags
,
code
,
tolower
,
fixes
)
_compile_charset
(
charset
,
flags
,
code
)
code
[
skip
]
=
_len
(
code
)
-
skip
elif
op
is
ANY
:
if
flags
&
SRE_FLAG_DOTALL
:
...
...
@@ -223,10 +230,10 @@ def _compile(code, pattern, flags):
else
:
raise
error
(
"internal: unsupported operand type
%
r"
%
(
op
,))
def
_compile_charset
(
charset
,
flags
,
code
,
fixup
=
None
,
fixes
=
None
):
def
_compile_charset
(
charset
,
flags
,
code
):
# compile charset subprogram
emit
=
code
.
append
for
op
,
av
in
_optimize_charset
(
charset
,
fixup
,
fixes
)
:
for
op
,
av
in
charset
:
emit
(
op
)
if
op
is
NEGATE
:
pass
...
...
@@ -250,11 +257,12 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
raise
error
(
"internal: unsupported set operator
%
r"
%
(
op
,))
emit
(
FAILURE
)
def
_optimize_charset
(
charset
,
fixup
,
fixes
):
def
_optimize_charset
(
charset
,
iscased
=
None
,
fixup
=
None
,
fixes
=
None
):
# internal: optimize character set
out
=
[]
tail
=
[]
charmap
=
bytearray
(
256
)
hascased
=
False
for
op
,
av
in
charset
:
while
True
:
try
:
...
...
@@ -265,18 +273,24 @@ def _optimize_charset(charset, fixup, fixes):
if
fixes
and
lo
in
fixes
:
for
k
in
fixes
[
lo
]:
charmap
[
k
]
=
1
if
not
hascased
and
iscased
(
av
):
hascased
=
True
else
:
charmap
[
av
]
=
1
elif
op
is
RANGE
:
r
=
range
(
av
[
0
],
av
[
1
]
+
1
)
if
fixup
:
r
=
map
(
fixup
,
r
)
if
fixup
and
fixes
:
for
i
in
r
:
charmap
[
i
]
=
1
if
i
in
fixes
:
for
k
in
fixes
[
i
]:
charmap
[
k
]
=
1
if
fixes
:
for
i
in
map
(
fixup
,
r
):
charmap
[
i
]
=
1
if
i
in
fixes
:
for
k
in
fixes
[
i
]:
charmap
[
k
]
=
1
else
:
for
i
in
map
(
fixup
,
r
):
charmap
[
i
]
=
1
if
not
hascased
:
hascased
=
any
(
map
(
iscased
,
r
))
else
:
for
i
in
r
:
charmap
[
i
]
=
1
...
...
@@ -290,11 +304,13 @@ def _optimize_charset(charset, fixup, fixes):
charmap
+=
b
'
\0
'
*
0xff00
continue
# Character set contains non-BMP character codes.
# There are only two ranges of cased non-BMP characters:
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
# and for both ranges RANGE_IGNORE works.
if
fixup
and
op
is
RANGE
:
op
=
RANGE_IGNORE
if
fixup
:
hascased
=
True
# There are only two ranges of cased non-BMP characters:
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
# and for both ranges RANGE_IGNORE works.
if
op
is
RANGE
:
op
=
RANGE_IGNORE
tail
.
append
((
op
,
av
))
break
...
...
@@ -322,17 +338,17 @@ def _optimize_charset(charset, fixup, fixes):
out
.
append
((
RANGE
,
(
p
,
q
-
1
)))
out
+=
tail
# if the case was changed or new representation is more compact
if
fixup
or
len
(
out
)
<
len
(
charset
):
return
out
if
hascased
or
len
(
out
)
<
len
(
charset
):
return
out
,
hascased
# else original character set is good enough
return
charset
return
charset
,
hascased
# use bitmap
if
len
(
charmap
)
==
256
:
data
=
_mk_bitmap
(
charmap
)
out
.
append
((
CHARSET
,
data
))
out
+=
tail
return
out
return
out
,
hascased
# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
...
...
@@ -371,7 +387,7 @@ def _optimize_charset(charset, fixup, fixes):
data
[
0
:
0
]
=
[
block
]
+
_bytes_to_codes
(
mapping
)
out
.
append
((
BIGCHARSET
,
data
))
out
+=
tail
return
out
return
out
,
hascased
_CODEBITS
=
_sre
.
CODESIZE
*
8
MAXCODE
=
(
1
<<
_CODEBITS
)
-
1
...
...
@@ -414,19 +430,31 @@ def _generate_overlap_table(prefix):
table
[
i
]
=
idx
+
1
return
table
def
_get_literal_prefix
(
pattern
):
def
_get_iscased
(
flags
):
if
not
flags
&
SRE_FLAG_IGNORECASE
:
return
None
elif
flags
&
SRE_FLAG_UNICODE
and
not
flags
&
SRE_FLAG_ASCII
:
return
_sre
.
unicode_iscased
else
:
return
_sre
.
ascii_iscased
def
_get_literal_prefix
(
pattern
,
flags
):
# look for literal prefix
prefix
=
[]
prefixappend
=
prefix
.
append
prefix_skip
=
None
iscased
=
_get_iscased
(
flags
)
for
op
,
av
in
pattern
.
data
:
if
op
is
LITERAL
:
if
iscased
and
iscased
(
av
):
break
prefixappend
(
av
)
elif
op
is
SUBPATTERN
:
group
,
add_flags
,
del_flags
,
p
=
av
if
add_flags
&
SRE_FLAG_IGNORECASE
:
flags1
=
(
flags
|
add_flags
)
&
~
del_flags
if
flags1
&
SRE_FLAG_IGNORECASE
and
flags1
&
SRE_FLAG_LOCALE
:
break
prefix1
,
prefix_skip1
,
got_all
=
_get_literal_prefix
(
p
)
prefix1
,
prefix_skip1
,
got_all
=
_get_literal_prefix
(
p
,
flags1
)
if
prefix_skip
is
None
:
if
group
is
not
None
:
prefix_skip
=
len
(
prefix
)
...
...
@@ -441,46 +469,49 @@ def _get_literal_prefix(pattern):
return
prefix
,
prefix_skip
,
True
return
prefix
,
prefix_skip
,
False
def
_get_charset_prefix
(
pattern
):
charset
=
[]
# not used
charsetappend
=
charset
.
append
if
pattern
.
data
:
def
_get_charset_prefix
(
pattern
,
flags
):
while
True
:
if
not
pattern
.
data
:
return
None
op
,
av
=
pattern
.
data
[
0
]
if
op
is
SUBPATTERN
:
group
,
add_flags
,
del_flags
,
p
=
av
if
p
and
not
(
add_flags
&
SRE_FLAG_IGNORECASE
):
op
,
av
=
p
[
0
]
if
op
is
LITERAL
:
charsetappend
((
op
,
av
))
elif
op
is
BRANCH
:
c
=
[]
cappend
=
c
.
append
for
p
in
av
[
1
]:
if
not
p
:
break
op
,
av
=
p
[
0
]
if
op
is
LITERAL
:
cappend
((
op
,
av
))
else
:
break
else
:
charset
=
c
elif
op
is
BRANCH
:
c
=
[]
cappend
=
c
.
append
for
p
in
av
[
1
]:
if
not
p
:
break
op
,
av
=
p
[
0
]
if
op
is
LITERAL
:
cappend
((
op
,
av
))
else
:
break
if
op
is
not
SUBPATTERN
:
break
group
,
add_flags
,
del_flags
,
pattern
=
av
flags
=
(
flags
|
add_flags
)
&
~
del_flags
if
flags
&
SRE_FLAG_IGNORECASE
and
flags
&
SRE_FLAG_LOCALE
:
return
None
iscased
=
_get_iscased
(
flags
)
if
op
is
LITERAL
:
if
iscased
and
iscased
(
av
):
return
None
return
[(
op
,
av
)]
elif
op
is
BRANCH
:
charset
=
[]
charsetappend
=
charset
.
append
for
p
in
av
[
1
]:
if
not
p
:
return
None
op
,
av
=
p
[
0
]
if
op
is
LITERAL
and
not
(
iscased
and
iscased
(
av
)):
charsetappend
((
op
,
av
))
else
:
charset
=
c
elif
op
is
IN
:
charset
=
av
return
charset
return
None
return
charset
elif
op
is
IN
:
charset
=
av
if
iscased
:
for
op
,
av
in
charset
:
if
op
is
LITERAL
:
if
iscased
(
av
):
return
None
elif
op
is
RANGE
:
if
av
[
1
]
>
0xffff
:
return
None
if
any
(
map
(
iscased
,
range
(
av
[
0
],
av
[
1
]
+
1
))):
return
None
return
charset
return
None
def
_compile_info
(
code
,
pattern
,
flags
):
# internal: compile an info block. in the current version,
...
...
@@ -496,12 +527,12 @@ def _compile_info(code, pattern, flags):
prefix
=
[]
prefix_skip
=
0
charset
=
[]
# not used
if
not
(
flags
&
SRE_FLAG_IGNORECASE
):
if
not
(
flags
&
SRE_FLAG_IGNORECASE
and
flags
&
SRE_FLAG_LOCALE
):
# look for literal prefix
prefix
,
prefix_skip
,
got_all
=
_get_literal_prefix
(
pattern
)
prefix
,
prefix_skip
,
got_all
=
_get_literal_prefix
(
pattern
,
flags
)
# if no prefix, look for charset prefix
if
not
prefix
:
charset
=
_get_charset_prefix
(
pattern
)
charset
=
_get_charset_prefix
(
pattern
,
flags
)
## if prefix:
## print("*** PREFIX", prefix, prefix_skip)
## if charset:
...
...
@@ -536,6 +567,8 @@ def _compile_info(code, pattern, flags):
# generate overlap table
code
.
extend
(
_generate_overlap_table
(
prefix
))
elif
charset
:
charset
,
hascased
=
_optimize_charset
(
charset
)
assert
not
hascased
_compile_charset
(
charset
,
flags
,
code
)
code
[
skip
]
=
len
(
code
)
-
skip
...
...
Lib/test/test_re.py
Dosyayı görüntüle @
6d336a02
...
...
@@ -891,15 +891,24 @@ class ReTests(unittest.TestCase):
lo
=
ord
(
c
.
lower
())
self
.
assertEqual
(
_sre
.
ascii_tolower
(
i
),
lo
)
self
.
assertEqual
(
_sre
.
unicode_tolower
(
i
),
lo
)
iscased
=
c
in
string
.
ascii_letters
self
.
assertEqual
(
_sre
.
ascii_iscased
(
i
),
iscased
)
self
.
assertEqual
(
_sre
.
unicode_iscased
(
i
),
iscased
)
for
i
in
list
(
range
(
128
,
0x1000
))
+
[
0x10400
,
0x10428
]:
c
=
chr
(
i
)
self
.
assertEqual
(
_sre
.
ascii_tolower
(
i
),
i
)
if
i
!=
0x0130
:
self
.
assertEqual
(
_sre
.
unicode_tolower
(
i
),
ord
(
c
.
lower
()))
iscased
=
c
!=
c
.
lower
()
or
c
!=
c
.
upper
()
self
.
assertFalse
(
_sre
.
ascii_iscased
(
i
))
self
.
assertEqual
(
_sre
.
unicode_iscased
(
i
),
c
!=
c
.
lower
()
or
c
!=
c
.
upper
())
self
.
assertEqual
(
_sre
.
ascii_tolower
(
0x0130
),
0x0130
)
self
.
assertEqual
(
_sre
.
unicode_tolower
(
0x0130
),
ord
(
'i'
))
self
.
assertFalse
(
_sre
.
ascii_iscased
(
0x0130
))
self
.
assertTrue
(
_sre
.
unicode_iscased
(
0x0130
))
def
test_not_literal
(
self
):
self
.
assertEqual
(
re
.
search
(
r"\s([^a])"
,
" b"
)
.
group
(
1
),
"b"
)
...
...
Misc/NEWS
Dosyayı görüntüle @
6d336a02
...
...
@@ -320,6 +320,9 @@ Extension Modules
Library
-------
- bpo-30285: Optimized case-insensitive matching and searching of regular
expressions.
- bpo-29990: Fix range checking in GB18030 decoder. Original patch by Ma Lin.
- bpo-29979: rewrite cgi.parse_multipart, reusing the FieldStorage class and
...
...
Modules/_sre.c
Dosyayı görüntüle @
6d336a02
...
...
@@ -273,6 +273,38 @@ _sre_getcodesize_impl(PyObject *module)
return
sizeof
(
SRE_CODE
);
}
/*[clinic input]
_sre.ascii_iscased -> bool
character: int
/
[clinic start generated code]*/
static
int
_sre_ascii_iscased_impl
(
PyObject
*
module
,
int
character
)
/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
{
unsigned
int
ch
=
(
unsigned
int
)
character
;
return
ch
!=
sre_lower
(
ch
)
||
ch
!=
sre_upper
(
ch
);
}
/*[clinic input]
_sre.unicode_iscased -> bool
character: int
/
[clinic start generated code]*/
static
int
_sre_unicode_iscased_impl
(
PyObject
*
module
,
int
character
)
/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
{
unsigned
int
ch
=
(
unsigned
int
)
character
;
return
ch
!=
sre_lower_unicode
(
ch
)
||
ch
!=
sre_upper_unicode
(
ch
);
}
/*[clinic input]
_sre.ascii_tolower -> int
...
...
@@ -2750,6 +2782,8 @@ static PyTypeObject Scanner_Type = {
static
PyMethodDef
_functions
[]
=
{
_SRE_COMPILE_METHODDEF
_SRE_GETCODESIZE_METHODDEF
_SRE_ASCII_ISCASED_METHODDEF
_SRE_UNICODE_ISCASED_METHODDEF
_SRE_ASCII_TOLOWER_METHODDEF
_SRE_UNICODE_TOLOWER_METHODDEF
{
NULL
,
NULL
}
...
...
Modules/clinic/_sre.c.h
Dosyayı görüntüle @
6d336a02
...
...
@@ -29,6 +29,68 @@ exit:
return
return_value
;
}
PyDoc_STRVAR
(
_sre_ascii_iscased__doc__
,
"ascii_iscased($module, character, /)
\n
"
"--
\n
"
"
\n
"
);
#define _SRE_ASCII_ISCASED_METHODDEF \
{"ascii_iscased", (PyCFunction)_sre_ascii_iscased, METH_O, _sre_ascii_iscased__doc__},
static
int
_sre_ascii_iscased_impl
(
PyObject
*
module
,
int
character
);
static
PyObject
*
_sre_ascii_iscased
(
PyObject
*
module
,
PyObject
*
arg
)
{
PyObject
*
return_value
=
NULL
;
int
character
;
int
_return_value
;
if
(
!
PyArg_Parse
(
arg
,
"i:ascii_iscased"
,
&
character
))
{
goto
exit
;
}
_return_value
=
_sre_ascii_iscased_impl
(
module
,
character
);
if
((
_return_value
==
-
1
)
&&
PyErr_Occurred
())
{
goto
exit
;
}
return_value
=
PyBool_FromLong
((
long
)
_return_value
);
exit:
return
return_value
;
}
PyDoc_STRVAR
(
_sre_unicode_iscased__doc__
,
"unicode_iscased($module, character, /)
\n
"
"--
\n
"
"
\n
"
);
#define _SRE_UNICODE_ISCASED_METHODDEF \
{"unicode_iscased", (PyCFunction)_sre_unicode_iscased, METH_O, _sre_unicode_iscased__doc__},
static
int
_sre_unicode_iscased_impl
(
PyObject
*
module
,
int
character
);
static
PyObject
*
_sre_unicode_iscased
(
PyObject
*
module
,
PyObject
*
arg
)
{
PyObject
*
return_value
=
NULL
;
int
character
;
int
_return_value
;
if
(
!
PyArg_Parse
(
arg
,
"i:unicode_iscased"
,
&
character
))
{
goto
exit
;
}
_return_value
=
_sre_unicode_iscased_impl
(
module
,
character
);
if
((
_return_value
==
-
1
)
&&
PyErr_Occurred
())
{
goto
exit
;
}
return_value
=
PyBool_FromLong
((
long
)
_return_value
);
exit:
return
return_value
;
}
PyDoc_STRVAR
(
_sre_ascii_tolower__doc__
,
"ascii_tolower($module, character, /)
\n
"
"--
\n
"
...
...
@@ -715,4 +777,4 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyObject *Py_UNUSED(ignored))
{
return
_sre_SRE_Scanner_search_impl
(
self
);
}
/*[clinic end generated code: output=
811e67d7f8f5052e
input=a9049054013a1b77]*/
/*[clinic end generated code: output=
5fe47c49e475cccb
input=a9049054013a1b77]*/
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment