Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
19af43d7
Kaydet (Commit)
19af43d7
authored
Tem 02, 2001
tarafından
Fredrik Lundh
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
added martin's BIGCHARSET patch to SRE 2.1.1. martin reports 2x
speedups for certain unicode character ranges.
üst
1fb5ce03
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
107 additions
and
31 deletions
+107
-31
sre_compile.py
Lib/sre_compile.py
+71
-10
sre_constants.py
Lib/sre_constants.py
+3
-2
_sre.c
Modules/_sre.c
+13
-0
sre_constants.h
Modules/sre_constants.h
+20
-19
No files found.
Lib/sre_compile.py
Dosyayı görüntüle @
19af43d7
...
@@ -156,6 +156,8 @@ def _compile_charset(charset, flags, code, fixup=None):
...
@@ -156,6 +156,8 @@ def _compile_charset(charset, flags, code, fixup=None):
emit
(
fixup
(
av
[
1
]))
emit
(
fixup
(
av
[
1
]))
elif
op
is
CHARSET
:
elif
op
is
CHARSET
:
code
.
extend
(
av
)
code
.
extend
(
av
)
elif
op
is
BIGCHARSET
:
code
.
extend
(
av
)
elif
op
is
CATEGORY
:
elif
op
is
CATEGORY
:
if
flags
&
SRE_FLAG_LOCALE
:
if
flags
&
SRE_FLAG_LOCALE
:
emit
(
CHCODES
[
CH_LOCALE
[
av
]])
emit
(
CHCODES
[
CH_LOCALE
[
av
]])
...
@@ -185,7 +187,7 @@ def _optimize_charset(charset, fixup):
...
@@ -185,7 +187,7 @@ def _optimize_charset(charset, fixup):
return
charset
# cannot compress
return
charset
# cannot compress
except
IndexError
:
except
IndexError
:
# character set contains unicode characters
# character set contains unicode characters
return
charset
return
_optimize_unicode
(
charset
,
fixup
)
# compress character map
# compress character map
i
=
p
=
n
=
0
i
=
p
=
n
=
0
runs
=
[]
runs
=
[]
...
@@ -211,19 +213,78 @@ def _optimize_charset(charset, fixup):
...
@@ -211,19 +213,78 @@ def _optimize_charset(charset, fixup):
return
out
return
out
else
:
else
:
# use bitmap
# use bitmap
data
=
[]
data
=
_mk_bitmap
(
charmap
)
m
=
1
;
v
=
0
for
c
in
charmap
:
if
c
:
v
=
v
+
m
m
=
m
<<
1
if
m
>
MAXCODE
:
data
.
append
(
v
)
m
=
1
;
v
=
0
out
.
append
((
CHARSET
,
data
))
out
.
append
((
CHARSET
,
data
))
return
out
return
out
return
charset
return
charset
def
_mk_bitmap
(
bits
):
data
=
[]
m
=
1
;
v
=
0
for
c
in
bits
:
if
c
:
v
=
v
+
m
m
=
m
<<
1
if
m
>
MAXCODE
:
data
.
append
(
v
)
m
=
1
;
v
=
0
return
data
# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
# characters, duplicate chunks are eliminitated, and each chunk is
# given a number. In the compiled expression, the charset is
# represented by a 16-bit word sequence, consisting of one word for
# the number of different chunks, a sequence of 256 bytes (128 words)
# of chunk numbers indexed by their original chunk position, and a
# sequence of chunks (16 words each).
# Compression is normally good: in a typical charset, large ranges of
# Unicode will be either completely excluded (e.g. if only cyrillic
# letters are to be matched), or completely included (e.g. if large
# subranges of Kanji match). These ranges will be represented by
# chunks of all one-bits or all zero-bits.
# Matching can be also done efficiently: the more significant byte of
# the Unicode character is an index into the chunk number, and the
# less significant byte is a bit index in the chunk (just like the
# CHARSET matching).
def
_optimize_unicode
(
charset
,
fixup
):
charmap
=
[
0
]
*
65536
negate
=
0
for
op
,
av
in
charset
:
if
op
is
NEGATE
:
negate
=
1
elif
op
is
LITERAL
:
charmap
[
fixup
(
av
)]
=
1
elif
op
is
RANGE
:
for
i
in
range
(
fixup
(
av
[
0
]),
fixup
(
av
[
1
])
+
1
):
charmap
[
i
]
=
1
elif
op
is
CATEGORY
:
# XXX: could expand category
return
charset
# cannot compress
if
negate
:
for
i
in
range
(
65536
):
charmap
[
i
]
=
not
charmap
[
i
]
comps
=
{}
mapping
=
[
0
]
*
256
block
=
0
data
=
[]
for
i
in
range
(
256
):
chunk
=
tuple
(
charmap
[
i
*
256
:(
i
+
1
)
*
256
])
new
=
comps
.
setdefault
(
chunk
,
block
)
mapping
[
i
]
=
new
if
new
==
block
:
block
+=
1
data
+=
_mk_bitmap
(
chunk
)
header
=
[
block
]
assert
MAXCODE
==
65535
for
i
in
range
(
128
):
header
.
append
(
mapping
[
2
*
i
]
+
256
*
mapping
[
2
*
i
+
1
])
data
[
0
:
0
]
=
header
return
[(
BIGCHARSET
,
data
)]
def
_simple
(
av
):
def
_simple
(
av
):
# check if av is a "simple" operator
# check if av is a "simple" operator
lo
,
hi
=
av
[
2
]
.
getwidth
()
lo
,
hi
=
av
[
2
]
.
getwidth
()
...
...
Lib/sre_constants.py
Dosyayı görüntüle @
19af43d7
...
@@ -11,7 +11,7 @@
...
@@ -11,7 +11,7 @@
# update when constants are added or removed
# update when constants are added or removed
MAGIC
=
20010
320
MAGIC
=
20010
701
# max code word in this release
# max code word in this release
...
@@ -33,6 +33,7 @@ ANY_ALL = "any_all"
...
@@ -33,6 +33,7 @@ ANY_ALL = "any_all"
ASSERT
=
"assert"
ASSERT
=
"assert"
ASSERT_NOT
=
"assert_not"
ASSERT_NOT
=
"assert_not"
AT
=
"at"
AT
=
"at"
BIGCHARSET
=
"bigcharset"
BRANCH
=
"branch"
BRANCH
=
"branch"
CALL
=
"call"
CALL
=
"call"
CATEGORY
=
"category"
CATEGORY
=
"category"
...
@@ -103,7 +104,7 @@ OPCODES = [
...
@@ -103,7 +104,7 @@ OPCODES = [
BRANCH
,
BRANCH
,
CALL
,
CALL
,
CATEGORY
,
CATEGORY
,
CHARSET
,
CHARSET
,
BIGCHARSET
,
GROUPREF
,
GROUPREF_IGNORE
,
GROUPREF
,
GROUPREF_IGNORE
,
IN
,
IN_IGNORE
,
IN
,
IN_IGNORE
,
INFO
,
INFO
,
...
...
Modules/_sre.c
Dosyayı görüntüle @
19af43d7
...
@@ -506,6 +506,19 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
...
@@ -506,6 +506,19 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
set
+=
16
;
set
+=
16
;
break
;
break
;
case
SRE_OP_BIGCHARSET
:
/* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
{
int
count
,
block
;
count
=
*
(
set
++
);
block
=
((
unsigned
char
*
)
set
)[
ch
>>
8
];
set
+=
128
;
if
(
set
[
block
*
16
+
((
ch
&
255
)
>>
4
)]
&
(
1
<<
(
ch
&
15
)))
return
ok
;
set
+=
count
*
16
;
break
;
}
case
SRE_OP_CATEGORY
:
case
SRE_OP_CATEGORY
:
/* <CATEGORY> <code> */
/* <CATEGORY> <code> */
if
(
sre_category
(
set
[
0
],
(
int
)
ch
))
if
(
sre_category
(
set
[
0
],
(
int
)
ch
))
...
...
Modules/sre_constants.h
Dosyayı görüntüle @
19af43d7
...
@@ -11,7 +11,7 @@
...
@@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution.
* See the _sre.c file for information on usage and redistribution.
*/
*/
#define SRE_MAGIC 20010
320
#define SRE_MAGIC 20010
701
#define SRE_OP_FAILURE 0
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
#define SRE_OP_ANY 2
...
@@ -23,24 +23,25 @@
...
@@ -23,24 +23,25 @@
#define SRE_OP_CALL 8
#define SRE_OP_CALL 8
#define SRE_OP_CATEGORY 9
#define SRE_OP_CATEGORY 9
#define SRE_OP_CHARSET 10
#define SRE_OP_CHARSET 10
#define SRE_OP_GROUPREF 11
#define SRE_OP_BIGCHARSET 11
#define SRE_OP_GROUPREF_IGNORE 12
#define SRE_OP_GROUPREF 12
#define SRE_OP_IN 13
#define SRE_OP_GROUPREF_IGNORE 13
#define SRE_OP_IN_IGNORE 14
#define SRE_OP_IN 14
#define SRE_OP_INFO 15
#define SRE_OP_IN_IGNORE 15
#define SRE_OP_JUMP 16
#define SRE_OP_INFO 16
#define SRE_OP_LITERAL 17
#define SRE_OP_JUMP 17
#define SRE_OP_LITERAL_IGNORE 18
#define SRE_OP_LITERAL 18
#define SRE_OP_MARK 19
#define SRE_OP_LITERAL_IGNORE 19
#define SRE_OP_MAX_UNTIL 20
#define SRE_OP_MARK 20
#define SRE_OP_MIN_UNTIL 21
#define SRE_OP_MAX_UNTIL 21
#define SRE_OP_NOT_LITERAL 22
#define SRE_OP_MIN_UNTIL 22
#define SRE_OP_NOT_LITERAL_IGNORE 23
#define SRE_OP_NOT_LITERAL 23
#define SRE_OP_NEGATE 24
#define SRE_OP_NOT_LITERAL_IGNORE 24
#define SRE_OP_RANGE 25
#define SRE_OP_NEGATE 25
#define SRE_OP_REPEAT 26
#define SRE_OP_RANGE 26
#define SRE_OP_REPEAT_ONE 27
#define SRE_OP_REPEAT 27
#define SRE_OP_SUBPATTERN 28
#define SRE_OP_REPEAT_ONE 28
#define SRE_OP_SUBPATTERN 29
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2
#define SRE_AT_BEGINNING_STRING 2
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment