Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
ca5f91b8
Kaydet (Commit)
ca5f91b8
authored
May 10, 2012
tarafından
Antoine Pitrou
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
Issue #14738: Speed-up UTF-8 decoding on non-ASCII data. Patch by Serhiy Storchaka.
üst
fda08b08
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
151 additions
and
78 deletions
+151
-78
NEWS
Misc/NEWS
+3
-0
asciilib.h
Objects/stringlib/asciilib.h
+1
-0
codecs.h
Objects/stringlib/codecs.h
+143
-78
ucs1lib.h
Objects/stringlib/ucs1lib.h
+1
-0
ucs2lib.h
Objects/stringlib/ucs2lib.h
+1
-0
ucs4lib.h
Objects/stringlib/ucs4lib.h
+1
-0
undef.h
Objects/stringlib/undef.h
+1
-0
unicodeobject.c
Objects/unicodeobject.c
+0
-0
No files found.
Misc/NEWS
Dosyayı görüntüle @
ca5f91b8
...
@@ -10,6 +10,9 @@ What's New in Python 3.3.0 Alpha 4?
...
@@ -10,6 +10,9 @@ What's New in Python 3.3.0 Alpha 4?
Core and Builtins
Core and Builtins
-----------------
-----------------
- Issue #14738: Speed-up UTF-8 decoding on non-ASCII data. Patch by Serhiy
Storchaka.
- Issue #14700: Fix two broken and undefined-behaviour-inducing overflow checks
- Issue #14700: Fix two broken and undefined-behaviour-inducing overflow checks
in old-style string formatting.
in old-style string formatting.
...
...
Objects/stringlib/asciilib.h
Dosyayı görüntüle @
ca5f91b8
...
@@ -7,6 +7,7 @@
...
@@ -7,6 +7,7 @@
#define STRINGLIB(F) asciilib_##F
#define STRINGLIB(F) asciilib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 1
#define STRINGLIB_SIZEOF_CHAR 1
#define STRINGLIB_MAX_CHAR 0x7Fu
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_PARSE_CODE "U"
...
...
Objects/stringlib/codecs.h
Dosyayı görüntüle @
ca5f91b8
...
@@ -15,19 +15,18 @@
...
@@ -15,19 +15,18 @@
# error C 'long' size should be either 4 or 8!
# error C 'long' size should be either 4 or 8!
#endif
#endif
Py_LOCAL_INLINE
(
int
)
Py_LOCAL_INLINE
(
Py_UCS4
)
STRINGLIB
(
utf8_
try_decode
)(
const
char
*
start
,
const
char
*
end
,
STRINGLIB
(
utf8_
decode
)(
const
char
**
inptr
,
const
char
*
end
,
STRINGLIB_CHAR
*
dest
,
STRINGLIB_CHAR
*
dest
,
const
char
**
src_pos
,
Py_ssize_t
*
dest_index
)
Py_ssize_t
*
outpos
)
{
{
int
ret
;
Py_UCS4
ch
;
Py_ssize_t
n
;
const
char
*
s
=
*
inptr
;
const
char
*
s
=
start
;
const
char
*
aligned_end
=
(
const
char
*
)
((
size_t
)
end
&
~
LONG_PTR_MASK
);
const
char
*
aligned_end
=
(
const
char
*
)
((
size_t
)
end
&
~
LONG_PTR_MASK
);
STRINGLIB_CHAR
*
p
=
dest
;
STRINGLIB_CHAR
*
p
=
dest
+
*
outpos
;
while
(
s
<
end
)
{
while
(
s
<
end
)
{
Py_UCS4
ch
=
(
unsigned
char
)
*
s
;
ch
=
(
unsigned
char
)
*
s
;
if
(
ch
<
0x80
)
{
if
(
ch
<
0x80
)
{
/* Fast path for runs of ASCII characters. Given that common UTF-8
/* Fast path for runs of ASCII characters. Given that common UTF-8
...
@@ -48,15 +47,33 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
...
@@ -48,15 +47,33 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
unsigned
long
value
=
*
(
unsigned
long
*
)
_s
;
unsigned
long
value
=
*
(
unsigned
long
*
)
_s
;
if
(
value
&
ASCII_CHAR_MASK
)
if
(
value
&
ASCII_CHAR_MASK
)
break
;
break
;
_p
[
0
]
=
_s
[
0
];
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
_p
[
1
]
=
_s
[
1
];
_p
[
0
]
=
(
STRINGLIB_CHAR
)(
value
&
0xFFu
);
_p
[
2
]
=
_s
[
2
];
_p
[
1
]
=
(
STRINGLIB_CHAR
)((
value
>>
8
)
&
0xFFu
);
_p
[
3
]
=
_s
[
3
];
_p
[
2
]
=
(
STRINGLIB_CHAR
)((
value
>>
16
)
&
0xFFu
);
#if (SIZEOF_LONG == 8)
_p
[
3
]
=
(
STRINGLIB_CHAR
)((
value
>>
24
)
&
0xFFu
);
_p
[
4
]
=
_s
[
4
];
# if SIZEOF_LONG == 8
_p
[
5
]
=
_s
[
5
];
_p
[
4
]
=
(
STRINGLIB_CHAR
)((
value
>>
32
)
&
0xFFu
);
_p
[
6
]
=
_s
[
6
];
_p
[
5
]
=
(
STRINGLIB_CHAR
)((
value
>>
40
)
&
0xFFu
);
_p
[
7
]
=
_s
[
7
];
_p
[
6
]
=
(
STRINGLIB_CHAR
)((
value
>>
48
)
&
0xFFu
);
_p
[
7
]
=
(
STRINGLIB_CHAR
)((
value
>>
56
)
&
0xFFu
);
# endif
#else
# if SIZEOF_LONG == 8
_p
[
0
]
=
(
STRINGLIB_CHAR
)((
value
>>
56
)
&
0xFFu
);
_p
[
1
]
=
(
STRINGLIB_CHAR
)((
value
>>
48
)
&
0xFFu
);
_p
[
2
]
=
(
STRINGLIB_CHAR
)((
value
>>
40
)
&
0xFFu
);
_p
[
3
]
=
(
STRINGLIB_CHAR
)((
value
>>
32
)
&
0xFFu
);
_p
[
4
]
=
(
STRINGLIB_CHAR
)((
value
>>
24
)
&
0xFFu
);
_p
[
5
]
=
(
STRINGLIB_CHAR
)((
value
>>
16
)
&
0xFFu
);
_p
[
6
]
=
(
STRINGLIB_CHAR
)((
value
>>
8
)
&
0xFFu
);
_p
[
7
]
=
(
STRINGLIB_CHAR
)(
value
&
0xFFu
);
# else
_p
[
0
]
=
(
STRINGLIB_CHAR
)((
value
>>
24
)
&
0xFFu
);
_p
[
1
]
=
(
STRINGLIB_CHAR
)((
value
>>
16
)
&
0xFFu
);
_p
[
2
]
=
(
STRINGLIB_CHAR
)((
value
>>
8
)
&
0xFFu
);
_p
[
3
]
=
(
STRINGLIB_CHAR
)(
value
&
0xFFu
);
# endif
#endif
#endif
_s
+=
SIZEOF_LONG
;
_s
+=
SIZEOF_LONG
;
_p
+=
SIZEOF_LONG
;
_p
+=
SIZEOF_LONG
;
...
@@ -67,87 +84,135 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
...
@@ -67,87 +84,135 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
break
;
break
;
ch
=
(
unsigned
char
)
*
s
;
ch
=
(
unsigned
char
)
*
s
;
}
}
if
(
ch
<
0x80
)
{
s
++
;
*
p
++
=
ch
;
continue
;
}
}
}
if
(
ch
<
0x80
)
{
if
(
ch
<
0xC2
)
{
s
++
;
/* invalid sequence
*
p
++
=
ch
;
\x80-\xBF -- continuation byte
continue
;
\xC0-\xC1 -- fake 0000-007F */
}
goto
InvalidStart
;
n
=
utf8_code_length
[
ch
];
if
(
s
+
n
>
end
)
{
/* unexpected end of data: the caller will decide whether
it's an error or not */
goto
_error
;
}
}
switch
(
n
)
{
if
(
ch
<
0xE0
)
{
case
0
:
/* \xC2\x80-\xDF\xBF -- 0080-07FF */
/* invalid start byte */
Py_UCS4
ch2
;
goto
_error
;
if
(
end
-
s
<
2
)
{
case
1
:
/* unexpected end of data: the caller will decide whether
/* internal error */
it's an error or not */
goto
_error
;
break
;
case
2
:
}
if
((
s
[
1
]
&
0xc0
)
!=
0x80
)
ch2
=
(
unsigned
char
)
s
[
1
];
if
((
ch2
&
0xC0
)
!=
0x80
)
/* invalid continuation byte */
/* invalid continuation byte */
goto
_error
;
goto
InvalidContinuation
;
ch
=
((
s
[
0
]
&
0x1f
)
<<
6
)
+
(
s
[
1
]
&
0x3f
);
ch
=
(
ch
<<
6
)
+
ch2
-
((
0xC0
<<
6
)
+
0x80
);
assert
((
ch
>
0x007F
)
&&
(
ch
<=
0x07FF
));
assert
((
ch
>
0x007F
)
&&
(
ch
<=
0x07FF
));
s
+=
2
;
s
+=
2
;
if
(
STRINGLIB_MAX_CHAR
<=
0x007F
||
(
STRINGLIB_MAX_CHAR
<
0x07FF
&&
ch
>
STRINGLIB_MAX_CHAR
))
goto
Overflow
;
*
p
++
=
ch
;
*
p
++
=
ch
;
break
;
continue
;
}
case
3
:
if
(
ch
<
0xF0
)
{
/*
Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
/*
\xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
will result in surrogates in range d800-dfff. Surrogates are
Py_UCS4
ch2
,
ch3
;
not valid UTF-8 so they are rejected.
if
(
end
-
s
<
3
)
{
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
/* unexpected end of data: the caller will decide whether
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.tx
t */
it's an error or no
t */
if
((
s
[
1
]
&
0xc0
)
!=
0x80
||
break
;
(
s
[
2
]
&
0xc0
)
!=
0x80
||
}
((
unsigned
char
)
s
[
0
]
==
0xE0
&&
ch2
=
(
unsigned
char
)
s
[
1
];
(
unsigned
char
)
s
[
1
]
<
0xA0
)
||
ch3
=
(
unsigned
char
)
s
[
2
];
((
unsigned
char
)
s
[
0
]
==
0xED
&&
if
((
ch2
&
0xC0
)
!=
0x80
||
(
unsigned
char
)
s
[
1
]
>
0x9F
)
)
{
(
ch3
&
0xC0
)
!=
0x80
)
{
/* invalid continuation byte */
/* invalid continuation byte */
goto
_error
;
goto
InvalidContinuation
;
}
if
(
ch
==
0xE0
)
{
if
(
ch2
<
0xA0
)
/* invalid sequence
\xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
goto
InvalidContinuation
;
}
}
ch
=
((
s
[
0
]
&
0x0f
)
<<
12
)
+
((
s
[
1
]
&
0x3f
)
<<
6
)
+
(
s
[
2
]
&
0x3f
);
else
if
(
ch
==
0xED
&&
ch2
>
0x9F
)
{
/* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
will result in surrogates in range D800-DFFF. Surrogates are
not valid UTF-8 so they are rejected.
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
goto
InvalidContinuation
;
}
ch
=
(
ch
<<
12
)
+
(
ch2
<<
6
)
+
ch3
-
((
0xE0
<<
12
)
+
(
0x80
<<
6
)
+
0x80
);
assert
((
ch
>
0x07FF
)
&&
(
ch
<=
0xFFFF
));
assert
((
ch
>
0x07FF
)
&&
(
ch
<=
0xFFFF
));
s
+=
3
;
s
+=
3
;
if
(
STRINGLIB_MAX_CHAR
<=
0x07FF
||
(
STRINGLIB_MAX_CHAR
<
0xFFFF
&&
ch
>
STRINGLIB_MAX_CHAR
))
goto
Overflow
;
*
p
++
=
ch
;
*
p
++
=
ch
;
break
;
continue
;
}
case
4
:
if
(
ch
<
0xF5
)
{
if
((
s
[
1
]
&
0xc0
)
!=
0x80
||
/* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
(
s
[
2
]
&
0xc0
)
!=
0x80
||
Py_UCS4
ch2
,
ch3
,
ch4
;
(
s
[
3
]
&
0xc0
)
!=
0x80
||
if
(
end
-
s
<
4
)
{
((
unsigned
char
)
s
[
0
]
==
0xF0
&&
/* unexpected end of data: the caller will decide whether
(
unsigned
char
)
s
[
1
]
<
0x90
)
||
it's an error or not */
((
unsigned
char
)
s
[
0
]
==
0xF4
&&
break
;
(
unsigned
char
)
s
[
1
]
>
0x8F
))
{
}
ch2
=
(
unsigned
char
)
s
[
1
];
ch3
=
(
unsigned
char
)
s
[
2
];
ch4
=
(
unsigned
char
)
s
[
3
];
if
((
ch2
&
0xC0
)
!=
0x80
||
(
ch3
&
0xC0
)
!=
0x80
||
(
ch4
&
0xC0
)
!=
0x80
)
{
/* invalid continuation byte */
/* invalid continuation byte */
goto
_error
;
goto
InvalidContinuation
;
}
if
(
ch
==
0xF0
)
{
if
(
ch2
<
0x90
)
/* invalid sequence
\xF0\x80\x80\x80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */
goto
InvalidContinuation
;
}
}
ch
=
((
s
[
0
]
&
0x7
)
<<
18
)
+
((
s
[
1
]
&
0x3f
)
<<
12
)
+
else
if
(
ch
==
0xF4
&&
ch2
>
0x8F
)
{
((
s
[
2
]
&
0x3f
)
<<
6
)
+
(
s
[
3
]
&
0x3f
);
/* invalid sequence
assert
((
ch
>
0xFFFF
)
&&
(
ch
<=
0x10ffff
));
\xF4\x90\x80\80- -- 110000- overflow */
goto
InvalidContinuation
;
}
ch
=
(
ch
<<
18
)
+
(
ch2
<<
12
)
+
(
ch3
<<
6
)
+
ch4
-
((
0xF0
<<
18
)
+
(
0x80
<<
12
)
+
(
0x80
<<
6
)
+
0x80
);
assert
((
ch
>
0xFFFF
)
&&
(
ch
<=
0x10FFFF
));
s
+=
4
;
s
+=
4
;
if
(
STRINGLIB_MAX_CHAR
<=
0xFFFF
||
(
STRINGLIB_MAX_CHAR
<
0x10FFFF
&&
ch
>
STRINGLIB_MAX_CHAR
))
goto
Overflow
;
*
p
++
=
ch
;
*
p
++
=
ch
;
break
;
continue
;
}
}
goto
InvalidStart
;
}
}
ret
=
0
;
ch
=
0
;
goto
_ok
;
Overflow
:
_error
:
Return
:
ret
=
-
1
;
*
inptr
=
s
;
_ok
:
*
outpos
=
p
-
dest
;
*
src_pos
=
s
;
return
ch
;
*
dest_index
=
p
-
dest
;
InvalidStart
:
return
ret
;
ch
=
1
;
goto
Return
;
InvalidContinuation
:
ch
=
2
;
goto
Return
;
}
}
#undef LONG_PTR_MASK
#undef LONG_PTR_MASK
...
...
Objects/stringlib/ucs1lib.h
Dosyayı görüntüle @
ca5f91b8
...
@@ -7,6 +7,7 @@
...
@@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs1lib_##F
#define STRINGLIB(F) ucs1lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 1
#define STRINGLIB_SIZEOF_CHAR 1
#define STRINGLIB_MAX_CHAR 0xFFu
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_PARSE_CODE "U"
...
...
Objects/stringlib/ucs2lib.h
Dosyayı görüntüle @
ca5f91b8
...
@@ -7,6 +7,7 @@
...
@@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs2lib_##F
#define STRINGLIB(F) ucs2lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 2
#define STRINGLIB_SIZEOF_CHAR 2
#define STRINGLIB_MAX_CHAR 0xFFFFu
#define STRINGLIB_CHAR Py_UCS2
#define STRINGLIB_CHAR Py_UCS2
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_PARSE_CODE "U"
...
...
Objects/stringlib/ucs4lib.h
Dosyayı görüntüle @
ca5f91b8
...
@@ -7,6 +7,7 @@
...
@@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs4lib_##F
#define STRINGLIB(F) ucs4lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 4
#define STRINGLIB_SIZEOF_CHAR 4
#define STRINGLIB_MAX_CHAR 0x10FFFFu
#define STRINGLIB_CHAR Py_UCS4
#define STRINGLIB_CHAR Py_UCS4
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_PARSE_CODE "U"
...
...
Objects/stringlib/undef.h
Dosyayı görüntüle @
ca5f91b8
#undef FASTSEARCH
#undef FASTSEARCH
#undef STRINGLIB
#undef STRINGLIB
#undef STRINGLIB_SIZEOF_CHAR
#undef STRINGLIB_SIZEOF_CHAR
#undef STRINGLIB_MAX_CHAR
#undef STRINGLIB_CHAR
#undef STRINGLIB_CHAR
#undef STRINGLIB_STR
#undef STRINGLIB_STR
#undef STRINGLIB_LEN
#undef STRINGLIB_LEN
...
...
Objects/unicodeobject.c
Dosyayı görüntüle @
ca5f91b8
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment