Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
6099a032
Kaydet (Commit)
6099a032
authored
Ara 18, 2011
tarafından
Victor Stinner
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
Issue #13624: Write a specialized UTF-8 encoder to allow more optimization
The main bottleneck was the PyUnicode_READ() macro.
üst
b66dcb66
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
212 additions
and
150 deletions
+212
-150
3.3.rst
Doc/whatsnew/3.3.rst
+3
-1
codecs.h
Objects/stringlib/codecs.h
+197
-0
unicodeobject.c
Objects/unicodeobject.c
+12
-149
No files found.
Doc/whatsnew/3.3.rst
Dosyayı görüntüle @
6099a032
...
...
@@ -712,7 +712,9 @@ Major performance enhancements have been added:
* the memory footprint is divided by 2 to 4 depending on the text
* encode an ASCII string to UTF-8 doesn't need to encode characters anymore,
the UTF-8 representation is shared with the ASCII representation
* getting a substring of a latin1 strings is 4 times faster
* the UTF-8 encoder has been optimized
* repeating a single ASCII letter and getting a substring of a ASCII strings
is 4 times faster
Build and C API Changes
...
...
Objects/stringlib/codecs.h
Dosyayı görüntüle @
6099a032
...
...
@@ -153,4 +153,201 @@ _ok:
#undef LONG_PTR_MASK
#undef ASCII_CHAR_MASK
/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
UCS-1 strings don't need to handle surrogates for example. */
Py_LOCAL_INLINE
(
PyObject
*
)
STRINGLIB
(
utf8_encoder
)(
PyObject
*
unicode
,
STRINGLIB_CHAR
*
data
,
Py_ssize_t
size
,
const
char
*
errors
)
{
#define MAX_SHORT_UNICHARS 300
/* largest size we'll do on the stack */
Py_ssize_t
i
;
/* index into s of next input byte */
PyObject
*
result
;
/* result string object */
char
*
p
;
/* next free byte in output buffer */
Py_ssize_t
nallocated
;
/* number of result bytes allocated */
Py_ssize_t
nneeded
;
/* number of result bytes needed */
#if STRINGLIB_SIZEOF_CHAR > 1
PyObject
*
errorHandler
=
NULL
;
PyObject
*
exc
=
NULL
;
PyObject
*
rep
=
NULL
;
#endif
#if STRINGLIB_SIZEOF_CHAR == 1
const
Py_ssize_t
max_char_size
=
2
;
char
stackbuf
[
MAX_SHORT_UNICHARS
*
2
];
#elif STRINGLIB_SIZEOF_CHAR == 2
const
Py_ssize_t
max_char_size
=
3
;
char
stackbuf
[
MAX_SHORT_UNICHARS
*
3
];
#else
/* STRINGLIB_SIZEOF_CHAR == 4 */
const
Py_ssize_t
max_char_size
=
4
;
char
stackbuf
[
MAX_SHORT_UNICHARS
*
4
];
#endif
assert
(
size
>=
0
);
if
(
size
<=
MAX_SHORT_UNICHARS
)
{
/* Write into the stack buffer; nallocated can't overflow.
* At the end, we'll allocate exactly as much heap space as it
* turns out we need.
*/
nallocated
=
Py_SAFE_DOWNCAST
(
sizeof
(
stackbuf
),
size_t
,
int
);
result
=
NULL
;
/* will allocate after we're done */
p
=
stackbuf
;
}
else
{
if
(
size
>
PY_SSIZE_T_MAX
/
max_char_size
)
{
/* integer overflow */
return
PyErr_NoMemory
();
}
/* Overallocate on the heap, and give the excess back at the end. */
nallocated
=
size
*
max_char_size
;
result
=
PyBytes_FromStringAndSize
(
NULL
,
nallocated
);
if
(
result
==
NULL
)
return
NULL
;
p
=
PyBytes_AS_STRING
(
result
);
}
for
(
i
=
0
;
i
<
size
;)
{
Py_UCS4
ch
=
data
[
i
++
];
if
(
ch
<
0x80
)
{
/* Encode ASCII */
*
p
++
=
(
char
)
ch
;
}
else
#if STRINGLIB_SIZEOF_CHAR > 1
if
(
ch
<
0x0800
)
#endif
{
/* Encode Latin-1 */
*
p
++
=
(
char
)(
0xc0
|
(
ch
>>
6
));
*
p
++
=
(
char
)(
0x80
|
(
ch
&
0x3f
));
}
#if STRINGLIB_SIZEOF_CHAR > 1
else
if
(
Py_UNICODE_IS_SURROGATE
(
ch
))
{
Py_ssize_t
newpos
;
Py_ssize_t
repsize
,
k
,
startpos
;
startpos
=
i
-
1
;
rep
=
unicode_encode_call_errorhandler
(
errors
,
&
errorHandler
,
"utf-8"
,
"surrogates not allowed"
,
unicode
,
&
exc
,
startpos
,
startpos
+
1
,
&
newpos
);
if
(
!
rep
)
goto
error
;
if
(
PyBytes_Check
(
rep
))
repsize
=
PyBytes_GET_SIZE
(
rep
);
else
repsize
=
PyUnicode_GET_LENGTH
(
rep
);
if
(
repsize
>
max_char_size
)
{
Py_ssize_t
offset
;
if
(
result
==
NULL
)
offset
=
p
-
stackbuf
;
else
offset
=
p
-
PyBytes_AS_STRING
(
result
);
if
(
nallocated
>
PY_SSIZE_T_MAX
-
repsize
+
max_char_size
)
{
/* integer overflow */
PyErr_NoMemory
();
goto
error
;
}
nallocated
+=
repsize
-
max_char_size
;
if
(
result
!=
NULL
)
{
if
(
_PyBytes_Resize
(
&
result
,
nallocated
)
<
0
)
goto
error
;
}
else
{
result
=
PyBytes_FromStringAndSize
(
NULL
,
nallocated
);
if
(
result
==
NULL
)
goto
error
;
Py_MEMCPY
(
PyBytes_AS_STRING
(
result
),
stackbuf
,
offset
);
}
p
=
PyBytes_AS_STRING
(
result
)
+
offset
;
}
if
(
PyBytes_Check
(
rep
))
{
char
*
prep
=
PyBytes_AS_STRING
(
rep
);
for
(
k
=
repsize
;
k
>
0
;
k
--
)
*
p
++
=
*
prep
++
;
}
else
/* rep is unicode */
{
enum
PyUnicode_Kind
repkind
;
void
*
repdata
;
if
(
PyUnicode_READY
(
rep
)
<
0
)
goto
error
;
repkind
=
PyUnicode_KIND
(
rep
);
repdata
=
PyUnicode_DATA
(
rep
);
for
(
k
=
0
;
k
<
repsize
;
k
++
)
{
Py_UCS4
c
=
PyUnicode_READ
(
repkind
,
repdata
,
k
);
if
(
0x80
<=
c
)
{
raise_encode_exception
(
&
exc
,
"utf-8"
,
unicode
,
i
-
1
,
i
,
"surrogates not allowed"
);
goto
error
;
}
*
p
++
=
(
char
)
c
;
}
}
Py_CLEAR
(
rep
);
}
else
#if STRINGLIB_SIZEOF_CHAR > 2
if
(
ch
<
0x10000
)
#endif
{
*
p
++
=
(
char
)(
0xe0
|
(
ch
>>
12
));
*
p
++
=
(
char
)(
0x80
|
((
ch
>>
6
)
&
0x3f
));
*
p
++
=
(
char
)(
0x80
|
(
ch
&
0x3f
));
}
#if STRINGLIB_SIZEOF_CHAR > 2
else
/* ch >= 0x10000 */
{
assert
(
ch
<=
MAX_UNICODE
);
/* Encode UCS4 Unicode ordinals */
*
p
++
=
(
char
)(
0xf0
|
(
ch
>>
18
));
*
p
++
=
(
char
)(
0x80
|
((
ch
>>
12
)
&
0x3f
));
*
p
++
=
(
char
)(
0x80
|
((
ch
>>
6
)
&
0x3f
));
*
p
++
=
(
char
)(
0x80
|
(
ch
&
0x3f
));
}
#endif
/* STRINGLIB_SIZEOF_CHAR > 2 */
#endif
/* STRINGLIB_SIZEOF_CHAR > 1 */
}
if
(
result
==
NULL
)
{
/* This was stack allocated. */
nneeded
=
p
-
stackbuf
;
assert
(
nneeded
<=
nallocated
);
result
=
PyBytes_FromStringAndSize
(
stackbuf
,
nneeded
);
}
else
{
/* Cut back to size actually needed. */
nneeded
=
p
-
PyBytes_AS_STRING
(
result
);
assert
(
nneeded
<=
nallocated
);
_PyBytes_Resize
(
&
result
,
nneeded
);
}
#if STRINGLIB_SIZEOF_CHAR > 1
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
exc
);
#endif
return
result
;
#if STRINGLIB_SIZEOF_CHAR > 1
error
:
Py_XDECREF
(
rep
);
Py_XDECREF
(
errorHandler
);
Py_XDECREF
(
exc
);
Py_XDECREF
(
result
);
return
NULL
;
#endif
#undef MAX_SHORT_UNICHARS
}
#endif
/* STRINGLIB_IS_UNICODE */
Objects/unicodeobject.c
Dosyayı görüntüle @
6099a032
...
...
@@ -4987,20 +4987,9 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
PyObject *
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
{
#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Py_ssize_t i; /* index into s of next input byte */
PyObject *result; /* result string object */
char *p; /* next free byte in output buffer */
Py_ssize_t nallocated; /* number of result bytes allocated */
Py_ssize_t nneeded; /* number of result bytes needed */
char stackbuf[MAX_SHORT_UNICHARS * 4];
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
int kind;
enum PyUnicode_Kind kind;
void *data;
Py_ssize_t size;
PyObject *rep = NULL;
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
...
...
@@ -5018,144 +5007,18 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
data = PyUnicode_DATA(unicode);
size = PyUnicode_GET_LENGTH(unicode);
assert(size >= 0);
if (size <= MAX_SHORT_UNICHARS) {
/* Write into the stack buffer; nallocated can't overflow.
* At the end, we'll allocate exactly as much heap space as it
* turns out we need.
*/
nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
result = NULL; /* will allocate after we're done */
p = stackbuf;
}
else {
/* Overallocate on the heap, and give the excess back at the end. */
nallocated = size * 4;
if (nallocated / 4 != size) /* overflow! */
return PyErr_NoMemory();
result = PyBytes_FromStringAndSize(NULL, nallocated);
if (result == NULL)
return NULL;
p = PyBytes_AS_STRING(result);
}
for (i = 0; i < size;) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
if (ch < 0x80)
/* Encode ASCII */
*p++ = (char) ch;
else if (ch < 0x0800) {
/* Encode Latin-1 */
*p++ = (char)(0xc0 | (ch >> 6));
*p++ = (char)(0x80 | (ch & 0x3f));
} else if (Py_UNICODE_IS_SURROGATE(ch)) {
Py_ssize_t newpos;
Py_ssize_t repsize, k, startpos;
startpos = i-1;
rep = unicode_encode_call_errorhandler(
errors, &errorHandler, "utf-8", "surrogates not allowed",
unicode, &exc, startpos, startpos+1, &newpos);
if (!rep)
goto error;
if (PyBytes_Check(rep))
repsize = PyBytes_GET_SIZE(rep);
else
repsize = PyUnicode_GET_LENGTH(rep);
if (repsize > 4) {
Py_ssize_t offset;
if (result == NULL)
offset = p - stackbuf;
else
offset = p - PyBytes_AS_STRING(result);
if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
/* integer overflow */
PyErr_NoMemory();
goto error;
}
nallocated += repsize - 4;
if (result != NULL) {
if (_PyBytes_Resize(&result, nallocated) < 0)
goto error;
} else {
result = PyBytes_FromStringAndSize(NULL, nallocated);
if (result == NULL)
goto error;
Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
}
p = PyBytes_AS_STRING(result) + offset;
}
if (PyBytes_Check(rep)) {
char *prep = PyBytes_AS_STRING(rep);
for(k = repsize; k > 0; k--)
*p++ = *prep++;
} else /* rep is unicode */ {
enum PyUnicode_Kind repkind;
void *repdata;
if (PyUnicode_READY(rep) < 0)
goto error;
repkind = PyUnicode_KIND(rep);
repdata = PyUnicode_DATA(rep);
for(k=0; k<repsize; k++) {
Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
if (0x80 <= c) {
raise_encode_exception(&exc, "utf-8",
unicode,
i-1, i,
"surrogates not allowed");
goto error;
}
*p++ = (char)c;
}
}
Py_CLEAR(rep);
} else if (ch < 0x10000) {
*p++ = (char)(0xe0 | (ch >> 12));
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
} else /* ch >= 0x10000 */ {
assert(ch <= MAX_UNICODE);
/* Encode UCS4 Unicode ordinals */
*p++ = (char)(0xf0 | (ch >> 18));
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
}
}
if (result == NULL) {
/* This was stack allocated. */
nneeded = p - stackbuf;
assert(nneeded <= nallocated);
result = PyBytes_FromStringAndSize(stackbuf, nneeded);
}
else {
/* Cut back to size actually needed. */
nneeded = p - PyBytes_AS_STRING(result);
assert(nneeded <= nallocated);
_PyBytes_Resize(&result, nneeded);
switch(kind) {
default:
assert(0);
case PyUnicode_1BYTE_KIND:
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
assert(!PyUnicode_IS_ASCII(unicode));
return ucs1lib_utf8_encoder(unicode, data, size, errors);
case PyUnicode_2BYTE_KIND:
return ucs2lib_utf8_encoder(unicode, data, size, errors);
case PyUnicode_4BYTE_KIND:
return ucs4lib_utf8_encoder(unicode, data, size, errors);
}
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return result;
error:
Py_XDECREF(rep);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
Py_XDECREF(result);
return NULL;
#undef MAX_SHORT_UNICHARS
}
PyObject *
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment