Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
e64322e0
Kaydet (Commit)
e64322e0
authored
Eki 30, 2012
tarafından
Victor Stinner
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
Close #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster
Patch written by Serhiy Storchaka.
üst
d4156c16
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
73 additions
and
74 deletions
+73
-74
3.4.rst
Doc/whatsnew/3.4.rst
+1
-1
NEWS
Misc/NEWS
+3
-0
unicodeobject.c
Objects/unicodeobject.c
+69
-73
No files found.
Doc/whatsnew/3.4.rst
Dosyayı görüntüle @
e64322e0
...
@@ -157,7 +157,7 @@ Optimizations
...
@@ -157,7 +157,7 @@ Optimizations
Major performance enhancements have been added:
Major performance enhancements have been added:
*
None yet
.
*
The UTF-32 decoder is now 3x to 4x faster
.
Build and C API Changes
Build and C API Changes
...
...
Misc/NEWS
Dosyayı görüntüle @
e64322e0
...
@@ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1?
...
@@ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1?
Core and Builtins
Core and Builtins
-----------------
-----------------
- Issue #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster. Patch
written by Serhiy Storchaka.
- Issue #16197: Update winreg docstrings and documentation to match code.
- Issue #16197: Update winreg docstrings and documentation to match code.
Patch by Zachary Ware.
Patch by Zachary Ware.
...
...
Objects/unicodeobject.c
Dosyayı görüntüle @
e64322e0
...
@@ -4804,14 +4804,8 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
...
@@ -4804,14 +4804,8 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
Py_ssize_t outpos;
Py_ssize_t outpos;
PyObject *unicode;
PyObject *unicode;
const unsigned char *q, *e;
const unsigned char *q, *e;
int bo = 0; /* assume native ordering by default */
int
le,
bo = 0; /* assume native ordering by default */
const char *errmsg = "";
const char *errmsg = "";
/* Offsets from q for retrieving bytes in the right order. */
#if PY_LITTLE_ENDIAN
int iorder[] = {0, 1, 2, 3};
#else
int iorder[] = {3, 2, 1, 0};
#endif
PyObject *errorHandler = NULL;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
PyObject *exc = NULL;
...
@@ -4825,83 +4819,88 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
...
@@ -4825,83 +4819,88 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
byte order setting accordingly. In native mode, the leading BOM
byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output
mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */
stream as-is (giving a ZWNBSP character). */
if (bo == 0) {
if (bo == 0 && size >= 4) {
if (size >= 4) {
Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
if (bom == 0x0000FEFF) {
(q[iorder[1]] << 8) | q[iorder[0]];
bo = -1;
#if PY_LITTLE_ENDIAN
q += 4;
if (bom == 0x0000FEFF) {
q += 4;
bo = -1;
}
else if (bom == 0xFFFE0000) {
q += 4;
bo = 1;
}
#else
if (bom == 0x0000FEFF) {
q += 4;
bo = 1;
}
else if (bom == 0xFFFE0000) {
q += 4;
bo = -1;
}
#endif
}
}
else if (bom == 0xFFFE0000) {
bo = 1;
q += 4;
}
if (byteorder)
*byteorder = bo;
}
}
if (bo == -1) {
if (q == e) {
/* force LE */
if (consumed)
iorder[0] = 0;
*consumed = size;
iorder[1] = 1;
Py_INCREF(unicode_empty);
iorder[2] = 2;
return unicode_empty;
iorder[3] = 3;
}
else if (bo == 1) {
/* force BE */
iorder[0] = 3;
iorder[1] = 2;
iorder[2] = 1;
iorder[3] = 0;
}
}
/* This might be one to much, because of a BOM */
#ifdef WORDS_BIGENDIAN
unicode = PyUnicode_New((size+3)/4, 127);
le = bo < 0;
#else
le = bo <= 0;
#endif
unicode = PyUnicode_New((e - q + 3) / 4, 127);
if (!unicode)
if (!unicode)
return NULL;
return NULL;
if (size == 0)
return unicode;
outpos = 0;
outpos = 0;
while (1) {
Py_UCS4 ch = 0;
Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(unicode);
if (e - q >= 4) {
enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
void *data = PyUnicode_DATA(unicode);
const unsigned char *last = e - 4;
if (le) {
do {
ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
if (ch > maxch)
break;
PyUnicode_WRITE(kind, data, outpos++, ch);
q += 4;
} while (q <= last);
}
else {
do {
ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
if (ch > maxch)
break;
PyUnicode_WRITE(kind, data, outpos++, ch);
q += 4;
} while (q <= last);
}
}
while (q < e) {
if (ch <= maxch) {
Py_UCS4 ch;
if (q == e || consumed)
/* remaining bytes at the end? (size should be divisible by 4) */
if (e-q<4) {
if (consumed)
break;
break;
/* remaining bytes at the end? (size should be divisible by 4) */
errmsg = "truncated data";
errmsg = "truncated data";
startinpos = ((const char *)q)-starts;
startinpos = ((const char *)q) - starts;
endinpos = ((const char *)e)-starts;
endinpos = ((const char *)e) - starts;
goto utf32Error;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
}
}
ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
else {
(q[iorder[1]] << 8) | q[iorder[0]];
if (ch < 0x110000) {
if (unicode_putchar(&unicode, &outpos, ch) < 0)
if (ch >= 0x110000)
goto onError;
{
q += 4;
continue;
}
errmsg = "codepoint not in range(0x110000)";
errmsg = "codepoint not in range(0x110000)";
startinpos = ((const char *)q)-starts;
startinpos = ((const char *)q) - starts;
endinpos = startinpos+4;
endinpos = startinpos + 4;
goto utf32Error;
}
}
if (unicode_putchar(&unicode, &outpos, ch) < 0)
goto onError;
/* The remaining input chars are ignored if the callback
q += 4;
chooses to skip the input */
continue;
utf32Error:
if (unicode_decode_call_errorhandler(
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
errors, &errorHandler,
"utf32", errmsg,
"utf32", errmsg,
...
@@ -4910,9 +4909,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
...
@@ -4910,9 +4909,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
goto onError;
goto onError;
}
}
if (byteorder)
*byteorder = bo;
if (consumed)
if (consumed)
*consumed = (const char *)q-starts;
*consumed = (const char *)q-starts;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment