Kaydet (Commit) 942889aa authored tarafından Victor Stinner's avatar Victor Stinner

Issue #27938: Add a fast-path for us-ascii encoding

Other changes:

* Rewrite _Py_normalize_encoding() as a C implementation of
  encodings.normalize_encoding(). For example, " utf-8 " is now normalized to
  "utf_8". So the fast path is now used for more name variants of the same
  encoding.
* Avoid strcpy() when encoding is NULL: call directly the UTF-8 codec
üst a9ab165c
...@@ -3100,9 +3100,9 @@ PyUnicode_FromEncodedObject(PyObject *obj, ...@@ -3100,9 +3100,9 @@ PyUnicode_FromEncodedObject(PyObject *obj,
return v; return v;
} }
/* Convert encoding to lower case and replace '_' with '-' in order to /* Normalize an encoding name: C implementation of
catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), encodings.normalize_encoding(). Return 1 on success, or 0 on error (encoding
1 on success. */ is longer than lower_len-1). */
int int
_Py_normalize_encoding(const char *encoding, _Py_normalize_encoding(const char *encoding,
char *lower, char *lower,
...@@ -3111,30 +3111,39 @@ _Py_normalize_encoding(const char *encoding, ...@@ -3111,30 +3111,39 @@ _Py_normalize_encoding(const char *encoding,
const char *e; const char *e;
char *l; char *l;
char *l_end; char *l_end;
int punct;
assert(encoding != NULL);
if (encoding == NULL) {
/* 6 == strlen("utf-8") + 1 */
if (lower_len < 6)
return 0;
strcpy(lower, "utf-8");
return 1;
}
e = encoding; e = encoding;
l = lower; l = lower;
l_end = &lower[lower_len - 1]; l_end = &lower[lower_len - 1];
while (*e) { punct = 0;
if (l == l_end) while (1) {
return 0; char c = *e;
if (Py_ISUPPER(*e)) { if (c == 0) {
*l++ = Py_TOLOWER(*e++); break;
} }
else if (*e == '_') {
*l++ = '-'; if (Py_ISALNUM(c) || c == '.') {
e++; if (punct && l != lower) {
if (l == l_end) {
return 0;
}
*l++ = '_';
}
punct = 0;
if (l == l_end) {
return 0;
}
*l++ = Py_TOLOWER(c);
} }
else { else {
*l++ = *e++; punct = 1;
} }
e++;
} }
*l = '\0'; *l = '\0';
return 1; return 1;
...@@ -3148,28 +3157,51 @@ PyUnicode_Decode(const char *s, ...@@ -3148,28 +3157,51 @@ PyUnicode_Decode(const char *s,
{ {
PyObject *buffer = NULL, *unicode; PyObject *buffer = NULL, *unicode;
Py_buffer info; Py_buffer info;
char lower[11]; /* Enough for any encoding shortcut */ char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
if (encoding == NULL) {
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
}
/* Shortcuts for common default encodings */ /* Shortcuts for common default encodings */
if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
if ((strcmp(lower, "utf-8") == 0) || char *lower = buflower;
(strcmp(lower, "utf8") == 0))
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); /* Fast paths */
else if ((strcmp(lower, "latin-1") == 0) || if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
(strcmp(lower, "latin1") == 0) || lower += 3;
(strcmp(lower, "iso-8859-1") == 0) || if (*lower == '_') {
(strcmp(lower, "iso8859-1") == 0)) /* Match "utf8" and "utf_8" */
return PyUnicode_DecodeLatin1(s, size, errors); lower++;
#ifdef HAVE_MBCS }
else if (strcmp(lower, "mbcs") == 0)
return PyUnicode_DecodeMBCS(s, size, errors); if (lower[0] == '8' && lower[1] == 0) {
#endif return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
else if (strcmp(lower, "ascii") == 0) }
return PyUnicode_DecodeASCII(s, size, errors); else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
else if (strcmp(lower, "utf-16") == 0) return PyUnicode_DecodeUTF16(s, size, errors, 0);
return PyUnicode_DecodeUTF16(s, size, errors, 0); }
else if (strcmp(lower, "utf-32") == 0) else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
return PyUnicode_DecodeUTF32(s, size, errors, 0); return PyUnicode_DecodeUTF32(s, size, errors, 0);
}
}
else {
if (strcmp(lower, "ascii") == 0
|| strcmp(lower, "us_ascii") == 0) {
return PyUnicode_DecodeASCII(s, size, errors);
}
#ifdef HAVE_MBCS
else if (strcmp(lower, "mbcs") == 0) {
return PyUnicode_DecodeMBCS(s, size, errors);
}
#endif
else if (strcmp(lower, "latin1") == 0
|| strcmp(lower, "latin_1") == 0
|| strcmp(lower, "iso_8859_1") == 0
|| strcmp(lower, "iso8859_1") == 0) {
return PyUnicode_DecodeLatin1(s, size, errors);
}
}
} }
/* Decode via the codec registry */ /* Decode via the codec registry */
...@@ -3512,34 +3544,56 @@ PyUnicode_AsEncodedString(PyObject *unicode, ...@@ -3512,34 +3544,56 @@ PyUnicode_AsEncodedString(PyObject *unicode,
const char *errors) const char *errors)
{ {
PyObject *v; PyObject *v;
char lower[11]; /* Enough for any encoding shortcut */ char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
if (!PyUnicode_Check(unicode)) { if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument(); PyErr_BadArgument();
return NULL; return NULL;
} }
if (encoding == NULL) {
return _PyUnicode_AsUTF8String(unicode, errors);
}
/* Shortcuts for common default encodings */ /* Shortcuts for common default encodings */
if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
if ((strcmp(lower, "utf-8") == 0) || char *lower = buflower;
(strcmp(lower, "utf8") == 0))
{ /* Fast paths */
if (errors == NULL || strcmp(errors, "strict") == 0) if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
return _PyUnicode_AsUTF8String(unicode, NULL); lower += 3;
else if (*lower == '_') {
/* Match "utf8" and "utf_8" */
lower++;
}
if (lower[0] == '8' && lower[1] == 0) {
return _PyUnicode_AsUTF8String(unicode, errors); return _PyUnicode_AsUTF8String(unicode, errors);
}
else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
return _PyUnicode_EncodeUTF16(unicode, errors, 0);
}
else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
return _PyUnicode_EncodeUTF32(unicode, errors, 0);
}
} }
else if ((strcmp(lower, "latin-1") == 0) || else {
(strcmp(lower, "latin1") == 0) || if (strcmp(lower, "ascii") == 0
(strcmp(lower, "iso-8859-1") == 0) || || strcmp(lower, "us_ascii") == 0) {
(strcmp(lower, "iso8859-1") == 0)) return _PyUnicode_AsASCIIString(unicode, errors);
return _PyUnicode_AsLatin1String(unicode, errors); }
#ifdef HAVE_MBCS #ifdef HAVE_MBCS
else if (strcmp(lower, "mbcs") == 0) else if (strcmp(lower, "mbcs") == 0) {
return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
}
#endif #endif
else if (strcmp(lower, "ascii") == 0) else if (strcmp(lower, "latin1") == 0 ||
return _PyUnicode_AsASCIIString(unicode, errors); strcmp(lower, "latin_1") == 0 ||
strcmp(lower, "iso_8859_1") == 0 ||
strcmp(lower, "iso8859_1") == 0) {
return _PyUnicode_AsLatin1String(unicode, errors);
}
}
} }
/* Encode via the codec registry */ /* Encode via the codec registry */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment