Kaydet (Commit) 6bd525b6 authored tarafından Victor Stinner's avatar Victor Stinner

Optimize error handlers of ASCII and Latin1 encoders when the replacement

string is pure ASCII: use _PyBytesWriter_WriteBytes(), don't check individual
character.

Cleanup unicode_encode_ucs1():

* Rename repunicode to rep
* Clear rep object on error
* Factorize code between bytes and unicode path
üst ce179bf6
...@@ -311,7 +311,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, ...@@ -311,7 +311,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
#if STRINGLIB_SIZEOF_CHAR > 1 #if STRINGLIB_SIZEOF_CHAR > 1
else if (Py_UNICODE_IS_SURROGATE(ch)) { else if (Py_UNICODE_IS_SURROGATE(ch)) {
Py_ssize_t startpos, endpos, newpos; Py_ssize_t startpos, endpos, newpos;
Py_ssize_t repsize, k; Py_ssize_t k;
if (error_handler == _Py_ERROR_UNKNOWN) if (error_handler == _Py_ERROR_UNKNOWN)
error_handler = get_error_handler(errors); error_handler = get_error_handler(errors);
...@@ -392,20 +392,12 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, ...@@ -392,20 +392,12 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
p = _PyBytesWriter_WriteBytes(&writer, p, p = _PyBytesWriter_WriteBytes(&writer, p,
PyBytes_AS_STRING(rep), PyBytes_AS_STRING(rep),
PyBytes_GET_SIZE(rep)); PyBytes_GET_SIZE(rep));
if (p == NULL)
goto error;
} }
else { else {
/* rep is unicode */ /* rep is unicode */
if (PyUnicode_READY(rep) < 0) if (PyUnicode_READY(rep) < 0)
goto error; goto error;
repsize = PyUnicode_GET_LENGTH(rep);
p = _PyBytesWriter_Prepare(&writer, p, repsize);
if (p == NULL)
goto error;
if (!PyUnicode_IS_ASCII(rep)) { if (!PyUnicode_IS_ASCII(rep)) {
raise_encode_exception(&exc, "utf-8", raise_encode_exception(&exc, "utf-8",
unicode, unicode,
...@@ -415,9 +407,13 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, ...@@ -415,9 +407,13 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
} }
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
memcpy(p, PyUnicode_DATA(rep), repsize); p = _PyBytesWriter_WriteBytes(&writer, p,
p += repsize; PyUnicode_DATA(rep),
PyUnicode_GET_LENGTH(rep));
} }
if (p == NULL)
goto error;
Py_CLEAR(rep); Py_CLEAR(rep);
i = newpos; i = newpos;
......
...@@ -6599,6 +6599,7 @@ unicode_encode_ucs1(PyObject *unicode, ...@@ -6599,6 +6599,7 @@ unicode_encode_ucs1(PyObject *unicode,
PyObject *error_handler_obj = NULL; PyObject *error_handler_obj = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN; _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
PyObject *rep = NULL;
/* output object */ /* output object */
_PyBytesWriter writer; _PyBytesWriter writer;
...@@ -6627,8 +6628,7 @@ unicode_encode_ucs1(PyObject *unicode, ...@@ -6627,8 +6628,7 @@ unicode_encode_ucs1(PyObject *unicode,
++pos; ++pos;
} }
else { else {
PyObject *repunicode; Py_ssize_t newpos, i;
Py_ssize_t repsize, newpos, i;
/* startpos for collecting unencodable chars */ /* startpos for collecting unencodable chars */
Py_ssize_t collstart = pos; Py_ssize_t collstart = pos;
Py_ssize_t collend = collstart + 1; Py_ssize_t collend = collstart + 1;
...@@ -6694,52 +6694,59 @@ unicode_encode_ucs1(PyObject *unicode, ...@@ -6694,52 +6694,59 @@ unicode_encode_ucs1(PyObject *unicode,
/* fallback to general error handling */ /* fallback to general error handling */
default: default:
repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj, rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
encoding, reason, unicode, &exc, encoding, reason, unicode, &exc,
collstart, collend, &newpos); collstart, collend, &newpos);
if (repunicode == NULL || (PyUnicode_Check(repunicode) && if (rep == NULL)
PyUnicode_READY(repunicode) == -1))
goto onError; goto onError;
/* substract preallocated bytes */ /* substract preallocated bytes */
writer.min_size -= 1; writer.min_size -= 1;
if (PyBytes_Check(repunicode)) { if (PyBytes_Check(rep)) {
/* Directly copy bytes result to output. */ /* Directly copy bytes result to output. */
str = _PyBytesWriter_WriteBytes(&writer, str, str = _PyBytesWriter_WriteBytes(&writer, str,
PyBytes_AS_STRING(repunicode), PyBytes_AS_STRING(rep),
PyBytes_GET_SIZE(repunicode)); PyBytes_GET_SIZE(rep));
if (str == NULL) if (str == NULL)
goto onError; goto onError;
pos = newpos;
Py_DECREF(repunicode);
break;
} }
else {
assert(PyUnicode_Check(rep));
/* need more space? (at least enough for what we if (PyUnicode_READY(rep) < 0)
have+the replacement+the rest of the string, so goto onError;
we won't have to check space for encodable characters) */
repsize = PyUnicode_GET_LENGTH(repunicode); if (PyUnicode_IS_ASCII(rep)) {
/* Fast path: all characters are smaller than limit */
assert(limit >= 128);
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
str = _PyBytesWriter_WriteBytes(&writer, str,
PyUnicode_DATA(rep),
PyUnicode_GET_LENGTH(rep));
}
else {
Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
str = _PyBytesWriter_Prepare(&writer, str, repsize); str = _PyBytesWriter_Prepare(&writer, str, repsize);
if (str == NULL) if (str == NULL)
goto onError; goto onError;
/* check if there is anything unencodable in the replacement /* check if there is anything unencodable in the
and copy it to the output */ replacement and copy it to the output */
for (i = 0; repsize-->0; ++i, ++str) { for (i = 0; repsize-->0; ++i, ++str) {
ch = PyUnicode_READ_CHAR(repunicode, i); ch = PyUnicode_READ_CHAR(rep, i);
if (ch >= limit) { if (ch >= limit) {
raise_encode_exception(&exc, encoding, unicode, raise_encode_exception(&exc, encoding, unicode,
pos, pos+1, reason); pos, pos+1, reason);
Py_DECREF(repunicode);
goto onError; goto onError;
} }
*str = (char)ch; *str = (char)ch;
} }
}
}
pos = newpos; pos = newpos;
Py_DECREF(repunicode); Py_CLEAR(rep);
} }
/* If overallocation was disabled, ensure that it was the last /* If overallocation was disabled, ensure that it was the last
...@@ -6753,6 +6760,7 @@ unicode_encode_ucs1(PyObject *unicode, ...@@ -6753,6 +6760,7 @@ unicode_encode_ucs1(PyObject *unicode,
return _PyBytesWriter_Finish(&writer, str); return _PyBytesWriter_Finish(&writer, str);
onError: onError:
Py_XDECREF(rep);
_PyBytesWriter_Dealloc(&writer); _PyBytesWriter_Dealloc(&writer);
Py_XDECREF(error_handler_obj); Py_XDECREF(error_handler_obj);
Py_XDECREF(exc); Py_XDECREF(exc);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment