Kaydet (Commit) 3d325191 authored tarafından Martin v. Löwis's avatar Martin v. Löwis

Port code page codec to Unicode API.

üst 8ba79306
...@@ -577,22 +577,18 @@ class CodecCallbackTest(unittest.TestCase): ...@@ -577,22 +577,18 @@ class CodecCallbackTest(unittest.TestCase):
UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")), UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
("\\uffff", 1) ("\\uffff", 1)
) )
if SIZEOF_WCHAR_T == 2:
len_wide = 2
else:
len_wide = 1
if SIZEOF_WCHAR_T > 0: if SIZEOF_WCHAR_T > 0:
self.assertEqual( self.assertEqual(
codecs.backslashreplace_errors( codecs.backslashreplace_errors(
UnicodeEncodeError("ascii", "\U00010000", UnicodeEncodeError("ascii", "\U00010000",
0, len_wide, "ouch")), 0, 1, "ouch")),
("\\U00010000", len_wide) ("\\U00010000", 1)
) )
self.assertEqual( self.assertEqual(
codecs.backslashreplace_errors( codecs.backslashreplace_errors(
UnicodeEncodeError("ascii", "\U0010ffff", UnicodeEncodeError("ascii", "\U0010ffff",
0, len_wide, "ouch")), 0, 1, "ouch")),
("\\U0010ffff", len_wide) ("\\U0010ffff", 1)
) )
# Lone surrogates (regardless of unicode width) # Lone surrogates (regardless of unicode width)
self.assertEqual( self.assertEqual(
......
...@@ -4680,9 +4680,6 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) ...@@ -4680,9 +4680,6 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
int kind; int kind;
void *data; void *data;
Py_ssize_t size; Py_ssize_t size;
#if SIZEOF_WCHAR_T == 2
Py_ssize_t wchar_offset = 0;
#endif
if (!PyUnicode_Check(unicode)) { if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument(); PyErr_BadArgument();
...@@ -4738,9 +4735,6 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) ...@@ -4738,9 +4735,6 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
PyObject *rep; PyObject *rep;
Py_ssize_t repsize, k, startpos; Py_ssize_t repsize, k, startpos;
startpos = i-1; startpos = i-1;
#if SIZEOF_WCHAR_T == 2
startpos += wchar_offset;
#endif
rep = unicode_encode_call_errorhandler( rep = unicode_encode_call_errorhandler(
errors, &errorHandler, "utf-8", "surrogates not allowed", errors, &errorHandler, "utf-8", "surrogates not allowed",
unicode, &exc, startpos, startpos+1, &newpos); unicode, &exc, startpos, startpos+1, &newpos);
...@@ -4809,9 +4803,6 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) ...@@ -4809,9 +4803,6 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f)); *p++ = (char)(0x80 | (ch & 0x3f));
#if SIZEOF_WCHAR_T == 2
wchar_offset++;
#endif
} }
} }
...@@ -7315,23 +7306,37 @@ encode_code_page_flags(UINT code_page, const char *errors) ...@@ -7315,23 +7306,37 @@ encode_code_page_flags(UINT code_page, const char *errors)
*/ */
static int static int
encode_code_page_strict(UINT code_page, PyObject **outbytes, encode_code_page_strict(UINT code_page, PyObject **outbytes,
const Py_UNICODE *p, const int size, PyObject *unicode, Py_ssize_t offset, int len,
const char* errors) const char* errors)
{ {
BOOL usedDefaultChar = FALSE; BOOL usedDefaultChar = FALSE;
BOOL *pusedDefaultChar = &usedDefaultChar; BOOL *pusedDefaultChar = &usedDefaultChar;
int outsize; int outsize;
PyObject *exc = NULL; PyObject *exc = NULL;
Py_UNICODE *p;
Py_ssize_t size;
const DWORD flags = encode_code_page_flags(code_page, NULL); const DWORD flags = encode_code_page_flags(code_page, NULL);
char *out; char *out;
/* Create a substring so that we can get the UTF-16 representation
of just the slice under consideration. */
PyObject *substring;
assert(size > 0); assert(len > 0);
if (code_page != CP_UTF8 && code_page != CP_UTF7) if (code_page != CP_UTF8 && code_page != CP_UTF7)
pusedDefaultChar = &usedDefaultChar; pusedDefaultChar = &usedDefaultChar;
else else
pusedDefaultChar = NULL; pusedDefaultChar = NULL;
substring = PyUnicode_Substring(unicode, offset, offset+len);
if (substring == NULL)
return -1;
p = PyUnicode_AsUnicodeAndSize(substring, &size);
if (p == NULL) {
Py_DECREF(substring);
return -1;
}
/* First get the size of the result */ /* First get the size of the result */
outsize = WideCharToMultiByte(code_page, flags, outsize = WideCharToMultiByte(code_page, flags,
p, size, p, size,
...@@ -7340,14 +7345,18 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes, ...@@ -7340,14 +7345,18 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
if (outsize <= 0) if (outsize <= 0)
goto error; goto error;
/* If we used a default char, then we failed! */ /* If we used a default char, then we failed! */
if (pusedDefaultChar && *pusedDefaultChar) if (pusedDefaultChar && *pusedDefaultChar) {
Py_DECREF(substring);
return -2; return -2;
}
if (*outbytes == NULL) { if (*outbytes == NULL) {
/* Create string object */ /* Create string object */
*outbytes = PyBytes_FromStringAndSize(NULL, outsize); *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
if (*outbytes == NULL) if (*outbytes == NULL) {
Py_DECREF(substring);
return -1; return -1;
}
out = PyBytes_AS_STRING(*outbytes); out = PyBytes_AS_STRING(*outbytes);
} }
else { else {
...@@ -7355,10 +7364,13 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes, ...@@ -7355,10 +7364,13 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
const Py_ssize_t n = PyBytes_Size(*outbytes); const Py_ssize_t n = PyBytes_Size(*outbytes);
if (outsize > PY_SSIZE_T_MAX - n) { if (outsize > PY_SSIZE_T_MAX - n) {
PyErr_NoMemory(); PyErr_NoMemory();
Py_DECREF(substring);
return -1; return -1;
} }
if (_PyBytes_Resize(outbytes, n + outsize) < 0) if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
Py_DECREF(substring);
return -1; return -1;
}
out = PyBytes_AS_STRING(*outbytes) + n; out = PyBytes_AS_STRING(*outbytes) + n;
} }
...@@ -7367,6 +7379,7 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes, ...@@ -7367,6 +7379,7 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
p, size, p, size,
out, outsize, out, outsize,
NULL, pusedDefaultChar); NULL, pusedDefaultChar);
Py_CLEAR(substring);
if (outsize <= 0) if (outsize <= 0)
goto error; goto error;
if (pusedDefaultChar && *pusedDefaultChar) if (pusedDefaultChar && *pusedDefaultChar)
...@@ -7374,6 +7387,7 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes, ...@@ -7374,6 +7387,7 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
return 0; return 0;
error: error:
Py_XDECREF(substring);
if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
return -2; return -2;
PyErr_SetFromWindowsErr(0); PyErr_SetFromWindowsErr(0);
...@@ -7390,12 +7404,11 @@ error: ...@@ -7390,12 +7404,11 @@ error:
static int static int
encode_code_page_errors(UINT code_page, PyObject **outbytes, encode_code_page_errors(UINT code_page, PyObject **outbytes,
PyObject *unicode, Py_ssize_t unicode_offset, PyObject *unicode, Py_ssize_t unicode_offset,
const Py_UNICODE *in, const int insize, Py_ssize_t insize, const char* errors)
const char* errors)
{ {
const DWORD flags = encode_code_page_flags(code_page, errors); const DWORD flags = encode_code_page_flags(code_page, errors);
const Py_UNICODE *startin = in; Py_ssize_t pos = unicode_offset;
const Py_UNICODE *endin = in + insize; Py_ssize_t endin = unicode_offset + insize;
/* Ideally, we should get reason from FormatMessage. This is the Windows /* Ideally, we should get reason from FormatMessage. This is the Windows
2000 English version of the message. */ 2000 English version of the message. */
const char *reason = "invalid character"; const char *reason = "invalid character";
...@@ -7404,12 +7417,11 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, ...@@ -7404,12 +7417,11 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
BOOL usedDefaultChar = FALSE, *pusedDefaultChar; BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
Py_ssize_t outsize; Py_ssize_t outsize;
char *out; char *out;
int charsize;
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
PyObject *encoding_obj = NULL; PyObject *encoding_obj = NULL;
char *encoding; char *encoding;
Py_ssize_t startpos, newpos, newoutsize; Py_ssize_t newpos, newoutsize;
PyObject *rep; PyObject *rep;
int ret = -1; int ret = -1;
...@@ -7422,7 +7434,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, ...@@ -7422,7 +7434,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
if (errors == NULL || strcmp(errors, "strict") == 0) { if (errors == NULL || strcmp(errors, "strict") == 0) {
/* The last error was ERROR_NO_UNICODE_TRANSLATION, /* The last error was ERROR_NO_UNICODE_TRANSLATION,
then we raise a UnicodeEncodeError. */ then we raise a UnicodeEncodeError. */
make_encode_exception(&exc, encoding, in, insize, 0, 0, reason); make_encode_exception_obj(&exc, encoding, unicode, 0, 0, reason);
if (exc != NULL) { if (exc != NULL) {
PyCodec_StrictErrors(exc); PyCodec_StrictErrors(exc);
Py_DECREF(exc); Py_DECREF(exc);
...@@ -7462,23 +7474,30 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, ...@@ -7462,23 +7474,30 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
} }
/* Encode the string character per character */ /* Encode the string character per character */
while (in < endin) while (pos < endin)
{ {
if ((in + 2) <= endin Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
&& 0xD800 <= in[0] && in[0] <= 0xDBFF wchar_t chars[2];
&& 0xDC00 <= in[1] && in[1] <= 0xDFFF) int charsize;
charsize = 2; if (ch < 0x10000) {
else chars[0] = (wchar_t)ch;
charsize = 1; charsize = 1;
}
else {
ch -= 0x10000;
chars[0] = 0xd800 + (ch >> 10);
chars[1] = 0xdc00 + (ch & 0x3ff);
charsize = 2;
}
outsize = WideCharToMultiByte(code_page, flags, outsize = WideCharToMultiByte(code_page, flags,
in, charsize, chars, charsize,
buffer, Py_ARRAY_LENGTH(buffer), buffer, Py_ARRAY_LENGTH(buffer),
NULL, pusedDefaultChar); NULL, pusedDefaultChar);
if (outsize > 0) { if (outsize > 0) {
if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
{ {
in += charsize; pos++;
memcpy(out, buffer, outsize); memcpy(out, buffer, outsize);
out += outsize; out += outsize;
continue; continue;
...@@ -7489,15 +7508,13 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, ...@@ -7489,15 +7508,13 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
goto error; goto error;
} }
charsize = Py_MAX(charsize - 1, 1);
startpos = unicode_offset + in - startin;
rep = unicode_encode_call_errorhandler( rep = unicode_encode_call_errorhandler(
errors, &errorHandler, encoding, reason, errors, &errorHandler, encoding, reason,
unicode, &exc, unicode, &exc,
startpos, startpos + charsize, &newpos); pos, pos + 1, &newpos);
if (rep == NULL) if (rep == NULL)
goto error; goto error;
in += (newpos - startpos); pos = newpos;
if (PyBytes_Check(rep)) { if (PyBytes_Check(rep)) {
outsize = PyBytes_GET_SIZE(rep); outsize = PyBytes_GET_SIZE(rep);
...@@ -7538,10 +7555,9 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, ...@@ -7538,10 +7555,9 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
for (i=0; i < outsize; i++) { for (i=0; i < outsize; i++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i); Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (ch > 127) { if (ch > 127) {
raise_encode_exception(&exc, raise_encode_exception_obj(&exc,
encoding, encoding, unicode,
startin, insize, pos, pos + 1,
startpos, startpos + charsize,
"unable to encode error handler result to ASCII"); "unable to encode error handler result to ASCII");
Py_DECREF(rep); Py_DECREF(rep);
goto error; goto error;
...@@ -7572,55 +7588,54 @@ encode_code_page(int code_page, ...@@ -7572,55 +7588,54 @@ encode_code_page(int code_page,
PyObject *unicode, PyObject *unicode,
const char *errors) const char *errors)
{ {
const Py_UNICODE *p; Py_ssize_t len;
Py_ssize_t size;
PyObject *outbytes = NULL; PyObject *outbytes = NULL;
Py_ssize_t offset; Py_ssize_t offset;
int chunk_len, ret, done; int chunk_len, ret, done;
p = PyUnicode_AsUnicodeAndSize(unicode, &size); if (PyUnicode_READY(unicode) < 0)
if (p == NULL) return NULL;
return NULL; len = PyUnicode_GET_LENGTH(unicode);
if (code_page < 0) { if (code_page < 0) {
PyErr_SetString(PyExc_ValueError, "invalid code page number"); PyErr_SetString(PyExc_ValueError, "invalid code page number");
return NULL; return NULL;
} }
if (size == 0) if (len == 0)
return PyBytes_FromStringAndSize(NULL, 0); return PyBytes_FromStringAndSize(NULL, 0);
offset = 0; offset = 0;
do do
{ {
#ifdef NEED_RETRY #ifdef NEED_RETRY
if (size > INT_MAX) { /* UTF-16 encoding may double the size, so use only INT_MAX/2
chunk_len = INT_MAX; chunks. */
if (len > INT_MAX/2) {
chunk_len = INT_MAX/2;
done = 0; done = 0;
} }
else else
#endif #endif
{ {
chunk_len = (int)size; chunk_len = (int)len;
done = 1; done = 1;
} }
ret = encode_code_page_strict(code_page, &outbytes, ret = encode_code_page_strict(code_page, &outbytes,
p, chunk_len, unicode, offset, chunk_len,
errors); errors);
if (ret == -2) if (ret == -2)
ret = encode_code_page_errors(code_page, &outbytes, ret = encode_code_page_errors(code_page, &outbytes,
unicode, offset, unicode, offset,
p, chunk_len, chunk_len, errors);
errors);
if (ret < 0) { if (ret < 0) {
Py_XDECREF(outbytes); Py_XDECREF(outbytes);
return NULL; return NULL;
} }
p += chunk_len;
offset += chunk_len; offset += chunk_len;
size -= chunk_len; len -= chunk_len;
} while (!done); } while (!done);
return outbytes; return outbytes;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment