Kaydet (Commit) 58cf607d authored tarafından Serhiy Storchaka's avatar Serhiy Storchaka

Issue #12892: The utf-16* and utf-32* codecs now reject (lone) surrogates.

The utf-16* and utf-32* encoders no longer allow surrogate code points
(U+D800-U+DFFF) to be encoded.
The utf-32* decoders no longer decode byte sequences that correspond to
surrogate code points.
The surrogatepass error handler now works with the utf-16* and utf-32* codecs.

Based on patches by Victor Stinner and Kang-Hao (Kenny) Lu.
üst a938bcfe
...@@ -365,18 +365,23 @@ and implemented by all standard Python codecs: ...@@ -365,18 +365,23 @@ and implemented by all standard Python codecs:
| | in :pep:`383`. | | | in :pep:`383`. |
+-------------------------+-----------------------------------------------+ +-------------------------+-----------------------------------------------+
In addition, the following error handlers are specific to a single codec: In addition, the following error handlers are specific to Unicode encoding
schemes:
+-------------------+---------+-------------------------------------------+ +-------------------+------------------------+-------------------------------------------+
| Value | Codec | Meaning | | Value | Codec | Meaning |
+===================+=========+===========================================+ +===================+========================+===========================================+
|``'surrogatepass'``| utf-8 | Allow encoding and decoding of surrogate | |``'surrogatepass'``| utf-8, utf-16, utf-32, | Allow encoding and decoding of surrogate |
| | | codes in UTF-8. | | | utf-16-be, utf-16-le, | codes in all the Unicode encoding schemes.|
+-------------------+---------+-------------------------------------------+ | | utf-32-be, utf-32-le | |
+-------------------+------------------------+-------------------------------------------+
.. versionadded:: 3.1 .. versionadded:: 3.1
The ``'surrogateescape'`` and ``'surrogatepass'`` error handlers. The ``'surrogateescape'`` and ``'surrogatepass'`` error handlers.
.. versionchanged:: 3.4
The ``'surrogatepass'`` error handlers now works with utf-16\* and utf-32\* codecs.
The set of allowed values can be extended via :meth:`register_error`. The set of allowed values can be extended via :meth:`register_error`.
...@@ -1167,6 +1172,12 @@ particular, the following variants typically exist: ...@@ -1167,6 +1172,12 @@ particular, the following variants typically exist:
| utf_8_sig | | all languages | | utf_8_sig | | all languages |
+-----------------+--------------------------------+--------------------------------+ +-----------------+--------------------------------+--------------------------------+
.. versionchanged:: 3.4
The utf-16\* and utf-32\* encoders no longer allow surrogate code points
(U+D800--U+DFFF) to be encoded. The utf-32\* decoders no longer decode
byte sequences that correspond to surrogate code points.
Python Specific Encodings Python Specific Encodings
------------------------- -------------------------
......
...@@ -253,6 +253,13 @@ Some smaller changes made to the core Python language are: ...@@ -253,6 +253,13 @@ Some smaller changes made to the core Python language are:
``__main__.__file__`` when a script has been executed directly using ``__main__.__file__`` when a script has been executed directly using
a relative path (Contributed by Brett Cannon in :issue:`18416`). a relative path (Contributed by Brett Cannon in :issue:`18416`).
* Now all the UTF-\* codecs (except UTF-7) reject surrogates during both
encoding and decoding unless the ``surrogatepass`` error handler is used,
with the exception of the UTF-16 decoder that accepts valid surrogate pairs,
and the UTF-16 encoder that produces them while encoding non-BMP characters.
Contributed by Victor Stinner, Kang-Hao (Kenny) Lu and Serhiy Storchaka in
:issue:`12892`.
New Modules New Modules
=========== ===========
......
...@@ -300,8 +300,46 @@ class ReadTest(MixInCheckStateHandling): ...@@ -300,8 +300,46 @@ class ReadTest(MixInCheckStateHandling):
self.assertEqual(reader.readline(), s5) self.assertEqual(reader.readline(), s5)
self.assertEqual(reader.readline(), "") self.assertEqual(reader.readline(), "")
ill_formed_sequence_replace = "\ufffd"
def test_lone_surrogates(self):
self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
"[\\udc80]".encode(self.encoding))
self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
"[�]".encode(self.encoding))
self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
"[]".encode(self.encoding))
self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
"[?]".encode(self.encoding))
bom = "".encode(self.encoding)
for before, after in [("\U00010fff", "A"), ("[", "]"),
("A", "\U00010fff")]:
before_sequence = before.encode(self.encoding)[len(bom):]
after_sequence = after.encode(self.encoding)[len(bom):]
test_string = before + "\uDC80" + after
test_sequence = (bom + before_sequence +
self.ill_formed_sequence + after_sequence)
self.assertRaises(UnicodeDecodeError, test_sequence.decode,
self.encoding)
self.assertEqual(test_string.encode(self.encoding,
"surrogatepass"),
test_sequence)
self.assertEqual(test_sequence.decode(self.encoding,
"surrogatepass"),
test_string)
self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
before + after)
self.assertEqual(test_sequence.decode(self.encoding, "replace"),
before + self.ill_formed_sequence_replace + after)
class UTF32Test(ReadTest, unittest.TestCase): class UTF32Test(ReadTest, unittest.TestCase):
encoding = "utf-32" encoding = "utf-32"
if sys.byteorder == 'little':
ill_formed_sequence = b"\x80\xdc\x00\x00"
else:
ill_formed_sequence = b"\x00\x00\xdc\x80"
spamle = (b'\xff\xfe\x00\x00' spamle = (b'\xff\xfe\x00\x00'
b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
...@@ -393,6 +431,7 @@ class UTF32Test(ReadTest, unittest.TestCase): ...@@ -393,6 +431,7 @@ class UTF32Test(ReadTest, unittest.TestCase):
class UTF32LETest(ReadTest, unittest.TestCase): class UTF32LETest(ReadTest, unittest.TestCase):
encoding = "utf-32-le" encoding = "utf-32-le"
ill_formed_sequence = b"\x80\xdc\x00\x00"
def test_partial(self): def test_partial(self):
self.check_partial( self.check_partial(
...@@ -437,6 +476,7 @@ class UTF32LETest(ReadTest, unittest.TestCase): ...@@ -437,6 +476,7 @@ class UTF32LETest(ReadTest, unittest.TestCase):
class UTF32BETest(ReadTest, unittest.TestCase): class UTF32BETest(ReadTest, unittest.TestCase):
encoding = "utf-32-be" encoding = "utf-32-be"
ill_formed_sequence = b"\x00\x00\xdc\x80"
def test_partial(self): def test_partial(self):
self.check_partial( self.check_partial(
...@@ -482,6 +522,10 @@ class UTF32BETest(ReadTest, unittest.TestCase): ...@@ -482,6 +522,10 @@ class UTF32BETest(ReadTest, unittest.TestCase):
class UTF16Test(ReadTest, unittest.TestCase): class UTF16Test(ReadTest, unittest.TestCase):
encoding = "utf-16" encoding = "utf-16"
if sys.byteorder == 'little':
ill_formed_sequence = b"\x80\xdc"
else:
ill_formed_sequence = b"\xdc\x80"
spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m' spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
...@@ -562,6 +606,7 @@ class UTF16Test(ReadTest, unittest.TestCase): ...@@ -562,6 +606,7 @@ class UTF16Test(ReadTest, unittest.TestCase):
class UTF16LETest(ReadTest, unittest.TestCase): class UTF16LETest(ReadTest, unittest.TestCase):
encoding = "utf-16-le" encoding = "utf-16-le"
ill_formed_sequence = b"\x80\xdc"
def test_partial(self): def test_partial(self):
self.check_partial( self.check_partial(
...@@ -605,6 +650,7 @@ class UTF16LETest(ReadTest, unittest.TestCase): ...@@ -605,6 +650,7 @@ class UTF16LETest(ReadTest, unittest.TestCase):
class UTF16BETest(ReadTest, unittest.TestCase): class UTF16BETest(ReadTest, unittest.TestCase):
encoding = "utf-16-be" encoding = "utf-16-be"
ill_formed_sequence = b"\xdc\x80"
def test_partial(self): def test_partial(self):
self.check_partial( self.check_partial(
...@@ -648,6 +694,8 @@ class UTF16BETest(ReadTest, unittest.TestCase): ...@@ -648,6 +694,8 @@ class UTF16BETest(ReadTest, unittest.TestCase):
class UTF8Test(ReadTest, unittest.TestCase): class UTF8Test(ReadTest, unittest.TestCase):
encoding = "utf-8" encoding = "utf-8"
ill_formed_sequence = b"\xed\xb2\x80"
ill_formed_sequence_replace = "\ufffd" * 3
def test_partial(self): def test_partial(self):
self.check_partial( self.check_partial(
...@@ -677,18 +725,11 @@ class UTF8Test(ReadTest, unittest.TestCase): ...@@ -677,18 +725,11 @@ class UTF8Test(ReadTest, unittest.TestCase):
u, u.encode(self.encoding)) u, u.encode(self.encoding))
def test_lone_surrogates(self): def test_lone_surrogates(self):
self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8") super().test_lone_surrogates()
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8") # not sure if this is making sense for
self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"), # UTF-16 and UTF-32
b'[\\udc80]') self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
b'[�]')
self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
b'[\x80]') b'[\x80]')
self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
b'[]')
self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
b'[?]')
def test_surrogatepass_handler(self): def test_surrogatepass_handler(self):
self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"), self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
...@@ -851,6 +892,9 @@ class UTF7Test(ReadTest, unittest.TestCase): ...@@ -851,6 +892,9 @@ class UTF7Test(ReadTest, unittest.TestCase):
self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
test_lone_surrogates = None
class UTF16ExTest(unittest.TestCase): class UTF16ExTest(unittest.TestCase):
def test_errors(self): def test_errors(self):
...@@ -875,7 +919,7 @@ class ReadBufferTest(unittest.TestCase): ...@@ -875,7 +919,7 @@ class ReadBufferTest(unittest.TestCase):
self.assertRaises(TypeError, codecs.readbuffer_encode) self.assertRaises(TypeError, codecs.readbuffer_encode)
self.assertRaises(TypeError, codecs.readbuffer_encode, 42) self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
class UTF8SigTest(ReadTest, unittest.TestCase): class UTF8SigTest(UTF8Test, unittest.TestCase):
encoding = "utf-8-sig" encoding = "utf-8-sig"
def test_partial(self): def test_partial(self):
......
...@@ -783,6 +783,7 @@ Ned Jackson Lovely ...@@ -783,6 +783,7 @@ Ned Jackson Lovely
Jason Lowe Jason Lowe
Tony Lownds Tony Lownds
Ray Loyzaga Ray Loyzaga
Kang-Hao (Kenny) Lu
Lukas Lueg Lukas Lueg
Loren Luke Loren Luke
Fredrik Lundh Fredrik Lundh
......
...@@ -10,6 +10,12 @@ Projected release date: 2013-11-24 ...@@ -10,6 +10,12 @@ Projected release date: 2013-11-24
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #12892: The utf-16* and utf-32* encoders no longer allow surrogate code
points (U+D800-U+DFFF) to be encoded. The utf-32* decoders no longer decode
byte sequences that correspond to surrogate code points. The surrogatepass
error handler now works with the utf-16* and utf-32* codecs. Based on
patches by Victor Stinner and Kang-Hao (Kenny) Lu.
- Issue #17806: Added keyword-argument support for "tabsize" to - Issue #17806: Added keyword-argument support for "tabsize" to
str/bytes.expandtabs(). str/bytes.expandtabs().
......
...@@ -596,66 +596,232 @@ IllegalSurrogate: ...@@ -596,66 +596,232 @@ IllegalSurrogate:
#undef SWAB #undef SWAB
Py_LOCAL_INLINE(void) #if STRINGLIB_MAX_CHAR >= 0x80
STRINGLIB(utf16_encode)(unsigned short *out, Py_LOCAL_INLINE(Py_ssize_t)
const STRINGLIB_CHAR *in, STRINGLIB(utf16_encode_)(const STRINGLIB_CHAR *in,
Py_ssize_t len, Py_ssize_t len,
unsigned short **outptr,
int native_ordering) int native_ordering)
{ {
unsigned short *out = *outptr;
const STRINGLIB_CHAR *end = in + len; const STRINGLIB_CHAR *end = in + len;
#if STRINGLIB_SIZEOF_CHAR == 1 #if STRINGLIB_SIZEOF_CHAR == 1
# define SWAB2(CH) ((CH) << 8) # define SWAB2(CH) ((CH) << 8)
#else #else
# define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) # define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
#endif #endif
#if STRINGLIB_MAX_CHAR < 0x10000
if (native_ordering) { if (native_ordering) {
# if STRINGLIB_SIZEOF_CHAR == 2 #if STRINGLIB_MAX_CHAR < 0x10000
Py_MEMCPY(out, in, 2 * len); const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
# else while (in < unrolled_end) {
_PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out); # if STRINGLIB_MAX_CHAR >= 0xd800
if (((in[0] ^ 0xd800) &
(in[1] ^ 0xd800) &
(in[2] ^ 0xd800) &
(in[3] ^ 0xd800) & 0xf800) == 0)
break;
# endif # endif
out[0] = in[0];
out[1] = in[1];
out[2] = in[2];
out[3] = in[3];
in += 4; out += 4;
}
#endif
while (in < end) {
Py_UCS4 ch;
ch = *in++;
#if STRINGLIB_MAX_CHAR >= 0xd800
if (ch < 0xd800)
*out++ = ch;
else if (ch < 0xe000)
/* reject surrogate characters (U+DC800-U+DFFF) */
goto fail;
# if STRINGLIB_MAX_CHAR >= 0x10000
else if (ch >= 0x10000) {
out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
out[1] = Py_UNICODE_LOW_SURROGATE(ch);
out += 2;
}
# endif
else
#endif
*out++ = ch;
}
} else { } else {
#if STRINGLIB_MAX_CHAR < 0x10000
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
while (in < unrolled_end) { while (in < unrolled_end) {
# if STRINGLIB_MAX_CHAR >= 0xd800
if (((in[0] ^ 0xd800) &
(in[1] ^ 0xd800) &
(in[2] ^ 0xd800) &
(in[3] ^ 0xd800) & 0xf800) == 0)
break;
# endif
out[0] = SWAB2(in[0]); out[0] = SWAB2(in[0]);
out[1] = SWAB2(in[1]); out[1] = SWAB2(in[1]);
out[2] = SWAB2(in[2]); out[2] = SWAB2(in[2]);
out[3] = SWAB2(in[3]); out[3] = SWAB2(in[3]);
in += 4; out += 4; in += 4; out += 4;
} }
#endif
while (in < end) { while (in < end) {
*out++ = SWAB2(*in); Py_UCS4 ch = *in++;
++in; #if STRINGLIB_MAX_CHAR >= 0xd800
if (ch < 0xd800)
*out++ = SWAB2((Py_UCS2)ch);
else if (ch < 0xe000)
/* reject surrogate characters (U+DC800-U+DFFF) */
goto fail;
# if STRINGLIB_MAX_CHAR >= 0x10000
else if (ch >= 0x10000) {
Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
out[0] = SWAB2(ch1);
out[1] = SWAB2(ch2);
out += 2;
}
# endif
else
#endif
*out++ = SWAB2((Py_UCS2)ch);
} }
} }
#else *outptr = out;
return len;
#if STRINGLIB_MAX_CHAR >= 0xd800
fail:
#endif
*outptr = out;
return len - (end - in + 1);
}
#endif
#undef SWAB2
#if STRINGLIB_MAX_CHAR >= 0x80
Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
Py_ssize_t len,
unsigned short **outptr,
int native_ordering)
{
unsigned short *out = *outptr;
const STRINGLIB_CHAR *end = in + len;
#if STRINGLIB_SIZEOF_CHAR == 1
if (native_ordering) { if (native_ordering) {
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
while (in < unrolled_end) {
out[0] = in[0];
out[1] = in[1];
out[2] = in[2];
out[3] = in[3];
in += 4; out += 4;
}
while (in < end) {
*out++ = *in++;
}
} else {
# define SWAB2(CH) ((CH) << 8) /* high byte is zero */
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
while (in < unrolled_end) {
out[0] = SWAB2(in[0]);
out[1] = SWAB2(in[1]);
out[2] = SWAB2(in[2]);
out[3] = SWAB2(in[3]);
in += 4; out += 4;
}
while (in < end) { while (in < end) {
Py_UCS4 ch = *in++; Py_UCS4 ch = *in++;
if (ch < 0x10000) *out++ = SWAB2((Py_UCS2)ch);
}
#undef SWAB2
}
*outptr = out;
return len;
#else
if (native_ordering) {
#if STRINGLIB_MAX_CHAR < 0x10000
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
while (in < unrolled_end) {
/* check if any character is a surrogate character */
if (((in[0] ^ 0xd800) &
(in[1] ^ 0xd800) &
(in[2] ^ 0xd800) &
(in[3] ^ 0xd800) & 0xf800) == 0)
break;
out[0] = in[0];
out[1] = in[1];
out[2] = in[2];
out[3] = in[3];
in += 4; out += 4;
}
#endif
while (in < end) {
Py_UCS4 ch;
ch = *in++;
if (ch < 0xd800)
*out++ = ch; *out++ = ch;
else { else if (ch < 0xe000)
/* reject surrogate characters (U+DC800-U+DFFF) */
goto fail;
#if STRINGLIB_MAX_CHAR >= 0x10000
else if (ch >= 0x10000) {
out[0] = Py_UNICODE_HIGH_SURROGATE(ch); out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
out[1] = Py_UNICODE_LOW_SURROGATE(ch); out[1] = Py_UNICODE_LOW_SURROGATE(ch);
out += 2; out += 2;
} }
#endif
else
*out++ = ch;
} }
} else { } else {
#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
#if STRINGLIB_MAX_CHAR < 0x10000
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
while (in < unrolled_end) {
/* check if any character is a surrogate character */
if (((in[0] ^ 0xd800) &
(in[1] ^ 0xd800) &
(in[2] ^ 0xd800) &
(in[3] ^ 0xd800) & 0xf800) == 0)
break;
out[0] = SWAB2(in[0]);
out[1] = SWAB2(in[1]);
out[2] = SWAB2(in[2]);
out[3] = SWAB2(in[3]);
in += 4; out += 4;
}
#endif
while (in < end) { while (in < end) {
Py_UCS4 ch = *in++; Py_UCS4 ch = *in++;
if (ch < 0x10000) if (ch < 0xd800)
*out++ = SWAB2((Py_UCS2)ch); *out++ = SWAB2((Py_UCS2)ch);
else { else if (ch < 0xe000)
/* reject surrogate characters (U+DC800-U+DFFF) */
goto fail;
#if STRINGLIB_MAX_CHAR >= 0x10000
else if (ch >= 0x10000) {
Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
out[0] = SWAB2(ch1); out[0] = SWAB2(ch1);
out[1] = SWAB2(ch2); out[1] = SWAB2(ch2);
out += 2; out += 2;
} }
#endif
else
*out++ = SWAB2((Py_UCS2)ch);
} }
#undef SWAB2
} }
*outptr = out;
return len;
fail:
*outptr = out;
return len - (end - in + 1);
#endif #endif
#undef SWAB2
} }
#endif
#endif /* STRINGLIB_IS_UNICODE */ #endif /* STRINGLIB_IS_UNICODE */
This diff is collapsed.
...@@ -753,6 +753,65 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) ...@@ -753,6 +753,65 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
} }
} }
#define ENC_UTF8 0
#define ENC_UTF16BE 1
#define ENC_UTF16LE 2
#define ENC_UTF32BE 3
#define ENC_UTF32LE 4
static int
get_standard_encoding(const char *encoding, int *bytelength)
{
if (Py_TOLOWER(encoding[0]) == 'u' &&
Py_TOLOWER(encoding[1]) == 't' &&
Py_TOLOWER(encoding[2]) == 'f') {
encoding += 3;
if (*encoding == '-' || *encoding == '_' )
encoding++;
if (encoding[0] == '1' && encoding[1] == '6') {
encoding += 2;
*bytelength = 2;
if (*encoding == '\0') {
#ifdef WORDS_BIGENDIAN
return ENC_UTF16BE;
#else
return ENC_UTF16LE;
#endif
}
if (*encoding == '-' || *encoding == '_' )
encoding++;
if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
if (Py_TOLOWER(encoding[0]) == 'b')
return ENC_UTF16BE;
if (Py_TOLOWER(encoding[0]) == 'l')
return ENC_UTF16LE;
}
}
else if (encoding[0] == '3' && encoding[1] == '2') {
encoding += 2;
*bytelength = 4;
if (*encoding == '\0') {
#ifdef WORDS_BIGENDIAN
return ENC_UTF32BE;
#else
return ENC_UTF32LE;
#endif
}
if (*encoding == '-' || *encoding == '_' )
encoding++;
if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
if (Py_TOLOWER(encoding[0]) == 'b')
return ENC_UTF32BE;
if (Py_TOLOWER(encoding[0]) == 'l')
return ENC_UTF32LE;
}
}
}
/* utf-8 */
*bytelength = 3;
return ENC_UTF8;
}
/* This handler is declared static until someone demonstrates /* This handler is declared static until someone demonstrates
a need to call it directly. */ a need to call it directly. */
static PyObject * static PyObject *
...@@ -760,24 +819,40 @@ PyCodec_SurrogatePassErrors(PyObject *exc) ...@@ -760,24 +819,40 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
{ {
PyObject *restuple; PyObject *restuple;
PyObject *object; PyObject *object;
PyObject *encode;
char *encoding;
int code;
int bytelength;
Py_ssize_t i; Py_ssize_t i;
Py_ssize_t start; Py_ssize_t start;
Py_ssize_t end; Py_ssize_t end;
PyObject *res; PyObject *res;
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
char *outp; unsigned char *outp;
if (PyUnicodeEncodeError_GetStart(exc, &start)) if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL; return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end)) if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL; return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc))) if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL; return NULL;
res = PyBytes_FromStringAndSize(NULL, 3*(end-start)); if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
Py_DECREF(object);
return NULL;
}
if (!(encoding = PyUnicode_AsUTF8(encode))) {
Py_DECREF(object);
Py_DECREF(encode);
return NULL;
}
code = get_standard_encoding(encoding, &bytelength);
Py_DECREF(encode);
res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
if (!res) { if (!res) {
Py_DECREF(object); Py_DECREF(object);
return NULL; return NULL;
} }
outp = PyBytes_AsString(res); outp = (unsigned char*)PyBytes_AsString(res);
for (i = start; i < end; i++) { for (i = start; i < end; i++) {
/* object is guaranteed to be "ready" */ /* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
...@@ -788,9 +863,33 @@ PyCodec_SurrogatePassErrors(PyObject *exc) ...@@ -788,9 +863,33 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
Py_DECREF(object); Py_DECREF(object);
return NULL; return NULL;
} }
*outp++ = (char)(0xe0 | (ch >> 12)); switch (code) {
*outp++ = (char)(0x80 | ((ch >> 6) & 0x3f)); case ENC_UTF8:
*outp++ = (char)(0x80 | (ch & 0x3f)); *outp++ = (unsigned char)(0xe0 | (ch >> 12));
*outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
*outp++ = (unsigned char)(0x80 | (ch & 0x3f));
break;
case ENC_UTF16LE:
*outp++ = (unsigned char) ch;
*outp++ = (unsigned char)(ch >> 8);
break;
case ENC_UTF16BE:
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char) ch;
break;
case ENC_UTF32LE:
*outp++ = (unsigned char) ch;
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char)(ch >> 16);
*outp++ = (unsigned char)(ch >> 24);
break;
case ENC_UTF32BE:
*outp++ = (unsigned char)(ch >> 24);
*outp++ = (unsigned char)(ch >> 16);
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char) ch;
break;
}
} }
restuple = Py_BuildValue("(On)", res, end); restuple = Py_BuildValue("(On)", res, end);
Py_DECREF(res); Py_DECREF(res);
...@@ -802,34 +901,64 @@ PyCodec_SurrogatePassErrors(PyObject *exc) ...@@ -802,34 +901,64 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
Py_UCS4 ch = 0; Py_UCS4 ch = 0;
if (PyUnicodeDecodeError_GetStart(exc, &start)) if (PyUnicodeDecodeError_GetStart(exc, &start))
return NULL; return NULL;
if (PyUnicodeDecodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeDecodeError_GetObject(exc))) if (!(object = PyUnicodeDecodeError_GetObject(exc)))
return NULL; return NULL;
if (!(p = (unsigned char*)PyBytes_AsString(object))) { if (!(p = (unsigned char*)PyBytes_AsString(object))) {
Py_DECREF(object); Py_DECREF(object);
return NULL; return NULL;
} }
if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
Py_DECREF(object);
return NULL;
}
if (!(encoding = PyUnicode_AsUTF8(encode))) {
Py_DECREF(object);
Py_DECREF(encode);
return NULL;
}
code = get_standard_encoding(encoding, &bytelength);
Py_DECREF(encode);
/* Try decoding a single surrogate character. If /* Try decoding a single surrogate character. If
there are more, let the codec call us again. */ there are more, let the codec call us again. */
p += start; p += start;
if (PyBytes_GET_SIZE(object) - start >= 3 && if (PyBytes_GET_SIZE(object) - start >= bytelength) {
(p[0] & 0xf0) == 0xe0 && switch (code) {
(p[1] & 0xc0) == 0x80 && case ENC_UTF8:
(p[2] & 0xc0) == 0x80) { if ((p[0] & 0xf0) == 0xe0 &&
/* it's a three-byte code */ (p[1] & 0xc0) == 0x80 &&
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); (p[2] & 0xc0) == 0x80) {
if (!Py_UNICODE_IS_SURROGATE(ch)) /* it's a three-byte code */
/* it's not a surrogate - fail */ ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
ch = 0; }
break;
case ENC_UTF16LE:
ch = p[1] << 8 | p[0];
break;
case ENC_UTF16BE:
ch = p[0] << 8 | p[1];
break;
case ENC_UTF32LE:
ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
break;
case ENC_UTF32BE:
ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
break;
}
} }
Py_DECREF(object); Py_DECREF(object);
if (ch == 0) { if (!Py_UNICODE_IS_SURROGATE(ch)) {
/* it's not a surrogate - fail */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc); PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL; return NULL;
} }
res = PyUnicode_FromOrdinal(ch); res = PyUnicode_FromOrdinal(ch);
if (res == NULL) if (res == NULL)
return NULL; return NULL;
return Py_BuildValue("(Nn)", res, start+3); return Py_BuildValue("(Nn)", res, start + bytelength);
} }
else { else {
wrong_exception_type(exc); wrong_exception_type(exc);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment