Kaydet (Commit) b2e796aa authored tarafından Benjamin Peterson's avatar Benjamin Peterson

in wide builds, avoid storing high unicode characters from source code with surrogates

This is accomplished by decoding with utf-32 instead of utf-16 on all builds.
The patch is by Adam Olsen.
üst 7b1b094f
...@@ -36,6 +36,14 @@ class PEP263Test(unittest.TestCase): ...@@ -36,6 +36,14 @@ class PEP263Test(unittest.TestCase):
exec(c, d) exec(c, d)
self.assertEquals(d['\xc6'], '\xc6') self.assertEquals(d['\xc6'], '\xc6')
def test_issue3297(self):
c = compile("a, b = '\U0001010F', '\\U0001010F'", "dummy", "exec")
d = {}
exec(c, d)
self.assertEqual(d['a'], d['b'])
self.assertEqual(len(d['a']), len(d['b']))
self.assertEqual(ascii(d['a']), ascii(d['b']))
def test_main(): def test_main():
support.run_unittest(PEP263Test) support.run_unittest(PEP263Test)
......
...@@ -12,6 +12,9 @@ What's New in Python 3.2 Alpha 1? ...@@ -12,6 +12,9 @@ What's New in Python 3.2 Alpha 1?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #3297: On wide unicode builds, do not split unicode characters into
surrogates.
- Remove length limitation when constructing a complex number from a string. - Remove length limitation when constructing a complex number from a string.
- Issue #1087418: Boost performance of bitwise operations for longs. - Issue #1087418: Boost performance of bitwise operations for longs.
......
...@@ -3246,10 +3246,11 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons ...@@ -3246,10 +3246,11 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons
u = NULL; u = NULL;
} else { } else {
/* check for integer overflow */ /* check for integer overflow */
if (len > PY_SIZE_MAX / 4) if (len > PY_SIZE_MAX / 6)
return NULL; return NULL;
/* "\XX" may become "\u005c\uHHLL" (12 bytes) */ /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
u = PyBytes_FromStringAndSize((char *)NULL, len * 4); "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
if (u == NULL) if (u == NULL)
return NULL; return NULL;
p = buf = PyBytes_AsString(u); p = buf = PyBytes_AsString(u);
...@@ -3266,20 +3267,24 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons ...@@ -3266,20 +3267,24 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons
PyObject *w; PyObject *w;
char *r; char *r;
Py_ssize_t rn, i; Py_ssize_t rn, i;
w = decode_utf8(c, &s, end, "utf-16-be"); w = decode_utf8(c, &s, end, "utf-32-be");
if (w == NULL) { if (w == NULL) {
Py_DECREF(u); Py_DECREF(u);
return NULL; return NULL;
} }
r = PyBytes_AS_STRING(w); r = PyBytes_AS_STRING(w);
rn = Py_SIZE(w); rn = Py_SIZE(w);
assert(rn % 2 == 0); assert(rn % 4 == 0);
for (i = 0; i < rn; i += 2) { for (i = 0; i < rn; i += 4) {
sprintf(p, "\\u%02x%02x", sprintf(p, "\\U%02x%02x%02x%02x",
r[i + 0] & 0xFF, r[i + 0] & 0xFF,
r[i + 1] & 0xFF); r[i + 1] & 0xFF,
p += 6; r[i + 2] & 0xFF,
r[i + 3] & 0xFF);
p += 10;
} }
/* Should be impossible to overflow */
assert(p - buf <= Py_SIZE(u));
Py_DECREF(w); Py_DECREF(w);
} else { } else {
*p++ = *s++; *p++ = *s++;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment