Kaydet (Commit) 687ff0ec authored tarafından Serhiy Storchaka's avatar Serhiy Storchaka

Issue #11489: JSON decoder now accepts lone surrogates.

...@@ -58,6 +58,16 @@ BACKSLASH = { ...@@ -58,6 +58,16 @@ BACKSLASH = {
'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
} }
def _decode_uXXXX(s, pos):
esc = s[pos + 1:pos + 5]
if len(esc) == 4 and esc[1] not in 'xX':
try:
return int(esc, 16)
except ValueError:
pass
msg = "Invalid \\uXXXX escape"
raise ValueError(errmsg(msg, s, pos))
def py_scanstring(s, end, strict=True, def py_scanstring(s, end, strict=True,
_b=BACKSLASH, _m=STRINGCHUNK.match): _b=BACKSLASH, _m=STRINGCHUNK.match):
"""Scan the string s for a JSON string. End is the index of the """Scan the string s for a JSON string. End is the index of the
...@@ -107,25 +117,14 @@ def py_scanstring(s, end, strict=True, ...@@ -107,25 +117,14 @@ def py_scanstring(s, end, strict=True,
raise ValueError(errmsg(msg, s, end)) raise ValueError(errmsg(msg, s, end))
end += 1 end += 1
else: else:
esc = s[end + 1:end + 5] uni = _decode_uXXXX(s, end)
next_end = end + 5 end += 5
if len(esc) != 4: if 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
msg = "Invalid \\uXXXX escape" uni2 = _decode_uXXXX(s, end + 1)
raise ValueError(errmsg(msg, s, end)) if 0xdc00 <= uni2 <= 0xdfff:
uni = int(esc, 16) uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
if 0xd800 <= uni <= 0xdbff: end += 6
msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
if not s[end + 5:end + 7] == '\\u':
raise ValueError(errmsg(msg, s, end))
esc2 = s[end + 7:end + 11]
if len(esc2) != 4:
raise ValueError(errmsg(msg, s, end))
uni2 = int(esc2, 16)
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
next_end += 6
char = chr(uni) char = chr(uni)
end = next_end
_append(char) _append(char)
return ''.join(chunks), end return ''.join(chunks), end
......
...@@ -5,10 +5,6 @@ from test.test_json import PyTest, CTest ...@@ -5,10 +5,6 @@ from test.test_json import PyTest, CTest
class TestScanstring: class TestScanstring:
def test_scanstring(self): def test_scanstring(self):
scanstring = self.json.decoder.scanstring scanstring = self.json.decoder.scanstring
self.assertEqual(
scanstring('"z\\ud834\\udd20x"', 1, True),
('z\U0001d120x', 16))
self.assertEqual( self.assertEqual(
scanstring('"z\U0001d120x"', 1, True), scanstring('"z\U0001d120x"', 1, True),
('z\U0001d120x', 5)) ('z\U0001d120x', 5))
...@@ -89,6 +85,53 @@ class TestScanstring: ...@@ -89,6 +85,53 @@ class TestScanstring:
scanstring('["Bad value", truth]', 2, True), scanstring('["Bad value", truth]', 2, True),
('Bad value', 12)) ('Bad value', 12))
def test_surrogates(self):
scanstring = self.json.decoder.scanstring
def assertScan(given, expect):
self.assertEqual(scanstring(given, 1, True),
(expect, len(given)))
assertScan('"z\\ud834\\u0079x"', 'z\ud834yx')
assertScan('"z\\ud834\\udd20x"', 'z\U0001d120x')
assertScan('"z\\ud834\\ud834\\udd20x"', 'z\ud834\U0001d120x')
assertScan('"z\\ud834x"', 'z\ud834x')
assertScan('"z\\ud834\udd20x12345"', 'z\ud834\udd20x12345')
assertScan('"z\\udd20x"', 'z\udd20x')
assertScan('"z\ud834\udd20x"', 'z\ud834\udd20x')
assertScan('"z\ud834\\udd20x"', 'z\ud834\udd20x')
assertScan('"z\ud834x"', 'z\ud834x')
def test_bad_escapes(self):
scanstring = self.json.decoder.scanstring
bad_escapes = [
'"\\"',
'"\\x"',
'"\\u"',
'"\\u0"',
'"\\u01"',
'"\\u012"',
'"\\uz012"',
'"\\u0z12"',
'"\\u01z2"',
'"\\u012z"',
'"\\u0x12"',
'"\\u0X12"',
'"\\ud834\\"',
'"\\ud834\\u"',
'"\\ud834\\ud"',
'"\\ud834\\udd"',
'"\\ud834\\udd2"',
'"\\ud834\\uzdd2"',
'"\\ud834\\udzd2"',
'"\\ud834\\uddz2"',
'"\\ud834\\udd2z"',
'"\\ud834\\u0x20"',
'"\\ud834\\u0X20"',
]
for s in bad_escapes:
with self.assertRaises(ValueError, msg=s):
scanstring(s, 1, True)
def test_overflow(self): def test_overflow(self):
with self.assertRaises(OverflowError): with self.assertRaises(OverflowError):
self.json.decoder.scanstring(b"xxx", sys.maxsize+1) self.json.decoder.scanstring(b"xxx", sys.maxsize+1)
......
...@@ -16,6 +16,8 @@ Core and Builtins ...@@ -16,6 +16,8 @@ Core and Builtins
Library Library
------- -------
- Issue #11489: JSON decoder now accepts lone surrogates.
- Issue #19545: Avoid chained exceptions while passing stray % to - Issue #19545: Avoid chained exceptions while passing stray % to
time.strptime(). Initial patch by Claudiu Popa. time.strptime(). Initial patch by Claudiu Popa.
......
...@@ -409,17 +409,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next ...@@ -409,17 +409,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
} }
} }
/* Surrogate pair */ /* Surrogate pair */
if (Py_UNICODE_IS_HIGH_SURROGATE(c)) { if (Py_UNICODE_IS_HIGH_SURROGATE(c) && end + 6 < len &&
PyUnicode_READ(kind, buf, next++) == '\\' &&
PyUnicode_READ(kind, buf, next++) == 'u') {
Py_UCS4 c2 = 0; Py_UCS4 c2 = 0;
if (end + 6 >= len) {
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
goto bail;
}
if (PyUnicode_READ(kind, buf, next++) != '\\' ||
PyUnicode_READ(kind, buf, next++) != 'u') {
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
goto bail;
}
end += 6; end += 6;
/* Decode 4 hex digits */ /* Decode 4 hex digits */
for (; next < end; next++) { for (; next < end; next++) {
...@@ -440,15 +433,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next ...@@ -440,15 +433,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
goto bail; goto bail;
} }
} }
if (!Py_UNICODE_IS_LOW_SURROGATE(c2)) { if (Py_UNICODE_IS_LOW_SURROGATE(c2))
raise_errmsg("Unpaired high surrogate", pystr, end - 5); c = Py_UNICODE_JOIN_SURROGATES(c, c2);
goto bail; else
} end -= 6;
c = Py_UNICODE_JOIN_SURROGATES(c, c2);
}
else if (Py_UNICODE_IS_LOW_SURROGATE(c)) {
raise_errmsg("Unpaired low surrogate", pystr, end - 5);
goto bail;
} }
} }
APPEND_OLD_CHUNK APPEND_OLD_CHUNK
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment