Kaydet (Commit) 7d6e076f authored tarafından Antoine Pitrou's avatar Antoine Pitrou

Issue #7451: Improve decoding performance of JSON objects, and reduce

the memory consumption of said decoded objects when they use the same
strings as keys.
üst d9107aad
...@@ -147,10 +147,14 @@ WHITESPACE_STR = ' \t\n\r' ...@@ -147,10 +147,14 @@ WHITESPACE_STR = ' \t\n\r'
def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook, def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
_w=WHITESPACE.match, _ws=WHITESPACE_STR): memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
s, end = s_and_end s, end = s_and_end
pairs = [] pairs = []
pairs_append = pairs.append pairs_append = pairs.append
# Backwards compatibility
if memo is None:
memo = {}
memo_get = memo.setdefault
# Use a slice to prevent IndexError from being raised, the following # Use a slice to prevent IndexError from being raised, the following
# check will raise a more specific ValueError if the string is empty # check will raise a more specific ValueError if the string is empty
nextchar = s[end:end + 1] nextchar = s[end:end + 1]
...@@ -167,6 +171,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook, ...@@ -167,6 +171,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
end += 1 end += 1
while True: while True:
key, end = scanstring(s, end, strict) key, end = scanstring(s, end, strict)
key = memo_get(key, key)
# To skip some function call overhead we optimize the fast paths where # To skip some function call overhead we optimize the fast paths where
# the JSON key separator is ": " or just ":". # the JSON key separator is ": " or just ":".
if s[end:end + 1] != ':': if s[end:end + 1] != ':':
...@@ -214,7 +219,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook, ...@@ -214,7 +219,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
pairs = object_hook(pairs) pairs = object_hook(pairs)
return pairs, end return pairs, end
def JSONArray(s_and_end, scan_once, context, _w=WHITESPACE.match): def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
s, end = s_and_end s, end = s_and_end
values = [] values = []
nextchar = s[end:end + 1] nextchar = s[end:end + 1]
...@@ -314,6 +319,7 @@ class JSONDecoder(object): ...@@ -314,6 +319,7 @@ class JSONDecoder(object):
self.parse_object = JSONObject self.parse_object = JSONObject
self.parse_array = JSONArray self.parse_array = JSONArray
self.parse_string = scanstring self.parse_string = scanstring
self.memo = {}
self.scan_once = make_scanner(self) self.scan_once = make_scanner(self)
......
...@@ -22,6 +22,8 @@ def py_make_scanner(context): ...@@ -22,6 +22,8 @@ def py_make_scanner(context):
parse_int = context.parse_int parse_int = context.parse_int
parse_constant = context.parse_constant parse_constant = context.parse_constant
object_hook = context.object_hook object_hook = context.object_hook
object_pairs_hook = context.object_pairs_hook
memo = context.memo
def _scan_once(string, idx): def _scan_once(string, idx):
try: try:
...@@ -33,7 +35,7 @@ def py_make_scanner(context): ...@@ -33,7 +35,7 @@ def py_make_scanner(context):
return parse_string(string, idx + 1, strict) return parse_string(string, idx + 1, strict)
elif nextchar == '{': elif nextchar == '{':
return parse_object((string, idx + 1), strict, return parse_object((string, idx + 1), strict,
_scan_once, object_hook, object_pairs_hook) _scan_once, object_hook, object_pairs_hook, memo)
elif nextchar == '[': elif nextchar == '[':
return parse_array((string, idx + 1), _scan_once) return parse_array((string, idx + 1), _scan_once)
elif nextchar == 'n' and string[idx:idx + 4] == 'null': elif nextchar == 'n' and string[idx:idx + 4] == 'null':
...@@ -60,6 +62,12 @@ def py_make_scanner(context): ...@@ -60,6 +62,12 @@ def py_make_scanner(context):
else: else:
raise StopIteration raise StopIteration
def scan_once(string, idx):
try:
return _scan_once(string, idx)
finally:
memo.clear()
return _scan_once return _scan_once
make_scanner = c_make_scanner or py_make_scanner make_scanner = c_make_scanner or py_make_scanner
import decimal import decimal
from unittest import TestCase from unittest import TestCase
from io import StringIO from io import StringIO
from contextlib import contextmanager
import json import json
import json.decoder
import json.scanner
from collections import OrderedDict from collections import OrderedDict
@contextmanager
def use_python_scanner():
py_scanner = json.scanner.py_make_scanner
old_scanner = json.decoder.make_scanner
json.decoder.make_scanner = py_scanner
try:
yield
finally:
json.decoder.make_scanner = old_scanner
class TestDecode(TestCase): class TestDecode(TestCase):
def test_decimal(self): def test_decimal(self):
rval = json.loads('1.1', parse_float=decimal.Decimal) rval = json.loads('1.1', parse_float=decimal.Decimal)
...@@ -39,3 +54,16 @@ class TestDecode(TestCase): ...@@ -39,3 +54,16 @@ class TestDecode(TestCase):
# exercise the uncommon cases. The array cases are already covered. # exercise the uncommon cases. The array cases are already covered.
rval = json.loads('{ "key" : "value" , "k":"v" }') rval = json.loads('{ "key" : "value" , "k":"v" }')
self.assertEquals(rval, {"key":"value", "k":"v"}) self.assertEquals(rval, {"key":"value", "k":"v"})
def check_keys_reuse(self, source, loads):
rval = loads(source)
(a, b), (c, d) = sorted(rval[0]), sorted(rval[1])
self.assertIs(a, c)
self.assertIs(b, d)
def test_keys_reuse(self):
s = '[{"a_key": 1, "b_\xe9": 2}, {"a_key": 3, "b_\xe9": 4}]'
self.check_keys_reuse(s, json.loads)
# Disabled: the pure Python version of json simply doesn't work
with use_python_scanner():
self.check_keys_reuse(s, json.decoder.JSONDecoder().decode)
...@@ -165,6 +165,10 @@ Extensions ...@@ -165,6 +165,10 @@ Extensions
Library Library
------- -------
- Issue #7451: Improve decoding performance of JSON objects, and reduce
the memory consumption of said decoded objects when they use the same
strings as keys.
- Issue #1100562: Fix deep-copying of objects derived from the list and - Issue #1100562: Fix deep-copying of objects derived from the list and
dict types. Patch by Michele Orrù and Björn Lindqvist. dict types. Patch by Michele Orrù and Björn Lindqvist.
......
...@@ -36,6 +36,7 @@ typedef struct _PyScannerObject { ...@@ -36,6 +36,7 @@ typedef struct _PyScannerObject {
PyObject *parse_float; PyObject *parse_float;
PyObject *parse_int; PyObject *parse_int;
PyObject *parse_constant; PyObject *parse_constant;
PyObject *memo;
} PyScannerObject; } PyScannerObject;
static PyMemberDef scanner_members[] = { static PyMemberDef scanner_members[] = {
...@@ -305,6 +306,21 @@ _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) { ...@@ -305,6 +306,21 @@ _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) {
return tpl; return tpl;
} }
#define APPEND_OLD_CHUNK \
if (chunk != NULL) { \
if (chunks == NULL) { \
chunks = PyList_New(0); \
if (chunks == NULL) { \
goto bail; \
} \
} \
if (PyList_Append(chunks, chunk)) { \
Py_DECREF(chunk); \
goto bail; \
} \
Py_CLEAR(chunk); \
}
static PyObject * static PyObject *
scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr) scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr)
{ {
...@@ -316,15 +332,14 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next ...@@ -316,15 +332,14 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
Return value is a new PyUnicode Return value is a new PyUnicode
*/ */
PyObject *rval; PyObject *rval = NULL;
Py_ssize_t len = PyUnicode_GET_SIZE(pystr); Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
Py_ssize_t begin = end - 1; Py_ssize_t begin = end - 1;
Py_ssize_t next = begin; Py_ssize_t next = begin;
const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr); const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
PyObject *chunks = PyList_New(0); PyObject *chunks = NULL;
if (chunks == NULL) { PyObject *chunk = NULL;
goto bail;
}
if (end < 0 || len <= end) { if (end < 0 || len <= end) {
PyErr_SetString(PyExc_ValueError, "end is out of bounds"); PyErr_SetString(PyExc_ValueError, "end is out of bounds");
goto bail; goto bail;
...@@ -332,7 +347,6 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next ...@@ -332,7 +347,6 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
while (1) { while (1) {
/* Find the end of the string or the next escape */ /* Find the end of the string or the next escape */
Py_UNICODE c = 0; Py_UNICODE c = 0;
PyObject *chunk = NULL;
for (next = end; next < len; next++) { for (next = end; next < len; next++) {
c = buf[next]; c = buf[next];
if (c == '"' || c == '\\') { if (c == '"' || c == '\\') {
...@@ -349,15 +363,11 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next ...@@ -349,15 +363,11 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
} }
/* Pick up this chunk if it's not zero length */ /* Pick up this chunk if it's not zero length */
if (next != end) { if (next != end) {
APPEND_OLD_CHUNK
chunk = PyUnicode_FromUnicode(&buf[end], next - end); chunk = PyUnicode_FromUnicode(&buf[end], next - end);
if (chunk == NULL) { if (chunk == NULL) {
goto bail; goto bail;
} }
if (PyList_Append(chunks, chunk)) {
Py_DECREF(chunk);
goto bail;
}
Py_DECREF(chunk);
} }
next++; next++;
if (c == '"') { if (c == '"') {
...@@ -459,27 +469,34 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next ...@@ -459,27 +469,34 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
} }
#endif #endif
} }
APPEND_OLD_CHUNK
chunk = PyUnicode_FromUnicode(&c, 1); chunk = PyUnicode_FromUnicode(&c, 1);
if (chunk == NULL) { if (chunk == NULL) {
goto bail; goto bail;
} }
if (PyList_Append(chunks, chunk)) { }
Py_DECREF(chunk);
if (chunks == NULL) {
if (chunk != NULL)
rval = chunk;
else
rval = PyUnicode_FromStringAndSize("", 0);
}
else {
APPEND_OLD_CHUNK
rval = join_list_unicode(chunks);
if (rval == NULL) {
goto bail; goto bail;
} }
Py_DECREF(chunk); Py_CLEAR(chunks);
} }
rval = join_list_unicode(chunks);
if (rval == NULL) {
goto bail;
}
Py_DECREF(chunks);
*next_end_ptr = end; *next_end_ptr = end;
return rval; return rval;
bail: bail:
*next_end_ptr = -1; *next_end_ptr = -1;
Py_XDECREF(chunks); Py_XDECREF(chunks);
Py_XDECREF(chunk);
return NULL; return NULL;
} }
...@@ -578,6 +595,7 @@ scanner_clear(PyObject *self) ...@@ -578,6 +595,7 @@ scanner_clear(PyObject *self)
Py_CLEAR(s->parse_float); Py_CLEAR(s->parse_float);
Py_CLEAR(s->parse_int); Py_CLEAR(s->parse_int);
Py_CLEAR(s->parse_constant); Py_CLEAR(s->parse_constant);
Py_CLEAR(s->memo);
return 0; return 0;
} }
...@@ -593,10 +611,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss ...@@ -593,10 +611,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
PyObject *val = NULL; PyObject *val = NULL;
PyObject *rval = PyList_New(0); PyObject *rval = NULL;
PyObject *key = NULL; PyObject *key = NULL;
int strict = PyObject_IsTrue(s->strict); int strict = PyObject_IsTrue(s->strict);
int has_pairs_hook = (s->object_pairs_hook != Py_None);
Py_ssize_t next_idx; Py_ssize_t next_idx;
if (has_pairs_hook)
rval = PyList_New(0);
else
rval = PyDict_New();
if (rval == NULL) if (rval == NULL)
return NULL; return NULL;
...@@ -606,6 +630,8 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss ...@@ -606,6 +630,8 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
/* only loop if the object is non-empty */ /* only loop if the object is non-empty */
if (idx <= end_idx && str[idx] != '}') { if (idx <= end_idx && str[idx] != '}') {
while (idx <= end_idx) { while (idx <= end_idx) {
PyObject *memokey;
/* read key */ /* read key */
if (str[idx] != '"') { if (str[idx] != '"') {
raise_errmsg("Expecting property name", pystr, idx); raise_errmsg("Expecting property name", pystr, idx);
...@@ -614,6 +640,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss ...@@ -614,6 +640,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
key = scanstring_unicode(pystr, idx + 1, strict, &next_idx); key = scanstring_unicode(pystr, idx + 1, strict, &next_idx);
if (key == NULL) if (key == NULL)
goto bail; goto bail;
memokey = PyDict_GetItem(s->memo, key);
if (memokey != NULL) {
Py_INCREF(memokey);
Py_DECREF(key);
key = memokey;
}
else {
if (PyDict_SetItem(s->memo, key, key) < 0)
goto bail;
}
idx = next_idx; idx = next_idx;
/* skip whitespace between key and : delimiter, read :, skip whitespace */ /* skip whitespace between key and : delimiter, read :, skip whitespace */
...@@ -630,19 +666,24 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss ...@@ -630,19 +666,24 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
if (val == NULL) if (val == NULL)
goto bail; goto bail;
{ if (has_pairs_hook) {
PyObject *tuple = PyTuple_Pack(2, key, val); PyObject *item = PyTuple_Pack(2, key, val);
if (tuple == NULL) if (item == NULL)
goto bail; goto bail;
if (PyList_Append(rval, tuple) == -1) { Py_CLEAR(key);
Py_DECREF(tuple); Py_CLEAR(val);
if (PyList_Append(rval, item) == -1) {
Py_DECREF(item);
goto bail; goto bail;
} }
Py_DECREF(tuple); Py_DECREF(item);
}
else {
if (PyDict_SetItem(rval, key, val) < 0)
goto bail;
Py_CLEAR(key);
Py_CLEAR(val);
} }
Py_CLEAR(key);
Py_CLEAR(val);
idx = next_idx; idx = next_idx;
/* skip whitespace before } or , */ /* skip whitespace before } or , */
...@@ -672,36 +713,23 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss ...@@ -672,36 +713,23 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
*next_idx_ptr = idx + 1; *next_idx_ptr = idx + 1;
if (s->object_pairs_hook != Py_None) { if (has_pairs_hook) {
val = PyObject_CallFunctionObjArgs(s->object_pairs_hook, rval, NULL); val = PyObject_CallFunctionObjArgs(s->object_pairs_hook, rval, NULL);
if (val == NULL)
goto bail;
Py_DECREF(rval); Py_DECREF(rval);
return val; return val;
} }
val = PyDict_New();
if (val == NULL)
goto bail;
if (PyDict_MergeFromSeq2(val, rval, 1) == -1)
goto bail;
Py_DECREF(rval);
rval = val;
/* if object_hook is not None: rval = object_hook(rval) */ /* if object_hook is not None: rval = object_hook(rval) */
if (s->object_hook != Py_None) { if (s->object_hook != Py_None) {
val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL); val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL);
if (val == NULL)
goto bail;
Py_DECREF(rval); Py_DECREF(rval);
rval = val; return val;
val = NULL;
} }
return rval; return rval;
bail: bail:
Py_XDECREF(key); Py_XDECREF(key);
Py_XDECREF(val); Py_XDECREF(val);
Py_DECREF(rval); Py_XDECREF(rval);
return NULL; return NULL;
} }
...@@ -988,6 +1016,9 @@ scanner_call(PyObject *self, PyObject *args, PyObject *kwds) ...@@ -988,6 +1016,9 @@ scanner_call(PyObject *self, PyObject *args, PyObject *kwds)
Py_TYPE(pystr)->tp_name); Py_TYPE(pystr)->tp_name);
return NULL; return NULL;
} }
PyDict_Clear(s->memo);
if (rval == NULL)
return NULL;
return _build_rval_index_tuple(rval, next_idx); return _build_rval_index_tuple(rval, next_idx);
} }
...@@ -1021,6 +1052,12 @@ scanner_init(PyObject *self, PyObject *args, PyObject *kwds) ...@@ -1021,6 +1052,12 @@ scanner_init(PyObject *self, PyObject *args, PyObject *kwds)
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx)) if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx))
return -1; return -1;
if (s->memo == NULL) {
s->memo = PyDict_New();
if (s->memo == NULL)
goto bail;
}
/* All of these will fail "gracefully" so we don't need to verify them */ /* All of these will fail "gracefully" so we don't need to verify them */
s->strict = PyObject_GetAttrString(ctx, "strict"); s->strict = PyObject_GetAttrString(ctx, "strict");
if (s->strict == NULL) if (s->strict == NULL)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment