Kaydet (Commit) 211b81dd authored tarafından Antoine Pitrou's avatar Antoine Pitrou

Issue #11114: Fix catastrophic performance of tell() on text files (up

to 1000x faster in some cases).  It is still one to two order of magnitudes
slower than binary tell().
üst d2751fb4
...@@ -1488,6 +1488,7 @@ class TextIOWrapper(TextIOBase): ...@@ -1488,6 +1488,7 @@ class TextIOWrapper(TextIOBase):
self._decoded_chars_used = 0 # offset into _decoded_chars for read() self._decoded_chars_used = 0 # offset into _decoded_chars for read()
self._snapshot = None # info for reconstructing decoder state self._snapshot = None # info for reconstructing decoder state
self._seekable = self._telling = self.buffer.seekable() self._seekable = self._telling = self.buffer.seekable()
self._b2cratio = 0.0
if self._seekable and self.writable(): if self._seekable and self.writable():
position = self.buffer.tell() position = self.buffer.tell()
...@@ -1655,7 +1656,12 @@ class TextIOWrapper(TextIOBase): ...@@ -1655,7 +1656,12 @@ class TextIOWrapper(TextIOBase):
# Read a chunk, decode it, and put the result in self._decoded_chars. # Read a chunk, decode it, and put the result in self._decoded_chars.
input_chunk = self.buffer.read1(self._CHUNK_SIZE) input_chunk = self.buffer.read1(self._CHUNK_SIZE)
eof = not input_chunk eof = not input_chunk
self._set_decoded_chars(self._decoder.decode(input_chunk, eof)) decoded_chars = self._decoder.decode(input_chunk, eof)
self._set_decoded_chars(decoded_chars)
if decoded_chars:
self._b2cratio = len(input_chunk) / len(self._decoded_chars)
else:
self._b2cratio = 0.0
if self._telling: if self._telling:
# At the snapshot point, len(dec_buffer) bytes before the read, # At the snapshot point, len(dec_buffer) bytes before the read,
...@@ -1709,20 +1715,56 @@ class TextIOWrapper(TextIOBase): ...@@ -1709,20 +1715,56 @@ class TextIOWrapper(TextIOBase):
# forward until it gives us enough decoded characters. # forward until it gives us enough decoded characters.
saved_state = decoder.getstate() saved_state = decoder.getstate()
try: try:
# Fast search for an acceptable start point, close to our
# current pos.
# Rationale: calling decoder.decode() has a large overhead
# regardless of chunk size; we want the number of such calls to
# be O(1) in most situations (common decoders, non-crazy input).
# Actually, it will be exactly 1 for fixed-size codecs (all
# 8-bit codecs, also UTF-16 and UTF-32).
skip_bytes = int(self._b2cratio * chars_to_skip)
skip_back = 1
assert skip_bytes <= len(next_input)
while skip_bytes > 0:
decoder.setstate((b'', dec_flags))
# Decode up to temptative start point
n = len(decoder.decode(next_input[:skip_bytes]))
if n <= chars_to_skip:
b, d = decoder.getstate()
if not b:
# Before pos and no bytes buffered in decoder => OK
dec_flags = d
chars_to_skip -= n
break
# Skip back by buffered amount and reset heuristic
skip_bytes -= len(b)
skip_back = 1
else:
# We're too far ahead, skip back a bit
skip_bytes -= skip_back
skip_back = skip_back * 2
else:
skip_bytes = 0
decoder.setstate((b'', dec_flags))
# Note our initial start point. # Note our initial start point.
decoder.setstate((b'', dec_flags)) start_pos = position + skip_bytes
start_pos = position start_flags = dec_flags
start_flags, bytes_fed, chars_decoded = dec_flags, 0, 0 if chars_to_skip == 0:
need_eof = 0 # We haven't moved from the start point.
return self._pack_cookie(start_pos, start_flags)
# Feed the decoder one byte at a time. As we go, note the # Feed the decoder one byte at a time. As we go, note the
# nearest "safe start point" before the current location # nearest "safe start point" before the current location
# (a point where the decoder has nothing buffered, so seek() # (a point where the decoder has nothing buffered, so seek()
# can safely start from there and advance to this location). # can safely start from there and advance to this location).
next_byte = bytearray(1) bytes_fed = 0
for next_byte[0] in next_input: need_eof = 0
# Chars decoded since `start_pos`
chars_decoded = 0
for i in range(skip_bytes, len(next_input)):
bytes_fed += 1 bytes_fed += 1
chars_decoded += len(decoder.decode(next_byte)) chars_decoded += len(decoder.decode(next_input[i:i+1]))
dec_buffer, dec_flags = decoder.getstate() dec_buffer, dec_flags = decoder.getstate()
if not dec_buffer and chars_decoded <= chars_to_skip: if not dec_buffer and chars_decoded <= chars_to_skip:
# Decoder buffer is empty, so this is a safe start point. # Decoder buffer is empty, so this is a safe start point.
......
...@@ -35,6 +35,10 @@ Core and Builtins ...@@ -35,6 +35,10 @@ Core and Builtins
Library Library
------- -------
- Issue #11114: Fix catastrophic performance of tell() on text files (up
to 1000x faster in some cases). It is still one to two order of magnitudes
slower than binary tell().
- Issue 10882: Add os.sendfile function. - Issue 10882: Add os.sendfile function.
- Issue #10868: Allow usage of the register method of an ABC as a class - Issue #10868: Allow usage of the register method of an ABC as a class
......
...@@ -678,12 +678,16 @@ typedef struct ...@@ -678,12 +678,16 @@ typedef struct
PyObject *pending_bytes; /* list of bytes objects waiting to be PyObject *pending_bytes; /* list of bytes objects waiting to be
written, or NULL */ written, or NULL */
Py_ssize_t pending_bytes_count; Py_ssize_t pending_bytes_count;
PyObject *snapshot;
/* snapshot is either None, or a tuple (dec_flags, next_input) where /* snapshot is either None, or a tuple (dec_flags, next_input) where
* dec_flags is the second (integer) item of the decoder state and * dec_flags is the second (integer) item of the decoder state and
* next_input is the chunk of input bytes that comes next after the * next_input is the chunk of input bytes that comes next after the
* snapshot point. We use this to reconstruct decoder states in tell(). * snapshot point. We use this to reconstruct decoder states in tell().
*/ */
PyObject *snapshot;
/* Bytes-to-characters ratio for the current chunk. Serves as input for
the heuristic in tell(). */
double b2cratio;
/* Cache raw object if it's a FileIO object */ /* Cache raw object if it's a FileIO object */
PyObject *raw; PyObject *raw;
...@@ -850,6 +854,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) ...@@ -850,6 +854,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
self->decoded_chars_used = 0; self->decoded_chars_used = 0;
self->pending_bytes_count = 0; self->pending_bytes_count = 0;
self->encodefunc = NULL; self->encodefunc = NULL;
self->b2cratio = 0.0;
if (encoding == NULL) { if (encoding == NULL) {
/* Try os.device_encoding(fileno) */ /* Try os.device_encoding(fileno) */
...@@ -1390,6 +1395,7 @@ textiowrapper_read_chunk(textio *self) ...@@ -1390,6 +1395,7 @@ textiowrapper_read_chunk(textio *self)
PyObject *dec_flags = NULL; PyObject *dec_flags = NULL;
PyObject *input_chunk = NULL; PyObject *input_chunk = NULL;
PyObject *decoded_chars, *chunk_size; PyObject *decoded_chars, *chunk_size;
Py_ssize_t nbytes, nchars;
int eof; int eof;
/* The return value is True unless EOF was reached. The decoded string is /* The return value is True unless EOF was reached. The decoded string is
...@@ -1435,7 +1441,8 @@ textiowrapper_read_chunk(textio *self) ...@@ -1435,7 +1441,8 @@ textiowrapper_read_chunk(textio *self)
goto fail; goto fail;
assert(PyBytes_Check(input_chunk)); assert(PyBytes_Check(input_chunk));
eof = (PyBytes_Size(input_chunk) == 0); nbytes = PyBytes_Size(input_chunk);
eof = (nbytes == 0);
if (Py_TYPE(self->decoder) == &PyIncrementalNewlineDecoder_Type) { if (Py_TYPE(self->decoder) == &PyIncrementalNewlineDecoder_Type) {
decoded_chars = _PyIncrementalNewlineDecoder_decode( decoded_chars = _PyIncrementalNewlineDecoder_decode(
...@@ -1450,7 +1457,12 @@ textiowrapper_read_chunk(textio *self) ...@@ -1450,7 +1457,12 @@ textiowrapper_read_chunk(textio *self)
if (decoded_chars == NULL) if (decoded_chars == NULL)
goto fail; goto fail;
textiowrapper_set_decoded_chars(self, decoded_chars); textiowrapper_set_decoded_chars(self, decoded_chars);
if (PyUnicode_GET_SIZE(decoded_chars) > 0) nchars = PyUnicode_GET_SIZE(decoded_chars);
if (nchars > 0)
self->b2cratio = (double) nbytes / nchars;
else
self->b2cratio = 0.0;
if (nchars > 0)
eof = 0; eof = 0;
if (self->telling) { if (self->telling) {
...@@ -2139,8 +2151,12 @@ textiowrapper_tell(textio *self, PyObject *args) ...@@ -2139,8 +2151,12 @@ textiowrapper_tell(textio *self, PyObject *args)
cookie_type cookie = {0,0,0,0,0}; cookie_type cookie = {0,0,0,0,0};
PyObject *next_input; PyObject *next_input;
Py_ssize_t chars_to_skip, chars_decoded; Py_ssize_t chars_to_skip, chars_decoded;
Py_ssize_t skip_bytes, skip_back;
PyObject *saved_state = NULL; PyObject *saved_state = NULL;
char *input, *input_end; char *input, *input_end;
char *dec_buffer;
Py_ssize_t dec_buffer_len;
int dec_flags;
CHECK_INITIALIZED(self); CHECK_INITIALIZED(self);
CHECK_CLOSED(self); CHECK_CLOSED(self);
...@@ -2176,6 +2192,7 @@ textiowrapper_tell(textio *self, PyObject *args) ...@@ -2176,6 +2192,7 @@ textiowrapper_tell(textio *self, PyObject *args)
#else #else
cookie.start_pos = PyLong_AsLong(posobj); cookie.start_pos = PyLong_AsLong(posobj);
#endif #endif
Py_DECREF(posobj);
if (PyErr_Occurred()) if (PyErr_Occurred())
goto fail; goto fail;
...@@ -2190,57 +2207,99 @@ textiowrapper_tell(textio *self, PyObject *args) ...@@ -2190,57 +2207,99 @@ textiowrapper_tell(textio *self, PyObject *args)
/* How many decoded characters have been used up since the snapshot? */ /* How many decoded characters have been used up since the snapshot? */
if (self->decoded_chars_used == 0) { if (self->decoded_chars_used == 0) {
/* We haven't moved from the snapshot point. */ /* We haven't moved from the snapshot point. */
Py_DECREF(posobj);
return textiowrapper_build_cookie(&cookie); return textiowrapper_build_cookie(&cookie);
} }
chars_to_skip = self->decoded_chars_used; chars_to_skip = self->decoded_chars_used;
/* Starting from the snapshot position, we will walk the decoder /* Decoder state will be restored at the end */
* forward until it gives us enough decoded characters.
*/
saved_state = PyObject_CallMethodObjArgs(self->decoder, saved_state = PyObject_CallMethodObjArgs(self->decoder,
_PyIO_str_getstate, NULL); _PyIO_str_getstate, NULL);
if (saved_state == NULL) if (saved_state == NULL)
goto fail; goto fail;
/* Note our initial start point. */ #define DECODER_GETSTATE() do { \
if (_textiowrapper_decoder_setstate(self, &cookie) < 0) PyObject *_state = PyObject_CallMethodObjArgs(self->decoder, \
goto fail; _PyIO_str_getstate, NULL); \
if (_state == NULL) \
goto fail; \
if (!PyArg_Parse(_state, "(y#i)", &dec_buffer, &dec_buffer_len, &dec_flags)) { \
Py_DECREF(_state); \
goto fail; \
} \
Py_DECREF(_state); \
} while (0)
/* TODO: replace assert with exception */
#define DECODER_DECODE(start, len, res) do { \
PyObject *_decoded = PyObject_CallMethod( \
self->decoder, "decode", "y#", start, len); \
if (_decoded == NULL) \
goto fail; \
assert (PyUnicode_Check(_decoded)); \
res = PyUnicode_GET_SIZE(_decoded); \
Py_DECREF(_decoded); \
} while (0)
/* Fast search for an acceptable start point, close to our
current pos */
skip_bytes = (Py_ssize_t) (self->b2cratio * chars_to_skip);
skip_back = 1;
assert(skip_back <= PyBytes_GET_SIZE(next_input));
input = PyBytes_AS_STRING(next_input);
while (skip_bytes > 0) {
/* Decode up to temptative start point */
if (_textiowrapper_decoder_setstate(self, &cookie) < 0)
goto fail;
DECODER_DECODE(input, skip_bytes, chars_decoded);
if (chars_decoded <= chars_to_skip) {
DECODER_GETSTATE();
if (dec_buffer_len == 0) {
/* Before pos and no bytes buffered in decoder => OK */
cookie.dec_flags = dec_flags;
chars_to_skip -= chars_decoded;
break;
}
/* Skip back by buffered amount and reset heuristic */
skip_bytes -= dec_buffer_len;
skip_back = 1;
}
else {
/* We're too far ahead, skip back a bit */
skip_bytes -= skip_back;
skip_back *= 2;
}
}
if (skip_bytes <= 0) {
skip_bytes = 0;
if (_textiowrapper_decoder_setstate(self, &cookie) < 0)
goto fail;
}
/* Feed the decoder one byte at a time. As we go, note the /* Note our initial start point. */
* nearest "safe start point" before the current location cookie.start_pos += skip_bytes;
* (a point where the decoder has nothing buffered, so seek() cookie.chars_to_skip = chars_to_skip;
if (chars_to_skip == 0)
goto finally;
/* We should be close to the desired position. Now feed the decoder one
* byte at a time until we reach the `chars_to_skip` target.
* As we go, note the nearest "safe start point" before the current
* location (a point where the decoder has nothing buffered, so seek()
* can safely start from there and advance to this location). * can safely start from there and advance to this location).
*/ */
chars_decoded = 0; chars_decoded = 0;
input = PyBytes_AS_STRING(next_input); input = PyBytes_AS_STRING(next_input);
input_end = input + PyBytes_GET_SIZE(next_input); input_end = input + PyBytes_GET_SIZE(next_input);
input += skip_bytes;
while (input < input_end) { while (input < input_end) {
PyObject *state; Py_ssize_t n;
char *dec_buffer;
Py_ssize_t dec_buffer_len;
int dec_flags;
PyObject *decoded = PyObject_CallMethod(
self->decoder, "decode", "y#", input, 1);
if (decoded == NULL)
goto fail;
assert (PyUnicode_Check(decoded));
chars_decoded += PyUnicode_GET_SIZE(decoded);
Py_DECREF(decoded);
DECODER_DECODE(input, 1, n);
/* We got n chars for 1 byte */
chars_decoded += n;
cookie.bytes_to_feed += 1; cookie.bytes_to_feed += 1;
DECODER_GETSTATE();
state = PyObject_CallMethodObjArgs(self->decoder,
_PyIO_str_getstate, NULL);
if (state == NULL)
goto fail;
if (!PyArg_Parse(state, "(y#i)", &dec_buffer, &dec_buffer_len, &dec_flags)) {
Py_DECREF(state);
goto fail;
}
Py_DECREF(state);
if (dec_buffer_len == 0 && chars_decoded <= chars_to_skip) { if (dec_buffer_len == 0 && chars_decoded <= chars_to_skip) {
/* Decoder buffer is empty, so this is a safe start point. */ /* Decoder buffer is empty, so this is a safe start point. */
...@@ -2272,8 +2331,7 @@ textiowrapper_tell(textio *self, PyObject *args) ...@@ -2272,8 +2331,7 @@ textiowrapper_tell(textio *self, PyObject *args)
} }
} }
/* finally */ finally:
Py_XDECREF(posobj);
res = PyObject_CallMethod(self->decoder, "setstate", "(O)", saved_state); res = PyObject_CallMethod(self->decoder, "setstate", "(O)", saved_state);
Py_DECREF(saved_state); Py_DECREF(saved_state);
if (res == NULL) if (res == NULL)
...@@ -2284,8 +2342,7 @@ textiowrapper_tell(textio *self, PyObject *args) ...@@ -2284,8 +2342,7 @@ textiowrapper_tell(textio *self, PyObject *args)
cookie.chars_to_skip = Py_SAFE_DOWNCAST(chars_to_skip, Py_ssize_t, int); cookie.chars_to_skip = Py_SAFE_DOWNCAST(chars_to_skip, Py_ssize_t, int);
return textiowrapper_build_cookie(&cookie); return textiowrapper_build_cookie(&cookie);
fail: fail:
Py_XDECREF(posobj);
if (saved_state) { if (saved_state) {
PyObject *type, *value, *traceback; PyObject *type, *value, *traceback;
PyErr_Fetch(&type, &value, &traceback); PyErr_Fetch(&type, &value, &traceback);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment