Issue #15955: Add an option to limit output size when decompressing LZMA data.

Patch by Nikolaus Rath and Martin Panter.

Issue #15955: Add an option to limit output size when decompressing LZMA data.
Patch by Nikolaus Rath and Martin Panter.
26795baa · Antoine Pitrou · e262074e · 26795baa · 26795baa · 26795baa
Kaydet (Commit) 26795baa authored Ock 17, 2015 tarafından Antoine Pitrou
Showing with 148 additions and 20 deletions

lzma.rst Doc/library/lzma.rst +31 -6

test_lzma.py Lib/test/test_lzma.py +91 -0

NEWS Misc/NEWS +3 -0

_lzmamodule.c Modules/_lzmamodule.c +0 -0

_lzmamodule.c.h Modules/clinic/_lzmamodule.c.h +23 -14

No files found.
--- a/Doc/library/lzma.rst
+++ b/Doc/library/lzma.rst
@@ -221,13 +221,32 @@ Compressing and decompressing data in memory
      decompress a multi-stream input with :class:`LZMADecompressor`, you must
      create a new decompressor for each stream.
-   .. method:: decompress(data)
+   .. method:: decompress(data, max_length=-1)
-      Decompress *data* (a :class:`bytes` object), returning a :class:`bytes`
+      Decompress *data* (a :term:`bytes-like object`), returning
-      object containing the decompressed data for at least part of the input.
+      uncompressed data as bytes. Some of *data* may be buffered
-      Some of *data* may be buffered internally, for use in later calls to
+      internally, for use in later calls to :meth:`decompress`. The
-      :meth:`decompress`. The returned data should be concatenated with the
+      returned data should be concatenated with the output of any
-      output of any previous calls to :meth:`decompress`.
+      previous calls to :meth:`decompress`.
+      If *max_length* is nonnegative, returns at most *max_length*
+      bytes of decompressed data. If this limit is reached and further
+      output can be produced, the :attr:`~.needs_input` attribute will
+      be set to ``False``. In this case, the next call to
+      :meth:`~.decompress` may provide *data* as ``b''`` to obtain
+      more of the output.
+      If all of the input data was decompressed and returned (either
+      because this was less than *max_length* bytes, or because
+      *max_length* was negative), the :attr:`~.needs_input` attribute
+      will be set to ``True``.
+      Attempting to decompress data after the end of stream is reached
+      raises an `EOFError`.  Any data found after the end of the
+      stream is ignored and saved in the :attr:`~.unused_data` attribute.
+      .. versionchanged:: 3.5
+         Added the *max_length* parameter.
   .. attribute:: check
@@ -245,6 +264,12 @@ Compressing and decompressing data in memory
      Before the end of the stream is reached, this will be ``b""``.
+   .. attribute:: needs_input
+      ``False`` if the :meth:`.decompress` method can provide more
+      decompressed data before requiring new uncompressed input.
+      .. versionadded:: 3.5
 .. function:: compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None)

--- a/Lib/test/test_lzma.py
+++ b/Lib/test/test_lzma.py
@@ -135,6 +135,97 @@ class CompressorDecompressorTestCase(unittest.TestCase):
        self.assertTrue(lzd.eof)
        self.assertEqual(lzd.unused_data, b"")
+    def test_decompressor_chunks_maxsize(self):
+        lzd = LZMADecompressor()
+        max_length = 100
+        out = []
+        # Feed first half the input
+        len_ = len(COMPRESSED_XZ) // 2
+        out.append(lzd.decompress(COMPRESSED_XZ[:len_],
+                                  max_length=max_length))
+        self.assertFalse(lzd.needs_input)
+        self.assertEqual(len(out[-1]), max_length)
+        # Retrieve more data without providing more input
+        out.append(lzd.decompress(b'', max_length=max_length))
+        self.assertFalse(lzd.needs_input)
+        self.assertEqual(len(out[-1]), max_length)
+        # Retrieve more data while providing more input
+        out.append(lzd.decompress(COMPRESSED_XZ[len_:],
+                                  max_length=max_length))
+        self.assertLessEqual(len(out[-1]), max_length)
+        # Retrieve remaining uncompressed data
+        while not lzd.eof:
+            out.append(lzd.decompress(b'', max_length=max_length))
+            self.assertLessEqual(len(out[-1]), max_length)
+        out = b"".join(out)
+        self.assertEqual(out, INPUT)
+        self.assertEqual(lzd.check, lzma.CHECK_CRC64)
+        self.assertEqual(lzd.unused_data, b"")
+    def test_decompressor_inputbuf_1(self):
+        # Test reusing input buffer after moving existing
+        # contents to beginning
+        lzd = LZMADecompressor()
+        out = []
+        # Create input buffer and fill it
+        self.assertEqual(lzd.decompress(COMPRESSED_XZ[:100],
+                                        max_length=0), b'')
+        # Retrieve some results, freeing capacity at beginning
+        # of input buffer
+        out.append(lzd.decompress(b'', 2))
+        # Add more data that fits into input buffer after
+        # moving existing data to beginning
+        out.append(lzd.decompress(COMPRESSED_XZ[100:105], 15))
+        # Decompress rest of data
+        out.append(lzd.decompress(COMPRESSED_XZ[105:]))
+        self.assertEqual(b''.join(out), INPUT)
+    def test_decompressor_inputbuf_2(self):
+        # Test reusing input buffer by appending data at the
+        # end right away
+        lzd = LZMADecompressor()
+        out = []
+        # Create input buffer and empty it
+        self.assertEqual(lzd.decompress(COMPRESSED_XZ[:200],
+                                        max_length=0), b'')
+        out.append(lzd.decompress(b''))
+        # Fill buffer with new data
+        out.append(lzd.decompress(COMPRESSED_XZ[200:280], 2))
+        # Append some more data, not enough to require resize
+        out.append(lzd.decompress(COMPRESSED_XZ[280:300], 2))
+        # Decompress rest of data
+        out.append(lzd.decompress(COMPRESSED_XZ[300:]))
+        self.assertEqual(b''.join(out), INPUT)
+    def test_decompressor_inputbuf_3(self):
+        # Test reusing input buffer after extending it
+        lzd = LZMADecompressor()
+        out = []
+        # Create almost full input buffer
+        out.append(lzd.decompress(COMPRESSED_XZ[:200], 5))
+        # Add even more data to it, requiring resize
+        out.append(lzd.decompress(COMPRESSED_XZ[200:300], 5))
+        # Decompress rest of data
+        out.append(lzd.decompress(COMPRESSED_XZ[300:]))
+        self.assertEqual(b''.join(out), INPUT)
    def test_decompressor_unused_data(self):
        lzd = LZMADecompressor()
        extra = b"fooblibar"

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -203,6 +203,9 @@ Core and Builtins
 Library
 -------
+- Issue #15955: Add an option to limit output size when decompressing LZMA
+  data.  Patch by Nikolaus Rath and Martin Panter.
 - Issue #23250: In the http.cookies module, capitalize "HttpOnly" and "Secure"
  as they are written in the standard.

--- a/Modules/_lzmamodule.c
+++ b/Modules/_lzmamodule.c
--- a/Modules/clinic/_lzmamodule.c.h
+++ b/Modules/clinic/_lzmamodule.c.h
@@ -62,34 +62,43 @@ _lzma_LZMACompressor_flush(Compressor *self, PyObject *Py_UNUSED(ignored))
 }
 PyDoc_STRVAR(_lzma_LZMADecompressor_decompress__doc__,
-"decompress($self, data, /)\n"
+"decompress($self, /, data, max_length=-1)\n"
 "--\n"
 "\n"
-"Provide data to the decompressor object.\n"
+"Decompresses *data*, returning uncompressed data as bytes.\n"
 "\n"
-"Returns a chunk of decompressed data if possible, or b\'\' otherwise.\n"
+"If *max_length* is nonnegative, returns at most *max_length* bytes of\n"
+"decompressed data. If this limit is reached and further output can be\n"
+"produced, *self.needs_input* will be set to ``False``. In this case, the next\n"
+"call to *decompress()* may provide *data* as b\'\' to obtain more of the output.\n"
 "\n"
-"Attempting to decompress data after the end of stream is reached\n"
+"If all of the input data was decompressed and returned (either because this\n"
-"raises an EOFError.  Any data found after the end of the stream\n"
+"was less than *max_length* bytes, or because *max_length* was negative),\n"
-"is ignored and saved in the unused_data attribute.");
+"*self.needs_input* will be set to True.\n"
+"\n"
+"Attempting to decompress data after the end of stream is reached raises an\n"
+"EOFError.  Any data found after the end of the stream is ignored and saved in\n"
+"the unused_data attribute.");
 #define _LZMA_LZMADECOMPRESSOR_DECOMPRESS_METHODDEF    \
-    {"decompress", (PyCFunction)_lzma_LZMADecompressor_decompress, METH_VARARGS, _lzma_LZMADecompressor_decompress__doc__},
+    {"decompress", (PyCFunction)_lzma_LZMADecompressor_decompress, METH_VARARGS|METH_KEYWORDS, _lzma_LZMADecompressor_decompress__doc__},
 static PyObject *
-_lzma_LZMADecompressor_decompress_impl(Decompressor *self, Py_buffer *data);
+_lzma_LZMADecompressor_decompress_impl(Decompressor *self, Py_buffer *data, Py_ssize_t max_length);
 static PyObject *
-_lzma_LZMADecompressor_decompress(Decompressor *self, PyObject *args)
+_lzma_LZMADecompressor_decompress(Decompressor *self, PyObject *args, PyObject *kwargs)
 {
    PyObject *return_value = NULL;
+    static char *_keywords[] = {"data", "max_length", NULL};
    Py_buffer data = {NULL, NULL};
+    Py_ssize_t max_length = -1;
-    if (!PyArg_ParseTuple(args,
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs,
-        "y*:decompress",
+        "y*|n:decompress", _keywords,
-        &data))
+        &data, &max_length))
        goto exit;
-    return_value = _lzma_LZMADecompressor_decompress_impl(self, &data);
+    return_value = _lzma_LZMADecompressor_decompress_impl(self, &data, max_length);
 exit:
    /* Cleanup for data */
@@ -242,4 +251,4 @@ exit:
    return return_value;
 }
-/*[clinic end generated code: output=808fec8216ac712b input=a9049054013a1b77]*/
+/*[clinic end generated code: output=d17fac38b09626d8 input=a9049054013a1b77]*/