gzip.py 19.9 KB
Newer Older
1 2
"""Functions that read and write gzipped files.

3 4 5 6 7
The user of the file doesn't have to worry about the compression,
but random access is not allowed."""

# based on Andrew Kuchling's minigzip.py distributed with the zlib module

8
import struct, sys, time, os
9
import zlib
10
import builtins
11
import io
12
import _compression
13

14
__all__ = ["GzipFile", "open", "compress", "decompress"]
15

16 17 18 19
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16

READ, WRITE = 1, 2

20 21 22 23
def open(filename, mode="rb", compresslevel=9,
         encoding=None, errors=None, newline=None):
    """Open a gzip-compressed file in binary or text mode.

24 25 26
    The filename argument can be an actual filename (a str or bytes object), or
    an existing file object to read from or write to.

27 28 29
    The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
    binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
    "rb", and the default compresslevel is 9.
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49

    For binary mode, this function is equivalent to the GzipFile constructor:
    GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
    and newline arguments must not be provided.

    For text mode, a GzipFile object is created, and wrapped in an
    io.TextIOWrapper instance with the specified encoding, error handling
    behavior, and line ending(s).

    """
    if "t" in mode:
        if "b" in mode:
            raise ValueError("Invalid mode: %r" % (mode,))
    else:
        if encoding is not None:
            raise ValueError("Argument 'encoding' not supported in binary mode")
        if errors is not None:
            raise ValueError("Argument 'errors' not supported in binary mode")
        if newline is not None:
            raise ValueError("Argument 'newline' not supported in binary mode")
50 51

    gz_mode = mode.replace("t", "")
52
    if isinstance(filename, (str, bytes, os.PathLike)):
53 54 55 56 57 58
        binary_file = GzipFile(filename, gz_mode, compresslevel)
    elif hasattr(filename, "read") or hasattr(filename, "write"):
        binary_file = GzipFile(None, gz_mode, compresslevel, filename)
    else:
        raise TypeError("filename must be a str or bytes object, or a file")

59 60 61 62 63
    if "t" in mode:
        return io.TextIOWrapper(binary_file, encoding, errors, newline)
    else:
        return binary_file

64
def write32u(output, value):
65 66
    # The L format writes the bit pattern correctly whether signed
    # or unsigned.
67 68
    output.write(struct.pack("<L", value))

69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
class _PaddedFile:
    """Minimal read-only file object that prepends a string to the contents
    of an actual file. Shouldn't be used outside of gzip.py, as it lacks
    essential functionality."""

    def __init__(self, f, prepend=b''):
        self._buffer = prepend
        self._length = len(prepend)
        self.file = f
        self._read = 0

    def read(self, size):
        if self._read is None:
            return self.file.read(size)
        if self._read + size <= self._length:
            read = self._read
            self._read += size
            return self._buffer[read:self._read]
        else:
            read = self._read
            self._read = None
            return self._buffer[read:] + \
                   self.file.read(size-self._length+read)

93
    def prepend(self, prepend=b''):
94 95
        if self._read is None:
            self._buffer = prepend
96
        else:  # Assume data was read since the last prepend() call
97 98 99 100 101
            self._read -= len(prepend)
            return
        self._length = len(self._buffer)
        self._read = 0

102
    def seek(self, off):
103 104
        self._read = None
        self._buffer = None
105
        return self.file.seek(off)
106

107 108
    def seekable(self):
        return True  # Allows fast-forwarding even in unseekable streams
109

110
class GzipFile(_compression.BaseStream):
111
    """The GzipFile class simulates most of the methods of a file object with
112
    the exception of the truncate() method.
113

114
    This class only supports opening files in binary mode. If you need to open a
115
    compressed file in text mode, use the gzip.open() function.
116

117
    """
118

119 120
    # Overridden with internal file object to be closed, if only a filename
    # is passed in
121 122
    myfileobj = None

Tim Peters's avatar
Tim Peters committed
123
    def __init__(self, filename=None, mode=None,
124
                 compresslevel=9, fileobj=None, mtime=None):
125 126 127 128 129 130
        """Constructor for the GzipFile class.

        At least one of fileobj and filename must be given a
        non-trivial value.

        The new class instance is based on fileobj, which can be a regular
131
        file, an io.BytesIO object, or any other object which simulates a file.
132 133 134 135
        It defaults to None, in which case filename is opened to provide
        a file object.

        When fileobj is not None, the filename argument is only used to be
136
        included in the gzip file header, which may include the original
137 138 139 140
        filename of the uncompressed file.  It defaults to the filename of
        fileobj, if discernible; otherwise, it defaults to the empty string,
        and in this case the original filename is not included in the header.

141 142
        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
        'xb' depending on whether the file will be read or written.  The default
143
        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
144
        A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
145
        'wb', 'a' and 'ab', and 'x' and 'xb'.
146

147
        The compresslevel argument is an integer from 0 to 9 controlling the
148
        level of compression; 1 is fastest and produces the least compression,
149 150
        and 9 is slowest and produces the most compression. 0 is no compression
        at all. The default is 9.
151

152
        The mtime argument is an optional numeric timestamp to be written
153 154
        to the last modification time field in the stream when compressing.
        If omitted or None, the current time is used.
155

156 157
        """

158
        if mode and ('t' in mode or 'U' in mode):
159
            raise ValueError("Invalid mode: {!r}".format(mode))
160 161
        if mode and 'b' not in mode:
            mode += 'b'
162
        if fileobj is None:
163
            fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
164
        if filename is None:
165 166
            filename = getattr(fileobj, 'name', '')
            if not isinstance(filename, (str, bytes)):
167
                filename = ''
168 169
        else:
            filename = os.fspath(filename)
170
        if mode is None:
171
            mode = getattr(fileobj, 'mode', 'rb')
172

173
        if mode.startswith('r'):
174
            self.mode = READ
175 176
            raw = _GzipReader(fileobj)
            self._buffer = io.BufferedReader(raw)
177
            self.name = filename
178

179
        elif mode.startswith(('w', 'a', 'x')):
180 181 182
            self.mode = WRITE
            self._init_write(filename)
            self.compress = zlib.compressobj(compresslevel,
Tim Peters's avatar
Tim Peters committed
183
                                             zlib.DEFLATED,
184 185 186
                                             -zlib.MAX_WBITS,
                                             zlib.DEF_MEM_LEVEL,
                                             0)
187
            self._write_mtime = mtime
188
        else:
189
            raise ValueError("Invalid mode: {!r}".format(mode))
190 191 192 193 194

        self.fileobj = fileobj

        if self.mode == WRITE:
            self._write_gzip_header()
195

196 197 198
    @property
    def filename(self):
        import warnings
199
        warnings.warn("use the name attribute", DeprecationWarning, 2)
200 201 202 203
        if self.mode == WRITE and self.name[-3:] != ".gz":
            return self.name + ".gz"
        return self.name

204 205 206 207 208
    @property
    def mtime(self):
        """Last modification time read from stream, or None"""
        return self._buffer.raw._last_mtime

209
    def __repr__(self):
210
        s = repr(self.fileobj)
211
        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
212 213

    def _init_write(self, filename):
214
        self.name = filename
215
        self.crc = zlib.crc32(b"")
216 217 218
        self.size = 0
        self.writebuf = []
        self.bufsize = 0
219
        self.offset = 0  # Current file offset for seek(), tell(), etc
220 221

    def _write_gzip_header(self):
222 223
        self.fileobj.write(b'\037\213')             # magic header
        self.fileobj.write(b'\010')                 # compression method
224
        try:
225 226
            # RFC 1952 requires the FNAME field to be Latin-1. Do not
            # include filenames that cannot be represented that way.
227
            fname = os.path.basename(self.name)
228 229
            if not isinstance(fname, bytes):
                fname = fname.encode('latin-1')
230 231
            if fname.endswith(b'.gz'):
                fname = fname[:-3]
232
        except UnicodeEncodeError:
233 234
            fname = b''
        flags = 0
235 236
        if fname:
            flags = FNAME
237
        self.fileobj.write(chr(flags).encode('latin-1'))
238
        mtime = self._write_mtime
239 240 241
        if mtime is None:
            mtime = time.time()
        write32u(self.fileobj, int(mtime))
242 243
        self.fileobj.write(b'\002')
        self.fileobj.write(b'\377')
244
        if fname:
245
            self.fileobj.write(fname + b'\000')
246 247

    def write(self,data):
248
        self._check_not_closed()
249 250
        if self.mode != WRITE:
            import errno
251
            raise OSError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters's avatar
Tim Peters committed
252

253
        if self.fileobj is None:
254
            raise ValueError("write() on closed GzipFile object")
255

256 257 258 259 260 261
        if isinstance(data, bytes):
            length = len(data)
        else:
            # accept any data that supports the buffer protocol
            data = memoryview(data)
            length = data.nbytes
262

263 264 265
        if length > 0:
            self.fileobj.write(self.compress.compress(data))
            self.size += length
266
            self.crc = zlib.crc32(data, self.crc)
267
            self.offset += length
268

269
        return length
270

271
    def read(self, size=-1):
272
        self._check_not_closed()
273 274
        if self.mode != READ:
            import errno
275
            raise OSError(errno.EBADF, "read() on write-only GzipFile object")
276
        return self._buffer.read(size)
277

278
    def read1(self, size=-1):
279 280 281 282
        """Implements BufferedIOBase.read1()

        Reads up to a buffer's worth of data is size is negative."""
        self._check_not_closed()
283 284
        if self.mode != READ:
            import errno
285
            raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
286

287 288 289
        if size < 0:
            size = io.DEFAULT_BUFFER_SIZE
        return self._buffer.read1(size)
290

291
    def peek(self, n):
292
        self._check_not_closed()
293 294
        if self.mode != READ:
            import errno
295
            raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
296
        return self._buffer.peek(n)
297

298 299 300 301
    @property
    def closed(self):
        return self.fileobj is None

302
    def close(self):
303 304
        fileobj = self.fileobj
        if fileobj is None:
Georg Brandl's avatar
Georg Brandl committed
305
            return
306 307 308 309 310
        self.fileobj = None
        try:
            if self.mode == WRITE:
                fileobj.write(self.compress.flush())
                write32u(fileobj, self.crc)
311
                # self.size may exceed 2 GiB, or even 4 GiB
312
                write32u(fileobj, self.size & 0xffffffff)
313 314
            elif self.mode == READ:
                self._buffer.close()
315 316 317 318 319
        finally:
            myfileobj = self.myfileobj
            if myfileobj:
                self.myfileobj = None
                myfileobj.close()
320

321
    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
322
        self._check_not_closed()
323
        if self.mode == WRITE:
Tim Peters's avatar
Tim Peters committed
324 325
            # Ensure the compressor's buffer is flushed
            self.fileobj.write(self.compress.flush(zlib_mode))
326
            self.fileobj.flush()
327

328 329 330 331 332 333 334 335
    def fileno(self):
        """Invoke the underlying file object's fileno() method.

        This will raise AttributeError if the underlying file object
        doesn't support fileno().
        """
        return self.fileobj.fileno()

336 337
    def rewind(self):
        '''Return the uncompressed stream file position indicator to the
Tim Peters's avatar
Tim Peters committed
338
        beginning of the file'''
339
        if self.mode != READ:
340
            raise OSError("Can't rewind in write mode")
341
        self._buffer.seek(0)
342

343 344 345 346 347 348 349 350 351
    def readable(self):
        return self.mode == READ

    def writable(self):
        return self.mode == WRITE

    def seekable(self):
        return True

352
    def seek(self, offset, whence=io.SEEK_SET):
353
        if self.mode == WRITE:
354 355 356 357 358
            if whence != io.SEEK_SET:
                if whence == io.SEEK_CUR:
                    offset = self.offset + offset
                else:
                    raise ValueError('Seek from end not supported')
359
            if offset < self.offset:
360
                raise OSError('Negative seek in write mode')
361
            count = offset - self.offset
362
            chunk = b'\0' * 1024
363
            for i in range(count // 1024):
364
                self.write(chunk)
365
            self.write(b'\0' * (count % 1024))
366
        elif self.mode == READ:
367 368
            self._check_not_closed()
            return self._buffer.seek(offset, whence)
369

370 371
        return self.offset

372
    def readline(self, size=-1):
373 374 375 376 377 378 379 380 381 382 383 384 385
        self._check_not_closed()
        return self._buffer.readline(size)


class _GzipReader(_compression.DecompressReader):
    def __init__(self, fp):
        super().__init__(_PaddedFile(fp), zlib.decompressobj,
                         wbits=-zlib.MAX_WBITS)
        # Set flag indicating start of a new member
        self._new_member = True
        self._last_mtime = None

    def _init_read(self):
386
        self._crc = zlib.crc32(b"")
387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
        self._stream_size = 0  # Decompressed size of unconcatenated stream

    def _read_exact(self, n):
        '''Read exactly *n* bytes from `self._fp`

        This method is required because self._fp may be unbuffered,
        i.e. return short reads.
        '''

        data = self._fp.read(n)
        while len(data) < n:
            b = self._fp.read(n - len(data))
            if not b:
                raise EOFError("Compressed file ended before the "
                               "end-of-stream marker was reached")
            data += b
        return data

    def _read_gzip_header(self):
        magic = self._fp.read(2)
        if magic == b'':
            return False

        if magic != b'\037\213':
            raise OSError('Not a gzipped file (%r)' % magic)

        (method, flag,
         self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
        if method != 8:
            raise OSError('Unknown compression method')

        if flag & FEXTRA:
            # Read & discard the extra field, if present
            extra_len, = struct.unpack("<H", self._read_exact(2))
            self._read_exact(extra_len)
        if flag & FNAME:
            # Read and discard a null-terminated string containing the filename
            while True:
                s = self._fp.read(1)
                if not s or s==b'\000':
                    break
        if flag & FCOMMENT:
            # Read and discard a null-terminated string containing a comment
            while True:
                s = self._fp.read(1)
                if not s or s==b'\000':
                    break
        if flag & FHCRC:
            self._read_exact(2)     # Read & discard the 16-bit header CRC
        return True

    def read(self, size=-1):
439
        if size < 0:
440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
            return self.readall()
        # size=0 is special because decompress(max_length=0) is not supported
        if not size:
            return b""

        # For certain input data, a single
        # call to decompress() may not return
        # any data. In this case, retry until we get some data or reach EOF.
        while True:
            if self._decompressor.eof:
                # Ending case: we've come to the end of a member in the file,
                # so finish up this member, and read a new gzip header.
                # Check the CRC and file size, and set the flag so we read
                # a new member
                self._read_eof()
                self._new_member = True
                self._decompressor = self._decomp_factory(
                    **self._decomp_args)

            if self._new_member:
                # If the _new_member flag is set, we have to
                # jump to the next member, if there is one.
                self._init_read()
                if not self._read_gzip_header():
                    self._size = self._pos
                    return b""
                self._new_member = False

            # Read a chunk of data from the file
            buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)

            uncompress = self._decompressor.decompress(buf, size)
            if self._decompressor.unconsumed_tail != b"":
                self._fp.prepend(self._decompressor.unconsumed_tail)
            elif self._decompressor.unused_data != b"":
                # Prepend the already read bytes to the fileobj so they can
                # be seen by _read_eof() and _read_gzip_header()
                self._fp.prepend(self._decompressor.unused_data)

            if uncompress != b"":
480
                break
481 482 483
            if buf == b"":
                raise EOFError("Compressed file ended before the "
                               "end-of-stream marker was reached")
484

485 486 487
        self._add_read_data( uncompress )
        self._pos += len(uncompress)
        return uncompress
Tim Peters's avatar
Tim Peters committed
488

489
    def _add_read_data(self, data):
490
        self._crc = zlib.crc32(data, self._crc)
491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516
        self._stream_size = self._stream_size + len(data)

    def _read_eof(self):
        # We've read to the end of the file
        # We check the that the computed CRC and size of the
        # uncompressed data matches the stored values.  Note that the size
        # stored is the true file size mod 2**32.
        crc32, isize = struct.unpack("<II", self._read_exact(8))
        if crc32 != self._crc:
            raise OSError("CRC check failed %s != %s" % (hex(crc32),
                                                         hex(self._crc)))
        elif isize != (self._stream_size & 0xffffffff):
            raise OSError("Incorrect length of data produced")

        # Gzip files can be padded with zeroes and still have archives.
        # Consume all zero bytes and set the file position to the first
        # non-zero byte. See http://www.gzip.org/#faq8
        c = b"\x00"
        while c == b"\x00":
            c = self._fp.read(1)
        if c:
            self._fp.prepend(c)

    def _rewind(self):
        super()._rewind()
        self._new_member = True
517

518 519
def compress(data, compresslevel=9):
    """Compress data in one shot and return the compressed string.
520
    Optional argument is the compression level, in range of 0-9.
521 522 523 524 525 526 527 528 529 530 531 532 533 534
    """
    buf = io.BytesIO()
    with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
        f.write(data)
    return buf.getvalue()

def decompress(data):
    """Decompress a gzip compressed string in one shot.
    Return the decompressed string.
    """
    with GzipFile(fileobj=io.BytesIO(data)) as f:
        return f.read()


535 536 537 538 539 540 541
def _test():
    # Act like gzip; with -d, act like gunzip.
    # The input file is not deleted, however, nor are any other gzip
    # options or features supported.
    args = sys.argv[1:]
    decompress = args and args[0] == "-d"
    if decompress:
542
        args = args[1:]
543
    if not args:
544
        args = ["-"]
545
    for arg in args:
546 547
        if decompress:
            if arg == "-":
548 549
                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
                g = sys.stdout.buffer
550 551
            else:
                if arg[-3:] != ".gz":
552
                    print("filename doesn't end in .gz:", repr(arg))
553 554
                    continue
                f = open(arg, "rb")
555
                g = builtins.open(arg[:-3], "wb")
556 557
        else:
            if arg == "-":
558 559
                f = sys.stdin.buffer
                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
560
            else:
561
                f = builtins.open(arg, "rb")
562
                g = open(arg + ".gz", "wb")
563
        while True:
564 565 566 567
            chunk = f.read(1024)
            if not chunk:
                break
            g.write(chunk)
568
        if g is not sys.stdout.buffer:
569
            g.close()
570
        if f is not sys.stdin.buffer:
571
            f.close()
572 573 574

if __name__ == '__main__':
    _test()