gzip.py 23.7 KB
Newer Older
1 2
"""Functions that read and write gzipped files.

3 4 5 6 7
The user of the file doesn't have to worry about the compression,
but random access is not allowed."""

# based on Andrew Kuchling's minigzip.py distributed with the zlib module

8
import struct, sys, time, os
9
import zlib
10
import builtins
11
import io
12

13
__all__ = ["GzipFile", "open", "compress", "decompress"]
14

15 16 17 18
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16

READ, WRITE = 1, 2

19 20 21 22
def open(filename, mode="rb", compresslevel=9,
         encoding=None, errors=None, newline=None):
    """Open a gzip-compressed file in binary or text mode.

23 24 25
    The filename argument can be an actual filename (a str or bytes object), or
    an existing file object to read from or write to.

26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
    The mode argument can be "r", "rb", "w", "wb", "a" or "ab" for binary mode,
    or "rt", "wt" or "at" for text mode. The default mode is "rb", and the
    default compresslevel is 9.

    For binary mode, this function is equivalent to the GzipFile constructor:
    GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
    and newline arguments must not be provided.

    For text mode, a GzipFile object is created, and wrapped in an
    io.TextIOWrapper instance with the specified encoding, error handling
    behavior, and line ending(s).

    """
    if "t" in mode:
        if "b" in mode:
            raise ValueError("Invalid mode: %r" % (mode,))
    else:
        if encoding is not None:
            raise ValueError("Argument 'encoding' not supported in binary mode")
        if errors is not None:
            raise ValueError("Argument 'errors' not supported in binary mode")
        if newline is not None:
            raise ValueError("Argument 'newline' not supported in binary mode")
49 50 51 52 53 54 55 56 57

    gz_mode = mode.replace("t", "")
    if isinstance(filename, (str, bytes)):
        binary_file = GzipFile(filename, gz_mode, compresslevel)
    elif hasattr(filename, "read") or hasattr(filename, "write"):
        binary_file = GzipFile(None, gz_mode, compresslevel, filename)
    else:
        raise TypeError("filename must be a str or bytes object, or a file")

58 59 60 61 62
    if "t" in mode:
        return io.TextIOWrapper(binary_file, encoding, errors, newline)
    else:
        return binary_file

63
def write32u(output, value):
64 65
    # The L format writes the bit pattern correctly whether signed
    # or unsigned.
66 67
    output.write(struct.pack("<L", value))

68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
class _PaddedFile:
    """Minimal read-only file object that prepends a string to the contents
    of an actual file. Shouldn't be used outside of gzip.py, as it lacks
    essential functionality."""

    def __init__(self, f, prepend=b''):
        self._buffer = prepend
        self._length = len(prepend)
        self.file = f
        self._read = 0

    def read(self, size):
        if self._read is None:
            return self.file.read(size)
        if self._read + size <= self._length:
            read = self._read
            self._read += size
            return self._buffer[read:self._read]
        else:
            read = self._read
            self._read = None
            return self._buffer[read:] + \
                   self.file.read(size-self._length+read)

    def prepend(self, prepend=b'', readprevious=False):
        if self._read is None:
            self._buffer = prepend
        elif readprevious and len(prepend) <= self._read:
            self._read -= len(prepend)
            return
        else:
            self._buffer = self._buffer[read:] + prepend
        self._length = len(self._buffer)
        self._read = 0

    def unused(self):
        if self._read is None:
            return b''
        return self._buffer[self._read:]

    def seek(self, offset, whence=0):
        # This is only ever called with offset=whence=0
        if whence == 1 and self._read is not None:
            if 0 <= offset + self._read <= self._length:
                self._read += offset
                return
            else:
                offset += self._length - self._read
        self._read = None
        self._buffer = None
        return self.file.seek(offset, whence)

    def __getattr__(self, name):
121
        return getattr(self.file, name)
122 123


124
class GzipFile(io.BufferedIOBase):
125
    """The GzipFile class simulates most of the methods of a file object with
126
    the exception of the readinto() and truncate() methods.
127

128
    This class only supports opening files in binary mode. If you need to open a
129
    compressed file in text mode, use the gzip.open() function.
130

131
    """
132

133
    myfileobj = None
134
    max_read_chunk = 10 * 1024 * 1024   # 10Mb
135

Tim Peters's avatar
Tim Peters committed
136
    def __init__(self, filename=None, mode=None,
137
                 compresslevel=9, fileobj=None, mtime=None):
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
        """Constructor for the GzipFile class.

        At least one of fileobj and filename must be given a
        non-trivial value.

        The new class instance is based on fileobj, which can be a regular
        file, a StringIO object, or any other object which simulates a file.
        It defaults to None, in which case filename is opened to provide
        a file object.

        When fileobj is not None, the filename argument is only used to be
        included in the gzip file header, which may includes the original
        filename of the uncompressed file.  It defaults to the filename of
        fileobj, if discernible; otherwise, it defaults to the empty string,
        and in this case the original filename is not included in the header.

        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
        depending on whether the file will be read or written.  The default
        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
157 158
        A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
        'wb', and 'a' and 'ab'.
159

160
        The compresslevel argument is an integer from 0 to 9 controlling the
161
        level of compression; 1 is fastest and produces the least compression,
162 163
        and 9 is slowest and produces the most compression. 0 is no compression
        at all. The default is 9.
164

165 166 167 168 169 170 171 172 173
        The mtime argument is an optional numeric timestamp to be written
        to the stream when compressing.  All gzip compressed streams
        are required to contain a timestamp.  If omitted or None, the
        current time is used.  This module ignores the timestamp when
        decompressing; however, some programs, such as gunzip, make use
        of it.  The format of the timestamp is the same as that of the
        return value of time.time() and of the st_mtime member of the
        object returned by os.stat().

174 175
        """

176
        if mode and ('t' in mode or 'U' in mode):
177
            raise ValueError("Invalid mode: {!r}".format(mode))
178 179
        if mode and 'b' not in mode:
            mode += 'b'
180
        if fileobj is None:
181
            fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
182
        if filename is None:
183 184
            filename = getattr(fileobj, 'name', '')
            if not isinstance(filename, (str, bytes)):
185
                filename = ''
186
        if mode is None:
187
            mode = getattr(fileobj, 'mode', 'rb')
188

189
        if mode.startswith('r'):
190
            self.mode = READ
Tim Peters's avatar
Tim Peters committed
191
            # Set flag indicating start of a new member
192
            self._new_member = True
193 194 195
            # Buffer data read from gzip file. extrastart is offset in
            # stream where buffer starts. extrasize is number of
            # bytes remaining in buffer from current stream position.
196
            self.extrabuf = b""
197
            self.extrasize = 0
198
            self.extrastart = 0
199
            self.name = filename
200 201
            # Starts small, scales exponentially
            self.min_readsize = 100
202
            fileobj = _PaddedFile(fileobj)
203

204
        elif mode.startswith(('w', 'a')):
205 206 207
            self.mode = WRITE
            self._init_write(filename)
            self.compress = zlib.compressobj(compresslevel,
Tim Peters's avatar
Tim Peters committed
208
                                             zlib.DEFLATED,
209 210 211 212
                                             -zlib.MAX_WBITS,
                                             zlib.DEF_MEM_LEVEL,
                                             0)
        else:
213
            raise ValueError("Invalid mode: {!r}".format(mode))
214 215

        self.fileobj = fileobj
216
        self.offset = 0
217
        self.mtime = mtime
218 219 220

        if self.mode == WRITE:
            self._write_gzip_header()
221

222 223 224
    @property
    def filename(self):
        import warnings
225
        warnings.warn("use the name attribute", DeprecationWarning, 2)
226 227 228 229
        if self.mode == WRITE and self.name[-3:] != ".gz":
            return self.name + ".gz"
        return self.name

230
    def __repr__(self):
231 232 233 234
        fileobj = self.fileobj
        if isinstance(fileobj, _PaddedFile):
            fileobj = fileobj.file
        s = repr(fileobj)
235
        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
236

237 238 239 240 241 242 243
    def _check_closed(self):
        """Raises a ValueError if the underlying file object has been closed.

        """
        if self.closed:
            raise ValueError('I/O operation on closed file.')

244
    def _init_write(self, filename):
245
        self.name = filename
246
        self.crc = zlib.crc32(b"") & 0xffffffff
247 248 249
        self.size = 0
        self.writebuf = []
        self.bufsize = 0
250 251

    def _write_gzip_header(self):
252 253
        self.fileobj.write(b'\037\213')             # magic header
        self.fileobj.write(b'\010')                 # compression method
254
        try:
255 256
            # RFC 1952 requires the FNAME field to be Latin-1. Do not
            # include filenames that cannot be represented that way.
257
            fname = os.path.basename(self.name)
258 259
            if not isinstance(fname, bytes):
                fname = fname.encode('latin-1')
260 261
            if fname.endswith(b'.gz'):
                fname = fname[:-3]
262
        except UnicodeEncodeError:
263 264
            fname = b''
        flags = 0
265 266
        if fname:
            flags = FNAME
267
        self.fileobj.write(chr(flags).encode('latin-1'))
268 269 270 271
        mtime = self.mtime
        if mtime is None:
            mtime = time.time()
        write32u(self.fileobj, int(mtime))
272 273
        self.fileobj.write(b'\002')
        self.fileobj.write(b'\377')
274
        if fname:
275
            self.fileobj.write(fname + b'\000')
276 277

    def _init_read(self):
278
        self.crc = zlib.crc32(b"") & 0xffffffff
279
        self.size = 0
280

281 282 283 284 285 286 287 288 289 290
    def _read_exact(self, n):
        data = self.fileobj.read(n)
        while len(data) < n:
            b = self.fileobj.read(n - len(data))
            if not b:
                raise EOFError("Compressed file ended before the "
                               "end-of-stream marker was reached")
            data += b
        return data

291
    def _read_gzip_header(self):
292
        magic = self.fileobj.read(2)
293
        if magic == b'':
294
            return False
295

296
        if magic != b'\037\213':
297
            raise OSError('Not a gzipped file')
298 299

        method, flag, self.mtime = struct.unpack("<BBIxx", self._read_exact(8))
300
        if method != 8:
301
            raise OSError('Unknown compression method')
302 303 304

        if flag & FEXTRA:
            # Read & discard the extra field, if present
305 306
            extra_len, = struct.unpack("<H", self._read_exact(2))
            self._read_exact(extra_len)
307 308
        if flag & FNAME:
            # Read and discard a null-terminated string containing the filename
309
            while True:
310
                s = self.fileobj.read(1)
311
                if not s or s==b'\000':
312
                    break
313 314
        if flag & FCOMMENT:
            # Read and discard a null-terminated string containing a comment
315
            while True:
316
                s = self.fileobj.read(1)
317
                if not s or s==b'\000':
318
                    break
319
        if flag & FHCRC:
320
            self._read_exact(2)     # Read & discard the 16-bit header CRC
321

322 323 324 325
        unused = self.fileobj.unused()
        if unused:
            uncompress = self.decompress.decompress(unused)
            self._add_read_data(uncompress)
326
        return True
327

328
    def write(self,data):
329
        self._check_closed()
330 331
        if self.mode != WRITE:
            import errno
332
            raise OSError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters's avatar
Tim Peters committed
333

334
        if self.fileobj is None:
335
            raise ValueError("write() on closed GzipFile object")
336 337 338 339 340

        # Convert data type if called by io.BufferedWriter.
        if isinstance(data, memoryview):
            data = data.tobytes()

341 342
        if len(data) > 0:
            self.size = self.size + len(data)
Christian Heimes's avatar
Christian Heimes committed
343
            self.crc = zlib.crc32(data, self.crc) & 0xffffffff
344
            self.fileobj.write( self.compress.compress(data) )
345
            self.offset += len(data)
346

347 348
        return len(data)

349
    def read(self, size=-1):
350
        self._check_closed()
351 352
        if self.mode != READ:
            import errno
353
            raise OSError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters's avatar
Tim Peters committed
354

355
        if self.extrasize <= 0 and self.fileobj is None:
356
            return b''
357 358

        readsize = 1024
359
        if size < 0:        # get the whole thing
360 361 362
            while self._read(readsize):
                readsize = min(self.max_read_chunk, readsize * 2)
            size = self.extrasize
363
        else:               # just get some more of it
364 365 366 367 368 369
            while size > self.extrasize:
                if not self._read(readsize):
                    if size > self.extrasize:
                        size = self.extrasize
                    break
                readsize = min(self.max_read_chunk, readsize * 2)
Tim Peters's avatar
Tim Peters committed
370

371 372
        offset = self.offset - self.extrastart
        chunk = self.extrabuf[offset: offset + size]
373 374
        self.extrasize = self.extrasize - size

375
        self.offset += size
376
        return chunk
377

378 379 380 381
    def read1(self, size=-1):
        self._check_closed()
        if self.mode != READ:
            import errno
382
            raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
383 384 385 386

        if self.extrasize <= 0 and self.fileobj is None:
            return b''

387 388 389
        # For certain input data, a single call to _read() may not return
        # any data. In this case, retry until we get some data or reach EOF.
        while self.extrasize <= 0 and self._read():
390 391 392 393 394 395 396 397 398 399
            pass
        if size < 0 or size > self.extrasize:
            size = self.extrasize

        offset = self.offset - self.extrastart
        chunk = self.extrabuf[offset: offset + size]
        self.extrasize -= size
        self.offset += size
        return chunk

400 401 402
    def peek(self, n):
        if self.mode != READ:
            import errno
403
            raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
404

405 406
        # Do not return ridiculously small buffers, for one common idiom
        # is to call peek(1) and expect more bytes in return.
407 408 409 410 411
        if n < 100:
            n = 100
        if self.extrasize == 0:
            if self.fileobj is None:
                return b''
412 413 414
            # Ensure that we don't return b"" if we haven't reached EOF.
            # 1024 is the same buffering heuristic used in read()
            while self.extrasize == 0 and self._read(max(n, 1024)):
415 416 417 418 419 420
                pass
        offset = self.offset - self.extrastart
        remaining = self.extrasize
        assert remaining == len(self.extrabuf) - offset
        return self.extrabuf[offset:offset + n]

421
    def _unread(self, buf):
422
        self.extrasize = len(buf) + self.extrasize
423
        self.offset -= len(buf)
424 425

    def _read(self, size=1024):
426
        if self.fileobj is None:
427
            return False
Tim Peters's avatar
Tim Peters committed
428

429
        if self._new_member:
430 431
            # If the _new_member flag is set, we have to
            # jump to the next member, if there is one.
Tim Peters's avatar
Tim Peters committed
432
            self._init_read()
433 434
            if not self._read_gzip_header():
                return False
435
            self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
436
            self._new_member = False
Tim Peters's avatar
Tim Peters committed
437

438 439
        # Read a chunk of data from the file
        buf = self.fileobj.read(size)
Tim Peters's avatar
Tim Peters committed
440

441 442
        # If the EOF has been reached, flush the decompression object
        # and mark this object as finished.
Tim Peters's avatar
Tim Peters committed
443

444
        if buf == b"":
445
            uncompress = self.decompress.flush()
446 447 448
            # Prepend the already read bytes to the fileobj to they can be
            # seen by _read_eof()
            self.fileobj.prepend(self.decompress.unused_data, True)
449 450
            self._read_eof()
            self._add_read_data( uncompress )
451
            return False
Tim Peters's avatar
Tim Peters committed
452

453 454 455
        uncompress = self.decompress.decompress(buf)
        self._add_read_data( uncompress )

456
        if self.decompress.unused_data != b"":
457 458 459
            # Ending case: we've come to the end of a member in the file,
            # so seek back to the start of the unused data, finish up
            # this member, and read a new gzip header.
460 461 462
            # Prepend the already read bytes to the fileobj to they can be
            # seen by _read_eof() and _read_gzip_header()
            self.fileobj.prepend(self.decompress.unused_data, True)
463
            # Check the CRC and file size, and set the flag so we read
Tim Peters's avatar
Tim Peters committed
464
            # a new member on the next call
465
            self._read_eof()
466
            self._new_member = True
467
        return True
Tim Peters's avatar
Tim Peters committed
468 469

    def _add_read_data(self, data):
Christian Heimes's avatar
Christian Heimes committed
470
        self.crc = zlib.crc32(data, self.crc) & 0xffffffff
471 472
        offset = self.offset - self.extrastart
        self.extrabuf = self.extrabuf[offset:] + data
473
        self.extrasize = self.extrasize + len(data)
474
        self.extrastart = self.offset
475
        self.size = self.size + len(data)
476 477

    def _read_eof(self):
478
        # We've read to the end of the file
479
        # We check the that the computed CRC and size of the
480 481
        # uncompressed data matches the stored values.  Note that the size
        # stored is the true file size mod 2**32.
482
        crc32, isize = struct.unpack("<II", self._read_exact(8))
Christian Heimes's avatar
Christian Heimes committed
483
        if crc32 != self.crc:
484
            raise OSError("CRC check failed %s != %s" % (hex(crc32),
Christian Heimes's avatar
Christian Heimes committed
485
                                                         hex(self.crc)))
486
        elif isize != (self.size & 0xffffffff):
487
            raise OSError("Incorrect length of data produced")
Tim Peters's avatar
Tim Peters committed
488

489 490 491 492 493 494 495
        # Gzip files can be padded with zeroes and still have archives.
        # Consume all zero bytes and set the file position to the first
        # non-zero byte. See http://www.gzip.org/#faq8
        c = b"\x00"
        while c == b"\x00":
            c = self.fileobj.read(1)
        if c:
496
            self.fileobj.prepend(c, True)
497

498 499 500 501
    @property
    def closed(self):
        return self.fileobj is None

502
    def close(self):
Georg Brandl's avatar
Georg Brandl committed
503 504
        if self.fileobj is None:
            return
505 506
        if self.mode == WRITE:
            self.fileobj.write(self.compress.flush())
Christian Heimes's avatar
Christian Heimes committed
507
            write32u(self.fileobj, self.crc)
508
            # self.size may exceed 2GB, or even 4GB
509
            write32u(self.fileobj, self.size & 0xffffffff)
510 511 512 513 514 515
            self.fileobj = None
        elif self.mode == READ:
            self.fileobj = None
        if self.myfileobj:
            self.myfileobj.close()
            self.myfileobj = None
516

517
    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
518
        self._check_closed()
519
        if self.mode == WRITE:
Tim Peters's avatar
Tim Peters committed
520 521
            # Ensure the compressor's buffer is flushed
            self.fileobj.write(self.compress.flush(zlib_mode))
522
            self.fileobj.flush()
523

524 525 526 527 528 529 530 531
    def fileno(self):
        """Invoke the underlying file object's fileno() method.

        This will raise AttributeError if the underlying file object
        doesn't support fileno().
        """
        return self.fileobj.fileno()

532 533
    def rewind(self):
        '''Return the uncompressed stream file position indicator to the
Tim Peters's avatar
Tim Peters committed
534
        beginning of the file'''
535
        if self.mode != READ:
536
            raise OSError("Can't rewind in write mode")
537
        self.fileobj.seek(0)
538
        self._new_member = True
539
        self.extrabuf = b""
540
        self.extrasize = 0
541
        self.extrastart = 0
542 543
        self.offset = 0

544 545 546 547 548 549 550 551 552
    def readable(self):
        return self.mode == READ

    def writable(self):
        return self.mode == WRITE

    def seekable(self):
        return True

553 554 555 556 557 558
    def seek(self, offset, whence=0):
        if whence:
            if whence == 1:
                offset = self.offset + offset
            else:
                raise ValueError('Seek from end not supported')
559 560
        if self.mode == WRITE:
            if offset < self.offset:
561
                raise OSError('Negative seek in write mode')
562
            count = offset - self.offset
563
            chunk = bytes(1024)
564
            for i in range(count // 1024):
565 566
                self.write(chunk)
            self.write(bytes(count % 1024))
567 568 569 570 571
        elif self.mode == READ:
            if offset < self.offset:
                # for negative seek, rewind and do positive seek
                self.rewind()
            count = offset - self.offset
572 573
            for i in range(count // 1024):
                self.read(1024)
574 575
            self.read(count % 1024)

576 577
        return self.offset

578
    def readline(self, size=-1):
579
        if size < 0:
580 581 582 583 584 585 586 587
            # Shortcut common case - newline found in buffer.
            offset = self.offset - self.extrastart
            i = self.extrabuf.find(b'\n', offset) + 1
            if i > 0:
                self.extrasize -= i - offset
                self.offset += i - offset
                return self.extrabuf[offset: i]

588
            size = sys.maxsize
589 590 591
            readsize = self.min_readsize
        else:
            readsize = size
592
        bufs = []
593
        while size != 0:
594
            c = self.read(readsize)
595
            i = c.find(b'\n')
596 597 598 599 600 601 602

            # We set i=size to break out of the loop under two
            # conditions: 1) there's no newline, and the chunk is
            # larger than size, or 2) there is a newline, but the
            # resulting line would be longer than 'size'.
            if (size <= i) or (i == -1 and len(c) > size):
                i = size - 1
603

604
            if i >= 0 or c == b'':
605 606 607
                bufs.append(c[:i + 1])    # Add portion of last chunk
                self._unread(c[i + 1:])   # Push back rest of chunk
                break
608 609

            # Append chunk to list, decrease 'size',
610
            bufs.append(c)
611 612
            size = size - len(c)
            readsize = min(size, readsize * 2)
613 614
        if readsize > self.min_readsize:
            self.min_readsize = min(readsize, self.min_readsize * 2, 512)
615
        return b''.join(bufs) # Return resulting line
Tim Peters's avatar
Tim Peters committed
616

617

618 619
def compress(data, compresslevel=9):
    """Compress data in one shot and return the compressed string.
620
    Optional argument is the compression level, in range of 0-9.
621 622 623 624 625 626 627 628 629 630 631 632 633 634
    """
    buf = io.BytesIO()
    with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
        f.write(data)
    return buf.getvalue()

def decompress(data):
    """Decompress a gzip compressed string in one shot.
    Return the decompressed string.
    """
    with GzipFile(fileobj=io.BytesIO(data)) as f:
        return f.read()


635 636 637 638 639 640 641
def _test():
    # Act like gzip; with -d, act like gunzip.
    # The input file is not deleted, however, nor are any other gzip
    # options or features supported.
    args = sys.argv[1:]
    decompress = args and args[0] == "-d"
    if decompress:
642
        args = args[1:]
643
    if not args:
644
        args = ["-"]
645
    for arg in args:
646 647
        if decompress:
            if arg == "-":
648 649
                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
                g = sys.stdout.buffer
650 651
            else:
                if arg[-3:] != ".gz":
652
                    print("filename doesn't end in .gz:", repr(arg))
653 654
                    continue
                f = open(arg, "rb")
655
                g = builtins.open(arg[:-3], "wb")
656 657
        else:
            if arg == "-":
658 659
                f = sys.stdin.buffer
                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
660
            else:
661
                f = builtins.open(arg, "rb")
662
                g = open(arg + ".gz", "wb")
663
        while True:
664 665 666 667
            chunk = f.read(1024)
            if not chunk:
                break
            g.write(chunk)
668
        if g is not sys.stdout.buffer:
669
            g.close()
670
        if f is not sys.stdin.buffer:
671
            f.close()
672 673 674

if __name__ == '__main__':
    _test()