gzip.py 17.8 KB
Newer Older
1 2
"""Functions that read and write gzipped files.

3 4 5 6 7
The user of the file doesn't have to worry about the compression,
but random access is not allowed."""

# based on Andrew Kuchling's minigzip.py distributed with the zlib module

8
import struct, sys, time
9
import zlib
10
import builtins
11

12 13
__all__ = ["GzipFile","open"]

14 15 16 17
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16

READ, WRITE = 1, 2

18 19 20 21 22
def U32(i):
    """Return i as an unsigned integer, assuming it fits in 32 bits.
    If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
    """
    if i < 0:
23
        i += 1 << 32
24 25
    return i

26
def LOWU32(i):
Christian Heimes's avatar
Christian Heimes committed
27
    """Return the low-order 32 bits, as a non-negative int"""
28
    return i & 0xFFFFFFFF
29

30
def write32u(output, value):
31 32
    # The L format writes the bit pattern correctly whether signed
    # or unsigned.
33 34
    output.write(struct.pack("<L", value))

35
def read32(input):
Christian Heimes's avatar
Christian Heimes committed
36
    return struct.unpack("<I", input.read(4))[0]
37

38
def open(filename, mode="rb", compresslevel=9):
39 40 41 42 43 44
    """Shorthand for GzipFile(filename, mode, compresslevel).

    The filename argument is required; mode defaults to 'rb'
    and compresslevel defaults to 9.

    """
45 46 47
    return GzipFile(filename, mode, compresslevel)

class GzipFile:
48
    """The GzipFile class simulates most of the methods of a file object with
49
    the exception of the readinto() and truncate() methods.
50 51

    """
52

53
    myfileobj = None
54
    max_read_chunk = 10 * 1024 * 1024   # 10Mb
55

Tim Peters's avatar
Tim Peters committed
56
    def __init__(self, filename=None, mode=None,
57
                 compresslevel=9, fileobj=None, mtime=None):
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
        """Constructor for the GzipFile class.

        At least one of fileobj and filename must be given a
        non-trivial value.

        The new class instance is based on fileobj, which can be a regular
        file, a StringIO object, or any other object which simulates a file.
        It defaults to None, in which case filename is opened to provide
        a file object.

        When fileobj is not None, the filename argument is only used to be
        included in the gzip file header, which may includes the original
        filename of the uncompressed file.  It defaults to the filename of
        fileobj, if discernible; otherwise, it defaults to the empty string,
        and in this case the original filename is not included in the header.

        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
        depending on whether the file will be read or written.  The default
        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
        Be aware that only the 'rb', 'ab', and 'wb' values should be used
        for cross-platform portability.

        The compresslevel argument is an integer from 1 to 9 controlling the
        level of compression; 1 is fastest and produces the least compression,
        and 9 is slowest and produces the most compression.  The default is 9.

84 85 86 87 88 89 90 91 92
        The mtime argument is an optional numeric timestamp to be written
        to the stream when compressing.  All gzip compressed streams
        are required to contain a timestamp.  If omitted or None, the
        current time is used.  This module ignores the timestamp when
        decompressing; however, some programs, such as gunzip, make use
        of it.  The format of the timestamp is the same as that of the
        return value of time.time() and of the st_mtime member of the
        object returned by os.stat().

93 94
        """

95 96 97 98
        # guarantee the file is opened in binary mode on platforms
        # that care about that sort of thing
        if mode and 'b' not in mode:
            mode += 'b'
99
        if fileobj is None:
100
            fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
101
        if filename is None:
102 103
            if hasattr(fileobj, 'name'): filename = fileobj.name
            else: filename = ''
104
        if mode is None:
105
            if hasattr(fileobj, 'mode'): mode = fileobj.mode
106
            else: mode = 'rb'
107 108 109

        if mode[0:1] == 'r':
            self.mode = READ
Tim Peters's avatar
Tim Peters committed
110
            # Set flag indicating start of a new member
111
            self._new_member = True
112
            self.extrabuf = b""
113
            self.extrasize = 0
114
            self.name = filename
115 116
            # Starts small, scales exponentially
            self.min_readsize = 100
117

118
        elif mode[0:1] == 'w' or mode[0:1] == 'a':
119 120 121
            self.mode = WRITE
            self._init_write(filename)
            self.compress = zlib.compressobj(compresslevel,
Tim Peters's avatar
Tim Peters committed
122
                                             zlib.DEFLATED,
123 124 125 126
                                             -zlib.MAX_WBITS,
                                             zlib.DEF_MEM_LEVEL,
                                             0)
        else:
127
            raise IOError("Mode " + mode + " not supported")
128 129

        self.fileobj = fileobj
130
        self.offset = 0
131
        self.mtime = mtime
132 133 134

        if self.mode == WRITE:
            self._write_gzip_header()
135

136 137 138 139 140 141 142 143
    @property
    def filename(self):
        import warnings
        warnings.warn("use the name attribute", DeprecationWarning)
        if self.mode == WRITE and self.name[-3:] != ".gz":
            return self.name + ".gz"
        return self.name

144
    def __repr__(self):
145 146
        s = repr(self.fileobj)
        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
147 148

    def _init_write(self, filename):
149
        self.name = filename
Christian Heimes's avatar
Christian Heimes committed
150
        self.crc = zlib.crc32("") & 0xffffffff
151 152 153
        self.size = 0
        self.writebuf = []
        self.bufsize = 0
154 155

    def _write_gzip_header(self):
156 157
        self.fileobj.write(b'\037\213')             # magic header
        self.fileobj.write(b'\010')                 # compression method
158
        try:
159 160 161 162 163
            # RFC 1952 requires the FNAME field to be Latin-1. Do not
            # include filenames that cannot be represented that way.
            fname = self.name.encode('latin-1')
            if fname.endswith(b'.gz'):
                fname = fname[:-3]
164
        except UnicodeEncodeError:
165 166
            fname = b''
        flags = 0
167 168
        if fname:
            flags = FNAME
169
        self.fileobj.write(chr(flags).encode('latin-1'))
170 171 172 173
        mtime = self.mtime
        if mtime is None:
            mtime = time.time()
        write32u(self.fileobj, int(mtime))
174 175
        self.fileobj.write(b'\002')
        self.fileobj.write(b'\377')
176
        if fname:
177
            self.fileobj.write(fname + b'\000')
178 179

    def _init_read(self):
Christian Heimes's avatar
Christian Heimes committed
180
        self.crc = zlib.crc32("") & 0xffffffff
181
        self.size = 0
182 183

    def _read_gzip_header(self):
184
        magic = self.fileobj.read(2)
185
        if magic != b'\037\213':
186
            raise IOError('Not a gzipped file')
187 188
        method = ord( self.fileobj.read(1) )
        if method != 8:
189
            raise IOError('Unknown compression method')
190
        flag = ord( self.fileobj.read(1) )
191
        self.mtime = read32(self.fileobj)
192 193
        # extraflag = self.fileobj.read(1)
        # os = self.fileobj.read(1)
194
        self.fileobj.read(2)
195 196 197

        if flag & FEXTRA:
            # Read & discard the extra field, if present
198 199
            xlen = ord(self.fileobj.read(1))
            xlen = xlen + 256*ord(self.fileobj.read(1))
200 201 202
            self.fileobj.read(xlen)
        if flag & FNAME:
            # Read and discard a null-terminated string containing the filename
203
            while True:
204
                s = self.fileobj.read(1)
205
                if not s or s==b'\000':
206
                    break
207 208
        if flag & FCOMMENT:
            # Read and discard a null-terminated string containing a comment
209
            while True:
210
                s = self.fileobj.read(1)
211
                if not s or s==b'\000':
212
                    break
213 214
        if flag & FHCRC:
            self.fileobj.read(2)     # Read & discard the 16-bit header CRC
215 216 217


    def write(self,data):
218 219 220
        if self.mode != WRITE:
            import errno
            raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters's avatar
Tim Peters committed
221

222
        if self.fileobj is None:
223
            raise ValueError("write() on closed GzipFile object")
224 225
        if len(data) > 0:
            self.size = self.size + len(data)
Christian Heimes's avatar
Christian Heimes committed
226
            self.crc = zlib.crc32(data, self.crc) & 0xffffffff
227
            self.fileobj.write( self.compress.compress(data) )
228
            self.offset += len(data)
229

230
    def read(self, size=-1):
231 232
        if self.mode != READ:
            import errno
233
            raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters's avatar
Tim Peters committed
234

235
        if self.extrasize <= 0 and self.fileobj is None:
236
            return b''
237 238

        readsize = 1024
239
        if size < 0:        # get the whole thing
240
            try:
241
                while True:
242
                    self._read(readsize)
243
                    readsize = min(self.max_read_chunk, readsize * 2)
244 245 246 247 248 249
            except EOFError:
                size = self.extrasize
        else:               # just get some more of it
            try:
                while size > self.extrasize:
                    self._read(readsize)
250
                    readsize = min(self.max_read_chunk, readsize * 2)
251
            except EOFError:
252 253
                if size > self.extrasize:
                    size = self.extrasize
Tim Peters's avatar
Tim Peters committed
254

255 256 257 258
        chunk = self.extrabuf[:size]
        self.extrabuf = self.extrabuf[size:]
        self.extrasize = self.extrasize - size

259
        self.offset += size
260
        return chunk
261

262
    def _unread(self, buf):
263
        self.extrabuf = buf + self.extrabuf
264
        self.extrasize = len(buf) + self.extrasize
265
        self.offset -= len(buf)
266 267

    def _read(self, size=1024):
268
        if self.fileobj is None:
269
            raise EOFError("Reached EOF")
Tim Peters's avatar
Tim Peters committed
270

271
        if self._new_member:
272 273
            # If the _new_member flag is set, we have to
            # jump to the next member, if there is one.
Tim Peters's avatar
Tim Peters committed
274
            #
275 276 277 278 279
            # First, check if we're at the end of the file;
            # if so, it's time to stop; no more members to read.
            pos = self.fileobj.tell()   # Save current position
            self.fileobj.seek(0, 2)     # Seek to end of file
            if pos == self.fileobj.tell():
280
                raise EOFError("Reached EOF")
Tim Peters's avatar
Tim Peters committed
281
            else:
282
                self.fileobj.seek( pos ) # Return to original position
Tim Peters's avatar
Tim Peters committed
283 284

            self._init_read()
285 286
            self._read_gzip_header()
            self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
287
            self._new_member = False
Tim Peters's avatar
Tim Peters committed
288

289 290
        # Read a chunk of data from the file
        buf = self.fileobj.read(size)
Tim Peters's avatar
Tim Peters committed
291

292 293
        # If the EOF has been reached, flush the decompression object
        # and mark this object as finished.
Tim Peters's avatar
Tim Peters committed
294

295
        if buf == b"":
296
            uncompress = self.decompress.flush()
297 298
            self._read_eof()
            self._add_read_data( uncompress )
299
            raise EOFError('Reached EOF')
Tim Peters's avatar
Tim Peters committed
300

301 302 303
        uncompress = self.decompress.decompress(buf)
        self._add_read_data( uncompress )

304
        if self.decompress.unused_data != b"":
305 306 307 308 309 310 311 312
            # Ending case: we've come to the end of a member in the file,
            # so seek back to the start of the unused data, finish up
            # this member, and read a new gzip header.
            # (The number of bytes to seek back is the length of the unused
            # data, minus 8 because _read_eof() will rewind a further 8 bytes)
            self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)

            # Check the CRC and file size, and set the flag so we read
Tim Peters's avatar
Tim Peters committed
313
            # a new member on the next call
314
            self._read_eof()
315
            self._new_member = True
Tim Peters's avatar
Tim Peters committed
316 317

    def _add_read_data(self, data):
Christian Heimes's avatar
Christian Heimes committed
318
        self.crc = zlib.crc32(data, self.crc) & 0xffffffff
319 320 321
        self.extrabuf = self.extrabuf + data
        self.extrasize = self.extrasize + len(data)
        self.size = self.size + len(data)
322 323

    def _read_eof(self):
324
        # We've read to the end of the file, so we have to rewind in order
Tim Peters's avatar
Tim Peters committed
325
        # to reread the 8 bytes containing the CRC and the file size.
326
        # We check the that the computed CRC and size of the
327 328
        # uncompressed data matches the stored values.  Note that the size
        # stored is the true file size mod 2**32.
329
        self.fileobj.seek(-8, 1)
330
        crc32 = read32(self.fileobj)
Christian Heimes's avatar
Christian Heimes committed
331 332 333 334
        isize = read32(self.fileobj)  # may exceed 2GB
        if crc32 != self.crc:
            raise IOError("CRC check failed %s != %s" % (hex(crc32),
                                                         hex(self.crc)))
335
        elif isize != (self.size & 0xffffffff):
336
            raise IOError("Incorrect length of data produced")
Tim Peters's avatar
Tim Peters committed
337

338
    def close(self):
Georg Brandl's avatar
Georg Brandl committed
339 340
        if self.fileobj is None:
            return
341 342
        if self.mode == WRITE:
            self.fileobj.write(self.compress.flush())
Christian Heimes's avatar
Christian Heimes committed
343
            write32u(self.fileobj, self.crc)
344
            # self.size may exceed 2GB, or even 4GB
345
            write32u(self.fileobj, self.size & 0xffffffff)
346 347 348 349 350 351
            self.fileobj = None
        elif self.mode == READ:
            self.fileobj = None
        if self.myfileobj:
            self.myfileobj.close()
            self.myfileobj = None
352

353
    def __del__(self):
354 355 356 357 358 359 360
        try:
            if (self.myfileobj is None and
                self.fileobj is None):
                return
        except AttributeError:
            return
        self.close()
Tim Peters's avatar
Tim Peters committed
361

362 363
    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
        if self.mode == WRITE:
Tim Peters's avatar
Tim Peters committed
364 365
            # Ensure the compressor's buffer is flushed
            self.fileobj.write(self.compress.flush(zlib_mode))
366
        self.fileobj.flush()
367

368 369 370 371 372 373 374 375
    def fileno(self):
        """Invoke the underlying file object's fileno() method.

        This will raise AttributeError if the underlying file object
        doesn't support fileno().
        """
        return self.fileobj.fileno()

376
    def isatty(self):
377
        return False
378

379 380 381 382 383
    def tell(self):
        return self.offset

    def rewind(self):
        '''Return the uncompressed stream file position indicator to the
Tim Peters's avatar
Tim Peters committed
384
        beginning of the file'''
385 386 387
        if self.mode != READ:
            raise IOError("Can't rewind in write mode")
        self.fileobj.seek(0)
388
        self._new_member = True
389
        self.extrabuf = b""
390 391 392
        self.extrasize = 0
        self.offset = 0

393 394 395 396 397 398
    def seek(self, offset, whence=0):
        if whence:
            if whence == 1:
                offset = self.offset + offset
            else:
                raise ValueError('Seek from end not supported')
399 400 401 402
        if self.mode == WRITE:
            if offset < self.offset:
                raise IOError('Negative seek in write mode')
            count = offset - self.offset
403
            chunk = bytes(1024)
404
            for i in range(count // 1024):
405 406
                self.write(chunk)
            self.write(bytes(count % 1024))
407 408 409 410 411
        elif self.mode == READ:
            if offset < self.offset:
                # for negative seek, rewind and do positive seek
                self.rewind()
            count = offset - self.offset
412 413
            for i in range(count // 1024):
                self.read(1024)
414 415
            self.read(count % 1024)

416
    def readline(self, size=-1):
417
        if size < 0:
418
            size = sys.maxsize
419 420 421
            readsize = self.min_readsize
        else:
            readsize = size
422
        bufs = []
423
        while size != 0:
424
            c = self.read(readsize)
425
            i = c.find(b'\n')
426 427 428 429 430 431 432

            # We set i=size to break out of the loop under two
            # conditions: 1) there's no newline, and the chunk is
            # larger than size, or 2) there is a newline, but the
            # resulting line would be longer than 'size'.
            if (size <= i) or (i == -1 and len(c) > size):
                i = size - 1
433

434
            if i >= 0 or c == b'':
435 436 437
                bufs.append(c[:i + 1])    # Add portion of last chunk
                self._unread(c[i + 1:])   # Push back rest of chunk
                break
438 439

            # Append chunk to list, decrease 'size',
440
            bufs.append(c)
441 442
            size = size - len(c)
            readsize = min(size, readsize * 2)
443 444
        if readsize > self.min_readsize:
            self.min_readsize = min(readsize, self.min_readsize * 2, 512)
445
        return b''.join(bufs) # Return resulting line
Tim Peters's avatar
Tim Peters committed
446

447 448
    def readlines(self, sizehint=0):
        # Negative numbers result in reading all the lines
449
        if sizehint <= 0:
450
            sizehint = sys.maxsize
451 452 453
        L = []
        while sizehint > 0:
            line = self.readline()
454
            if line == b"":
455
                break
456
            L.append(line)
457 458 459
            sizehint = sizehint - len(line)

        return L
460 461

    def writelines(self, L):
462 463
        for line in L:
            self.write(line)
464

465 466 467
    def __iter__(self):
        return self

468
    def __next__(self):
469 470 471 472 473 474
        line = self.readline()
        if line:
            return line
        else:
            raise StopIteration

475 476 477 478 479 480 481 482
    def __enter__(self):
        if self.fileobj is None:
            raise ValueError("I/O operation on closed GzipFile object")
        return self

    def __exit__(self, *args):
        self.close()

483 484 485 486 487 488 489 490

def _test():
    # Act like gzip; with -d, act like gunzip.
    # The input file is not deleted, however, nor are any other gzip
    # options or features supported.
    args = sys.argv[1:]
    decompress = args and args[0] == "-d"
    if decompress:
491
        args = args[1:]
492
    if not args:
493
        args = ["-"]
494
    for arg in args:
495 496
        if decompress:
            if arg == "-":
497 498
                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
                g = sys.stdout.buffer
499 500
            else:
                if arg[-3:] != ".gz":
501
                    print("filename doesn't end in .gz:", repr(arg))
502 503
                    continue
                f = open(arg, "rb")
504
                g = builtins.open(arg[:-3], "wb")
505 506
        else:
            if arg == "-":
507 508
                f = sys.stdin.buffer
                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
509
            else:
510
                f = builtins.open(arg, "rb")
511
                g = open(arg + ".gz", "wb")
512
        while True:
513 514 515 516 517 518 519 520
            chunk = f.read(1024)
            if not chunk:
                break
            g.write(chunk)
        if g is not sys.stdout:
            g.close()
        if f is not sys.stdin:
            f.close()
521 522 523

if __name__ == '__main__':
    _test()