gzip.py 11 KB
Newer Older
1 2
"""Functions that read and write gzipped files.

3 4 5 6 7
The user of the file doesn't have to worry about the compression,
but random access is not allowed."""

# based on Andrew Kuchling's minigzip.py distributed with the zlib module

8 9 10
import time
import string
import zlib
11
import struct
12
import __builtin__
13 14 15 16 17 18

FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16

READ, WRITE = 1, 2

def write32(output, value):
19
    output.write(struct.pack("<l", value))
20
    
21 22 23
def write32u(output, value):
    output.write(struct.pack("<L", value))

24
def read32(input):
25
    return struct.unpack("<l", input.read(4))[0]
26

27
def open(filename, mode="rb", compresslevel=9):
28 29 30 31
    return GzipFile(filename, mode, compresslevel)

class GzipFile:

32 33 34
    myfileobj = None

    def __init__(self, filename=None, mode=None, 
35 36
                 compresslevel=9, fileobj=None):
        if fileobj is None:
37
            fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
38
        if filename is None:
39 40
            if hasattr(fileobj, 'name'): filename = fileobj.name
            else: filename = ''
41
        if mode is None:
42
            if hasattr(fileobj, 'mode'): mode = fileobj.mode
43
            else: mode = 'rb'
44 45 46

        if mode[0:1] == 'r':
            self.mode = READ
47 48 49 50
 	    # Set flag indicating start of a new member
            self._new_member = 1 
            self.extrabuf = ""
            self.extrasize = 0
51 52
            self.filename = filename

53
        elif mode[0:1] == 'w' or mode[0:1] == 'a':
54 55 56 57 58 59 60 61 62 63 64 65 66 67
            self.mode = WRITE
            self._init_write(filename)
            self.compress = zlib.compressobj(compresslevel,
                                             zlib.DEFLATED, 
                                             -zlib.MAX_WBITS,
                                             zlib.DEF_MEM_LEVEL,
                                             0)
        else:
            raise ValueError, "Mode " + mode + " not supported"

        self.fileobj = fileobj

        if self.mode == WRITE:
            self._write_gzip_header()
68 69

    def __repr__(self):
70 71
        s = repr(self.fileobj)
        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
72 73

    def _init_write(self, filename):
74 75 76 77 78 79 80
        if filename[-3:] != '.gz':
            filename = filename + '.gz'
        self.filename = filename
        self.crc = zlib.crc32("")
        self.size = 0
        self.writebuf = []
        self.bufsize = 0
81 82

    def _write_gzip_header(self):
83 84 85 86 87 88 89
        self.fileobj.write('\037\213')             # magic header
        self.fileobj.write('\010')                 # compression method
        fname = self.filename[:-3]
        flags = 0
        if fname:
            flags = FNAME
        self.fileobj.write(chr(flags))
90
        write32u(self.fileobj, long(time.time()))
91 92 93 94
        self.fileobj.write('\002')
        self.fileobj.write('\377')
        if fname:
            self.fileobj.write(fname + '\000')
95 96

    def _init_read(self):
97 98
        self.crc = zlib.crc32("")
        self.size = 0
99 100

    def _read_gzip_header(self):
101 102
        magic = self.fileobj.read(2)
        if magic != '\037\213':
103
            raise IOError, 'Not a gzipped file'
104 105
        method = ord( self.fileobj.read(1) )
        if method != 8:
106
            raise IOError, 'Unknown compression method'
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
        flag = ord( self.fileobj.read(1) )
        # modtime = self.fileobj.read(4)
        # extraflag = self.fileobj.read(1)
        # os = self.fileobj.read(1)
        self.fileobj.read(6)

        if flag & FEXTRA:
            # Read & discard the extra field, if present
            xlen=ord(self.fileobj.read(1))              
            xlen=xlen+256*ord(self.fileobj.read(1))
            self.fileobj.read(xlen)
        if flag & FNAME:
            # Read and discard a null-terminated string containing the filename
            while (1):
                s=self.fileobj.read(1)
                if not s or s=='\000': break
        if flag & FCOMMENT:
            # Read and discard a null-terminated string containing a comment
            while (1):
                s=self.fileobj.read(1)
                if not s or s=='\000': break
        if flag & FHCRC:
            self.fileobj.read(2)     # Read & discard the 16-bit header CRC
130 131 132


    def write(self,data):
133 134 135 136 137 138
        if self.fileobj is None:
            raise ValueError, "write() on closed GzipFile object"
        if len(data) > 0:
            self.size = self.size + len(data)
            self.crc = zlib.crc32(data, self.crc)
            self.fileobj.write( self.compress.compress(data) )
139 140

    def writelines(self,lines):
141
        self.write(string.join(lines))
142

143
    def read(self, size=-1):
144 145 146 147
        if self.extrasize <= 0 and self.fileobj is None:
            return ''

        readsize = 1024
148
        if size < 0:        # get the whole thing
149 150 151 152 153 154 155 156 157 158 159 160
            try:
                while 1:
                    self._read(readsize)
                    readsize = readsize * 2
            except EOFError:
                size = self.extrasize
        else:               # just get some more of it
            try:
                while size > self.extrasize:
                    self._read(readsize)
                    readsize = readsize * 2
            except EOFError:
161 162
                if size > self.extrasize:
                    size = self.extrasize
163 164 165 166 167 168
        
        chunk = self.extrabuf[:size]
        self.extrabuf = self.extrabuf[size:]
        self.extrasize = self.extrasize - size

        return chunk
169

170
    def _unread(self, buf):
171
        self.extrabuf = buf + self.extrabuf
172
        self.extrasize = len(buf) + self.extrasize
173 174

    def _read(self, size=1024):
175 176 177 178 179 180 181 182 183 184 185
        if self.fileobj is None: raise EOFError, "Reached EOF"
 	
        if self._new_member:
            # If the _new_member flag is set, we have to 
            # 
            # First, check if we're at the end of the file;
            # if so, it's time to stop; no more members to read.
            pos = self.fileobj.tell()   # Save current position
            self.fileobj.seek(0, 2)     # Seek to end of file
            if pos == self.fileobj.tell():
                self.fileobj = None
186
                raise EOFError, "Reached EOF"
187 188 189 190 191 192 193 194 195 196 197 198 199 200
            else: 
                self.fileobj.seek( pos ) # Return to original position
  
            self._init_read()       
            self._read_gzip_header()
            self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
            self._new_member = 0
 
        # Read a chunk of data from the file
        buf = self.fileobj.read(size)
 
        # If the EOF has been reached, flush the decompression object
        # and mark this object as finished.
       
201 202
        if buf == "":
            uncompress = self.decompress.flush()
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
            self._read_eof()
            self.fileobj = None
            self._add_read_data( uncompress )
            raise EOFError, 'Reached EOF'
  
        uncompress = self.decompress.decompress(buf)
        self._add_read_data( uncompress )

        if self.decompress.unused_data != "":
            # Ending case: we've come to the end of a member in the file,
            # so seek back to the start of the unused data, finish up
            # this member, and read a new gzip header.
            # (The number of bytes to seek back is the length of the unused
            # data, minus 8 because _read_eof() will rewind a further 8 bytes)
            self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)

            # Check the CRC and file size, and set the flag so we read
            # a new member on the next call 
            self._read_eof()
            self._new_member = 1        
	    
    def _add_read_data(self, data):	        
        self.crc = zlib.crc32(data, self.crc)
        self.extrabuf = self.extrabuf + data
        self.extrasize = self.extrasize + len(data)
        self.size = self.size + len(data)
229 230

    def _read_eof(self):
231 232 233 234 235
        # We've read to the end of the file, so we have to rewind in order
        # to reread the 8 bytes containing the CRC and the file size.  
        # We check the that the computed CRC and size of the
        # uncompressed data matches the stored values.
        self.fileobj.seek(-8, 1)
236 237
        crc32 = read32(self.fileobj)
        isize = read32(self.fileobj)
238
        if crc32%0x100000000L != self.crc%0x100000000L:
239
            raise ValueError, "CRC check failed"
240
        elif isize != self.size:
241 242
            raise ValueError, "Incorrect length of data produced"
          
243
    def close(self):
244 245 246 247 248 249 250 251 252 253
        if self.mode == WRITE:
            self.fileobj.write(self.compress.flush())
            write32(self.fileobj, self.crc)
            write32(self.fileobj, self.size)
            self.fileobj = None
        elif self.mode == READ:
            self.fileobj = None
        if self.myfileobj:
            self.myfileobj.close()
            self.myfileobj = None
254

255
    def __del__(self):
256 257 258 259 260 261 262
        try:
            if (self.myfileobj is None and
                self.fileobj is None):
                return
        except AttributeError:
            return
        self.close()
263
        
264
    def flush(self):
265
        self.fileobj.flush()
266 267

    def seek(self):
268
        raise IOError, 'Random access not allowed in gzip files'
269 270

    def tell(self):
271
        raise IOError, 'I won\'t tell() you for gzip files'
272 273

    def isatty(self):
274
        return 0
275 276

    def readline(self):
277 278 279 280 281 282
        bufs = []
        readsize = 100
        while 1:
            c = self.read(readsize)
            i = string.find(c, '\n')
            if i >= 0 or c == '':
283
                bufs.append(c[:i+1])
284 285 286 287
                self._unread(c[i+1:])
                return string.join(bufs, '')
            bufs.append(c)
            readsize = readsize * 2
288

289
    def readlines(self, ignored=None):
290
        buf = self.read()
291 292 293 294 295 296
        lines = string.split(buf, '\n')
        for i in range(len(lines)-1):
            lines[i] = lines[i] + '\n'
        if lines and not lines[-1]:
            del lines[-1]
        return lines
297 298

    def writelines(self, L):
299 300
        for line in L:
            self.write(line)
301 302 303 304 305 306 307 308 309 310


def _test():
    # Act like gzip; with -d, act like gunzip.
    # The input file is not deleted, however, nor are any other gzip
    # options or features supported.
    import sys
    args = sys.argv[1:]
    decompress = args and args[0] == "-d"
    if decompress:
311
        args = args[1:]
312
    if not args:
313
        args = ["-"]
314
    for arg in args:
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
        if decompress:
            if arg == "-":
                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
                g = sys.stdout
            else:
                if arg[-3:] != ".gz":
                    print "filename doesn't end in .gz:", `arg`
                    continue
                f = open(arg, "rb")
                g = __builtin__.open(arg[:-3], "wb")
        else:
            if arg == "-":
                f = sys.stdin
                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
            else:
                f = __builtin__.open(arg, "rb")
                g = open(arg + ".gz", "wb")
        while 1:
            chunk = f.read(1024)
            if not chunk:
                break
            g.write(chunk)
        if g is not sys.stdout:
            g.close()
        if f is not sys.stdin:
            f.close()
341 342 343

if __name__ == '__main__':
    _test()