filecmp.py 9.28 KB
Newer Older
1
"""Utilities for comparing files and directories.
2

3 4 5 6
Classes:
    dircmp

Functions:
7
    cmp(f1, f2, shallow=1) -> int
8 9 10 11 12 13
    cmpfiles(a, b, common) -> ([], [], [])

"""

import os
import stat
14
from itertools import ifilter, ifilterfalse, imap, izip
15

Skip Montanaro's avatar
Skip Montanaro committed
16 17
__all__ = ["cmp","dircmp","cmpfiles"]

18 19 20
_cache = {}
BUFSIZE=8*1024

21
def cmp(f1, f2, shallow=1):
22
    """Compare two files.
23

24
    Arguments:
25

26
    f1 -- First file name
27

28
    f2 -- Second file name
29

30 31
    shallow -- Just check stat signature (do not read the files).
               defaults to 1.
32

33
    Return value:
34

35
    True if the files are the same, False otherwise.
36

37 38
    This function uses a cache for past comparisons and the results,
    with a cache invalidation mechanism relying on stale signatures.
39

40
    """
41

42 43
    s1 = _sig(os.stat(f1))
    s2 = _sig(os.stat(f2))
44
    if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG:
45
        return False
46
    if shallow and s1 == s2:
47
        return True
48
    if s1[1] != s2[1]:
49
        return False
50

51 52 53 54 55 56
    result = _cache.get((f1, f2))
    if result and (s1, s2) == result[:2]:
        return result[2]
    outcome = _do_cmp(f1, f2)
    _cache[f1, f2] = s1, s2, outcome
    return outcome
57 58

def _sig(st):
59 60 61
    return (stat.S_IFMT(st.st_mode),
            st.st_size,
            st.st_mtime)
62 63

def _do_cmp(f1, f2):
64
    bufsize = BUFSIZE
65
    with open(f1, 'rb') as fp1, open(f2, 'rb') as fp2:
66 67 68 69 70 71 72
        while True:
            b1 = fp1.read(bufsize)
            b2 = fp2.read(bufsize)
            if b1 != b2:
                return False
            if not b1:
                return True
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130

# Directory comparison class.
#
class dircmp:
    """A class that manages the comparison of 2 directories.

    dircmp(a,b,ignore=None,hide=None)
      A and B are directories.
      IGNORE is a list of names to ignore,
        defaults to ['RCS', 'CVS', 'tags'].
      HIDE is a list of names to hide,
        defaults to [os.curdir, os.pardir].

    High level usage:
      x = dircmp(dir1, dir2)
      x.report() -> prints a report on the differences between dir1 and dir2
       or
      x.report_partial_closure() -> prints report on differences between dir1
            and dir2, and reports on common immediate subdirectories.
      x.report_full_closure() -> like report_partial_closure,
            but fully recursive.

    Attributes:
     left_list, right_list: The files in dir1 and dir2,
        filtered by hide and ignore.
     common: a list of names in both dir1 and dir2.
     left_only, right_only: names only in dir1, dir2.
     common_dirs: subdirectories in both dir1 and dir2.
     common_files: files in both dir1 and dir2.
     common_funny: names in both dir1 and dir2 where the type differs between
        dir1 and dir2, or the name is not stat-able.
     same_files: list of identical files.
     diff_files: list of filenames which differ.
     funny_files: list of files which could not be compared.
     subdirs: a dictionary of dircmp objects, keyed by names in common_dirs.
     """

    def __init__(self, a, b, ignore=None, hide=None): # Initialize
        self.left = a
        self.right = b
        if hide is None:
            self.hide = [os.curdir, os.pardir] # Names never to be shown
        else:
            self.hide = hide
        if ignore is None:
            self.ignore = ['RCS', 'CVS', 'tags'] # Names ignored in comparison
        else:
            self.ignore = ignore

    def phase0(self): # Compare everything except common subdirectories
        self.left_list = _filter(os.listdir(self.left),
                                 self.hide+self.ignore)
        self.right_list = _filter(os.listdir(self.right),
                                  self.hide+self.ignore)
        self.left_list.sort()
        self.right_list.sort()

    def phase1(self): # Compute common names
131 132
        a = dict(izip(imap(os.path.normcase, self.left_list), self.left_list))
        b = dict(izip(imap(os.path.normcase, self.right_list), self.right_list))
133 134 135
        self.common = map(a.__getitem__, ifilter(b.__contains__, a))
        self.left_only = map(a.__getitem__, ifilterfalse(b.__contains__, a))
        self.right_only = map(b.__getitem__, ifilterfalse(a.__contains__, b))
136 137 138 139 140 141 142 143 144 145 146 147

    def phase2(self): # Distinguish files, directories, funnies
        self.common_dirs = []
        self.common_files = []
        self.common_funny = []

        for x in self.common:
            a_path = os.path.join(self.left, x)
            b_path = os.path.join(self.right, x)

            ok = 1
            try:
148
                a_stat = os.stat(a_path)
149 150 151 152
            except os.error, why:
                # print 'Can\'t stat', a_path, ':', why[1]
                ok = 0
            try:
153
                b_stat = os.stat(b_path)
154 155 156 157 158
            except os.error, why:
                # print 'Can\'t stat', b_path, ':', why[1]
                ok = 0

            if ok:
159 160
                a_type = stat.S_IFMT(a_stat.st_mode)
                b_type = stat.S_IFMT(b_stat.st_mode)
161
                if a_type != b_type:
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
                    self.common_funny.append(x)
                elif stat.S_ISDIR(a_type):
                    self.common_dirs.append(x)
                elif stat.S_ISREG(a_type):
                    self.common_files.append(x)
                else:
                    self.common_funny.append(x)
            else:
                self.common_funny.append(x)

    def phase3(self): # Find out differences between common files
        xx = cmpfiles(self.left, self.right, self.common_files)
        self.same_files, self.diff_files, self.funny_files = xx

    def phase4(self): # Find out differences between common subdirectories
        # A new dircmp object is created for each common subdirectory,
        # these are stored in a dictionary indexed by filename.
        # The hide and ignore properties are inherited from the parent
        self.subdirs = {}
        for x in self.common_dirs:
            a_x = os.path.join(self.left, x)
            b_x = os.path.join(self.right, x)
            self.subdirs[x]  = dircmp(a_x, b_x, self.ignore, self.hide)

    def phase4_closure(self): # Recursively call phase4() on subdirectories
        self.phase4()
188 189
        for sd in self.subdirs.itervalues():
            sd.phase4_closure()
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217

    def report(self): # Print a report on the differences between a and b
        # Output format is purposely lousy
        print 'diff', self.left, self.right
        if self.left_only:
            self.left_only.sort()
            print 'Only in', self.left, ':', self.left_only
        if self.right_only:
            self.right_only.sort()
            print 'Only in', self.right, ':', self.right_only
        if self.same_files:
            self.same_files.sort()
            print 'Identical files :', self.same_files
        if self.diff_files:
            self.diff_files.sort()
            print 'Differing files :', self.diff_files
        if self.funny_files:
            self.funny_files.sort()
            print 'Trouble with common files :', self.funny_files
        if self.common_dirs:
            self.common_dirs.sort()
            print 'Common subdirectories :', self.common_dirs
        if self.common_funny:
            self.common_funny.sort()
            print 'Common funny cases :', self.common_funny

    def report_partial_closure(self): # Print reports on self and on subdirs
        self.report()
218
        for sd in self.subdirs.itervalues():
219
            print
220
            sd.report()
221 222 223

    def report_full_closure(self): # Report on self and subdirs recursively
        self.report()
224
        for sd in self.subdirs.itervalues():
225
            print
226
            sd.report_full_closure()
227

Raymond Hettinger's avatar
Raymond Hettinger committed
228 229 230 231 232 233 234 235 236 237 238
    methodmap = dict(subdirs=phase4,
                     same_files=phase3, diff_files=phase3, funny_files=phase3,
                     common_dirs = phase2, common_files=phase2, common_funny=phase2,
                     common=phase1, left_only=phase1, right_only=phase1,
                     left_list=phase0, right_list=phase0)

    def __getattr__(self, attr):
        if attr not in self.methodmap:
            raise AttributeError, attr
        self.methodmap[attr](self)
        return getattr(self, attr)
239

240
def cmpfiles(a, b, common, shallow=1):
241 242
    """Compare common files in two directories.

243 244 245 246 247
    a, b -- directory names
    common -- list of file names found in both directories
    shallow -- if true, do comparison based solely on stat() information

    Returns a tuple of three lists:
248 249
      files that compare equal
      files that are different
250
      filenames that aren't regular files.
251

252
    """
253 254
    res = ([], [], [])
    for x in common:
255 256
        ax = os.path.join(a, x)
        bx = os.path.join(b, x)
257
        res[_cmp(ax, bx, shallow)].append(x)
258 259 260 261 262
    return res


# Compare two files.
# Return:
Tim Peters's avatar
Tim Peters committed
263 264 265
#       0 for equal
#       1 for different
#       2 for funny cases (can't stat, etc.)
266
#
Raymond Hettinger's avatar
Raymond Hettinger committed
267
def _cmp(a, b, sh, abs=abs, cmp=cmp):
268
    try:
269
        return not abs(cmp(a, b, sh))
270 271 272 273 274 275
    except os.error:
        return 2


# Return a copy with items that occur in skip removed.
#
Raymond Hettinger's avatar
Raymond Hettinger committed
276 277
def _filter(flist, skip):
    return list(ifilterfalse(skip.__contains__, flist))
278 279 280 281 282 283 284 285


# Demonstration and testing.
#
def demo():
    import sys
    import getopt
    options, args = getopt.getopt(sys.argv[1:], 'r')
286
    if len(args) != 2:
287
        raise getopt.GetoptError('need exactly two args', None)
288 289 290 291 292 293 294 295
    dd = dircmp(args[0], args[1])
    if ('-r', '') in options:
        dd.report_full_closure()
    else:
        dd.report()

if __name__ == '__main__':
    demo()