filecmp.py 9.64 KB
Newer Older
1
"""Utilities for comparing files and directories.
2

3 4 5 6
Classes:
    dircmp

Functions:
7
    cmp(f1, f2, shallow=1) -> int
8 9 10 11 12 13
    cmpfiles(a, b, common) -> ([], [], [])

"""

import os
import stat
14
import warnings
15
from itertools import ifilter, ifilterfalse, imap, izip
16

Skip Montanaro's avatar
Skip Montanaro committed
17 18
__all__ = ["cmp","dircmp","cmpfiles"]

19 20 21
_cache = {}
BUFSIZE=8*1024

22
def cmp(f1, f2, shallow=1, use_statcache=None):
23
    """Compare two files.
24

25
    Arguments:
26

27
    f1 -- First file name
28

29
    f2 -- Second file name
30

31 32
    shallow -- Just check stat signature (do not read the files).
               defaults to 1.
33

34
    use_statcache -- obsolete argument.
35

36
    Return value:
37

38
    True if the files are the same, False otherwise.
39

40 41
    This function uses a cache for past comparisons and the results,
    with a cache invalidation mechanism relying on stale signatures.
42

43
    """
44 45 46 47
    if use_statcache is not None:
        warnings.warn("use_statcache argument is deprecated",
                      DeprecationWarning)

48 49
    s1 = _sig(os.stat(f1))
    s2 = _sig(os.stat(f2))
50
    if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG:
51
        return False
52
    if shallow and s1 == s2:
53
        return True
54
    if s1[1] != s2[1]:
55
        return False
56

57 58 59 60 61 62
    result = _cache.get((f1, f2))
    if result and (s1, s2) == result[:2]:
        return result[2]
    outcome = _do_cmp(f1, f2)
    _cache[f1, f2] = s1, s2, outcome
    return outcome
63 64

def _sig(st):
65 66 67
    return (stat.S_IFMT(st.st_mode),
            st.st_size,
            st.st_mtime)
68 69

def _do_cmp(f1, f2):
70 71 72
    bufsize = BUFSIZE
    fp1 = open(f1, 'rb')
    fp2 = open(f2, 'rb')
Raymond Hettinger's avatar
Raymond Hettinger committed
73
    while True:
74 75 76
        b1 = fp1.read(bufsize)
        b2 = fp2.read(bufsize)
        if b1 != b2:
Raymond Hettinger's avatar
Raymond Hettinger committed
77
            return False
78
        if not b1:
Raymond Hettinger's avatar
Raymond Hettinger committed
79
            return True
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137

# Directory comparison class.
#
class dircmp:
    """A class that manages the comparison of 2 directories.

    dircmp(a,b,ignore=None,hide=None)
      A and B are directories.
      IGNORE is a list of names to ignore,
        defaults to ['RCS', 'CVS', 'tags'].
      HIDE is a list of names to hide,
        defaults to [os.curdir, os.pardir].

    High level usage:
      x = dircmp(dir1, dir2)
      x.report() -> prints a report on the differences between dir1 and dir2
       or
      x.report_partial_closure() -> prints report on differences between dir1
            and dir2, and reports on common immediate subdirectories.
      x.report_full_closure() -> like report_partial_closure,
            but fully recursive.

    Attributes:
     left_list, right_list: The files in dir1 and dir2,
        filtered by hide and ignore.
     common: a list of names in both dir1 and dir2.
     left_only, right_only: names only in dir1, dir2.
     common_dirs: subdirectories in both dir1 and dir2.
     common_files: files in both dir1 and dir2.
     common_funny: names in both dir1 and dir2 where the type differs between
        dir1 and dir2, or the name is not stat-able.
     same_files: list of identical files.
     diff_files: list of filenames which differ.
     funny_files: list of files which could not be compared.
     subdirs: a dictionary of dircmp objects, keyed by names in common_dirs.
     """

    def __init__(self, a, b, ignore=None, hide=None): # Initialize
        self.left = a
        self.right = b
        if hide is None:
            self.hide = [os.curdir, os.pardir] # Names never to be shown
        else:
            self.hide = hide
        if ignore is None:
            self.ignore = ['RCS', 'CVS', 'tags'] # Names ignored in comparison
        else:
            self.ignore = ignore

    def phase0(self): # Compare everything except common subdirectories
        self.left_list = _filter(os.listdir(self.left),
                                 self.hide+self.ignore)
        self.right_list = _filter(os.listdir(self.right),
                                  self.hide+self.ignore)
        self.left_list.sort()
        self.right_list.sort()

    def phase1(self): # Compute common names
138 139 140 141 142
        a = dict(izip(imap(os.path.normcase, self.left_list), self.left_list))
        b = dict(izip(imap(os.path.normcase, self.right_list), self.right_list))
        self.common = map(a.__getitem__, ifilter(b.has_key, a))
        self.left_only = map(a.__getitem__, ifilterfalse(b.has_key, a))
        self.right_only = map(b.__getitem__, ifilterfalse(a.has_key, b))
143 144 145 146 147 148 149 150 151 152 153 154

    def phase2(self): # Distinguish files, directories, funnies
        self.common_dirs = []
        self.common_files = []
        self.common_funny = []

        for x in self.common:
            a_path = os.path.join(self.left, x)
            b_path = os.path.join(self.right, x)

            ok = 1
            try:
155
                a_stat = os.stat(a_path)
156 157 158 159
            except os.error, why:
                # print 'Can\'t stat', a_path, ':', why[1]
                ok = 0
            try:
160
                b_stat = os.stat(b_path)
161 162 163 164 165
            except os.error, why:
                # print 'Can\'t stat', b_path, ':', why[1]
                ok = 0

            if ok:
166 167
                a_type = stat.S_IFMT(a_stat.st_mode)
                b_type = stat.S_IFMT(b_stat.st_mode)
168
                if a_type != b_type:
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
                    self.common_funny.append(x)
                elif stat.S_ISDIR(a_type):
                    self.common_dirs.append(x)
                elif stat.S_ISREG(a_type):
                    self.common_files.append(x)
                else:
                    self.common_funny.append(x)
            else:
                self.common_funny.append(x)

    def phase3(self): # Find out differences between common files
        xx = cmpfiles(self.left, self.right, self.common_files)
        self.same_files, self.diff_files, self.funny_files = xx

    def phase4(self): # Find out differences between common subdirectories
        # A new dircmp object is created for each common subdirectory,
        # these are stored in a dictionary indexed by filename.
        # The hide and ignore properties are inherited from the parent
        self.subdirs = {}
        for x in self.common_dirs:
            a_x = os.path.join(self.left, x)
            b_x = os.path.join(self.right, x)
            self.subdirs[x]  = dircmp(a_x, b_x, self.ignore, self.hide)

    def phase4_closure(self): # Recursively call phase4() on subdirectories
        self.phase4()
195 196
        for sd in self.subdirs.itervalues():
            sd.phase4_closure()
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224

    def report(self): # Print a report on the differences between a and b
        # Output format is purposely lousy
        print 'diff', self.left, self.right
        if self.left_only:
            self.left_only.sort()
            print 'Only in', self.left, ':', self.left_only
        if self.right_only:
            self.right_only.sort()
            print 'Only in', self.right, ':', self.right_only
        if self.same_files:
            self.same_files.sort()
            print 'Identical files :', self.same_files
        if self.diff_files:
            self.diff_files.sort()
            print 'Differing files :', self.diff_files
        if self.funny_files:
            self.funny_files.sort()
            print 'Trouble with common files :', self.funny_files
        if self.common_dirs:
            self.common_dirs.sort()
            print 'Common subdirectories :', self.common_dirs
        if self.common_funny:
            self.common_funny.sort()
            print 'Common funny cases :', self.common_funny

    def report_partial_closure(self): # Print reports on self and on subdirs
        self.report()
225
        for sd in self.subdirs.itervalues():
226
            print
227
            sd.report()
228 229 230

    def report_full_closure(self): # Report on self and subdirs recursively
        self.report()
231
        for sd in self.subdirs.itervalues():
232
            print
233
            sd.report_full_closure()
234

Raymond Hettinger's avatar
Raymond Hettinger committed
235 236 237 238 239 240 241 242 243 244 245
    methodmap = dict(subdirs=phase4,
                     same_files=phase3, diff_files=phase3, funny_files=phase3,
                     common_dirs = phase2, common_files=phase2, common_funny=phase2,
                     common=phase1, left_only=phase1, right_only=phase1,
                     left_list=phase0, right_list=phase0)

    def __getattr__(self, attr):
        if attr not in self.methodmap:
            raise AttributeError, attr
        self.methodmap[attr](self)
        return getattr(self, attr)
246

247
def cmpfiles(a, b, common, shallow=1, use_statcache=None):
248 249
    """Compare common files in two directories.

250 251 252
    a, b -- directory names
    common -- list of file names found in both directories
    shallow -- if true, do comparison based solely on stat() information
253
    use_statcache -- obsolete argument
254 255

    Returns a tuple of three lists:
256 257
      files that compare equal
      files that are different
258
      filenames that aren't regular files.
259

260
    """
261 262 263
    if use_statcache is not None:
        warnings.warn("use_statcache argument is deprecated",
                      DeprecationWarning)
264 265
    res = ([], [], [])
    for x in common:
266 267
        ax = os.path.join(a, x)
        bx = os.path.join(b, x)
268
        res[_cmp(ax, bx, shallow)].append(x)
269 270 271 272 273
    return res


# Compare two files.
# Return:
Tim Peters's avatar
Tim Peters committed
274 275 276
#       0 for equal
#       1 for different
#       2 for funny cases (can't stat, etc.)
277
#
Raymond Hettinger's avatar
Raymond Hettinger committed
278
def _cmp(a, b, sh, abs=abs, cmp=cmp):
279
    try:
280
        return not abs(cmp(a, b, sh))
281 282 283 284 285 286
    except os.error:
        return 2


# Return a copy with items that occur in skip removed.
#
Raymond Hettinger's avatar
Raymond Hettinger committed
287 288
def _filter(flist, skip):
    return list(ifilterfalse(skip.__contains__, flist))
289 290 291 292 293 294 295 296


# Demonstration and testing.
#
def demo():
    import sys
    import getopt
    options, args = getopt.getopt(sys.argv[1:], 'r')
297
    if len(args) != 2:
298
        raise getopt.GetoptError('need exactly two args', None)
299 300 301 302 303 304 305 306
    dd = dircmp(args[0], args[1])
    if ('-r', '') in options:
        dd.report_full_closure()
    else:
        dd.report()

if __name__ == '__main__':
    demo()