filecmp.py 9.3 KB
Newer Older
1
"""Utilities for comparing files and directories.
2

3 4 5 6
Classes:
    dircmp

Functions:
7
    cmp(f1, f2, shallow=True) -> int
8 9 10 11 12 13
    cmpfiles(a, b, common) -> ([], [], [])

"""

import os
import stat
14
from itertools import filterfalse
15

16
__all__ = ["cmp", "dircmp", "cmpfiles"]
Skip Montanaro's avatar
Skip Montanaro committed
17

18
_cache = {}
19
BUFSIZE = 8*1024
20

21
def cmp(f1, f2, shallow=True):
22
    """Compare two files.
23

24
    Arguments:
25

26
    f1 -- First file name
27

28
    f2 -- Second file name
29

30 31
    shallow -- Just check stat signature (do not read the files).
               defaults to 1.
32

33
    Return value:
34

35
    True if the files are the same, False otherwise.
36

37 38
    This function uses a cache for past comparisons and the results,
    with a cache invalidation mechanism relying on stale signatures.
39

40
    """
41

42 43
    s1 = _sig(os.stat(f1))
    s2 = _sig(os.stat(f2))
44
    if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG:
45
        return False
46
    if shallow and s1 == s2:
47
        return True
48
    if s1[1] != s2[1]:
49
        return False
50

51 52 53 54 55 56
    result = _cache.get((f1, f2))
    if result and (s1, s2) == result[:2]:
        return result[2]
    outcome = _do_cmp(f1, f2)
    _cache[f1, f2] = s1, s2, outcome
    return outcome
57 58

def _sig(st):
59 60 61
    return (stat.S_IFMT(st.st_mode),
            st.st_size,
            st.st_mtime)
62 63

def _do_cmp(f1, f2):
64
    bufsize = BUFSIZE
65
    with open(f1, 'rb') as fp1, open(f2, 'rb') as fp2:
Benjamin Peterson's avatar
Benjamin Peterson committed
66 67 68 69 70 71 72
        while True:
            b1 = fp1.read(bufsize)
            b2 = fp2.read(bufsize)
            if b1 != b2:
                return False
            if not b1:
                return True
73 74 75 76 77 78

# Directory comparison class.
#
class dircmp:
    """A class that manages the comparison of 2 directories.

79
    dircmp(a, b, ignore=None, hide=None)
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
      A and B are directories.
      IGNORE is a list of names to ignore,
        defaults to ['RCS', 'CVS', 'tags'].
      HIDE is a list of names to hide,
        defaults to [os.curdir, os.pardir].

    High level usage:
      x = dircmp(dir1, dir2)
      x.report() -> prints a report on the differences between dir1 and dir2
       or
      x.report_partial_closure() -> prints report on differences between dir1
            and dir2, and reports on common immediate subdirectories.
      x.report_full_closure() -> like report_partial_closure,
            but fully recursive.

    Attributes:
     left_list, right_list: The files in dir1 and dir2,
        filtered by hide and ignore.
     common: a list of names in both dir1 and dir2.
     left_only, right_only: names only in dir1, dir2.
     common_dirs: subdirectories in both dir1 and dir2.
     common_files: files in both dir1 and dir2.
     common_funny: names in both dir1 and dir2 where the type differs between
        dir1 and dir2, or the name is not stat-able.
     same_files: list of identical files.
     diff_files: list of filenames which differ.
     funny_files: list of files which could not be compared.
     subdirs: a dictionary of dircmp objects, keyed by names in common_dirs.
     """

    def __init__(self, a, b, ignore=None, hide=None): # Initialize
        self.left = a
        self.right = b
        if hide is None:
            self.hide = [os.curdir, os.pardir] # Names never to be shown
        else:
            self.hide = hide
        if ignore is None:
            self.ignore = ['RCS', 'CVS', 'tags'] # Names ignored in comparison
        else:
            self.ignore = ignore

    def phase0(self): # Compare everything except common subdirectories
        self.left_list = _filter(os.listdir(self.left),
                                 self.hide+self.ignore)
        self.right_list = _filter(os.listdir(self.right),
                                  self.hide+self.ignore)
        self.left_list.sort()
        self.right_list.sort()

    def phase1(self): # Compute common names
131 132
        a = dict(zip(map(os.path.normcase, self.left_list), self.left_list))
        b = dict(zip(map(os.path.normcase, self.right_list), self.right_list))
133
        self.common = list(map(a.__getitem__, filter(b.__contains__, a)))
134 135
        self.left_only = list(map(a.__getitem__, filterfalse(b.__contains__, a)))
        self.right_only = list(map(b.__getitem__, filterfalse(a.__contains__, b)))
136 137 138 139 140 141 142 143 144 145 146 147

    def phase2(self): # Distinguish files, directories, funnies
        self.common_dirs = []
        self.common_files = []
        self.common_funny = []

        for x in self.common:
            a_path = os.path.join(self.left, x)
            b_path = os.path.join(self.right, x)

            ok = 1
            try:
148
                a_stat = os.stat(a_path)
149
            except os.error as why:
150
                # print('Can\'t stat', a_path, ':', why.args[1])
151 152
                ok = 0
            try:
153
                b_stat = os.stat(b_path)
154
            except os.error as why:
155
                # print('Can\'t stat', b_path, ':', why.args[1])
156 157 158
                ok = 0

            if ok:
159 160
                a_type = stat.S_IFMT(a_stat.st_mode)
                b_type = stat.S_IFMT(b_stat.st_mode)
161
                if a_type != b_type:
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
                    self.common_funny.append(x)
                elif stat.S_ISDIR(a_type):
                    self.common_dirs.append(x)
                elif stat.S_ISREG(a_type):
                    self.common_files.append(x)
                else:
                    self.common_funny.append(x)
            else:
                self.common_funny.append(x)

    def phase3(self): # Find out differences between common files
        xx = cmpfiles(self.left, self.right, self.common_files)
        self.same_files, self.diff_files, self.funny_files = xx

    def phase4(self): # Find out differences between common subdirectories
        # A new dircmp object is created for each common subdirectory,
        # these are stored in a dictionary indexed by filename.
        # The hide and ignore properties are inherited from the parent
        self.subdirs = {}
        for x in self.common_dirs:
            a_x = os.path.join(self.left, x)
            b_x = os.path.join(self.right, x)
            self.subdirs[x]  = dircmp(a_x, b_x, self.ignore, self.hide)

    def phase4_closure(self): # Recursively call phase4() on subdirectories
        self.phase4()
188
        for sd in self.subdirs.values():
189
            sd.phase4_closure()
190 191 192

    def report(self): # Print a report on the differences between a and b
        # Output format is purposely lousy
193
        print('diff', self.left, self.right)
194 195
        if self.left_only:
            self.left_only.sort()
196
            print('Only in', self.left, ':', self.left_only)
197 198
        if self.right_only:
            self.right_only.sort()
199
            print('Only in', self.right, ':', self.right_only)
200 201
        if self.same_files:
            self.same_files.sort()
202
            print('Identical files :', self.same_files)
203 204
        if self.diff_files:
            self.diff_files.sort()
205
            print('Differing files :', self.diff_files)
206 207
        if self.funny_files:
            self.funny_files.sort()
208
            print('Trouble with common files :', self.funny_files)
209 210
        if self.common_dirs:
            self.common_dirs.sort()
211
            print('Common subdirectories :', self.common_dirs)
212 213
        if self.common_funny:
            self.common_funny.sort()
214
            print('Common funny cases :', self.common_funny)
215 216 217

    def report_partial_closure(self): # Print reports on self and on subdirs
        self.report()
218
        for sd in self.subdirs.values():
219
            print()
220
            sd.report()
221 222 223

    def report_full_closure(self): # Report on self and subdirs recursively
        self.report()
224
        for sd in self.subdirs.values():
225
            print()
226
            sd.report_full_closure()
227

Raymond Hettinger's avatar
Raymond Hettinger committed
228 229 230 231 232 233 234 235
    methodmap = dict(subdirs=phase4,
                     same_files=phase3, diff_files=phase3, funny_files=phase3,
                     common_dirs = phase2, common_files=phase2, common_funny=phase2,
                     common=phase1, left_only=phase1, right_only=phase1,
                     left_list=phase0, right_list=phase0)

    def __getattr__(self, attr):
        if attr not in self.methodmap:
236
            raise AttributeError(attr)
Raymond Hettinger's avatar
Raymond Hettinger committed
237 238
        self.methodmap[attr](self)
        return getattr(self, attr)
239

240
def cmpfiles(a, b, common, shallow=True):
241 242
    """Compare common files in two directories.

243 244 245 246 247
    a, b -- directory names
    common -- list of file names found in both directories
    shallow -- if true, do comparison based solely on stat() information

    Returns a tuple of three lists:
248 249
      files that compare equal
      files that are different
250
      filenames that aren't regular files.
251

252
    """
253 254
    res = ([], [], [])
    for x in common:
255 256
        ax = os.path.join(a, x)
        bx = os.path.join(b, x)
257
        res[_cmp(ax, bx, shallow)].append(x)
258 259 260 261 262
    return res


# Compare two files.
# Return:
Tim Peters's avatar
Tim Peters committed
263 264 265
#       0 for equal
#       1 for different
#       2 for funny cases (can't stat, etc.)
266
#
Raymond Hettinger's avatar
Raymond Hettinger committed
267
def _cmp(a, b, sh, abs=abs, cmp=cmp):
268
    try:
269
        return not abs(cmp(a, b, sh))
270 271 272 273 274 275
    except os.error:
        return 2


# Return a copy with items that occur in skip removed.
#
Raymond Hettinger's avatar
Raymond Hettinger committed
276
def _filter(flist, skip):
277
    return list(filterfalse(skip.__contains__, flist))
278 279 280 281 282 283 284 285


# Demonstration and testing.
#
def demo():
    import sys
    import getopt
    options, args = getopt.getopt(sys.argv[1:], 'r')
286
    if len(args) != 2:
287
        raise getopt.GetoptError('need exactly two args', None)
288 289 290 291 292 293 294 295
    dd = dircmp(args[0], args[1])
    if ('-r', '') in options:
        dd.report_full_closure()
    else:
        dd.report()

if __name__ == '__main__':
    demo()