filecmp.py 9.25 KB
Newer Older
1
"""Utilities for comparing files and directories.
2

3 4 5 6
Classes:
    dircmp

Functions:
7
    cmp(f1, f2, shallow=1) -> int
8 9 10 11 12 13
    cmpfiles(a, b, common) -> ([], [], [])

"""

import os
import stat
14
from itertools import filterfalse
15

Skip Montanaro's avatar
Skip Montanaro committed
16 17
__all__ = ["cmp","dircmp","cmpfiles"]

18 19 20
_cache = {}
BUFSIZE=8*1024

21
def cmp(f1, f2, shallow=1):
22
    """Compare two files.
23

24
    Arguments:
25

26
    f1 -- First file name
27

28
    f2 -- Second file name
29

30 31
    shallow -- Just check stat signature (do not read the files).
               defaults to 1.
32

33
    Return value:
34

35
    True if the files are the same, False otherwise.
36

37 38
    This function uses a cache for past comparisons and the results,
    with a cache invalidation mechanism relying on stale signatures.
39

40
    """
41

42 43
    s1 = _sig(os.stat(f1))
    s2 = _sig(os.stat(f2))
44
    if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG:
45
        return False
46
    if shallow and s1 == s2:
47
        return True
48
    if s1[1] != s2[1]:
49
        return False
50

51 52 53 54 55 56
    result = _cache.get((f1, f2))
    if result and (s1, s2) == result[:2]:
        return result[2]
    outcome = _do_cmp(f1, f2)
    _cache[f1, f2] = s1, s2, outcome
    return outcome
57 58

def _sig(st):
59 60 61
    return (stat.S_IFMT(st.st_mode),
            st.st_size,
            st.st_mtime)
62 63

def _do_cmp(f1, f2):
64 65 66
    bufsize = BUFSIZE
    fp1 = open(f1, 'rb')
    fp2 = open(f2, 'rb')
Raymond Hettinger's avatar
Raymond Hettinger committed
67
    while True:
68 69 70
        b1 = fp1.read(bufsize)
        b2 = fp2.read(bufsize)
        if b1 != b2:
Raymond Hettinger's avatar
Raymond Hettinger committed
71
            return False
72
        if not b1:
Raymond Hettinger's avatar
Raymond Hettinger committed
73
            return True
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131

# Directory comparison class.
#
class dircmp:
    """A class that manages the comparison of 2 directories.

    dircmp(a,b,ignore=None,hide=None)
      A and B are directories.
      IGNORE is a list of names to ignore,
        defaults to ['RCS', 'CVS', 'tags'].
      HIDE is a list of names to hide,
        defaults to [os.curdir, os.pardir].

    High level usage:
      x = dircmp(dir1, dir2)
      x.report() -> prints a report on the differences between dir1 and dir2
       or
      x.report_partial_closure() -> prints report on differences between dir1
            and dir2, and reports on common immediate subdirectories.
      x.report_full_closure() -> like report_partial_closure,
            but fully recursive.

    Attributes:
     left_list, right_list: The files in dir1 and dir2,
        filtered by hide and ignore.
     common: a list of names in both dir1 and dir2.
     left_only, right_only: names only in dir1, dir2.
     common_dirs: subdirectories in both dir1 and dir2.
     common_files: files in both dir1 and dir2.
     common_funny: names in both dir1 and dir2 where the type differs between
        dir1 and dir2, or the name is not stat-able.
     same_files: list of identical files.
     diff_files: list of filenames which differ.
     funny_files: list of files which could not be compared.
     subdirs: a dictionary of dircmp objects, keyed by names in common_dirs.
     """

    def __init__(self, a, b, ignore=None, hide=None): # Initialize
        self.left = a
        self.right = b
        if hide is None:
            self.hide = [os.curdir, os.pardir] # Names never to be shown
        else:
            self.hide = hide
        if ignore is None:
            self.ignore = ['RCS', 'CVS', 'tags'] # Names ignored in comparison
        else:
            self.ignore = ignore

    def phase0(self): # Compare everything except common subdirectories
        self.left_list = _filter(os.listdir(self.left),
                                 self.hide+self.ignore)
        self.right_list = _filter(os.listdir(self.right),
                                  self.hide+self.ignore)
        self.left_list.sort()
        self.right_list.sort()

    def phase1(self): # Compute common names
132 133
        a = dict(zip(map(os.path.normcase, self.left_list), self.left_list))
        b = dict(zip(map(os.path.normcase, self.right_list), self.right_list))
134
        self.common = list(map(a.__getitem__, filter(b.__contains__, a)))
135 136
        self.left_only = list(map(a.__getitem__, filterfalse(b.__contains__, a)))
        self.right_only = list(map(b.__getitem__, filterfalse(a.__contains__, b)))
137 138 139 140 141 142 143 144 145 146 147 148

    def phase2(self): # Distinguish files, directories, funnies
        self.common_dirs = []
        self.common_files = []
        self.common_funny = []

        for x in self.common:
            a_path = os.path.join(self.left, x)
            b_path = os.path.join(self.right, x)

            ok = 1
            try:
149
                a_stat = os.stat(a_path)
150
            except os.error as why:
151
                # print('Can\'t stat', a_path, ':', why.args[1])
152 153
                ok = 0
            try:
154
                b_stat = os.stat(b_path)
155
            except os.error as why:
156
                # print('Can\'t stat', b_path, ':', why.args[1])
157 158 159
                ok = 0

            if ok:
160 161
                a_type = stat.S_IFMT(a_stat.st_mode)
                b_type = stat.S_IFMT(b_stat.st_mode)
162
                if a_type != b_type:
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
                    self.common_funny.append(x)
                elif stat.S_ISDIR(a_type):
                    self.common_dirs.append(x)
                elif stat.S_ISREG(a_type):
                    self.common_files.append(x)
                else:
                    self.common_funny.append(x)
            else:
                self.common_funny.append(x)

    def phase3(self): # Find out differences between common files
        xx = cmpfiles(self.left, self.right, self.common_files)
        self.same_files, self.diff_files, self.funny_files = xx

    def phase4(self): # Find out differences between common subdirectories
        # A new dircmp object is created for each common subdirectory,
        # these are stored in a dictionary indexed by filename.
        # The hide and ignore properties are inherited from the parent
        self.subdirs = {}
        for x in self.common_dirs:
            a_x = os.path.join(self.left, x)
            b_x = os.path.join(self.right, x)
            self.subdirs[x]  = dircmp(a_x, b_x, self.ignore, self.hide)

    def phase4_closure(self): # Recursively call phase4() on subdirectories
        self.phase4()
189
        for sd in self.subdirs.values():
190
            sd.phase4_closure()
191 192 193

    def report(self): # Print a report on the differences between a and b
        # Output format is purposely lousy
194
        print('diff', self.left, self.right)
195 196
        if self.left_only:
            self.left_only.sort()
197
            print('Only in', self.left, ':', self.left_only)
198 199
        if self.right_only:
            self.right_only.sort()
200
            print('Only in', self.right, ':', self.right_only)
201 202
        if self.same_files:
            self.same_files.sort()
203
            print('Identical files :', self.same_files)
204 205
        if self.diff_files:
            self.diff_files.sort()
206
            print('Differing files :', self.diff_files)
207 208
        if self.funny_files:
            self.funny_files.sort()
209
            print('Trouble with common files :', self.funny_files)
210 211
        if self.common_dirs:
            self.common_dirs.sort()
212
            print('Common subdirectories :', self.common_dirs)
213 214
        if self.common_funny:
            self.common_funny.sort()
215
            print('Common funny cases :', self.common_funny)
216 217 218

    def report_partial_closure(self): # Print reports on self and on subdirs
        self.report()
219
        for sd in self.subdirs.values():
220
            print()
221
            sd.report()
222 223 224

    def report_full_closure(self): # Report on self and subdirs recursively
        self.report()
225
        for sd in self.subdirs.values():
226
            print()
227
            sd.report_full_closure()
228

Raymond Hettinger's avatar
Raymond Hettinger committed
229 230 231 232 233 234 235 236
    methodmap = dict(subdirs=phase4,
                     same_files=phase3, diff_files=phase3, funny_files=phase3,
                     common_dirs = phase2, common_files=phase2, common_funny=phase2,
                     common=phase1, left_only=phase1, right_only=phase1,
                     left_list=phase0, right_list=phase0)

    def __getattr__(self, attr):
        if attr not in self.methodmap:
237
            raise AttributeError(attr)
Raymond Hettinger's avatar
Raymond Hettinger committed
238 239
        self.methodmap[attr](self)
        return getattr(self, attr)
240

241
def cmpfiles(a, b, common, shallow=1):
242 243
    """Compare common files in two directories.

244 245 246 247 248
    a, b -- directory names
    common -- list of file names found in both directories
    shallow -- if true, do comparison based solely on stat() information

    Returns a tuple of three lists:
249 250
      files that compare equal
      files that are different
251
      filenames that aren't regular files.
252

253
    """
254 255
    res = ([], [], [])
    for x in common:
256 257
        ax = os.path.join(a, x)
        bx = os.path.join(b, x)
258
        res[_cmp(ax, bx, shallow)].append(x)
259 260 261 262 263
    return res


# Compare two files.
# Return:
Tim Peters's avatar
Tim Peters committed
264 265 266
#       0 for equal
#       1 for different
#       2 for funny cases (can't stat, etc.)
267
#
Raymond Hettinger's avatar
Raymond Hettinger committed
268
def _cmp(a, b, sh, abs=abs, cmp=cmp):
269
    try:
270
        return not abs(cmp(a, b, sh))
271 272 273 274 275 276
    except os.error:
        return 2


# Return a copy with items that occur in skip removed.
#
Raymond Hettinger's avatar
Raymond Hettinger committed
277
def _filter(flist, skip):
278
    return list(filterfalse(skip.__contains__, flist))
279 280 281 282 283 284 285 286


# Demonstration and testing.
#
def demo():
    import sys
    import getopt
    options, args = getopt.getopt(sys.argv[1:], 'r')
287
    if len(args) != 2:
288
        raise getopt.GetoptError('need exactly two args', None)
289 290 291 292 293 294 295 296
    dd = dircmp(args[0], args[1])
    if ('-r', '') in options:
        dd.report_full_closure()
    else:
        dd.report()

if __name__ == '__main__':
    demo()