For an explanation, see the directory above.
duplicate-files.py
See “Relearning Python #1 for more information.
Listing
1#!/usr/bin/env python3
2
3from pathlib import Path
4
5#import argparse
6import filecmp
7import json
8import sys
9
10STD_EXCLUDES=['CVS', 'CVSROOT', '.DS_Store', '.git', '.gitignore', '.svn']
11
12def find_files(sizemap, f, exclude=STD_EXCLUDES):
13 # TODO: Check we have permissions
14 if f.name in exclude:
15 return
16
17 if f.is_symlink():
18 pass
19 elif f.is_dir():
20 for child in f.iterdir():
21 find_files(sizemap, child)
22 elif f.is_file():
23 sz = f.stat().st_size
24 if not sz in sizemap:
25 sizemap[sz] = []
26 ls = sizemap[sz]
27 if f not in ls:
28 ls.append(f.as_posix())
29 # using strings so JSON can print them
30#end
31
32def duplicate_files(file_i, file_j):
33 return file_i != file_j and filecmp.cmp(file_i, file_j, shallow=False)
34
35def add_to_dupsets(dupsets, file_i, file_j):
36 if not file_i in dupsets:
37 dupsets[file_i] = set((file_i,))
38 if not file_j in dupsets:
39 dupsets[file_j] = set((file_j,))
40 dupsets[file_i].add(file_j)
41 dupsets[file_j].add(file_i)
42
43def sort_uniq(sets):
44 result = list(set([tuple(sorted(s)) for s in sets]))
45 result.sort()
46 return result
47
48def compare_files(sizemap):
49 dupsets = {}
50 for sz, ls in sizemap.items():
51 if sz == 0: continue
52 max_i = len(ls)
53 for i in range(0, max_i):
54 for j in range(i, max_i):
55 if duplicate_files(ls[i], ls[j]):
56 add_to_dupsets(dupsets, ls[i], ls[j])
57 return sort_uniq(dupsets.values())
58
59def run():
60 sizemap = {}
61 # Assume all arguments are directory names for now
62 for dirname in sys.argv[1:]:
63 find_files(sizemap, Path(dirname))
64 result = compare_files(sizemap)
65 print(json.dumps(result, sort_keys=False, indent=2))
66
67if __name__ == '__main__':
68 run()