duplicate-files.py - 2023-03-02

Posted: 2023-03-03
Word Count: 258
Tags: programming python python-code

Table of Contents

For an explanation, see the directory above.

duplicate-files.py

See “Relearning Python #1 for more information.

Listing

 1#!/usr/bin/env python3
 2
 3from pathlib import Path
 4
 5#import argparse
 6import filecmp
 7import json
 8import sys
 9
10STD_EXCLUDES=['CVS', 'CVSROOT', '.DS_Store', '.git', '.gitignore', '.svn']
11
12def find_files(sizemap, f, exclude=STD_EXCLUDES):
13    # TODO: Check we have permissions 
14    if f.name in exclude:
15        return
16
17    if f.is_symlink():
18        pass
19    elif f.is_dir():
20        for child in f.iterdir():
21            find_files(sizemap, child)
22    elif f.is_file():
23        sz = f.stat().st_size
24        if not sz in sizemap:
25            sizemap[sz] = []
26        ls = sizemap[sz]
27        if f not in ls:
28            ls.append(f.as_posix())
29            # using strings so JSON can print them
30#end
31
32def duplicate_files(file_i, file_j):
33     return file_i != file_j and filecmp.cmp(file_i, file_j, shallow=False)
34
35def add_to_dupsets(dupsets, file_i, file_j):
36    if not file_i in dupsets:
37        dupsets[file_i] = set((file_i,))
38    if not file_j in dupsets:
39        dupsets[file_j] = set((file_j,))
40    dupsets[file_i].add(file_j) 
41    dupsets[file_j].add(file_i)
42
43def sort_uniq(sets):
44    result = list(set([tuple(sorted(s)) for s in sets]))
45    result.sort()
46    return result
47
48def compare_files(sizemap):
49    dupsets = {}
50    for sz, ls in sizemap.items():
51        if sz == 0: continue
52        max_i = len(ls)
53        for i in range(0, max_i):
54            for j in range(i, max_i):
55                if duplicate_files(ls[i], ls[j]):
56                    add_to_dupsets(dupsets, ls[i], ls[j])
57    return sort_uniq(dupsets.values())
58
59def run():
60    sizemap = {}
61    # Assume all arguments are directory names for now
62    for dirname in sys.argv[1:]:
63        find_files(sizemap, Path(dirname))
64    result = compare_files(sizemap)
65    print(json.dumps(result, sort_keys=False, indent=2))
66
67if __name__ == '__main__':
68    run()