For an explanation see the directory above.
Highlighting shows changes from the previous version.
duplicate-files.rb
Summary of Changes
- PRUNE list of directories now regular expressions and constant.1
- Use
Pathname
objects to implement file queries and sort by path elements, not ASCII sort order (e.g. ‘-’ before ‘/’). - Reimplement
append_duplicates
to useArray#combination
, like its younger Python cousin. - Remove progress bar, replace with Spinner.
- Only generate performance data when output path provided.
- Deprecate
-d
, reimplement by post-processing the results list. - BUG FIX: Make option arguments mandatory.
Usage
Usage: duplicate-files.rb [options] dir1 [dir2 ...]
-q, --[no-]quiet Run absolutely quietly
-v, --[no-]verbose Run verbosely
-d, --from DIR Compare arguments to files from DIR (deprecated)
-o, --output OUTFILE Write standard output to OUTFILE
-j, --json Write output as JSON
-y, --yaml Write output as YAML (default)
-p, --[no-]pretty Pretty-print output
-P, --perf-data DIR Write performance data to DIR
-z, --[no-]zero Include zero-length files
Listing
1#!/usr/bin/env ruby
2
3require 'find'
4require 'fileutils'
5require 'optparse'
6require 'pathname'
7require 'rational'
8require 'tempfile'
9require 'yaml'
10require 'json'
11
12
13# List of file patterns to prune in search
14PRUNE = [
15 # Version Control directories and files
16 /CVS/, /CVSROOT/, /\.git/, /\.gitignore/, /\.svn/,
17 # Trash folders, usually off user or media root directories
18 /Trash/, /\.Trash.*/,
19 # Apple Mac metadata files and folders
20 /\.DS_Store/, /\._.*/, /\.Apple.*/
21]
22
23# Unfortunate artifact of transition between Ruby 1.9 and 2.0
24if self.class.const_defined?(:Encoding) then
25 ENCODING_UTF8 = Encoding.find('UTF-8')
26else
27 ENCODING_UTF8 = nil
28end
29
30# Simple class to manage an ASCII spinner
31class Spinner
32 SPINNER_STATES = ['-', '\\', '|', "/"]
33
34 UPDATE_INTERVAL = 0.5 # s
35
36 def initialize(io=$stderr)
37 @io = io
38 @state = 0
39 @updated = nil
40 end
41
42 def start
43 @updated = Time.now
44 @io.print(SPINNER_STATES[0])
45 end
46
47 def update
48 if not @updated then
49 start
50 end
51
52 now = Time.now
53 if now - @updated > UPDATE_INTERVAL then
54 @updated = now
55 @state = (@state + 1) % SPINNER_STATES.length
56 @io.print("\b", SPINNER_STATES[@state])
57 end
58 end
59
60 def stop
61 @io.print("\b ")
62 end
63end
64
65def prunable?(path)
66 name = File.basename(path)
67 PRUNE.each do |p|
68 return true if p =~ name
69 end
70 return false
71end
72
73# Recurse through array of `dirs` and produce Hash of all files by size.
74def files_by_size(fmap, dirs, spinner=nil)
75 Find.find(*dirs) do |path|
76 if prunable?(path) then
77 Find.prune
78 elsif File.file?(path) then
79 size = File.size(path)
80 path.encode!(ENCODING_UTF8) if ENCODING_UTF8
81 if not fmap[size] then
82 fmap[size] = {}
83 end
84 fmap[size][Pathname(path)] = true
85 spinner.update if spinner
86 end
87 end
88end
89
90# Compare each file in `paths` and append lists of equal files to `fmap`
91def append_duplicates(result, paths, progress, verbose=false)
92 idsets = Hash.new {|h,k| h[k] = { k => true } }
93
94 paths.keys.combination(2) do |path_i, path_j|
95 if path_i.exist? and path_j.exist? \
96 and path_i.realpath != path_j.realpath then
97 $stderr.print("Comparing <", path_i,
98 "> and <", path_j, "> ...") if verbose
99 if FileUtils.cmp(path_i, path_j) then
100 idsets[path_i][path_j] = true
101 idsets[path_j][path_i] = true
102 end
103 progress.update if progress
104 $stderr.print(" done!\n") if verbose
105 end
106 end
107 idsets.values.uniq.each do |s|
108 result << s.keys.sort
109 end
110 result.sort!
111 return result
112end
113
114# Find all files in `fmap` and append lists of equal files to `result`
115# `fmap` is a map of files by size.
116def append_all_dupes(result, fmap, opts, progress)
117 progress.start if progress
118 fmap.each_pair do |size, paths|
119 if size > 1 and paths.size > 1 then
120 append_duplicates(result, paths, progress, opts.verbose)
121 end
122 end
123 progress.stop if progress
124end
125
126Options = Struct.new("Options",
127 :info, :verbose, :canondir, :dirs,
128 :io, :format, :pretty, :perfdir,
129 :zero)
130
131def parse_options
132
133 config = Options.new(true, false, nil, [],
134 nil, :yaml, false, nil, false)
135
136 outfile = nil
137
138 OptionParser.new do |opts|
139 opts.banner = "Usage: duplicate-files.rb [options] dir1 [dir2 ...]"
140
141 opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
142 config.info = (not v)
143 end
144 opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
145 config.verbose = v
146 end
147 opts.on("-d", "--from DIR",
148 "Compare arguments to files from DIR (deprecated)") do |d|
149 config.canondir = d
150 end
151 opts.on("-o", "--output OUTFILE",
152 "Write standard output to OUTFILE") do |f|
153 outfile = File.new(f, 'w')
154 end
155 opts.on("-j", "--json", "Write output as JSON") do |v|
156 config.format = :json
157 end
158 opts.on("-y", "--yaml", "Write output as YAML (default)") do |v|
159 config.format = :yaml
160 end
161 opts.on("-p", "--[no-]pretty", "Pretty-print output") do |v|
162 config.pretty = v
163 end
164 opts.on("-P", "--perf-data DIR",
165 "Write performance data to DIR") do |v|
166 config.perfdir = v
167 end
168 opts.on("-z", "--[no-]zero", "Include zero-length files") do |v|
169 config.zero = v
170 end
171 end.parse!
172
173 config.io = (outfile or $stdout)
174
175 config.dirs = ARGV
176
177 return config
178end
179
180def log_perf_data(perfdir, info, data)
181 begin
182 name = 'perf-' + Time.now.strftime('%Y-%m-%d-%H%M%S-%N') + '.json'
183 path = nil
184 if not perfdir then
185 dir = Dir.mktmpdir('dfperf')
186 path = File.join(dir, name)
187 else
188 Dir.mkdir(perfdir, 755) if not Dir.exist?(perfdir)
189 dir = Dir.new(perfdir)
190 path = File.join(dir.to_path, name)
191 end
192 # TODO: check that file doesn't already exist.
193 File.open(path, 'w') do |file|
194 file.write(JSON.pretty_generate(data))
195 end
196 if info then
197 $stderr.print("Wrote performance data to ", path, "\n") if info
198 end
199 rescue Exception => msg
200 $stderr.print("Cannot log performance data: ", msg, "\n") if info
201 end
202end
203
204def run
205 opts = parse_options
206
207 spinner = nil
208 if opts.info or opts.verbose then
209 $stderr.print("Looking for files in ", opts.dirs, " ... ")
210 spinner = Spinner.new($stderr)
211 spinner.start
212 end
213
214 time_start = Time.now
215
216 fmap = {}
217 files_by_size(fmap, opts.dirs, spinner)
218 if opts.canondir then
219 files_by_size(fmap, [opts.canondir], spinner)
220 end
221
222 time_search = Time.now
223
224 if spinner then
225 spinner.stop
226 $stderr.print("done!\n")
227 count = 0
228 fmap.each_value do |v|
229 count += v.size
230 end
231 $stderr.print("Found ", count, " files in ",
232 fmap.size, " size groups.\n")
233 end
234
235 if opts.info then
236 $stderr.print("Comparing files ... ")
237 end
238
239 time_before_compare = Time.now
240
241 results = []
242 append_all_dupes(results, fmap, opts, spinner)
243
244 time_compare = Time.now
245
246 if opts.info then
247 $stderr.print("done!\n")
248 end
249
250 if opts.zero and fmap[0] then
251 zerofiles = fmap[0].keys.clone
252 zerofiles << Pathname('')
253 zerofiles.sort!
254 results << zerofiles
255 results.sort!
256 end
257
258 if opts.canondir then
259 bubble_up(opts.canondir, results)
260 end
261
262 if opts.perfdir then
263 log_perf_data(opts.perfdir, opts.info, {
264 "ngroups" => fmap.size,
265 "time-compare" => time_compare - time_before_compare,
266 "time-search" => time_search - time_start,
267 "data" => fmap})
268 end
269
270 if opts.format == :json then
271 if opts.pretty then
272 text = JSON.pretty_generate(results)
273 else
274 text = JSON.generate(results)
275 end
276 opts.io.write(text)
277 opts.io.write("\n")
278 else
279 # :line_width setting so file names w/spaces don't break across lines
280 yamlopts = {:line_width => 4096}
281 if opts.pretty then
282 # {:canonical => true} looks almost identical to pretty-printed
283 # JSON so the header keeps them distinct.
284 yamlopts[:header] = true
285 yamlopts[:canonical] = true
286 end
287 # Using Pathname upstream means exposing it in YAML;
288 # build new results with just Strings and Arrays
289 saferesults = []
290 results.each do |set|
291 safeset = []
292 saferesults << safeset
293 set.each do |path|
294 safeset << path.to_s
295 end
296 end
297 YAML.safe_dump(saferesults, opts.io, yamlopts)
298 end
299end
300
301def bubble_up(canondir, results)
302 cpath = Pathname(canondir)
303
304 results.each do |ls|
305 ls.sort! do |a, b|
306 canon_first(cpath, a, b)
307 end
308 end
309end
310
311def canon_first(cpath, a, b)
312 a_in_dir = is_parent?(cpath, a)
313 b_in_dir = is_parent?(cpath, b)
314
315 if a_in_dir == b_in_dir then
316 # how we originally sorted them
317 return Pathname(a) <=> Pathname(b)
318 elsif a_in_dir then
319 return -1
320 else
321 return 1
322 end
323end
324
325def is_parent?(cpath, path)
326 result = false
327 path.ascend do |p|
328 if cpath == p then
329 result = true
330 return true
331 end
332 end
333 return result
334end
335
336run
remove-files.rb
Summary of Changes
None.
Usage
Usage: remove-files.rb [options] [listfile ...]
-q, --[no-]quiet Run absolutely quietly
-v, --[no-]verbose Run verbosely
-j, --[no-]json Read input as JSON
-y, --[no-]yaml Read input as YAML (default)
-x, --[no-]mixed Deduce format from files
-i, --[no-]interactive Wait for user approval before deleting.
-u, --[no-]dry-run Don't delete, just list files
Listing
1#!/usr/bin/env ruby
2
3require 'optparse'
4require 'yaml'
5require 'psych'
6require 'json'
7
8# Append all but the first paths from each set in `filesets` to results
9def append_files(result, filesets, verbose=true)
10 filesets.each do |fset|
11 delset = fset[1..-1]
12 # Skip the first file in each set
13 if verbose then
14 $stderr.print("Skipping ", fset[0].inspect, ";\n");
15 $stderr.print("Adding ", delset, "\n")
16 end
17 delset.each do |n|
18 result << n
19 end
20 end
21end
22
23# Figure out if the `text` in `path` is JSON
24def is_json(path, text)
25 text.strip! # Bad form to modify an argument
26 path.end_with?(".json") or text.start_with?("{", "[", "\"")
27end
28
29def read_file(files, path, verbose)
30 $stderr.print("Reading ", path, " ...\n") if verbose
31 text = f.read
32 begin
33 if is_json(path, text) then
34 $stderr.print("Assuming JSON: ", path) if verbose
35 append_files(files, JSON.parse(text), verbose)
36 else
37 $stderr.print("Assuming YAML: ", path) if verbose
38 append_files(files, YAML.safe_load(text), verbose)
39 end
40 rescue JSON::JsonError => msg
41 $stderr.print(msg, "\n") if verbose
42 $stdout.print(path.inspect, "isn't valid JSON; skipping.\n") if info
43 rescue Psych::Exception => msg
44 $stderr.print(msg, "\n") if verbose
45 $stdout.print(path.inspect, "isn't valid YAML; skipping.\n") if info
46 end
47end
48
49#
50# Run the main loop
51#
52def run
53
54 info = true
55 verbose = false
56 input = :yaml
57 dry = false
58 interactive = false
59 files = []
60
61 OptionParser.new do |opts|
62 opts.banner = "Usage: remove-files.rb [options] [listfile ...]"
63
64 opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
65 info = (not v)
66 end
67 opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
68 verbose = v
69 end
70 opts.on("-j", "--[no-]json", "Read input as JSON") do |v|
71 input = :json if v
72 end
73 opts.on("-y", "--[no-]yaml", "Read input as YAML (default)") do |v|
74 input = :yaml if v
75 end
76 opts.on("-x", "--[no-]mixed", "Deduce format from files") do |v|
77 input = nil if v
78 end
79 opts.on("-i", "--[no-]interactive",
80 "Wait for user approval before deleting.") do |v|
81 interactive = v
82 end
83 opts.on("-u", "--[no-]dry-run", "Don't delete, just list files") do |v|
84 dry = v
85 end
86 end.parse!
87
88 begin
89 if input == :json then
90 # Assuming we're getting a single JSON object ...
91 result = JSON.parse(ARGF.read)
92 append_files(files, result, verbose)
93 elsif input == :yaml or ARGV.empty? then
94 append_files(files, YAML.safe_load(ARGF.read), verbose)
95 else
96 ARGV.each { |path| read_file(files, path, verbose) }
97 end
98 rescue Psych::Exception => msg
99 $stderr.print(msg, "\n") if verbose
100 $stdout.print("Input isn't valid YAML; skipping.\n") if info
101 rescue JSON::JsonError => msg
102 $stderr.print(msg, "\n") if verbose
103 $stdout.print("Input isn't valid JSON; skipping.\n") if info
104 end
105
106 files.uniq!
107 files.sort!
108
109 if info and not files.empty? then
110 if interactive then
111 $stdout.print("About to remove the following files:\n")
112 else
113 $stdout.print("Removing the following files:\n")
114 end
115 YAML.dump(files, $stdout)
116 end
117
118 if interactive and not files.empty? then
119 $stdout.print("Remove ", files.size, " files? [y/N]: ")
120 response = $stdin.gets
121 if not response then
122 $stdout.print("No response; are you piping to STDIN? Exiting.\n")
123 return -1
124 else
125 response.strip!
126 if not response.start_with?('Y', 'y') then
127 $stdout.print("Exiting.\n")
128 return 1
129 end
130 end
131 end
132
133 count = 0
134
135 files.each do |path|
136 $stderr.print("-> rm ", path.inspect, "\n") if verbose
137 begin
138 File.delete(path) if not dry
139 count = count + 1
140 rescue StandardError => msg
141 $stderr.print(msg, "\n") if verbose
142 $stderr.print("Cannot remove ", path.inspect, "; skipping.\n") if info
143 end
144 end
145
146 $stdout.print("Removed ", count, " files.") if info
147 $stdout.print(" (Not really.)") if info and dry
148 $stdout.print("\n") if info
149
150 return 0
151end
152
153run()
-
Previous versions had no options to add or delete from the list, but it was part of the Options struct regardless. ↩︎