For an explanation, see the directory above.
Highlighting shows changes from the previous version.
duplicate-files.rb
Summary of Changes
- Replace Printer classes with simple
if
statement. - Add progress bar during comparisons. Note that estimates are wildly inaccurate.
- Write performance data to
/tmp
or a configured directory to help deduce why estimates are so far off.
Usage
Usage: duplicate-files.rb [options] dir1 [dir2 ...]
-q, --[no-]quiet Run absolutely quietly
-v, --[no-]verbose Run verbosely
-d, --from [DIR] Compare arguments only to files from DIR
-o, --output [OUTFILE] Write standard output to OUTFILE
-j, --json Write output as JSON
-y, --yaml Write output as YAML (default)
-p, --[no-]pretty Pretty-print output
-P, --perf-data [DIR] Write performace data to a directory (default /tmp)
Listing
1#!/usr/bin/env ruby
2
3require 'find'
4require 'fileutils'
5require 'optparse'
6require 'rational'
7require 'tempfile'
8require 'yaml'
9require 'json'
10
11
12# Default list of files to prune in search
13PRUNE = ['.svn', 'CVS', 'CVSROOT', '.DS_Store', '.git']
14
15# Unfortunate artifact of transition between Ruby 1.9 and 2.0
16if self.class.const_defined?(:Encoding) then
17 ENCODING_UTF8 = Encoding.find('UTF-8')
18else
19 ENCODING_UTF8 = nil
20end
21
22# Simple class to manage an ASCII spinner
23class Spinner
24 SPINNER_STATES = ['-', '\\', '|', "/"]
25
26 UPDATE_INTERVAL = 0.5 # s
27
28 def initialize(io=$stderr)
29 @io = io
30 @state = 0
31 @updated = nil
32 end
33
34 def start
35 @updated = Time.now
36 @io.print(SPINNER_STATES[0])
37 end
38
39 def update
40 if not @updated then
41 start()
42 end
43
44 now = Time.now
45 if now - @updated > UPDATE_INTERVAL then
46 @updated = now
47 @state = (@state + 1) % SPINNER_STATES.length
48 @io.print("\b", SPINNER_STATES[@state])
49 end
50 end
51end
52
53#
54# Not-so-simple class to manage a progress bar
55#
56class Progress
57 BAR_LENGTH = 40
58
59 UPDATE_INTERVAL = 1.0 # s
60
61 attr_accessor :estimate, :actual
62
63 def initialize(io=$stderr)
64 @io = io
65 @updated = nil
66 @estimate = 0
67 @actual = 0
68 end
69
70 def do_estimate(results, canon)
71 @estimate = 0
72 results.each do |fileset|
73 # choose(x, k) is the number of ways one can choose `k`
74 # items from a set of `n`.
75 size = fileset.size
76 size += 1 if canon
77 if size > 1 then
78 @estimate += size * (size - 1) # choose(size, 2) # factorial(size)
79 end
80 end
81 update
82 end
83
84 def factorial(n)
85 if n <= 1 then 1 else n * factorial(n-1) end
86 end
87
88 def choose(n, k)
89 if n < 1 or k < 0 or k > n then
90 return 0
91 end
92 result = 1
93 1.upto(k) do |i|
94 result = result * (n + 1 - i) / i
95 end
96 return result
97 end
98
99 def add(amt = 1)
100 @actual += amt
101 update
102 end
103
104 def progress(increments = 100)
105 if @estimate == 0 then
106 return 0
107 else
108 return Rational(increments * @actual, @estimate).floor.to_i
109 end
110 end
111
112 def progress_bar
113 str = String.new
114 1.upto(BAR_LENGTH) do |i|
115 if i <= progress(BAR_LENGTH) then
116 str << '#'
117 else
118 str << '-'
119 end
120 end
121 return str
122 end
123
124 def update
125 if not @updated then
126 @updated = Time.now
127 end
128
129 now = Time.now
130 if now - @updated > UPDATE_INTERVAL then
131 @updated = now
132 msg = @actual.to_s + "/" + @estimate.to_s + \
133 " |" + progress_bar + "|"
134 @io.print("\b"*msg.size, msg)
135 end
136 end
137end
138
139# Whether `path_i` and `path_j` refer to duplicate but not identical files
140def duplicate_files?(path_i, path_j)
141 return (File.exist?(path_i) \
142 and File.exist?(path_j) \
143 and not File.identical?(path_i, path_j) \
144 and FileUtils.cmp(path_i, path_j))
145end
146
147def prunable?(path, prune=[])
148 name = File.basename(path)
149 return (prune.include?(name) or File.fnmatch('._*', name))
150end
151
152# Recurse through array of `dirs` and produce Hash of all files by size.
153def files_by_size(dirs, prune=[], verbose=false)
154 result = Hash.new {|h,k| h[k]=[] }
155 count = 0
156 spinner = nil
157 if verbose then
158 $stderr.print("Looking for files in ", dirs, ": ")
159 spinner = Spinner.new($stderr)
160 spinner.start
161 end
162 Find.find(*dirs) do |path|
163 if prunable?(path, prune) then
164 Find.prune()
165 elsif File.file?(path) then
166 size = File.size(path)
167 if size > 0 then
168 count = count + 1
169 path.encode!(ENCODING_UTF8) if ENCODING_UTF8
170 result[size] << path if size > 0
171 if verbose then
172 count = count + 1
173 spinner.update
174 end
175 end
176 end
177 end
178 if verbose then
179 $stderr.print(" done!\n")
180
181 $stderr.print("Found ", count, " non-empty files in ",
182 result.size, " size groups.\n")
183 end
184 return result
185end
186
187# Compare each file in `paths` and append lists of equal files to `result`
188def append_duplicates(result, paths, progress)
189 idsets = Hash.new {|h,k| h[k] = { k => true } }
190 0.upto(paths.length - 1) do |i|
191 (i+1).upto(paths.length - 1) do |j|
192 path_i = paths[i]
193 path_j = paths[j]
194 if duplicate_files?(path_i, path_j) then
195 idsets[path_i][path_j] = true
196 idsets[path_j][path_i] = true
197 end
198 progress.add
199 end
200 end
201 idsets.values.uniq.each do | s |
202 result << s.keys.sort
203 end
204 return result
205end
206
207# Find all files in `fmap` and append lists of equal files to `result`
208# `fmap` is a map of files by size.
209def append_all_dupes(result, fmap, opts, progress)
210 fmap.each_pair do |size, paths|
211 if paths.size > 1 then
212 $stderr.print("Comparing ", paths, " ...") if opts.verbose
213 append_duplicates(result, paths, progress)
214 $stderr.print("done.\n") if opts.verbose
215 end
216 end
217end
218
219# Find all files in `srcdir`, compare to all files in `fmap`,
220# and append lists of equal files to `result`.
221# The path from `srcdir` will always be first in the list.
222def append_dir_dupes(result, fmap, opts, progress)
223 $stderr.print("Looking for files in '", opts.canondir, "':\n") if opts.verbose
224 Find.find(opts.canondir) do |path_i|
225 if prunable?(path_i, opts.prune) then
226 Find.prune()
227 elsif File.file?(path_i) then
228 size = File.size(path_i)
229 paths = fmap[size]
230 if size > 0 and paths and not paths.empty? then
231 dupes = []
232 $stderr.print("Comparing '", path_i,
233 "' to ", paths, " ...") if opts.verbose
234 paths.each do |path_j|
235 if duplicate_files?(path_i, path_j) then
236 dupes << path_j
237 end
238 progress.add
239 end
240 if dupes.size > 0 then
241 dupes.sort!
242 result << [path_i, *dupes]
243 end
244 $stderr.print(" done.\n") if opts.verbose
245 end
246 end
247 end
248 $stderr.print("Done!\n") if opts.verbose
249 return result
250end
251
252Options = Struct.new("Options",
253 :info, :verbose, :canondir, :dirs, :prune,
254 :io, :format, :pretty, :perfdir)
255
256def parse_options
257
258 config = Options.new(true, false, nil, [], PRUNE,
259 nil, :yaml, false, nil)
260
261 outfile = nil
262
263 OptionParser.new do |opts|
264 opts.banner = "Usage: duplicate-files.rb [options] dir1 [dir2 ...]"
265
266 opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
267 config.info = (not v)
268 end
269 opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
270 config.verbose = v
271 end
272 opts.on("-d", "--from [DIR]",
273 "Compare arguments only to files from DIR") do |d|
274 config.canondir = d
275 end
276 opts.on("-o", "--output [OUTFILE]",
277 "Write standard output to OUTFILE") do |f|
278 outfile = File.new(f, 'w')
279 end
280 opts.on("-j", "--json", "Write output as JSON") do |v|
281 config.format = :json
282 end
283 opts.on("-y", "--yaml", "Write output as YAML (default)") do |v|
284 config.format = :yaml
285 end
286 opts.on("-p", "--[no-]pretty", "Pretty-print output") do |v|
287 config.pretty = v
288 end
289 opts.on("-P", "--perf-data [DIR]",
290 "Write performace data to a directory (default /tmp)") do |v|
291 config.perfdir = v
292 end
293 end.parse!
294
295 config.io = (outfile or $stdout)
296
297 config.dirs = ARGV
298
299 return config
300end
301
302def log_perf_data(perfdir, info, data)
303 begin
304 name = 'perf-' + Time.now.strftime('%Y-%m-%d-%H%M%S-%N') + '.json'
305 path = nil
306 if not perfdir then
307 dir = Dir.mktmpdir('dfperf')
308 path = File.join(dir, name)
309 else
310 Dir.mkdir(perfdir, 755) if not Dir.exist?(perfdir)
311 dir = Dir.new(perfdir)
312 path = File.join(dir.to_path, name)
313 end
314 # TODO: check that file doesn't already exist.
315 File.open(path, 'w') do |file|
316 file.write(JSON.pretty_generate(data))
317 end
318 if info then
319 $stderr.print("Wrote performance data to ", path, "\n") if info
320 end
321 rescue Exception => msg
322 $stderr.print("Cannot log performance data: ", msg, "\n") if info
323 end
324end
325
326def run
327 opts = parse_options
328
329 time_start = Time.now
330
331 fmap = files_by_size(opts.dirs, opts.prune, (opts.info or opts.verbose))
332
333 time_search = Time.now
334
335 if opts.info then
336 $stderr.print("Comparing files ...\n")
337 end
338
339 results = []
340 progress = Progress.new($stderr)
341
342 time_before_est = Time.now
343
344 progress.do_estimate(fmap, opts.canondir != nil)
345
346 time_estimate = Time.now
347
348 if opts.canondir then
349 append_dir_dupes(results, fmap, opts, progress)
350 else
351 append_all_dupes(results, fmap, opts, progress)
352 end
353
354 time_compare = Time.now
355
356 if opts.info then
357 $stderr.print("... done!\n")
358 end
359
360 log_perf_data(opts.perfdir, opts.info, {
361 "actual" => progress.actual,
362 "estimate" => progress.estimate,
363 "ngroups" => fmap.size,
364 "time-compare" => time_compare - time_estimate,
365 "time-estimate" => time_estimate - time_before_est,
366 "time-search" => time_search - time_start,
367 "data" => fmap})
368
369 if opts.format == :json then
370 if @pretty then
371 text = JSON.pretty_generate(results)
372 else
373 text = JSON.generate(results)
374 end
375 opts.io.write(text)
376 opts.io.write("\n")
377 else
378 # :line_width setting so file names w/spaces don't break across lines
379 yamlopts = {:line_width => 4096}
380 if @pretty then
381 # {:canonical => true} looks almost identical to pretty-printed JSON
382 # so the header keeps them distinct.
383 yamlopts[:header] = true
384 yamlopts[:canonical] = true
385 end
386 YAML.dump(results, opts.io, yamlopts)
387 end
388end
389
390run()
remove-files.rb
Summary of Changes
- Huge rewrite to prevent accidentally deleting the difference file, among other mishaps.
Usage
Usage: remove-files.rb [options] [listfile ...]
-q, --[no-]quiet Run absolutely quietly
-v, --[no-]verbose Run verbosely
-j, --[no-]json Read input as JSON
-y, --[no-]yaml Read input as YAML (default)
-x, --[no-]mixed Deduce format from files
-u, --[no-]dry-run Don't delete, just list files
Listing
1#!/usr/bin/env ruby
2
3require 'optparse'
4require 'yaml'
5require 'psych'
6require 'json'
7
8# Append all but the first paths from each set in `filesets` to results
9def append_files(result, filesets, verbose=true)
10 filesets.each do |fset|
11 delset = fset[1..-1]
12 # Skip the first file in each set
13 if verbose then
14 $stderr.print("Skipping ", fset[0].inspect, ";\n");
15 $stderr.print("Adding ", delset, "\n")
16 end
17 delset.each do |n|
18 result << n
19 end
20 end
21end
22
23# Figure out if the `text` in `path` is JSON
24def is_json(path, text)
25 text.strip! # Bad form to modify an argument
26 path.end_with?(".json") or text.start_with?("{", "[", "\"")
27end
28
29def read_file(files, path, verbose)
30 $stderr.print("Reading ", path, " ...\n") if verbose
31 text = f.read
32 begin
33 if is_json(path, text) then
34 $stderr.print("Assuming JSON: ", path) if verbose
35 append_files(files, JSON.parse(text), verbose)
36 else
37 $stderr.print("Assuming YAML: ", path) if verbose
38 append_files(files, YAML.safe_load(text), verbose)
39 end
40 rescue JSON::JsonError => msg
41 $stderr.print(msg, "\n") if verbose
42 $stdout.print(path.inspect, "isn't valid JSON; skipping.\n") if info
43 rescue Psych::Exception => msg
44 $stderr.print(msg, "\n") if verbose
45 $stdout.print(path.inspect, "isn't valid YAML; skipping.\n") if info
46 end
47end
48
49#
50# Run the main loop
51#
52def run
53
54 info = true
55 verbose = false
56 input = :yaml
57 dry = false
58 files = []
59
60 OptionParser.new do |opts|
61 opts.banner = "Usage: remove-files.rb [options] [listfile ...]"
62
63 opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
64 info = (not v)
65 end
66 opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
67 verbose = v
68 end
69 opts.on("-j", "--[no-]json", "Read input as JSON") do |v|
70 input = :json if v
71 end
72 opts.on("-y", "--[no-]yaml", "Read input as YAML (default)") do |v|
73 input = :yaml if v
74 end
75 opts.on("-x", "--[no-]mixed", "Deduce format from files") do |v|
76 input = nil if v
77 end
78 opts.on("-u", "--[no-]dry-run", "Don't delete, just list files") do |v|
79 dry = v
80 end
81 end.parse!
82
83 begin
84 if input == :json then
85 # Assuming we're getting a single JSON object ...
86 result = JSON.parse(ARGF.read)
87 append_files(files, result, verbose)
88 elsif input == :yaml or ARGV.empty? then
89 append_files(files, YAML.safe_load(ARGF.read), verbose)
90 else
91 ARGV.each { |path| read_file(files, path, verbose) }
92 end
93 rescue Psych::Exception => msg
94 $stderr.print(msg, "\n") if verbose
95 $stdout.print("Input isn't valid YAML; skipping.\n") if info
96 rescue JSON::JsonError => msg
97 $stderr.print(msg, "\n") if verbose
98 $stdout.print("Input isn't valid JSON; skipping.\n") if info
99 end
100
101 if info and not files.empty? then
102 $stdout.print("About to remove the following files:\n")
103 YAML.dump(files, $stdout)
104 end
105
106 files.uniq!
107 files.sort!
108
109 count = 0
110
111 files.each do |path|
112 $stderr.print("-> rm ", path.inspect, "\n") if verbose
113 begin
114 File.delete(path) if not dry
115 count = count + 1
116 rescue StandardError => msg
117 $stderr.print(msg, "\n") if verbose
118 $stderr.print("Cannot remove ", path.inspect, "; skipping.\n") if info
119 end
120 end
121
122 $stdout.print("Removed ", count, " files.") if info
123 $stdout.print(" (Not really.)") if info and dry
124 $stdout.print("\n") if info
125end
126
127run()