duplicate-files.rb - 2023-02-07

Posted: 2023-02-07
Word Count: 2221
Tags: programming ruby

Table of Contents

For an explanation, see the directory above.

Highlighting shows changes from the previous version.

duplicate-files.rb

Summary of Changes

Usage

Usage: duplicate-files.rb [options] dir1 [dir2 ...]
    -q, --[no-]quiet                 Run absolutely quietly
    -v, --[no-]verbose               Run verbosely
    -d, --from [DIR]                 Compare arguments only to files from DIR
    -o, --output [OUTFILE]           Write standard output to OUTFILE
    -j, --json                       Write output as JSON
    -y, --yaml                       Write output as YAML (default)
    -p, --[no-]pretty                Pretty-print output
    -P, --perf-data [DIR]            Write performace data to a directory (default /tmp)

Listing

  1#!/usr/bin/env ruby 
  2
  3require 'find'
  4require 'fileutils'
  5require 'optparse'
  6require 'rational'
  7require 'tempfile'
  8require 'yaml'
  9require 'json'
 10
 11
 12# Default list of files to prune in search
 13PRUNE = ['.svn', 'CVS', 'CVSROOT', '.DS_Store', '.git']
 14
 15# Unfortunate artifact of transition between Ruby 1.9 and 2.0
 16if self.class.const_defined?(:Encoding) then
 17    ENCODING_UTF8 = Encoding.find('UTF-8')
 18else
 19    ENCODING_UTF8 = nil
 20end
 21
 22# Simple class to manage an ASCII spinner
 23class Spinner
 24    SPINNER_STATES = ['-', '\\', '|', "/"]
 25
 26    UPDATE_INTERVAL = 0.5 # s
 27
 28    def initialize(io=$stderr)
 29        @io = io
 30        @state = 0
 31        @updated = nil
 32    end
 33
 34    def start
 35        @updated = Time.now
 36        @io.print(SPINNER_STATES[0])
 37    end
 38
 39    def update
 40        if not @updated then
 41            start()
 42        end
 43
 44        now = Time.now
 45        if now - @updated > UPDATE_INTERVAL then
 46            @updated = now
 47            @state = (@state + 1) % SPINNER_STATES.length
 48            @io.print("\b", SPINNER_STATES[@state])
 49        end
 50    end
 51end
 52
 53#
 54# Not-so-simple class to manage a progress bar
 55#
 56class Progress
 57    BAR_LENGTH = 40
 58
 59    UPDATE_INTERVAL = 1.0 # s
 60
 61    attr_accessor :estimate, :actual
 62
 63    def initialize(io=$stderr)
 64        @io = io
 65        @updated = nil
 66        @estimate = 0
 67        @actual = 0
 68    end
 69
 70    def do_estimate(results, canon)
 71        @estimate = 0
 72        results.each do |fileset|
 73            # choose(x, k) is the number of ways one can choose `k`
 74            # items from a set of `n`.
 75            size = fileset.size
 76            size += 1 if canon
 77            if size > 1 then
 78                @estimate += size * (size - 1) # choose(size, 2) # factorial(size)
 79            end
 80        end
 81        update
 82    end
 83
 84    def factorial(n)
 85        if n <= 1 then 1 else n * factorial(n-1) end
 86    end
 87
 88    def choose(n, k)
 89        if n < 1 or k < 0 or k > n then
 90            return 0
 91        end
 92        result = 1
 93        1.upto(k) do |i|
 94            result = result * (n + 1 - i) / i
 95        end
 96        return result
 97    end
 98
 99    def add(amt = 1)
100        @actual += amt
101        update
102    end
103
104    def progress(increments = 100)
105        if @estimate == 0 then
106            return 0
107        else
108            return Rational(increments * @actual, @estimate).floor.to_i
109        end
110    end
111
112    def progress_bar
113        str = String.new
114        1.upto(BAR_LENGTH) do |i|
115            if i <= progress(BAR_LENGTH) then
116                str << '#'
117            else
118                str << '-'
119            end
120        end
121        return str
122    end
123
124    def update
125        if not @updated then
126            @updated = Time.now
127        end
128
129        now = Time.now
130        if now - @updated > UPDATE_INTERVAL then
131            @updated = now
132            msg = @actual.to_s + "/" + @estimate.to_s + \
133                  " |" + progress_bar + "|"
134            @io.print("\b"*msg.size, msg)
135        end
136    end
137end
138
139# Whether `path_i` and `path_j` refer to duplicate but not identical files
140def duplicate_files?(path_i, path_j)
141    return (File.exist?(path_i) \
142            and File.exist?(path_j) \
143            and not File.identical?(path_i, path_j) \
144            and FileUtils.cmp(path_i, path_j))
145end
146
147def prunable?(path, prune=[])
148    name = File.basename(path)
149    return (prune.include?(name) or File.fnmatch('._*', name))
150end
151
152# Recurse through array of `dirs` and produce Hash of all files by size.
153def files_by_size(dirs, prune=[], verbose=false)
154    result = Hash.new {|h,k| h[k]=[] }
155    count = 0
156    spinner = nil 
157    if verbose then
158        $stderr.print("Looking for files in ", dirs, ": ")
159        spinner = Spinner.new($stderr)
160        spinner.start
161    end
162    Find.find(*dirs) do |path|
163        if prunable?(path, prune) then
164            Find.prune()
165        elsif File.file?(path) then
166            size = File.size(path)
167            if size > 0 then
168                count = count + 1
169                path.encode!(ENCODING_UTF8) if ENCODING_UTF8
170                result[size] << path if size > 0
171                if verbose then
172                    count = count + 1
173                    spinner.update
174                end
175            end
176        end
177    end
178    if verbose then
179        $stderr.print(" done!\n")
180
181        $stderr.print("Found ", count, " non-empty files in ", 
182                      result.size, " size groups.\n")
183    end
184    return result
185end
186
187# Compare each file in `paths` and append lists of equal files to `result`
188def append_duplicates(result, paths, progress)
189    idsets = Hash.new {|h,k| h[k] = { k => true } }
190    0.upto(paths.length - 1) do |i|
191        (i+1).upto(paths.length - 1) do |j|
192            path_i = paths[i]
193            path_j = paths[j]
194            if duplicate_files?(path_i, path_j) then
195                idsets[path_i][path_j] = true
196                idsets[path_j][path_i] = true
197            end
198            progress.add
199        end
200    end
201    idsets.values.uniq.each do | s |
202        result << s.keys.sort
203    end
204    return result
205end
206
207# Find all files in `fmap` and append lists of equal files to `result`
208# `fmap` is a map of files by size.
209def append_all_dupes(result, fmap, opts, progress)
210    fmap.each_pair do |size, paths|
211        if paths.size > 1 then
212            $stderr.print("Comparing ", paths, " ...") if opts.verbose
213            append_duplicates(result, paths, progress)
214            $stderr.print("done.\n") if opts.verbose
215        end
216    end
217end
218
219# Find all files in `srcdir`, compare to all files in `fmap`,
220# and append lists of equal files to `result`.
221# The path from `srcdir` will always be first in the list.
222def append_dir_dupes(result, fmap, opts, progress)
223    $stderr.print("Looking for files in '", opts.canondir, "':\n") if opts.verbose
224    Find.find(opts.canondir) do |path_i|
225        if prunable?(path_i, opts.prune) then
226            Find.prune()
227        elsif File.file?(path_i) then
228            size = File.size(path_i)
229            paths = fmap[size]
230            if size > 0 and paths and not paths.empty? then
231                dupes = []
232                $stderr.print("Comparing '", path_i, 
233                              "' to ", paths, " ...") if opts.verbose
234                paths.each do |path_j|
235                    if duplicate_files?(path_i, path_j) then
236                        dupes << path_j
237                    end
238                    progress.add
239                end
240                if dupes.size > 0 then
241                   dupes.sort!
242                   result << [path_i, *dupes]
243                end
244                $stderr.print(" done.\n") if opts.verbose
245            end
246        end
247    end
248    $stderr.print("Done!\n") if opts.verbose
249    return result
250end
251
252Options = Struct.new("Options", 
253                     :info, :verbose, :canondir, :dirs, :prune, 
254                     :io, :format, :pretty, :perfdir)
255
256def parse_options
257
258    config = Options.new(true, false, nil, [], PRUNE, 
259                            nil, :yaml, false, nil)
260
261    outfile = nil
262
263    OptionParser.new do |opts|
264        opts.banner = "Usage: duplicate-files.rb [options] dir1 [dir2 ...]"
265
266        opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
267            config.info = (not v)
268        end
269        opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
270            config.verbose = v
271        end
272        opts.on("-d", "--from [DIR]", 
273                "Compare arguments only to files from DIR") do |d|
274            config.canondir = d
275        end
276        opts.on("-o", "--output [OUTFILE]", 
277                "Write standard output to OUTFILE") do |f|
278            outfile = File.new(f, 'w')
279        end
280        opts.on("-j", "--json", "Write output as JSON") do |v|
281            config.format = :json
282        end
283        opts.on("-y", "--yaml", "Write output as YAML (default)") do |v|
284            config.format = :yaml 
285        end
286        opts.on("-p", "--[no-]pretty", "Pretty-print output") do |v|
287            config.pretty = v
288        end
289        opts.on("-P", "--perf-data [DIR]", 
290                "Write performace data to a directory (default /tmp)") do |v|
291            config.perfdir = v
292        end
293    end.parse!
294
295    config.io = (outfile or $stdout)
296
297    config.dirs = ARGV
298
299    return config
300end
301
302def log_perf_data(perfdir, info, data)
303    begin
304        name = 'perf-' + Time.now.strftime('%Y-%m-%d-%H%M%S-%N') + '.json'
305        path = nil
306        if not perfdir then
307            dir = Dir.mktmpdir('dfperf')
308            path = File.join(dir, name)
309        else
310            Dir.mkdir(perfdir, 755) if not Dir.exist?(perfdir)
311            dir = Dir.new(perfdir)
312            path = File.join(dir.to_path, name)
313        end
314        # TODO: check that file doesn't already exist.
315        File.open(path, 'w') do |file|
316            file.write(JSON.pretty_generate(data))
317        end
318        if info then
319           $stderr.print("Wrote performance data to ", path, "\n") if info
320        end
321    rescue Exception  => msg
322        $stderr.print("Cannot log performance data: ", msg, "\n") if info 
323    end
324end
325
326def run
327    opts = parse_options
328
329    time_start = Time.now
330
331    fmap = files_by_size(opts.dirs, opts.prune, (opts.info or opts.verbose))
332
333    time_search = Time.now
334
335    if opts.info then
336        $stderr.print("Comparing files ...\n")
337    end
338
339    results = []
340    progress = Progress.new($stderr)
341
342    time_before_est = Time.now
343
344    progress.do_estimate(fmap, opts.canondir != nil)
345
346    time_estimate = Time.now
347
348    if opts.canondir then
349        append_dir_dupes(results, fmap, opts, progress)
350    else
351        append_all_dupes(results, fmap, opts, progress)
352    end
353
354    time_compare = Time.now
355
356    if opts.info then
357        $stderr.print("... done!\n")
358    end
359
360    log_perf_data(opts.perfdir, opts.info, {
361        "actual" => progress.actual,
362        "estimate" => progress.estimate,
363        "ngroups" => fmap.size, 
364        "time-compare" => time_compare - time_estimate,
365        "time-estimate" => time_estimate - time_before_est,
366        "time-search" => time_search - time_start,
367        "data" => fmap})
368
369    if opts.format == :json then
370        if @pretty then
371            text = JSON.pretty_generate(results)
372        else
373            text = JSON.generate(results)
374        end
375        opts.io.write(text)
376        opts.io.write("\n")
377    else
378        # :line_width setting so file names w/spaces don't break across lines
379        yamlopts = {:line_width => 4096} 
380        if @pretty then
381            # {:canonical => true} looks almost identical to pretty-printed JSON
382            # so the header keeps them distinct.
383            yamlopts[:header] = true
384            yamlopts[:canonical] = true
385        end
386        YAML.dump(results, opts.io, yamlopts)
387    end
388end
389
390run()

remove-files.rb

Summary of Changes

Usage

Usage: remove-files.rb [options] [listfile ...]
    -q, --[no-]quiet                 Run absolutely quietly
    -v, --[no-]verbose               Run verbosely
    -j, --[no-]json                  Read input as JSON
    -y, --[no-]yaml                  Read input as YAML (default)
    -x, --[no-]mixed                 Deduce format from files
    -u, --[no-]dry-run               Don't delete, just list files

Listing

  1#!/usr/bin/env ruby 
  2
  3require 'optparse'
  4require 'yaml'
  5require 'psych'
  6require 'json'
  7
  8# Append all but the first paths from each set in `filesets` to results
  9def append_files(result, filesets, verbose=true)
 10  filesets.each do |fset|
 11    delset = fset[1..-1]
 12    # Skip the first file in each set
 13    if verbose then
 14      $stderr.print("Skipping ", fset[0].inspect, ";\n");
 15      $stderr.print("Adding ", delset, "\n")
 16    end
 17    delset.each do |n|
 18      result << n
 19    end
 20  end
 21end
 22
 23# Figure out if the `text` in `path` is JSON
 24def is_json(path, text)
 25  text.strip! # Bad form to modify an argument
 26  path.end_with?(".json") or text.start_with?("{", "[", "\"")
 27end
 28
 29def read_file(files, path, verbose)
 30  $stderr.print("Reading ", path, " ...\n") if verbose
 31  text = f.read
 32  begin
 33    if is_json(path, text) then
 34      $stderr.print("Assuming JSON: ", path) if verbose
 35      append_files(files, JSON.parse(text), verbose)
 36    else
 37      $stderr.print("Assuming YAML: ", path) if verbose
 38      append_files(files, YAML.safe_load(text), verbose)
 39    end
 40  rescue JSON::JsonError => msg
 41    $stderr.print(msg, "\n") if verbose
 42    $stdout.print(path.inspect, "isn't valid JSON; skipping.\n") if info
 43  rescue Psych::Exception => msg
 44    $stderr.print(msg, "\n") if verbose
 45    $stdout.print(path.inspect, "isn't valid YAML; skipping.\n") if info
 46  end
 47end
 48
 49#
 50# Run the main loop
 51#
 52def run
 53
 54  info = true
 55  verbose = false
 56  input = :yaml
 57  dry = false
 58  files = []
 59
 60  OptionParser.new do |opts|
 61    opts.banner = "Usage: remove-files.rb [options] [listfile ...]"
 62
 63    opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
 64      info = (not v)
 65    end
 66    opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
 67      verbose = v
 68    end
 69    opts.on("-j", "--[no-]json", "Read input as JSON") do |v|
 70      input = :json if v
 71    end
 72    opts.on("-y", "--[no-]yaml", "Read input as YAML (default)") do |v|
 73      input = :yaml if v
 74    end
 75    opts.on("-x", "--[no-]mixed", "Deduce format from files") do |v|
 76      input = nil if v
 77    end
 78    opts.on("-u", "--[no-]dry-run", "Don't delete, just list files") do |v|
 79      dry = v
 80    end
 81  end.parse!
 82
 83  begin
 84    if input == :json then
 85      # Assuming we're getting a single JSON object ...
 86      result = JSON.parse(ARGF.read)
 87      append_files(files, result, verbose)
 88    elsif input == :yaml or ARGV.empty? then
 89      append_files(files, YAML.safe_load(ARGF.read), verbose)
 90    else
 91      ARGV.each { |path| read_file(files, path, verbose) }
 92    end
 93  rescue Psych::Exception => msg
 94    $stderr.print(msg, "\n") if verbose
 95    $stdout.print("Input isn't valid YAML; skipping.\n") if info
 96  rescue JSON::JsonError => msg
 97    $stderr.print(msg, "\n") if verbose
 98    $stdout.print("Input isn't valid JSON; skipping.\n") if info
 99  end
100
101  if info and not files.empty? then
102    $stdout.print("About to remove the following files:\n")
103    YAML.dump(files, $stdout)
104  end
105
106  files.uniq!
107  files.sort!
108
109  count = 0
110
111  files.each do |path|
112    $stderr.print("-> rm ", path.inspect, "\n") if verbose
113    begin
114      File.delete(path) if not dry 
115      count = count + 1
116    rescue StandardError => msg
117      $stderr.print(msg, "\n") if verbose
118      $stderr.print("Cannot remove ", path.inspect, "; skipping.\n") if info
119    end
120  end
121
122  $stdout.print("Removed ", count, " files.") if info
123  $stdout.print(" (Not really.)") if info and dry
124  $stdout.print("\n") if info
125end
126
127run()