lib/crf/finder.rb from alebian/crf

lib/crf/finder.rb
Summary

Maintainability

0 mins
Test Coverage

Issues
require 'crf/repetitions_list'
require 'digest'
require 'ruby-progressbar'
require 'concurrent/executor/cached_thread_pool'

module Crf
  class Finder
    attr_reader :paths, :repetitions, :files

    ##
    # Creates the Finder object with a directory where it will look for duplicate files.
    # Path is the string representation of the absolute path of the directory.
    #
    # @param paths [Array] paths of the folders where the scan will start.
    # @param fast [Boolean] boolean indicating if this class will make a fast scan or not.
    #
    def initialize(paths, fast = false)
      @paths = paths
      @fast = fast
      initialize_thread_pool
    end

    def search_repeated_files
      @repetitions = first_run
      return repetitions if repetitions.empty? || @fast
      @repetitions = second_run(repetitions)
    end

    private

    def all_files
      @files = []
      paths.each do |path|
        Dir["#{path.chomp('/')}/**/*"].each do |file_path|
          @files << file_path.freeze if file?(file_path) && !@files.include?(file_path)
        end
      end
      @files
    end

    def file?(path)
      !File.directory?(path) && !File.symlink?(path)
    end

    ##
    # This looks for the files with the same size only
    #
    def first_run
      repetitions_list = Crf::RepetitionsList.new
      all_files.each do |file_path|
        repetitions_list.add(file_identifier(file_path).freeze, file_path)
      end
      repetitions_list.repetitions
    end

    def file_identifier(path)
      File.size(path).to_s
    end

    ##
    # After finding files with the same size, perform a deeper analysis of those
    #
    def second_run(repetitions)
      repetitions_list = Crf::RepetitionsList.new

      repetitions.values.each do |repeated_array|
        repeated_array.each do |file_path|
          @thread_pool.post { repetitions_list.add(file_hash(file_path), file_path) }
        end
      end
      await_pool_termination

      repetitions_list.repetitions
    end

    def file_hash(path)
      Digest::SHA256.file(path).hexdigest
    end

    def await_pool_termination
      @thread_pool.shutdown
      @thread_pool.wait_for_termination
      initialize_thread_pool
    end

    def initialize_thread_pool
      @thread_pool = Concurrent::CachedThreadPool.new
    end
  end
end