lib/baw-workers/harvest/gather_files.rb
require 'pathname'
module BawWorkers
module Harvest
# Get a list of files to be harvested.
class GatherFiles
# Create a new BawWorkers::Harvest::GatherFiles.
# @param [Logger] logger
# @param [BawWorkers::FileInfo] file_info_helper
# @param [Array<String>] ext_include
# @param [String] config_file_name
# @return [BawWorkers::Harvest::GatherFiles]
def initialize(logger, file_info_helper, ext_include, config_file_name)
@logger = logger
@file_info_helper = file_info_helper
@ext_include = ext_include #Settings.available_formats.audio
@ext_exclude = %w(completed log yml)
@config_file_name = config_file_name
@class_name = self.class.name
end
# Get file properties for a file, directory, or array of files or directories.
# @param [String] input file, directory, or array of files or directories
# @param [Boolean] recurse look in sub folders
# @return [Array<Hash>] file properties
def run(input, recurse = true)
results = []
@logger.info(@class_name) { 'Gathering files.' }
input_array = []
input_array = [input] if input.is_a?(String)
input_array = input if input.is_a?(Array)
if input_array.size > 0
input_array.each do |item|
if item.is_a?(String) && File.file?(item)
top_dir = File.dirname(item)
else
top_dir = item
end
results.push(*process(item, top_dir, recurse))
end
else
msg = "'#{input}' must be a string or an array of strings."
@logger.warn(@class_name) { msg }
end
@logger.info(@class_name) { "Finished gathering files. Found #{results.size} file(s)." }
results.compact
end
private
def process(path, top_dir, recurse = true)
dirs = []
results = []
path = File.expand_path(path)
if path.is_a?(String) && File.file?(path)
@logger.info(@class_name) { "Found file #{path}." }
current_dir = File.dirname(path)
files = [path]
elsif path.is_a?(String) && File.directory?(path)
@logger.info(@class_name) { "Found directory #{path}." }
current_dir = path
check_directory(current_dir)
files = files_in_directory(current_dir)
dirs = directories_in_directory(current_dir) if recurse
else
@logger.warn(@class_name) { "Not a recognised file or directory: #{path}." }
return results
end
dir_settings = get_folder_settings(File.join(current_dir, @config_file_name))
# process any files found
files.each do |file|
file_result = file(file, top_dir, dir_settings)
results.push(file_result) unless file_result.blank?
end
if results.size > 0
@logger.info(@class_name) { "Gathered info for #{results.size} valid files in #{current_dir}." }
else
@logger.debug(@class_name) { "No valid files in #{current_dir}." }
end
# process any directories found
dirs.each { |dir| results.push(*process(dir, top_dir, recurse)) }
results
end
# Check properties for a directory.
# @param [String] path directory
# @return [Array<Hash>] directory
def check_directory(path)
unless File.directory?(path)
msg = "'#{path}' is not a directory."
@logger.error(@class_name) { msg }
fail ArgumentError, msg
end
is_writable = File.writable?(path)
is_writable_real = File.writable_real?(path)
if !is_writable || !is_writable_real
msg = "Found read-only directory: '#{path}'."
@logger.error(@class_name) { msg }
fail ArgumentError, msg
end
path
end
# Get file properties for a single file.
# @param [String] path file
# @param [String] top_dir base directory
# @param [Hash] dir_settings
# @return [Hash] file properties
def file(path, top_dir, dir_settings = {})
unless File.file?(path)
msg = "'#{path}' is not a file."
@logger.error(@class_name) { msg }
fail ArgumentError, msg
end
path = File.expand_path(path)
unless @file_info_helper.valid_ext?(path, @ext_include)
@logger.warn(@class_name) { "Invalid extension #{path}." }
return
end
@logger.debug(@class_name) { "Valid extension #{path}." }
dir_settings = get_folder_settings(File.join(File.dirname(path), @config_file_name)) if dir_settings.blank?
basic_info, advanced_info = file_info(path, dir_settings[:utc_offset])
if basic_info.blank? || advanced_info.blank?
@logger.warn(@class_name) { "Not enough information for #{path}." }
{}
else
@logger.debug(@class_name) { "Complete information found for #{path}." }
result = {}
result = result.merge(basic_info).merge(dir_settings).merge(advanced_info)
result[:file_rel_path] = Pathname.new(path).relative_path_from(Pathname.new(top_dir)).to_s
result
end
end
# Get info for file.
# @param [String] file
# @param [String] utc_offset
# @return [Array] basic_info, advanced_info
def file_info(file, utc_offset)
basic_info, advanced_info = nil
begin
basic_info = @file_info_helper.basic(file)
advanced_info = @file_info_helper.advanced(file, utc_offset)
msg_props = "properties for file #{file} using offset #{utc_offset}: Basic: #{basic_info}. Advanced: #{advanced_info}."
if basic_info.blank? || advanced_info.blank?
@logger.info(@class_name) { "Could not get #{msg_props}" }
else
@logger.debug(@class_name) { "Successfully got #{msg_props}" }
end
rescue StandardError => e
@logger.error(@class_name) {
"Problem getting details for #{file} using utc offset '#{utc_offset}': #{format_error(e)}"
}
end
[basic_info, advanced_info]
end
# Get all files in a directory.
# @param [String] path directory
# @return [Array<String>] files
def files_in_directory(path)
items_in_dir = Dir.glob(File.join(path, '*'))
files_in_dir = items_in_dir.select { |f| File.file?(f) }
@logger.info(@class_name) { "Found #{files_in_dir.size} files in #{path}." }
@logger.debug(@class_name) { "Files in #{path}: '#{files_in_dir.join(', ')}'." }
files_in_dir
end
# Get all directories in a directory.
# @param [String] path directory
# @return [Array<String>] directories
def directories_in_directory(path)
dirs_in_dir = Dir.glob(File.join(path, '*/'))
@logger.info(@class_name) { "Found #{dirs_in_dir.size} directories in #{path}." }
@logger.debug(@class_name) { "Directories in #{path}: '#{dirs_in_dir.join(', ')}'." }
dirs_in_dir
end
# Get folder settings.
# If the config file does not exist, that's ok,
# some files might have that info in their file names
# so the settings file might not exist
# @param [string] file
# @return [Hash]
def get_folder_settings(file)
unless File.file?(file)
@logger.debug(@class_name) { "Harvest directory config file was not found '#{file}'." }
return {}
end
unless File.size?(file)
@logger.warn(@class_name) { "Harvest directory config file had no content '#{file}'." }
return {}
end
begin
config = YAML.load_file(file)
folder_settings = {
project_id: config['project_id'],
site_id: config['site_id'],
uploader_id: config['uploader_id'],
utc_offset: config['utc_offset'],
metadata: config['metadata']
}
if @file_info_helper.numeric?(folder_settings[:project_id]) &&
@file_info_helper.numeric?(folder_settings[:site_id]) &&
@file_info_helper.numeric?(folder_settings[:uploader_id]) &&
@file_info_helper.time_offset?(folder_settings[:utc_offset])
@logger.debug(@class_name) { "Harvest directory settings loaded from config file #{file}." }
folder_settings
else
@logger.warn(@class_name) { "Harvest directory config file was not valid '#{file}'. Could not get all settings." }
{}
end
rescue StandardError => e
@logger.warn(@class_name) { "Harvest directory config file was not valid '#{file}'. #{format_error(e)}" }
{}
end
end
# Format error.
# @param [Exception] e error
# @return [String] formatted error
def format_error(e)
"Error: #{e}\nBacktrace: #{e.backtrace.first(8).join("\n")}"
end
end
end
end