bin/prepare_content
#!/usr/bin/env ruby
# frozen_string_literal: true
# Used in Aug 2022 by Meagan Trott, implying a workflow in the Rumsey Map Center that involves it
# Used to stage content from Rumsey or other similar format to folder structure ready for accessioning.
# This script is only known to be used by the Maps Accessioning team (Rumsey Map Center)
# Full documentation of how it is used is here (which needs to be updated if this script moves):
# https://consul.stanford.edu/pages/viewpage.action?pageId=146704638
# Iterate through each row in the supplied CSV manifest, find files, generate contentMetadata and copies/symlinks to new location.
# Note: filenames must match exactly (no leading 0s) but can be in any sub-folder
# Peter Mangiafico
# November 14, 2017
#
# Run with
# RAILS_ENV=production bin/prepare_content INPUT_CSV_FILE.csv FULL_PATH_TO_CONTENT FULL_PATH_TO_STAGING_AREA [--no-object-folders] [--report]
# e.g.
# RAILS_ENV=production bin/prepare_content /maps/ThirdParty/Rumsey/Rumsey_Batch1.csv /maps/ThirdParty/Rumsey/content /maps/ThirdParty/Rumsey [--no-object-folders] [--report]
# the first parameter is the input CSV (with columns labeled "Object", "Image", and "Label" (image is the filename, object is the object identifier which can be turned into a folder)
# second parameter is the full path to the content folder that will be searched (i.e. the base content folder)
# Note: files will be searched iteratively through all sub-folders of the base content folder
# third parameter is optional and is the full path to a folder to stage (i.e. copy or symlink) content to - if not provided, will use same path as csv file, and append "staging"
#
# if you set the --report switch, it will only produce the output report, it will not copy or symlink any files
# if you set the --no-object-folders switch, then all copied/symlinked files will be flat in the staging directory (i.e. no object level folders) -- this requires all filenames to be unique across objects, if left off, then object folders will be created to store copied/symlinked files
# note that file extensions do not matter when matching
require 'optparse'
report = false # if set to true, will only show output and produce report, won't actually copy or symlink files or create anything, can be overriden with --report switch
no_object_folders = false # if false, then each new object will be in a separately created folder, with copied/symlinked contained inside it; if true, you will get a flat list
symlink = false # if false, then files are copied/symlinked to output folder, if true, then files are symlinked to output folder
help = "Usage:\n #{$PROGRAM_NAME} INPUT_CSV_FILE BASE_CONTENT_FOLDER [STAGING_FOLDER] [--no-object-folders] [--report] [--symlink]\n"
OptionParser.new do |opts|
opts.banner = help
opts.on('--report') do |_dr|
report = true
end
opts.on('--no-object-folders') do |_ob|
no_object_folders = true
end
opts.on('--symlink') do |_dr|
symlink = true
end
end.parse!
if ARGV.size < 2
puts help
abort 'Incorrect number of argument provided - you need to supply an input CSV file and the folder to search for.'
end
csv_in = ARGV[0]
base_content_folder = ARGV[1]
source_path = File.dirname(csv_in)
source_name = File.basename(csv_in, File.extname(csv_in))
csv_out = File.join(source_path, source_name + '_log.csv')
action = symlink ? 'symlink' : 'copy'
staging_folder = if ARGV.size == 2 # no staging path provided, use same as CSV In and append "staging"
File.join(source_path, 'staging')
else # use what was provided
ARGV[2]
end
abort "#{csv_in} not found" unless File.exist?(csv_in)
### After checking to see that the options look okay, now load the environment (this is slow, ~5s)
require_relative '../config/environment'
unless File.exist?(csv_out) # if we don't already have a log file, write out the header row
CSV.open(csv_out, 'a') do |f|
output_row = %w[Object Image Filename Sequence Label Druid Success Message Time]
f << output_row
end
end
# read in existing log file
log_file_data = CSV.open(csv_out, 'rb:bom|utf-8', headers: true).map { |row| row.to_hash.with_indifferent_access }
# read input manifest
csv_data = CSV.open(csv_in, 'rb:bom|utf-8', headers: true).map { |row| row.to_hash.with_indifferent_access }
start_time = Time.now
puts ''
puts '***Prepare Content***'
puts 'Only producing report' if report
puts 'Creating object folders' unless no_object_folders
puts 'Create symlinks instead of copy' if symlink
puts "Input CSV File: #{csv_in}"
puts "Logging to: #{csv_out}"
puts "Base Content Folder: #{base_content_folder}"
puts "Staging Folder: #{staging_folder}"
puts "Started at: #{start_time}"
puts ''
$stdout.flush
found_objects = []
n = 0
num_files_not_found = 0
num_objects = 0
num_files_copied = 0
FileUtils.cd(base_content_folder)
FileUtils.mkdir_p staging_folder unless report
files_to_search = Dir.glob('**/**').reject { |f| ['.', '..', '.DS_Store'].include?(f) }
csv_data.each do |row|
n += 1
puts "Row #{n} out of #{csv_data.size}"
$stdout.flush
object = row['Object'].gsub(',', '-') # commas are no good in filenames, use a dash instead
row_filename = row['Image']
label = row['Label']
sequence = row['Sequence']
druid = row['Druid']
success = false
filename = File.basename(row_filename, File.extname(row_filename)) # remove any extension from the filename that was provided
previously_found = !log_file_data.select { |log_row| log_row['Image'] == row_filename && log_row['Success'].downcase == 'true' }.empty?
previously_missed = !log_file_data.select { |log_row| log_row['Image'] == row_filename && log_row['Success'].downcase == 'false' }.empty?
if previously_found
puts "......#{Time.now}: skipping #{object} - already run"
next
end
# only look for this file if it has not already been found according to the output log file
object_folder = File.join(staging_folder, object)
unless found_objects.include? object # check to see if we have a new object so we can create a new output folder for it
msg = "...#{Time.now}: Found new object: '#{object}'"
unless no_object_folders || report
FileUtils.mkdir_p object_folder
msg += " - creating object folder '#{object_folder}' if it does not exist"
end
found_objects << object
num_objects += 1
puts msg
end
# now search for any file which ends with the filename (trying to catch cases where the filename has 0s at the beginning that were dropped from the spreadsheet)
puts "......#{Time.now}: looking for file '#{filename}', object '#{object}', label '#{label}'"
# this regular expression will look for files that either match exactly (ignoring extension)
# or that match exacatly but are in a sub-directory (as indicated by having a path separator, e.g. a "/" right before the filename)
# e.g. if you are looking for a file called "test.csv", this will match "test", "test.csv", "test.jpg", "dir/test.csv", "dir/test", but NOT "0test", or "dir/0test.jpg"
files_found = files_to_search.grep(/((.+\/{1}#{filename})|(^#{filename}))\.\S+/i)
# if found, copies or symlinks files that match
files_found.each do |input_file|
input_filename = File.basename(input_file)
message = "found #{input_file}, #{action} to object folder #{object_folder}"
output_file_full_path = no_object_folders ? File.join(staging_folder, input_filename) : File.join(object_folder, input_filename)
input_file_full_path = Pathname.new(File.join(base_content_folder, input_file)).cleanpath(true).to_s
unless report || File.exist?(output_file_full_path)
symlink ? FileUtils.ln_s(input_file_full_path, output_file_full_path, force: true) : FileUtils.cp(input_file_full_path, output_file_full_path)
end
num_files_copied += 1
success = true
CSV.open(csv_out, 'a') do |f|
output_row = [object, filename, input_filename, sequence, label, druid, success, message, Time.now]
f << output_row
end
puts "......#{message}"
end
# do not log if it was previously missed and we missed it again
if !previously_missed && !success
message = "ERROR #{filename} NOT FOUND"
num_files_not_found += 1
CSV.open(csv_out, 'a') do |f|
output_row = [object, filename, '', sequence, label, druid, success, message, Time.now]
f << output_row
end
puts "......#{message}"
end
puts ''
$stdout.flush
end
puts ''
puts "Total objects staged: #{num_objects}"
puts "Total files #{action}: #{num_files_copied}"
puts "Total rows: #{csv_data.size}"
puts "Total files not found: #{num_files_not_found}"
puts "Completed at #{Time.now}, total time was #{format('%.2f', ((Time.now - start_time) / 60.0))} minutes"