bin/prepare_content from sul-dlss/pre-assembly

bin/prepare_content
Summary

Maintainability

Test Coverage

Issues
#!/usr/bin/env ruby
# frozen_string_literal: true

# Used in Aug 2022 by Meagan Trott, implying a workflow in the Rumsey Map Center that involves it

# Used to stage content from Rumsey or other similar format to folder structure ready for accessioning.
# This script is only known to be used by the Maps Accessioning team (Rumsey Map Center)
# Full documentation of how it is used is here (which needs to be updated if this script moves):
# https://consul.stanford.edu/pages/viewpage.action?pageId=146704638

# Iterate through each row in the supplied CSV manifest, find files, generate contentMetadata and copies/symlinks to new location.
# Note: filenames must match exactly (no leading 0s) but can be in any sub-folder

# Peter Mangiafico
# November 14, 2017
#
# Run with
# RAILS_ENV=production bin/prepare_content INPUT_CSV_FILE.csv FULL_PATH_TO_CONTENT FULL_PATH_TO_STAGING_AREA [--no-object-folders] [--report]
#  e.g.
# RAILS_ENV=production bin/prepare_content /maps/ThirdParty/Rumsey/Rumsey_Batch1.csv /maps/ThirdParty/Rumsey/content /maps/ThirdParty/Rumsey [--no-object-folders] [--report]

# the first parameter is the input CSV (with columns labeled "Object", "Image", and "Label" (image is the filename, object is the object identifier which can be turned into a folder)
# second parameter is the full path to the content folder that will be searched (i.e. the base content folder)
#      Note: files will be searched iteratively through all sub-folders of the base content folder
# third parameter is optional and is the full path to a folder to stage (i.e. copy or symlink) content to - if not provided, will use same path as csv file, and append "staging"
#
# if you set the --report switch, it will only produce the output report, it will not copy or symlink any files
# if you set the --no-object-folders switch, then all copied/symlinked files will be flat in the staging directory (i.e. no object level folders) -- this requires all filenames to be unique across objects, if left off, then object folders will be created to store copied/symlinked files
# note that file extensions do not matter when matching

require 'optparse'
report = false # if set to true, will only show output and produce report, won't actually copy or symlink files or create anything, can be overriden with --report switch
no_object_folders = false # if false, then each new object will be in a separately created folder, with copied/symlinked contained inside it; if true, you will get a flat list
symlink = false # if false, then files are copied/symlinked to output folder, if true, then files are symlinked to output folder

help = "Usage:\n    #{$PROGRAM_NAME} INPUT_CSV_FILE BASE_CONTENT_FOLDER [STAGING_FOLDER] [--no-object-folders] [--report] [--symlink]\n"
OptionParser.new do |opts|
  opts.banner = help
  opts.on('--report') do |_dr|
    report = true
  end
  opts.on('--no-object-folders') do |_ob|
    no_object_folders = true
  end
  opts.on('--symlink') do |_dr|
    symlink = true
  end
end.parse!

if ARGV.size < 2
  puts help
  abort 'Incorrect number of argument provided - you need to supply an input CSV file and the folder to search for.'
end
csv_in = ARGV[0]
base_content_folder = ARGV[1]

source_path = File.dirname(csv_in)
source_name = File.basename(csv_in, File.extname(csv_in))
csv_out = File.join(source_path, source_name + '_log.csv')
action = symlink ? 'symlink' : 'copy'

staging_folder = if ARGV.size == 2 # no staging path provided, use same as CSV In and append "staging"
                   File.join(source_path, 'staging')
                 else # use what was provided
                   ARGV[2]
                 end

abort "#{csv_in} not found" unless File.exist?(csv_in)

### After checking to see that the options look okay, now load the environment (this is slow, ~5s)
require_relative '../config/environment'

unless File.exist?(csv_out) # if we don't already have a log file, write out the header row
  CSV.open(csv_out, 'a') do |f|
    output_row = %w[Object Image Filename Sequence Label Druid Success Message Time]
    f << output_row
  end
end

# read in existing log file
log_file_data = CSV.open(csv_out, 'rb:bom|utf-8', headers: true).map { |row| row.to_hash.with_indifferent_access }

# read input manifest
csv_data = CSV.open(csv_in, 'rb:bom|utf-8', headers: true).map { |row| row.to_hash.with_indifferent_access }

start_time = Time.now
puts ''
puts '***Prepare Content***'
puts 'Only producing report' if report
puts 'Creating object folders' unless no_object_folders
puts 'Create symlinks instead of copy' if symlink
puts "Input CSV File: #{csv_in}"
puts "Logging to: #{csv_out}"
puts "Base Content Folder: #{base_content_folder}"
puts "Staging Folder: #{staging_folder}"
puts "Started at: #{start_time}"
puts ''
$stdout.flush

found_objects = []
n = 0
num_files_not_found = 0
num_objects = 0
num_files_copied = 0

FileUtils.cd(base_content_folder)
FileUtils.mkdir_p staging_folder unless report
files_to_search = Dir.glob('**/**').reject { |f| ['.', '..', '.DS_Store'].include?(f) }

csv_data.each do |row|
  n += 1
  puts "Row #{n} out of #{csv_data.size}"
  $stdout.flush

  object = row['Object'].gsub(',', '-') # commas are no good in filenames, use a dash instead
  row_filename = row['Image']
  label = row['Label']
  sequence = row['Sequence']
  druid = row['Druid']

  success = false

  filename = File.basename(row_filename, File.extname(row_filename)) # remove any extension from the filename that was provided

  previously_found = !log_file_data.select { |log_row| log_row['Image'] == row_filename && log_row['Success'].downcase == 'true' }.empty?
  previously_missed = !log_file_data.select { |log_row| log_row['Image'] == row_filename && log_row['Success'].downcase == 'false' }.empty?

  if previously_found
    puts "......#{Time.now}: skipping #{object} - already run"
    next
  end

  # only look for this file if it has not already been found according to the output log file

  object_folder = File.join(staging_folder, object)

  unless found_objects.include? object # check to see if we have a new object so we can create a new output folder for it
    msg = "...#{Time.now}: Found new object: '#{object}'"
    unless no_object_folders || report
      FileUtils.mkdir_p object_folder
      msg += " - creating object folder '#{object_folder}' if it does not exist"
    end
    found_objects << object
    num_objects += 1
    puts msg
  end

  # now search for any file which ends with the filename (trying to catch cases where the filename has 0s at the beginning that were dropped from the spreadsheet)
  puts "......#{Time.now}: looking for file '#{filename}', object '#{object}', label '#{label}'"
  # this regular expression will look for files that either match exactly (ignoring extension)
  #  or that match exacatly but are in a sub-directory (as indicated by having a path separator, e.g. a "/" right before the filename)
  # e.g. if you are looking for a file called "test.csv", this will match "test", "test.csv", "test.jpg", "dir/test.csv", "dir/test", but NOT "0test", or "dir/0test.jpg"
  files_found = files_to_search.grep(/((.+\/{1}#{filename})|(^#{filename}))\.\S+/i)
  # if found, copies or symlinks files that match
  files_found.each do |input_file|
    input_filename = File.basename(input_file)
    message = "found #{input_file}, #{action} to object folder #{object_folder}"
    output_file_full_path = no_object_folders ? File.join(staging_folder, input_filename) : File.join(object_folder, input_filename)
    input_file_full_path = Pathname.new(File.join(base_content_folder, input_file)).cleanpath(true).to_s
    unless report || File.exist?(output_file_full_path)
      symlink ? FileUtils.ln_s(input_file_full_path, output_file_full_path, force: true) : FileUtils.cp(input_file_full_path, output_file_full_path)
    end
    num_files_copied += 1
    success = true
    CSV.open(csv_out, 'a') do |f|
      output_row = [object, filename, input_filename, sequence, label, druid, success, message, Time.now]
      f << output_row
    end
    puts "......#{message}"
  end

  # do not log if it was previously missed and we missed it again
  if !previously_missed && !success
    message = "ERROR #{filename} NOT FOUND"
    num_files_not_found += 1
    CSV.open(csv_out, 'a') do |f|
      output_row = [object, filename, '', sequence, label, druid, success, message, Time.now]
      f << output_row
    end
    puts "......#{message}"
  end

  puts ''
  $stdout.flush
end

puts ''
puts "Total objects staged: #{num_objects}"
puts "Total files #{action}: #{num_files_copied}"
puts "Total rows: #{csv_data.size}"
puts "Total files not found: #{num_files_not_found}"

puts "Completed at #{Time.now}, total time was #{format('%.2f', ((Time.now - start_time) / 60.0))} minutes"