lib/fedora/fedora_three_objects_migration_methods.rb
# frozen_string_literal: true
module FedoraThreeObjectsMigrationMethods
private
def make_pid_folder
FileUtils.mkdir_p "emory_#{@pid}"
end
def pull_pid_xml
system('fedora-export.sh', @fedora_three_path.split('://').last, @fedora_username, @fedora_password,
"emory:#{@pid}", 'info:fedora/fedora-system:FOXML-1.1', 'migrate', '.', @fedora_three_path.split('://').first)
file = File.open("./emory_#{@pid}.xml")
Nokogiri::XML(file)
end
def copy_files_to_folder
datastreams = @pid_xml.xpath('//foxml:datastream')
@pids_with_no_binaries += [@pid] if pid_lacks_binaries(datastreams)
return if pid_lacks_binaries(datastreams)
make_pid_folder
datastreams.each do |datastream|
if test_for_audit(datastream:)
pull_audit_object(datastream:)
elsif test_for_xmls(datastream:)
pull_xml_object(datastream:)
elsif test_for_license(datastream:) || test_for_allowed_mime_type(datastream:)
pull_binary_object(datastream:)
end
end
end
def test_for_xmls(datastream:)
datastream['ID'] != 'AUDIT' && datastream['ID'] != 'DC' && ['text/xml', 'application/rdf+xml'].include?(datastream.elements.first['MIMETYPE'])
end
def test_for_audit(datastream:)
datastream['ID'] == 'AUDIT'
end
def test_for_license(datastream:)
datastream['ID'] == 'SYMPLECTIC-LICENCE'
end
def test_for_allowed_mime_type(datastream:)
ALLOWED_TYPES.any? { |k, _v| datastream.elements.first['MIMETYPE'].include?(k.to_s) }
end
def pid_lacks_binaries(datastreams)
tested_datastreams = datastreams.reject do |ds|
test_for_xmls(datastream: ds) || test_for_audit(datastream: ds) || !test_for_allowed_mime_type(datastream: ds) || test_for_license(datastream: ds)
end
@number_of_binary_datastreams = tested_datastreams.size
tested_datastreams.empty?
end
def pull_audit_object(datastream:)
IO.copy_stream(StringIO.new(datastream.at_xpath('//foxml:datastreamVersion/foxml:xmlContent').to_s), "./emory_#{@pid}/AUDIT.xml")
record_filenames_with_path('AUDIT.xml')
end
def pull_xml_object(datastream:)
xml_doc = datastream['ID']
download = URI.open("#{@fedora_three_path}/fedora/get/emory:#{@pid}/#{xml_doc}")
filename = "#{download.base_uri.to_s.split('/')[-1]}.xml"
IO.copy_stream(download, "./emory_#{@pid}/#{filename}")
record_filenames_with_path(filename)
end
def pull_binary_object(datastream:)
binary_save_name = process_binary_filename(datastream:)
download = URI.open("#{@fedora_three_path}/fedora/get/emory:#{@pid}/#{@binary_id}")
IO.copy_stream(download, "./emory_#{@pid}/#{binary_save_name}")
record_filenames_with_path(binary_save_name)
end
def pull_pids_csv
::CSV.open(@pids, headers: true, return_headers: false).map(&:fields).flatten
end
def record_filenames_with_path(filename)
@pids_with_filenames[@pid] = @pids_with_filenames[@pid].nil? ? filename : @pids_with_filenames[@pid] + ";#{filename}"
end
def file_end_reports
# PIDs with no binaries report
File.write("./pids_with_no_binaries_#{@date_time_started}.txt", "List of PIDs with no binary files: #{@pids_with_no_binaries.join(', ')}") unless @pids_with_no_binaries.empty?
return if @pids_with_filenames.empty?
# PIDs with binaries CSV
::CSV.open("./pids_with_binaries_#{@date_time_started}.csv", 'wb') do |csv|
csv << ['pid', 'filenames']
@pids_with_filenames.to_a.each { |elem| csv << elem }
end
end
def truncate_long_filenames(filename)
if filename.length > 150
filename_chunks = filename.split('.')
[filename_chunks.first[0..99], 'TRUNCATED_FILE_NAME', ".#{filename_chunks.last}"].join
else
filename
end
end
def process_binary_filename(datastream:)
@binary_id = datastream['ID']
binary_filename = @binary_id == 'content' ? nil : datastream&.elements&.first&.[]('LABEL')&.gsub(/[^\s0-9A-Za-z._-]/, '')
binary_ext = ALLOWED_TYPES.find { |k, _v| datastream.elements.first['MIMETYPE'].include?(k.to_s) }[1] unless test_for_license(datastream:)
blank_filename_test(datastream:, binary_filename:) ? ["content", binary_ext].join('.') : truncate_long_filenames(binary_filename.tr(' ', '_'))
end
def blank_filename_test(datastream:, binary_filename:)
binary_filename.nil? || binary_filename.include?('/') || (!test_for_license(datastream:) && !ALLOWED_TYPES.values.any? { |t| binary_filename.include?(".#{t}") }) ||
(@number_of_binary_datastreams == 1 && !test_for_license(datastream:))
end
end