sul-dlss/argo

View on GitHub
app/jobs/modsulator_job.rb

Summary

Maintainability
A
0 mins
Test Coverage
A
95%
# frozen_string_literal: true

# This class defines a ActiveJob task that is started when the user uploads a bulk metadata file for
# an APO.
class ModsulatorJob < ApplicationJob
  # A somewhat easy to understand and informative time stamp format
  TIME_FORMAT = '%Y-%m-%d %H:%M%P'

  # The method does all the work of converting any input spreadsheets to XML, writing a log file as it goes along.
  # Later, this log file will be used to generate a nicer looking log for the user to view and to generate the list of
  # spreadsheet upload jobs within the Argo UI.
  #
  # @param  [String]  apo_id             DRUID of the DOR APO that governs all of the objects we're trying to upload metadata for.
  # @param  [String]  uploaded_filename  Full path to the temporary uploaded file. Deleted upon completion.
  # @param  [String]  output_directory   Where to store output (log, generated XML etc.).
  # @param  [User]    user               Acting user
  # @param  [String]  filetype           One of 'xml, 'spreadsheet', or 'xml_only'. If not 'xml', the input is to be loaded as MODS.
  # @param  [String]  note               An optional note that the user entered to go with the job.
  # @return [Void]
  def perform(apo_id, uploaded_filename, output_directory, user, groups, filetype = 'spreadsheet', note = '')
    original_filename = generate_original_filename(uploaded_filename)
    log_filename = generate_log_filename(output_directory)
    persist_metadata = load_to_dor?(filetype)
    method = operation(filetype)

    user.set_groups_to_impersonate(groups)
    ability = Ability.new(user)

    File.open(log_filename, 'a') do |log|
      start_log(log, user, original_filename, note)

      # If a modsulator request fails, the job will fail and automatically be retried
      response_xml = if method == 'normalize'
                       ModsulatorClient.normalize_mods(uploaded_filename:, pretty_filename: original_filename, log:)
                     else
                       ModsulatorClient.convert_spreadsheet_to_mods(uploaded_filename:,
                                                                    pretty_filename: original_filename, log:)
                     end

      if response_xml.nil?
        log.puts('argo.bulk_metadata.bulk_log_error_exception Got no response from server')
        log.puts("argo.bulk_metadata.bulk_log_job_complete #{Time.zone.now.strftime(TIME_FORMAT)}")
        return nil
      end

      metadata_path = File.join(output_directory, generate_xml_filename(original_filename))
      save_metadata_xml(response_xml, metadata_path, log)

      if persist_metadata
        log.puts('argo.bulk_metadata.bulk_log_xml_only false')
        update_metadata(apo_id, response_xml, original_filename, ability, log) # Load into DOR
      end

      log.puts("argo.bulk_metadata.bulk_log_job_complete #{Time.zone.now.strftime(TIME_FORMAT)}")
    end
    # Remove the (temporary) uploaded file only if everything worked. Removing upon catching an exception causes
    # subsequent job attempts to fail.
    FileUtils.rm(uploaded_filename, force: true)
  end

  # When filetype = 'xml' the user just wants to convert submitted spreadsheet to MODS. No need to load into DOR
  def load_to_dor?(filetype)
    filetype != 'xml'
  end

  def operation(filetype)
    filetype == 'xml_only' ? 'normalize' : 'convert'
  end

  # Upload metadata into DOR.
  #
  # @param  [String] druid               The governing APO's druid.
  # @param  [String] xml_string          A MODS XML string.
  # @param  [File] log                   Log file handle.
  # @param  [Ability] ability            The abilities of the current user
  # @param  [String] original_filename   The name of the uploaded file
  # @return [Void]
  def update_metadata(druid, xml_string, original_filename, ability, log)
    return if xml_string.nil?

    root = Nokogiri::XML(xml_string).root

    # Loop through each <xmlDoc> node and add the MODS XML that it contains to the object's descMetadata
    mods_list = root.xpath('//x:xmlDoc', 'x' => root.namespace.href)
    mods_list.each do |xmldoc_node|
      item_druid = Druid.new(xmldoc_node.attr('objectId')).with_namespace

      unless DruidTools::Druid.valid? item_druid
        log.puts("argo.bulk_metadata.bulk_log_invalid_druid #{item_druid}")
        next
      end

      begin
        cocina = Repository.find(item_druid)

        ApplyModsMetadata.new(apo_druid: druid,
                              mods: xmldoc_node.first_element_child.to_s,
                              cocina:,
                              original_filename:,
                              ability:,
                              log:).apply
      rescue Dor::Services::Client::NotFoundResponse => e
        log.puts("argo.bulk_metadata.bulk_log_not_exist #{item_druid}")
        log.puts(e.message.to_s)
        log.puts(e.backtrace.to_s)
      end
    end
  end

  # Generate a filename for the job's log file.
  #
  # @param  [String] output_dir Where to store the log file.
  # @return [String] A filename for the log file.
  def generate_log_filename(output_dir)
    FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir)
    # This log will be used for generating the table of past jobs later
    File.join(output_dir, Settings.bulk_metadata.log)
  end

  # The uploaded filename is of the form <file.xlsx.TIMESTAMP> or <file.xml.TIMESTAMP> in order to prevent
  # collisions when 2 people upload the same file. We don't want to display the timestamp later, though, so this method
  # returns a nicer looking version of the filename.
  #
  # @param  [String] uploaded_filename  The full path to the temporary uploaded file.
  # @return [String] A prettier version of the uploaded filename.
  def generate_original_filename(uploaded_filename)
    original_filename = File.basename(uploaded_filename)
    original_filename.slice(0, original_filename.rindex('.'))
  end

  # Write initial job information to the log file.
  #
  # @param [File] log_file The log file to write to.
  # @param [user] username The login name of the current user.
  # @param [String] filename The name of this job's input file.
  # @param [String] note An optional comment that describes this job.
  def start_log(log_file, user, filename, note = '')
    log_file.puts("argo.bulk_metadata.bulk_log_job_start #{Time.zone.now.strftime(TIME_FORMAT)}")
    log_file.puts("argo.bulk_metadata.bulk_log_user #{user.sunetid}")
    log_file.puts("argo.bulk_metadata.bulk_log_input_file #{filename}")
    log_file.puts("argo.bulk_metadata.bulk_log_note #{note}") if note.present?
    log_file.flush # record start in case of crash
  end

  # Writes the generated XML to a file named "metadata.xml" to disk and updates the log.
  #
  # @param  [String]  xml                An XML string, which will be written to output_filename.
  # @param  [String]  output_filename    The full path for where to store the XML file.
  # @param  [File]    log_file           The log file.
  # @return [Void]
  def save_metadata_xml(xml, output_filename, log_file)
    return if xml.nil?

    File.write(output_filename, xml)
    log_file.puts("argo.bulk_metadata.bulk_log_xml_timestamp #{Time.zone.now.strftime(TIME_FORMAT)}")
    log_file.puts("argo.bulk_metadata.bulk_log_xml_filename #{File.basename(output_filename)}")
    log_file.puts("argo.bulk_metadata.bulk_log_record_count #{xml.scan('<xmlDoc id').size}")
  end

  # Generates a filename for the MODS XML that this job creates.
  #
  # @param  [String]   original_filename    The name of the original file that the user uploaded.
  # @return [String]
  def generate_xml_filename(original_filename)
    "#{File.basename(original_filename, '.*')}-#{Settings.bulk_metadata.xml}.xml"
  end
end