UNC-Libraries/hy-c

View on GitHub
app/services/tasks/proquest_ingest_service.rb

Summary

Maintainability
A
2 hrs
Test Coverage
A
97%
# frozen_string_literal: true
module Tasks
  require 'fileutils'
  require 'htmlentities'
  require 'zip'
  require 'tasks/migration_helper'

  class ProquestIngestService < IngestService
    attr_reader :admin_set_id

    def initialize(config, status_service)
      super

      @admin_set_id = @admin_set.id
    end

    def ingest_source
      'ProQuest'
    end

    # URI representing the type of packaging used for the original deposit represented by this record, such as CDR METS or BagIt.
    def deposit_package_type
      'http://proquest.com'
    end

    # Subclassification of the packaging type for this deposit, such as a METS profile.
    def deposit_package_subtype
      'ProQuest'
    end

    def process_package(package_path)
      @file_last_modified = ''
      unzipped_package_dir = unzip_dir(package_path)

      # extract files
      extract_files(package_path)

      raise "Error extracting #{package_path}: skipping zip file" if unzipped_package_dir.blank?

      metadata_file_path = metadata_file_path(dir: unzipped_package_dir)

      pdf_file_path = Dir.glob("#{unzipped_package_dir}/*.pdf")
      raise "Error: #{unzipped_package_dir} has more than 1 pdf file" unless pdf_file_path.count == 1

      raise "Package #{unzipped_package_dir} has no metadata file path" unless metadata_file_path

      raise "Package #{unzipped_package_dir} contains an empty metadata file" unless File.file?(metadata_file_path)

      # only use xml file for metadata extraction
      metadata, listed_files = proquest_metadata(metadata_file_path)

      logger.info("#{metadata_file_path}, Number of files: #{listed_files.count}")

      # create disseration record
      resource = MigrationHelper.check_enumeration(metadata, Dissertation.new, metadata_file_path)
      resource.visibility = metadata['visibility']
      unless metadata['embargo_release_date'].blank?
        resource.visibility_during_embargo = metadata['visibility_during_embargo']
        resource.visibility_after_embargo = metadata['visibility_after_embargo']
        resource.embargo_release_date = metadata['embargo_release_date']
      end
      resource[:deposit_record] = deposit_record.id
      resource.save!

      id = resource.id

      logger.info("[#{metadata_file_path}] created dissertation: #{id}")

      # get group permissions info to use for setting work and fileset permissions
      group_permissions = MigrationHelper.get_permissions_attributes(@admin_set_id)
      resource.update permissions_attributes: group_permissions

      # Create sipity record
      workflow = Sipity::Workflow.joins(:permission_template)
                                 .where(permission_templates: { source_id: resource.admin_set_id }, active: true)
      workflow_state = Sipity::WorkflowState.where(workflow_id: workflow.first.id, name: 'deposited')
      Sipity::Entity.create!(proxy_for_global_id: resource.to_global_id.to_s,
                             workflow: workflow.first,
                             workflow_state: workflow_state.first)

      # get list of all files in unzipped proquest package
      unzipped_file_list = Dir.glob("#{unzipped_package_dir}/**/*.*")

      ordered_members = []
      listed_files.each do |f|
        logger.info("[#{id}] trying...#{f}")

        file_path = unzipped_file_list.find { |e| e.match(f.to_s) }
        if file_path.blank?
          logger.error("[#{id}] cannot find #{f}")
          next
        end

        if File.file?(file_path)
          file_set = ingest_proquest_file(parent: resource,
                                          resource: metadata.merge({ title: [f] }),
                                          f: file_path)
          file_set.update permissions_attributes: group_permissions
          ordered_members << file_set if file_set
        end
      end
      resource.ordered_members = ordered_members

      # Attach metadata file
      fileset_attrs = { 'title' => [File.basename(metadata_file_path)] }
      fileset = ingest_proquest_file(parent: resource, resource: fileset_attrs, f: metadata_file_path)

      # Force visibility to private since it seems to be saving as public
      fileset.visibility = Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PRIVATE
      fileset.update permissions_attributes: group_permissions

      resource.ordered_members << fileset

      # delete zip file after files have been extracted and ingested successfully
      File.delete(package_path) if cleanup_enabled?
    end

    def metadata_file_path(dir:)
      metadata_file = Dir.glob("#{dir}/*_DATA.xml")
      if metadata_file.count == 1
        metadata_file.first.to_s
      else
        logger.error("Error: #{dir} has #{metadata_file.count} xml file(s)")
        nil
      end
    end

    def ingest_proquest_file(parent: nil, resource: nil, f: nil)
      logger.info("[#{parent.id}] ingesting... #{f}")
      fileset_metadata = file_record(resource)

      fileset_metadata.except!('embargo_release_date', 'visibility_during_embargo', 'visibility_after_embargo') if fileset_metadata['embargo_release_date'].blank?
      file_set = FileSet.create(fileset_metadata)
      actor = Hyrax::Actors::FileSetActor.new(file_set, @depositor)
      actor.create_metadata(fileset_metadata)
      file = File.open(f)
      actor.create_content(file)
      actor.attach_to_work(parent)
      file.close

      file_set
    end

    def proquest_metadata(metadata_file)
      file = File.open(metadata_file)
      metadata = Nokogiri::XML(file)
      file.close

      file_full = Array.new(0)
      visibility_during_embargo = Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PRIVATE
      visibility_after_embargo = Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PUBLIC
      embargo_release_date = ''
      visibility = Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PUBLIC

      embargo_code = metadata.xpath('//DISS_submission/@embargo_code').text

      logger.info("[#{metadata_file}] embargo code: #{embargo_code}")

      unless embargo_code.blank?
        current_date = Date.today
        comp_date_string = metadata.xpath('//DISS_description/DISS_dates/DISS_comp_date').text
        comp_date = Date.new(comp_date_string.to_i, 12, 31)
        embargo_release_date = current_date < comp_date ? current_date : comp_date

        if embargo_code == '2'
          embargo_release_date += 1.year
        elsif ['3', '4'].include? embargo_code
          embargo_release_date += 2.years
        else
          embargo_release_date = ''
        end

        embargo_release_date = '' if !embargo_release_date.blank? && embargo_release_date != current_date && embargo_release_date < current_date

        visibility = visibility_during_embargo unless embargo_release_date.blank?
      end

      logger.info("[#{metadata_file}] embargo release date: #{embargo_release_date}")

      title = metadata.xpath('//DISS_description/DISS_title').text

      creators = metadata.xpath('//DISS_submission/DISS_authorship/DISS_author[@type="primary"]/DISS_name').map do |creator|
        format_name(creator)
      end

      degree_granting_institution = metadata.xpath('//DISS_description/DISS_institution/DISS_inst_name').text

      keywords = metadata.xpath('//DISS_description/DISS_categorization/DISS_keyword').text.split(', ')
      keywords << metadata.xpath('//DISS_description/DISS_categorization/DISS_category/DISS_cat_desc').map(&:text)

      abstract = metadata.xpath('//DISS_content/DISS_abstract').text

      advisor = metadata.xpath('//DISS_description/DISS_advisor/DISS_name').map do |advise|
        "#{advise.xpath('DISS_surname').text}, #{advise.xpath('DISS_fname').text} #{advise.xpath('DISS_middle').text}"
      end

      committee_members = metadata.xpath('//DISS_description/DISS_cmte_member/DISS_name').map do |advise|
        format_name(advise)
      end
      advisor += committee_members

      abbreviated_degree = metadata.xpath('//DISS_description/DISS_degree').text

      dcmi_type = 'http://purl.org/dc/dcmitype/Text'
      normalized_degree = abbreviated_degree.downcase.gsub('.', '')
      degree_map = { 'ma' => 'Master of Arts',
                     'ms' => 'Master of Science',
                     'edd' => 'Doctor of Education',
                     'de' => 'Doctor of Education',
                     'phd' => 'Doctor of Philosophy',
                     'drph' => 'Doctor of Public Health',
                     'dnp' => 'Doctor of Nursing Practice' }
      if !degree_map[normalized_degree].blank?
        degree = DegreesService.label(degree_map[normalized_degree])
      else
        logger.warn("[#{metadata_file}] unknown degree: #{abbreviated_degree}")
        degree = abbreviated_degree
      end

      resource_type = if normalized_degree.in? ['ma', 'ms']
                        'Masters Thesis'
                      else
                        'Dissertation'
                      end

      department = metadata.xpath('//DISS_description/DISS_institution/DISS_inst_contact').text.strip

      date_issued = metadata.xpath('//DISS_description/DISS_dates/DISS_comp_date').text
      date_issued = Date.strptime(date_issued, '%Y')

      graduation_year = (date_issued.year || @file_last_modified.year).to_s

      language = metadata.xpath('//DISS_description/DISS_categorization/DISS_language').text
      if language == 'en'
        language = MigrationHelper.get_language_uri(['eng'])
        language_label = LanguagesService.label(language) unless language.blank?
      end

      file_full << metadata.xpath('//DISS_content/DISS_binary').text
      file_full += metadata.xpath('//DISS_content/DISS_attachment').map do |file_name|
        file_name.xpath('DISS_file_name').text
      end

      work_attributes = {
        'title' => [title],
        'label' => title,
        'depositor' => @depositor.uid,
        'creators_attributes' => build_person_hash(creators, department),
        'date_issued' => (Date.try(:edtf, date_issued.year) || date_issued.year).to_s,
        'abstract' => abstract.gsub(/\n/, '').strip,
        'advisors_attributes' => build_person_hash(advisor, nil),
        'dcmi_type' => dcmi_type,
        'degree' => degree,
        'degree_granting_institution' => degree_granting_institution,
        'graduation_year' => graduation_year,
        'language' => language,
        'language_label' => language_label,
        'rights_statement' => 'http://rightsstatements.org/vocab/InC-EDU/1.0/',
        'rights_statement_label' => 'In Copyright - Educational Use Permitted',
        'keyword' => keywords.flatten,
        'resource_type' => resource_type,
        'visibility' => visibility,
        'embargo_release_date' => (Date.try(:edtf, embargo_release_date.to_s)).to_s,
        'visibility_during_embargo' => visibility_during_embargo,
        'visibility_after_embargo' => visibility_after_embargo,
        'admin_set_id' => @admin_set_id
      }

      work_attributes.reject! { |_k, v| v.blank? }

      [work_attributes, file_full]
    end

    def build_person_hash(people, department)
      person_hash = {}
      people.each_with_index do |person, index|
        person_hash[index.to_s] = { 'name' => person, 'index' => index + 1 }
        next if department.nil?

        add_affiliation_hash(person_hash, index, department)
      end

      person_hash
    end

    def add_affiliation_hash(person_hash, index, department)
      affiliation = affiliation(department)
      if affiliation
        person_hash[index.to_s]['affiliation'] = affiliation(department)
      else
        person_hash[index.to_s]['other_affiliation'] = department
      end

      person_hash
    end

    def affiliation(department)
      return nil if department.nil? || department.empty?

      ProquestDepartmentMappingsService.standard_department_name(department)
    rescue ProquestDepartmentMappingsService::UnknownDepartmentError
      logger.warn("Could not map to standard department name: #{department}")
      nil
    end

    def format_name(person)
      name_parts = []
      name_parts << person.xpath('DISS_surname').text
      name_parts << ("#{person.xpath('DISS_fname').text} #{person.xpath('DISS_middle').text}").strip
      name_parts << person.xpath('DISS_suffix').text
      name_parts.reject { |name| name.blank? }.join(', ')
    end

    # FileSets can include any metadata listed in BasicMetadata file
    def file_record(attrs)
      file_set = FileSet.new
      file_attributes = Hash.new

      # Singularize non-enumerable attributes and make sure enumerable attributes are arrays
      attrs.each do |k, v|
        if file_set.attributes.keys.member?(k.to_s)
          file_attributes[k] = if !file_set.attributes[k.to_s].respond_to?(:each) && file_attributes[k].respond_to?(:each)
                                 v.first
                               elsif file_set.attributes[k.to_s].respond_to?(:each) && !file_attributes[k].respond_to?(:each)
                                 Array(v)
                               else
                                 v
                               end
        end
      end

      file_attributes['date_created'] = attrs['date_created']
      file_attributes['visibility'] = attrs['visibility']
      unless attrs['embargo_release_date'].blank?
        file_attributes['embargo_release_date'] = attrs['embargo_release_date']
        file_attributes['visibility_during_embargo'] = attrs['visibility_during_embargo']
        file_attributes['visibility_after_embargo'] = attrs['visibility_after_embargo']
      end

      file_attributes
    end

    def valid_extract?(extracted_files)
      # There should only be one _DATA.xml file
      metadata_file_match = extracted_files.keys.map { |file_name| file_name.match('_DATA.xml') }.compact
      # There should be at least one PDF file, but there could be more if there are supplemental materials
      pdf_file_match = extracted_files.keys.map { |file_name| file_name.match('.pdf') }.compact
      return true if metadata_file_match.size == 1 && pdf_file_match.size >= 1

      false
    end
  end
end