archivesspace/archivesspace

View on GitHub
backend/app/converters/digital_object_converter.rb

Summary

Maintainability
D
2 days
Test Coverage
require_relative 'converter'

class DigitalObjectConverter < Converter

  require_relative 'lib/csv_converter'
  include ASpaceImport::CSVConvert


  def self.import_types(show_hidden = false)
    [
     {
       :name => "digital_object_csv",
       :description => "Import Digital Object records from a CSV file"
     }
    ]
  end


  def self.instance_for(type, input_file)
    if type == "digital_object_csv"
      self.new(input_file)
    else
      nil
    end
  end


  def self.configure
    {
      # 1. Map the cell data to schemas or handlers

      'agent_role' => 'd.agent_role',
      'agent_type' => 'agent.agent_type',

      'agent_contact_address_1' => 'agent_contact.address_1',
      'agent_contact_address_2' => 'agent_contact.address_2',
      'agent_contact_address_3' => 'agent_contact.address_3',
      'agent_contact_city' => 'agent_contact.city',
      'agent_contact_country' => 'agent_contact.country',
      'agent_contact_email' => 'agent_contact.email',
      'agent_contact_fax' => 'agent_contact.fax',
      'agent_contact_name' => 'agent_contact.name',

      'agent_contact_post_code' => 'agent_contact.post_code',
      'agent_contact_region' => 'agent_contact.region',
      'agent_contact_salutation' => 'agent_contact.salutation',
      'agent_contact_telephone' => 'agent_contact.telephone',
      'agent_contact_telephone_ext' => 'agent_contact.telephone_ext',

      'agent_name_authority_id' => 'agent_name.authority_id',
      'agent_name_dates' => 'agent_name.dates',
      'agent_name_fuller_form' => 'agent_name.fuller_form',
      'agent_name_name_order' => 'agent_name.name_order',
      'agent_name_number' => 'agent_name.number',
      'agent_name_prefix' => 'agent_name.prefix',
      'agent_name_primary_name' => 'agent_name.primary_name',
      'agent_name_qualifier' => 'agent_name.qualifier',
      'agent_name_rest_of_name' => 'agent_name.rest_of_name',
      'agent_name_rules' => 'agent_name.rules',
      'agent_name_sort_name' => 'agent_name.sort_name',
      'agent_name_source' => 'agent_name.source',
      'agent_name_subordinate_name_1' => 'agent_name.subordinate_name_1',
      'agent_name_subordinate_name_2' => 'agent_name.subordinate_name_2',
      'agent_name_suffix' => 'agent_name.suffix',

      'agent_name_description_note' => 'note_bioghist.content',
      'agent_name_description_citation' => 'note_citation.content',
      # 'agent_name_description_type' => '',

      'digital_object_acknowledgement_sent' => [normalize_boolean, 'acknowledgement_sent_event_date.boolean'],
      'digital_object_acknowledgement_sent_date' => [date_flip, 'acknowledgement_sent_event_date.expression'],

      'digital_object_agreement_received' => [normalize_boolean, 'agreement_received_event_date.boolean'],
      'digital_object_agreement_received_date' => [date_flip, 'agreement_received_event_date.expression'],

      'digital_object_agreement_sent' => [normalize_boolean, 'agreement_sent_event_date.boolean'],
      'digital_object_agreement_sent_date' => [date_flip, 'agreement_sent_event_date.expression'],

      'digital_object_cataloged' => [normalize_boolean, 'cataloged_event_date.boolean'],
      'digital_object_cataloged_date' => [date_flip, 'cataloged_event_date.expression'],

      'digital_object_processed' => [normalize_boolean, 'processed_event_date.boolean'],
      'digital_object_processed_date' => [date_flip, 'processed_event_date.expression'],
      'digital_object_processing_started_date' => 'collection_management.processing_started_date',
      'digital_object_processing_estimate' => 'collection_management.processing_hours_per_foot_estimate',
      'digital_object_processing_hours_total' => 'collection_management.processing_hours_total',
      'digital_object_processing_plan' => 'collection_management.processing_plan',
      'digital_object_processing_priority' => 'collection_management.processing_priority',

      # 'digital_object_processing_started_date' => '',
      'digital_object_processing_status' => 'collection_management.processing_status',
      'digital_object_processing_total_extent' => 'collection_management.processing_total_extent',
      'digital_object_processing_total_extent_type' => 'collection_management.processing_total_extent_type',
      'digital_object_processors' => 'collection_management.processors',

      'digital_object_rights_determined' => 'collection_management.rights_determined',
      # 'digital_object_rights_transferred' => '',
      # 'digital_object_rights_transferred_date' => '',
      # 'digital_object_rights_transferred_note' => '',

      'digital_object_title' => 'd.title',
      'digital_object_id' => 'd.digital_object_id',
      'digital_object_is_component' => [normalize_boolean, 'd.is_component'],
      'digital_object_component_id' => 'd.component_id',

      'digital_object_cataloged_note' => 'collection_management.cataloged_note',

      'digital_object_language' => 'lang_material.language',
      'digital_object_script' => 'lang_material.script',

      'digital_object_level' => 'd.level',
      'digital_object_publish' => [normalize_boolean, 'd.publish'],
      'digital_object_type' => 'd.digital_object_type',
      'digital_object_restrictions' => [normalize_boolean, 'd.restrictions'],

      'date_1_label' => 'date_1.label',
      'date_1_expression' => 'date_1.expression',
      'date_1_begin' => 'date_1.begin',
      'date_1_end' => 'date_1.end',
      'date_1_type' => 'date_1.date_type',

      'date_2_label' => 'date_2.label',
      'date_2_expression' => 'date_2.expression',
      'date_2_begin' => 'date_2.begin',
      'date_2_end' => 'date_2.end',
      'date_2_type' => 'date_2.date_type',

      'extent_type' => 'extent.extent_type',
      'extent_container_summary' => 'extent.container_summary',
      'extent_number' => 'extent.number',
      'extent_physical_details' => 'extent.physical_details',
      'extent_portion' => 'extent.portion',
      'extent_dimensions' => 'extent.dimensions',
      'extent_container_summary' => 'extent.container_summary',

      'subject_source' => 'subject.source',
      'subject_term' => 'subject.term',
      'subject_term_type' => 'subject.term_type',

      'user_defined_boolean_1' => [normalize_boolean, 'user_defined.boolean_1'],
      'user_defined_boolean_2' => [normalize_boolean, 'user_defined.boolean_2'],
      'user_defined_boolean_3' => [normalize_boolean, 'user_defined.boolean_3'],
      'user_defined_date_1' => [date_flip, 'user_defined.date_1'],
      'user_defined_date_2' => [date_flip, 'user_defined.date_2'],
      'user_defined_date_3' => [date_flip, 'user_defined.date_3'],
      'user_defined_integer_1' => [to_int, 'user_defined.integer_1'],
      'user_defined_integer_2' => [to_int, 'user_defined.integer_2'],
      'user_defined_integer_3' => [to_int, 'user_defined.integer_3'],
      'user_defined_real_1' => [to_real, 'user_defined.real_1'],
      'user_defined_real_2' => [to_real, 'user_defined.real_2'],
      'user_defined_real_3' => [to_real, 'user_defined.real_3'],
      'user_defined_string_1' => 'user_defined.string_1',
      'user_defined_string_2' => 'user_defined.string_2',
      'user_defined_string_3' => 'user_defined.string_3',
      'user_defined_string_4' => 'user_defined.string_4',
      'user_defined_text_1' => 'user_defined.text_1',
      'user_defined_text_2' => 'user_defined.text_2',
      'user_defined_text_3' => 'user_defined.text_3',
      'user_defined_text_4' => 'user_defined.text_4',
      'user_defined_text_5' => 'user_defined.text_5',
      'user_defined_enum_1' => 'user_defined.enum_1',
      'user_defined_enum_2' => 'user_defined.enum_2',
      'user_defined_enum_3' => 'user_defined.enum_3',
      'user_defined_enum_4' => 'user_defined.enum_4',

      'file_version_file_uri' => 'file_version.file_uri',
      'file_version_publish' => [normalize_boolean, 'file_version.publish'],
      'file_version_use_statement' => 'file_version.use_statement',
      'file_version_xlink_actuate_attribute' => 'file_version.xlink_actuate_attribute',
      'file_version_xlink_show_attribute' => 'file_version.xlink_show_attribute',
      'file_version_file_format_name' => 'file_version.file_format_name',
      'file_version_file_format_version' => 'file_version.file_format_version',
      'file_version_file_size_bytes' => 'file_version.file_size_bytes',
      'file_version_checksum' => 'file_version.checksum',
      'file_version_checksum_method' => 'file_version.checksum_method',
      'file_version_is_representative' => [normalize_boolean, 'file_version.is_representative'],
      'file_version_caption' => 'file_version.caption',

      # 2. Define data handlers
      #    :record_type of the schema (if other than the handler key)
      #    :defaults - hash which maps property keys to default values if nothing shows up in the source date
      #    :on_row_complete - Proc to run whenever a row in the CSV table is complete
      #        param 1 is the set of objects generated by the row
      #        param 2 is an object in the row (of the type described in the handler)

      :acknowledgement_sent_event_date => event_template('acknowledgement_sent'),

      :agreement_received_event_date => event_template('agreement_received'),

      :agreement_sent_event_date => event_template('agreement_sent'),

      :cataloged_event_date => event_template('cataloged'),

      :processed_event_date => event_template('processed'),


      :agent => {
        :record_type => Proc.new {|data|
            @agent_type = data['agent_type']
          },
        :on_row_complete => Proc.new {|cache, agent|
            digital_object = cache.find {|obj| obj.class.record_type == 'digital_object' }

            if digital_object
              digital_object.linked_agents[0]['ref'] = agent.uri
            else
              cache.reject! {|obj| obj.key == agent.key}
            end
          },

      },

      :agent_contact => {
        :on_row_complete => Proc.new {|cache, this|
          agent = cache.find {|obj| obj.class.record_type =~ /^agent_(perso|corpo|famil)/}
          agent.agent_contacts << this
        }
      },

      :agent_name => {
        :record_type => Proc.new {|data|
          @agent_type.sub(/agent_/, 'name_')
        },
        :on_create => Proc.new {|data, obj|
          if @agent_type =~ /family/
            obj.family_name = data['primary_name']
          end
        },
        :on_row_complete => Proc.new {|cache, this|
          agent = cache.find {|obj| obj.class.record_type =~ /^agent_(perso|corpo|famil)/}
          agent.names << this
        }
      },

      # this might be a Digital Object, or it might be a Digital Object Component
      :d => {
        :record_type => Proc.new {|data|
          data['is_component'] ? :digital_object_component : :digital_object
        },
        :on_create => Proc.new {|data, obj|
          if obj.class.record_type == 'digital_object_component'

            unless data['digital_object_id']
              raise "Component entries must have a 'digital_object_id' to link them to a top-level record"
            end

            do_uri = uri_lookup[data['digital_object_id']]

            unless do_uri
              raise "Components must be preceded by their top-level digital object in the CSV"
            end

            obj.digital_object = {'ref' => do_uri}
          else

            if data['agent_role']
              obj.linked_agents << {'role' => data['agent_role']}
            end
          end

        },
        :on_row_complete => Proc.new { |cache, obj|
          case
          when obj.class.record_type == 'digital_object'

            uri_lookup[obj.digital_object_id] = obj.uri

            if (cm = cache.find {|obj| obj.class.record_type == 'collection_management'})
              obj.collection_management = cm
            end

          else
            # ignore collection management data in a component context
            cache.reject! {|obj| obj.class.record_type == 'collection_management'}
          end
        }
      },

      :date_1 => {
        :record_type => :date,
        :defaults => date_defaults,
        :on_row_complete => attach_date,
      },

      :date_2 => {
        :record_type => :date,
        :defaults => date_defaults,
        :on_row_complete => attach_date,
      },

      :extent => {
        :defaults => {:portion => 'whole'},
        :on_row_complete => Proc.new {|cache, extent|
          digital_object = cache.find {|obj| obj.class.record_type =~ /^digital_object/ }
          digital_object.extents << extent
        }
      },

      :lang_material => {
        :on_create => Proc.new {|data, obj|
          obj.language_and_script = {'jsonmodel_type' => 'language_and_script', 'language' => data['language'], 'script' => data['script']}
        },
        :on_row_complete => Proc.new {|cache, this|
          digital_object = cache.find {|obj| obj.class.record_type =~ /^digital_object/ }
          digital_object.lang_materials << this
        }
      },

      :note_bioghist => {
        :on_create => Proc.new {|data, obj|
          obj.subnotes = [{'jsonmodel_type' => 'note_text', 'content' => data['content']}]
        },
        :on_row_complete => Proc.new {|cache, this|
          agent = cache.find {|obj| obj.class.record_type =~ /^agent_(perso|fami|corpo)/}
          agent.notes << this
        }
      },

      :note_citation => {
        :on_row_complete => Proc.new {|cache, this|
          note_biogist = cache.find {|obj| obj.class.record_type == 'note_bioghist'}
          note_biogist.subnotes << this
        }
      },

      :subject => {
        :on_create => Proc.new {|data, obj|
          obj.terms = [{:term => data['term'], :term_type => data['term_type'], :vocabulary => '/vocabularies/1'}]
          obj.vocabulary = '/vocabularies/1'
        },
        :on_row_complete => Proc.new {|cache, this|
          digital_object = cache.find {|obj| obj.class.record_type == 'digital_object'}
          digital_object.subjects << {'ref' => this.uri}
        }
      },

      :user_defined => {
        :on_row_complete => Proc.new {|cache, this|
          digital_object = cache.find {|obj| obj.class.record_type == 'digital_object'}
          digital_object.user_defined = this
        }
      },

      :file_version => {
        :on_row_complete => Proc.new {|cache, this|
          digital_object = cache.find {|obj| obj.class.record_type =~ /^digital_object/ }
          digital_object.file_versions << this
        }
      },
    }
  end


  private

  def self.event_template(event_type)
    {
      :record_type => Proc.new {|data|
        data['boolean'] ? :date : nil
      },
      :defaults => date_defaults,
      :on_create => Proc.new {|data, obj|
        obj.expression = 'unknown' unless data['expression']
      },
      :on_row_complete => Proc.new { |cache, date|
        digital_object = cache.find {|obj| obj.class.record_type == 'digital_object'}
        event = ASpaceImport::JSONModel(:event).new
        cache << event
        event.event_type = event_type
        # Not sure how best to handle this, assuming for now that the built-in ASpace agent exists:
        event.linked_agents << {'role' => 'executing_program', 'ref' => '/agents/software/1'}
        event.date = date
        event.linked_records << {'role' => 'source', 'ref' => digital_object.uri}
      }
    }
  end


  def self.date_defaults
    {
      :label => 'other',
      :date_type => 'single',
      :begin => '1900'
    }
  end


  def self.attach_date
    Proc.new { |cache, date|
      digital_object = cache.find {|obj| obj.class.record_type =~ /^digital_object/ }
      digital_object.dates << date
    }
  end


  def self.normalize_boolean
    @normalize_boolean ||= Proc.new {|val| val.to_s.upcase.match(/\A(1|T|Y|YES|TRUE)\Z/) ? true : false }
    @normalize_boolean
  end


  #need to track relationships across rows
  def self.uri_lookup
    @uri_lookup ||= {}
    @uri_lookup
  end


  # need to resue the agent type
  def self.agent_type
    @agent_type ||= nil
    @agent_type
  end


  def self.date_flip
    @date_flip ||= Proc.new {|val| val.sub(/^([0-9]{1,2})\/([0-9]{1,2})\/([0-9]{4})$/, '\2/\1/\3')}

    @date_flip
  end


  def self.to_real
    @to_real ||= Proc.new {|val| "%0.2f" % val.to_f}

    @to_real
  end


  def self.to_int
    @to_int ||= Proc.new {|val| val.to_i.to_s}

    @to_int
  end
end