lib/workers/past_perfect4_importer_worker.rb from kete/kete

lib/workers/past_perfect4_importer_worker.rb
Summary

Maintainability

5 days
Test Coverage

Issues
require 'rexml/document'
require 'redcloth'
require 'importer'
# past perfect 4 xml output importer
# only handling photos for the moment
# needs accompanying archives.xml file
# to create topics to group photos by
# review http://development.kete.net.nz/kete/tickets/66 for details
# right now we create a topic for the collection (ACCESSNO from image)
# and add related images to it
# see OBJECTID goes to user_reference
class PastPerfect4ImporterWorker < BackgrounDRb::MetaWorker
  set_worker_name :past_perfect4_importer_worker
  set_no_auto_load true

  # importer has the version of methods that will work in the context
  # of backgroundrb
  include Importer

  def create(args = nil)
    importer_simple_setup

    @last_related_topic = nil
    @last_related_topic_pp4_objectid = nil
  end

  def do_work(args = nil)
    logger.info('in worker')
    begin
      @zoom_class = args[:zoom_class]
      @import = Import.find(args[:import])
      @related_topic_type = @import.topic_type
      @import_type = @import.xml_type
      @import_dir_path = ::Import::IMPORTS_DIR + @import.directory
      @import_parent_dir_for_image_dirs = @import_dir_path + '/images'
      @contributing_user = @import.user
      @import_request = args[:import_request]
      @current_basket = @import.basket
      @description_end_templates['default'] = @import.default_description_end_template
      @record_interval = @import.interval_between_records

      logger.info('after description_end_template and var assigns')
      # legacy support for kete horowhenua
      if !@import_request[:host].scan('horowhenua').blank?
        @description_end_templates['default'] = 'Any use of this image must be accompanied by the credit "Horowhenua Historical Society Inc."'
        @description_end_templates[/^f\d/] = 'Any use of this image must be accompanied by the credit "Foxton Historical Society"'
        @description_end_templates["2000\.000\."] = 'Any use of this image must be accompanied by the credit "Horowhenua District Council"'

        @collections_to_skip << 'HHS Photograph Collection'
      end

      logger.info('after description_end_template reassign')

      @zoom_class_for_params = @zoom_class.tableize.singularize

      # this may not work because of scope
      params = args[:params]

      logger.info('params: ' + params.inspect)

      @import_photos_file_path = "#{@import_dir_path}/records.xml"

      # this sets the instance vars that tell us what xml element paths we are using
      # old style or new style
      determine_elements_used(@import_photos_file_path)

      # this gets rid of xml elements that have empty values
      @path_to_trimmed_photos = importer_trim_fat_from_xml_import_file(@import_photos_file_path, "#{RAILS_ROOT}/tmp/trimmed_photos_pp4.xml")
      @import_photos_xml = REXML::Document.new File.open(@path_to_trimmed_photos)

      logger.info('after first trim')

      # TODO: test what happens when there isn't an accessions.xml file
      @import_accessions_file_path = "#{@import_dir_path}/accessions.xml"
      @path_to_trimmed_accessions = importer_trim_fat_from_xml_import_file(@import_accessions_file_path, "#{RAILS_ROOT}/tmp/trimmed_accessions_pp4.xml", true)

      # open the accessions xml to search for a matching record later
      @import_accessions_xml = REXML::Document.new File.open(@path_to_trimmed_accessions)
      # this gets the first matching accession record
      logger.info('opened accession')

      @import_accessions_xml_root = @import_accessions_xml.root

      @import.update_attributes(status: 'in progress')

      # we work from the photos
      # and grab information from the accessions file
      # bases on ACCESSNO as a kind of forein key
      # as we need it
      @import_photos_xml.elements.each(@root_element_name + '/' + @record_element_path) do |record|
        # we override this locally for our customizations
        importer_process(record, params)
      end
      importer_update_processing_vars_at_end
    rescue
      importer_update_processing_vars_if_rescue
    end
  end

  def importer_process(record, params)
    current_record = @results[:records_processed] + 1
    logger.info("starting record #{current_record}")

    # clear this out so last related_topic
    # doesn't persist
    related_topic = nil
    related_topic_pp4_objectid = nil
    existing_item = nil
    image_file = nil
    image_objectid = nil
    objectid = nil

    record_hash = importer_xml_record_to_hash(record, true)

    # make sure there is a imagefile value
    # if not, log record to skipped photos file with reason skipped
    image_file = record_hash['IMAGEFILE']

    logger.info('what is image_file: ' + image_file)

    image_objectid = record_hash['OBJECTID']

    reason_skipped = nil

    path_to_file_to_grab = importer_prepare_path_to_image_file(image_file)

    logger.info("record #{current_record} : path_to_file_to_grab : " + path_to_file_to_grab)

    if image_file.blank? || !File.exist?(path_to_file_to_grab)
      # TODO: add check to see if image_file has a, b, c, versions associated with it
      # and add them is if they exist
      # change record imagefile accordingly for each and call importer_process on each
      reason_skipped = 'no image file specified or the image file isn\'t available'
      logger.info("record #{current_record} : reason skipped image")
    else
      logger.info("record #{current_record} : looking for topic")
      # grab the relate_to_topic_id
      # by getting the ACCESSNO field's value
      # see if there is a matching topic already
      # if not, create one from match in @import_accessions_xml
      # no match, log record to skipped photos file with reason skipped
      related_topic_pp4_objectid = record_hash['ACCESSNO']

      if related_topic_pp4_objectid.nil?
        # see if we can derive the accessno from the objectid
        # if we can't derive the accessno, just stick it in without
        # related topic
        objectid_parts = image_objectid.split('.')
        if objectid_parts.size > 2
          related_topic_pp4_objectid = objectid_parts[0] + '.' + objectid_parts[1]
        else
          related_topic_pp4_objectid = 0
          related_topic = 0
          # reason_skipped.add_text 'no ACCESSNO specified'
          # logger.info("record #{current_record} : reason skipped no ACCESSNO")
        end
      end

      if related_topic_pp4_objectid != 0
        # this item has the same related_topic as the last
        # don't bother looking it up again
        if !@last_related_topic_pp4_objectid.nil? && (related_topic_pp4_objectid == @last_related_topic_pp4_objectid)
          related_topic = @last_related_topic
        else
          related_topic = Topic.find(
            :first,
            conditions: "extended_content like \'%<user_reference xml_element_name=\"dc:identifier\">#{related_topic_pp4_objectid}</user_reference>%\' AND topic_type_id = #{@related_topic_type.id}"
          )

        end

        related_accession_record = nil

        # no existing related topic
        # find the record in accessions
        # and create topic from it
        if related_topic.nil?
          logger.info("record #{current_record} : no kete topic found")

          logger.info('in creation of accession record')

          logger.info('accession we are looking for: ' + related_topic_pp4_objectid)

          if !@collections_to_skip.include?(record_hash['COLLECTION'])
            # file may time out
            related_accession_record = nil

            related_accession_record = @import_accessions_xml_root.elements["#{@record_element_path}[@ACCESSNO=\'#{related_topic_pp4_objectid}\']"]
            # we have some accesion record's that are mangled
            # by being three sections
            # rather than two
            # grab only the first two elements
            # and try again before giving up
            if related_accession_record.blank?
              cleaned_accessno_array = related_topic_pp4_objectid.split('.')
              cleaned_up_accessno = cleaned_accessno_array[0] + '.' + cleaned_accessno_array[1]

              logger.info('looking for cleaned up accession: ' + cleaned_up_accessno)

              if !@last_related_topic_pp4_objectid.nil? && (cleaned_up_accessno == @last_related_topic_pp4_objectid)
                logger.info('looking for cleaned up accession: last accessno match')
                related_topic = @last_related_topic
              else
                logger.info('looking for cleaned up accession: looking for existing topic')
                related_topic = Topic.find(
                  :first,
                  conditions: "extended_content like \'%<user_reference xml_element_name=\"dc:identifier\">#{cleaned_up_accessno}</user_reference>%\' AND topic_type_id = #{@related_topic_type.id}"
                )

              end

              if related_topic.nil?
                related_accession_record = @import_accessions_xml_root.elements["#{@record_element_path}[@ACCESSNO=\'#{cleaned_up_accessno}\']"]
              end
              related_topic_pp4_objectid = cleaned_up_accessno
            end

            if !related_accession_record.blank? && related_topic.nil?
              accession_record_hash = importer_xml_record_to_hash(related_accession_record, true)

              # create a new topic from related_accession_record
              # prepare user_reference for extended_content
              accession_topic = { 'topic' => {
                topic_type_id: @related_topic_type.id,
                title: record_hash['COLLECTION']
              } }

              descrip = RedCloth.new accession_record_hash['DESCRIP']
              accession_topic['topic'][:description] = descrip.to_html
              accession_topic['topic'][:short_summary] = importer_prepare_short_summary(descrip)

              topic_params = importer_prepare_extended_field(value: related_topic_pp4_objectid, field: 'OBJECTID', zoom_class_for_params: 'topic', params: accession_topic)

              related_topic = importer_create_related_topic(topic_params)
              logger.info('after related topic creation')
            end
          end
        end
      end

      # TODO: if there is an object with a matching exactly user_reference,
      # and the modified date of the import record is later
      # do an update instead

      # base our check on OBJECTID
      objectid = record_hash['OBJECTID']

      logger.info('what is objectid: ' + objectid)

      # this relies on user_reference extended_field
      # being mapped to the particular kete content type (not content type in mime sense)
      # Walter McGinnis, 2008-10-10
      # User Reference may be used by multiple images (all under same parent record)
      # they will have different filenames, but same user reference...
      # so adding filename check as criteria
      existing_item = StillImage.find(
        :first, joins: 'join image_files on still_images.id = image_files.still_image_id',
                conditions: "filename = \'#{File.basename(path_to_file_to_grab)}\' and extended_content like \'%<user_reference xml_element_name=\"dc:identifier\">#{objectid}</user_reference>%\'"
      )

      new_record = nil
      if existing_item.nil?
        # figure out the description_end_template based on the objectid
        description_end_template = @description_end_templates['default']
        @description_end_templates.each do |pattern, text|
          if pattern != 'default'
            description_end_template = text if !image_objectid.scan(pattern).blank?
          end
        end

        new_record = create_new_item_from_record(record, @zoom_class, { params: params, record_hash: record_hash, description_end_template: description_end_template })
      else
        logger.info('what is existing item: ' + existing_item.id.to_s)
        # record exists in kete already
        reason_skipped = 'kete already has a copy of this record'
      end

      if !new_record.nil? && !new_record.id.nil?
        # we may not have a related topic, only add the relation if we do
        if !related_topic.nil? && (related_topic != 0)
          ContentItemRelation.new_relation_to_topic(related_topic.id, new_record)
          if @last_related_topic.nil? || (related_topic.id != @last_related_topic.id)
            # update the last topic, since we are done adding things to it for now
            related_topic.prepare_and_save_to_zoom
          end
        end

        new_record.prepare_and_save_to_zoom
        sleep(@record_interval) if @record_interval > 0

        # now that we know that we have a valid related_topic
        # update @last_related_topic and @last_related_topic_pp4_objectid
        @last_related_topic = related_topic
        @last_related_topic_pp4_objectid = related_topic_pp4_objectid

        importer_update_records_processed_vars
      end
    end
    # if this record was skipped, add to skipped_records
    if !reason_skipped.blank?
      importer_log_to_skipped_records(image_file, reason_skipped)
    end
    # will this help memory leaks
    record = nil
    # give zebra and our server a small break
    sleep(@record_interval) if @record_interval > 0
  end

  # customized in this worker to override importer module version
  # expects an xml element of our record
  def create_new_item_from_record(record, zoom_class, options = {})
    zoom_class_for_params = zoom_class.tableize.singularize

    params = options[:params]

    # initialize the subhash in params
    # clears it out if it does already
    params[zoom_class_for_params] = {}

    params[zoom_class_for_params][:basket_id] = @current_basket.id

    # check extended_field.import_field_synonyms
    # for which extended field to map the import_field to
    # special cases for title, short_summary, and description
    record_hash = {}
    if options[:record_hash].nil?
      record_hash = Hash.from_xml(record.to_s)

      # HACK: to go down one more level
      record_hash.keys.each do |record_field|
        record_hash = record_hash[record_field]
      end
    else
      record_hash = options[:record_hash]
    end

    field_count = 1
    tag_list_array = []
    # add support for all items during this import getting a set of tags
    # added to every item in addition to the specific ones for the item
    tag_list_array = @import.base_tags.split(',') if !@import.base_tags.blank?

    record_hash.keys.each do |record_field|
      value = record_hash[record_field]
      if !value.nil?
        value = value.strip
        # replace \r with \n
        value.tr("\r", "\n")
      end

      if !value.blank?

        case record_field
        when 'TITLE'
          params[zoom_class_for_params][:title] = value
        when 'ADMIN'
          if (zoom_class == 'Topic') || (zoom_class == 'Document')
            params[zoom_class_for_params][:short_summary] = value
          else
            if params[zoom_class_for_params][:description].nil?
              params[zoom_class_for_params][:description] = value
            else
              params[zoom_class_for_params][:description] += "\n" + value
            end
          end
        when 'IMAGEFILE'
          if zoom_class == 'StillImage'
            # we do a check earlier in the script for imagefile
            # so we should have something to work with here
            params[:image_file] = { uploaded_data: copy_and_load_to_temp_file(importer_prepare_path_to_image_file(value)) }
          end
        when 'OBJECTID'
          if zoom_class == 'Topic'
            value = record_hash['ACCESSNO']
          end
          params = importer_prepare_extended_field(value: value, field: record_field, zoom_class_for_params: zoom_class_for_params, params: params)
        when *SystemSetting.description_synonyms
          if params[zoom_class_for_params][:description].nil?
            params[zoom_class_for_params][:description] = value
          else
            params[zoom_class_for_params][:description] += "\n\n" + value
          end
        when *SystemSetting.tags_synonyms
          if record_field == 'PEOPLE'
            # each person is in the form: last name, first names
            # one name per line
            # it may have things in parentheses which we ignore
            people_in_lines = value.split("\n")
            people_in_lines.each do |person|
              names_array = person.split(',')
              first_names = ''
              if !names_array[1].nil?
                first_names = names_array[1].split('(')[0].strip
              end
              last_names = names_array[0].strip
              name = first_names + ' ' + last_names
              tag_list_array << name.strip
            end
          else
            tag_list_array << value.tr("\n", ' ')
          end
        else
          params = importer_prepare_extended_field(value: value, field: record_field, zoom_class_for_params: zoom_class_for_params, params: params)
        end
      end
      field_count += 1
    end

    logger.info('after fields')

    if !@import.description_beginning_template.blank?
      # append the citation to the description field
      if !params[zoom_class_for_params][:description].nil?
        params[zoom_class_for_params][:description] = @import.description_beginning_template + "\n\n" + params[zoom_class_for_params][:description]
      else
        params[zoom_class_for_params][:description] = @import.description_beginning_template
      end
    elsif !SystemSetting.description_template.blank?
      if !params[zoom_class_for_params][:description].nil?
        params[zoom_class_for_params][:description] = SystemSetting.description_template + "\n\n" + params[zoom_class_for_params][:description]
      else
        params[zoom_class_for_params][:description] = SystemSetting.description_template
      end
    end

    if !options[:description_end_template].nil?
      # append the description_end_template to the description field
      if !params[zoom_class_for_params][:description].nil?
        params[zoom_class_for_params][:description] += "\n\n" + options[:description_end_template]
      else
        params[zoom_class_for_params][:description] = options[:description_end_template]
      end
    end

    logger.info('after description_end_template')

    description = ''
    # used to give use better html output for descriptions
    if !params[zoom_class_for_params][:description].nil?
      description = RedCloth.new params[zoom_class_for_params][:description]
      params[zoom_class_for_params][:description] = description.to_html
    end

    logger.info('after redcloth')

    if (zoom_class == 'Topic') || zoom_class == 'Document' && params[zoom_class_for_params][:short_summary].nil?
      if !description.blank?
        params[zoom_class_for_params][:short_summary] = importer_prepare_short_summary(description)
      end
    end

    logger.info('after short summary')

    params[zoom_class_for_params][:tag_list] = tag_list_array.join(',')

    logger.info('after tag list')

    # add the uniform license chosen at import to this item
    params[zoom_class_for_params][:license_id] = @import.license.id if !@import.license.blank?

    logger.info('after license')

    # clear any lingering values for @fields
    # and instantiate it, in case we need it
    @fields = nil

    if zoom_class == 'Topic'
      params[zoom_class_for_params][:topic_type_id] = @related_topic_type.id

      @fields = @related_topic_type.topic_type_to_field_mappings

      ancestors = TopicType.find(@related_topic_type).ancestors

      if ancestors.size > 1
        ancestors.each do |ancestor|
          @fields = @fields + ancestor.topic_type_to_field_mappings
        end
      end
    else
      content_type = ContentType.find_by_class_name(zoom_class)
      @fields = content_type.content_type_to_field_mappings
    end

    if @fields.size > 0
      # we use our version of this method
      # that calls xml builder directly, rather than using partial template
      params[zoom_class_for_params.to_sym] = params[zoom_class_for_params]
      params = importer_extended_fields_update_hash_for_item(item_key: zoom_class_for_params, params: params)
    end

    logger.info('after field set up')

    # replace with something that isn't reliant on params
    replacement_zoom_item_hash = importer_extended_fields_replacement_params_hash(item_key: zoom_class_for_params, item_class: zoom_class, params: params)

    new_record = Module.class_eval(zoom_class).new(replacement_zoom_item_hash)

    # we need new_image_file's file, for our embedded metadata (if enabled)
    # thus we have to create it before the still image
    new_image_file = nil
    new_image_file = importer_add_image(params, zoom_class) unless params[:image_file].blank?

    # only necessary for still images, because attachment is in a child model
    # if we are allowing harvesting of embedded metadata from the image_file
    # we need to grab it from the image_file's file path
    if SystemSetting.enable_embedded_support && !new_image_file.nil? && zoom_class == 'StillImage'
      new_record.populate_attributes_from_embedded_in(new_image_file.full_filename)
    end

    # if still image and new_image failed, fail
    new_record_added = false
    unless zoom_class == 'StillImage'
      new_record_added = new_record.save
    else
      new_record_added = new_record.save unless new_image_file.nil?
    end

    if new_record_added
      importer_add_still_image_to(new_image_file, new_record, zoom_class) unless new_image_file.nil?

      new_record.creator = @contributing_user

      logger.info('new_record: ' + new_record.inspect)
      return new_record
    else
      # destroy images if the record wasn't added successfully
      new_image_file.destroy unless new_image_file.nil?

      logger.info('new_record not added - save failed:')
      logger.info('what are errors on save of new record: ' + new_record.errors.inspect)
      return nil
    end
  end

  # set up the correct xml paths to use
  # based on what is in the source file
  def determine_elements_used(in_file)
    # assume original style root element and paths
    @root_element_name = 'Root'
    @record_element_path = 'Information/Record'
    # this should tell us what we need to know by around the second line
    IO.foreach(in_file) do |line|
      # if exported directly from Past Perfect, should match this
      # empty means no match
      if line.include?('<VFPData>')
        @root_element_name = 'VFPData'
        @record_element_path = 'ppdata'
        return
      else
        # we have matched the previous style, return without resetting vars
        return if line.include?('<Record>')
      end
    end
  end
end