lib/importer.rb from kete/kete

lib/importer.rb
Summary

Maintainability

2 wks
Test Coverage

Issues
# -*- coding: utf-8 -*-
require 'tempfile'
require 'fileutils'
require 'mime/types'
require 'oai_dc_helpers'
require 'xml_helpers'
require 'zoom_helpers'
require 'zoom_controller_helpers'
require 'extended_content_helpers'
require 'kete_url_for'
# used by importer scripts  in lib/workers
module Importer
  unless included_modules.include? Importer
    def self.included(klass)
      klass.send :include, KeteUrlFor
      klass.send :include, OaiDcHelpers
      klass.send :include, ZoomHelpers
      klass.send :include, ZoomControllerHelpers
      klass.send :include, ExtendedContentHelpers
      klass.send :include, ActionController::UrlWriter
    end

    # nicked from attachment_fu and modified
    def copy_and_load_to_temp_file(file)
      # derive filename from file path passed in
      filename = File.basename(file)

      # derive content_type, too
      content_type = MIME::Types.type_for(filename).first.content_type

      returning Tempfile.new(filename) do |tmp|
        FileUtils.copy_file file, tmp.path
        (class << tmp; self; end;).class_eval do
          alias_method :local_path, :path
          define_method(:original_filename) { filename }
          define_method(:content_type) { content_type }
        end
      end
    end

    def importer_add_image(params, zoom_class)
      # add the image file and then close it
      if zoom_class == 'StillImage'
        logger.info('what is params[:image_file]: ' + params[:image_file].to_s)
        new_image_file = ImageFile.new(params[:image_file])
        new_image_file.save
        new_image_file
      end
    end

    def importer_add_still_image_to(new_image_file, new_record, zoom_class)
      # add the image file and then close it
      if zoom_class == 'StillImage'
        new_image_file.still_image_id = new_record.id
        new_image_file.save
        # attachment_fu doesn't insert our still_image_id into the thumbnails
        # automagically
        new_image_file.thumbnails.each do |thumb|
          thumb.still_image_id = new_record.id
          thumb.save!
        end
        logger.info('images done')
      end
    end

    def importer_simple_setup
      @successful = false
      @import_field_to_extended_field_map = {}
      @description_end_templates = {}
      @collections_to_skip = []
      @results = {
        do_work_time: Time.now.to_s,
        done_with_do_work: false,
        records_processed: 0
      }

      cache[:results] = @results
    end

    def importer_setup_initial_instance_vars(args)
      @zoom_class = args[:zoom_class]
      @import = Import.find(args[:import])
      @import_type = @import.xml_type
      @import_dir_path = ::Import::IMPORTS_DIR + @import.directory
      @contributing_user = @import.user
      @import_request = args[:import_request]
      @description_end_templates['default'] = @import.default_description_end_template
      @current_basket = @import.basket
      logger.info('what is current basket: ' + @current_basket.inspect)
      @import_topic_type = @import.topic_type
      @zoom_class_for_params = @zoom_class.tableize.singularize
      @xml_path_to_record ||= @import.xml_path_to_record.blank? ? 'records/record' : @import.xml_path_to_record
      @record_interval = @import.interval_between_records

      # These help prevent duplicate records
      # Use ||= so they are only assigned if the importer worker doesn't specify one already
      @record_identifier_xml_field ||= @import.record_identifier_xml_field
      @extended_field_that_contains_record_identifier ||= @import.extended_field_that_contains_record_identifier

      # Values for relating records.
      # Use ||= so they are only assigned if the importer worker doesn't specify one already
      @related_topics_reference_in_record_xml_field ||= @import.related_topics_reference_in_record_xml_field
      @related_topic_type ||= @import.related_topic_type
      @extended_field_that_contains_related_topics_reference ||= @import.extended_field_that_contains_related_topics_reference
    end

    # override this in your importer worker
    # if you need something more complex
    # this is what we call from the importers controller
    # for our particular importer worker
    # create method per importer worker
    # should do the setup specific to our type of importer
    # most importantly the @xml_path_to_record
    def do_work(args = nil)
      logger.info('in work')
      begin
        importer_setup_initial_instance_vars(args)

        params = args[:params]

        # some import types will take data in type specific format
        # and convert to standard records.xml that importer expects
        # this is done simply by defining a records_pre_processor method in worker class
        records_pre_processor if defined?(records_pre_processor)

        # work through records and add topics for each
        # if they don't already exist
        @results[:records_processed] = 0
        cache[:results] = @results

        # if there was an uploaded archive file (zip, tar, etc.)
        # process the extracted records
        # otherwise we expect a XML file describing the records
        if @import.import_archive_file.present? && params[:related_topic].present?

          @related_topic = Topic.find(params[:related_topic])

          # variables assigned, files good to go, we're started
          @import.update_attributes(status: I18n.t('importer_lib.do_work.in_progress'))

          importer_records_from_directory_at(@import_dir_path, params)

        else
          # trimming of file
          @path_to_trimmed_records = "#{@import_dir_path}/records_trimmed.xml"
          # @skip_trimming is set in records_pre_processor (or not if it is not run)
          # just use records.xml if we should skip trimming
          records_xml_path = "#{@import_dir_path}/records.xml"
          if @skip_trimming
            @path_to_trimmed_records = records_xml_path
          else
            @path_to_trimmed_records = importer_trim_fat_from_xml_import_file(records_xml_path, @path_to_trimmed_records)
          end

          @import_records_xml = Nokogiri::XML File.open(@path_to_trimmed_records)

          # variables assigned, files good to go, we're started
          @import.update_attributes(status: I18n.t('importer_lib.do_work.in_progress'))

          @import_records_xml.xpath(@xml_path_to_record).each do |record|
            importer_process(record, params) unless record.content.blank?
          end
        end

        importer_update_processing_vars_at_end
      rescue
        importer_update_processing_vars_if_rescue
      end
    end

    # recursively work through import directory
    # to find extracted files to be imported
    def importer_records_from_directory_at(path, params)
      # files or directories to ignore
      not_wanted_patterns = ['Thumbs.db', 'ehthumbs.db', '__MACOSX']
      Dir.foreach(path) do |record|
        full_path_to_record = path + '/' + record
        not_wanted = File.basename(full_path_to_record).first == '.' || not_wanted_patterns.include?(record)

        unless not_wanted
          # descend directories
          # else process files
          if File.directory?(full_path_to_record)
            importer_records_from_directory_at(full_path_to_record, params)
          else
            importer_process(full_path_to_record, params)
          end
        end
      end
    end

    def importer_fetch_related_topics(related_topic_identifier, params, options = {})
      related_topics = []

      related_topics += importer_locate_existing_items(options)

      if related_topics.blank? && !@record_identifier_xml_field.blank?
        # HACK, for horizons agency/series import, needs to be handled better
        return [] if @import_dir == 'series'
        matching_records = @import_records_xml.xpath("#{@xml_path_to_record}[#{@record_identifier_xml_field}='#{related_topic_identifier.strip}']")

        # if no matches, try downcase and upcase searches
        matching_records = @import_records_xml.xpath("#{@xml_path_to_record}[#{@record_identifier_xml_field}='#{related_topic_identifier.strip.downcase}']") unless matching_records.any?
        matching_records = @import_records_xml.xpath("#{@xml_path_to_record}[#{@record_identifier_xml_field}='#{related_topic_identifier.strip.upcase}']") unless matching_records.any?

        matching_records.each do |record|
          # HACK, for horizons agency/series import, needs to be handled better
          # remove agency.Successor and agency.Predecessor (causes infinite loop) for nodes for now
          record.search('//agency.Successor').each do |node|
            node.remove
          end
          record.search('//agency.Predecessor').each do |node|
            node.remove
          end

          related_topics << importer_process(record, params) unless record.blank? || record.content.blank?
        end
      end

      related_topics
    end

    def importer_prepare_extended_field(options = {})
      params = options[:params]
      field = options[:field]
      value = options[:value]
      zoom_class_for_params = options[:zoom_class_for_params]
      if !value.blank?
        # look up the synonym for the field
        # check if it's been mapped locally
        extended_field = ''
        if @import_field_to_extended_field_map[field].present?
          extended_field = @import_field_to_extended_field_map[field]
        else
          if @import_topic_type
            extended_fields = @import_topic_type.mapped_fields
          else
            extended_fields = ExtendedField.all(conditions: "import_synonyms like \'%#{field}%\'")
          end

          if extended_fields.present?
            extended_field = extended_fields.select { |ext_field| (ext_field.import_synonyms || '').split.include?(field) }.first
            @import_field_to_extended_field_map[field] = extended_field
          else
            logger.info('field in prepare: ' + field.inspect)
            @import_field_to_extended_field_map[field] = I18n.t('importer_lib.importer_prepare_extended_field.not_available')
          end
        end

        if extended_field.present? && (extended_field != I18n.t('importer_lib.importer_prepare_extended_field.not_available'))
          # add some smarts for handling fields that are multiple
          # assumes comma separated values

          params[zoom_class_for_params]['extended_content_values'] = {} if \
            params[zoom_class_for_params]['extended_content_values'].nil?

          if %w{choice autocomplete}.include?(extended_field.ftype)
            params[zoom_class_for_params]['extended_content_values'][extended_field.label_for_params] ||= {}
            if extended_field.multiple
              value.split(',').each_with_index do |multiple_choice, multiple_index|
                params[zoom_class_for_params]['extended_content_values'][extended_field.label_for_params][(multiple_index + 1).to_s] ||= {}
                multiple_choice.strip.split('->').each_with_index do |choice, choice_index|
                  params[zoom_class_for_params]['extended_content_values'][extended_field.label_for_params][(multiple_index + 1).to_s][(choice_index + 1).to_s] = choice.strip
                end
              end
            else
              value.split('->').each_with_index do |choice, choice_index|
                params[zoom_class_for_params]['extended_content_values'][extended_field.label_for_params][(choice_index + 1).to_s] = choice.strip
              end
            end

          # Kieran Pilkington, 2009-10-28
          # The following code does not work yet
          # TODO: it looks like this still needs multiple support?
          elsif extended_field.ftype == 'topic_type' && @extended_field_that_contains_related_topics_reference.present?
            logger.info 'dealing with topic_type extended field'
            logger.info 'what is value? ' + value.inspect
            unless value =~ /http:\/\//
              logger.info 'value does not include http://'
              topic_type = TopicType.find_by_id(extended_field.topic_type)
              logger.info 'finding topic in topic type: ' + topic_type.inspect

              topics = importer_fetch_related_topics(
                value, params, {
                  item_type: 'topics',
                  topic_type: topic_type,
                  extended_field_data: {
                    label: @extended_field_that_contains_related_topics_reference.label_for_params,
                    value: value
                  }
                }
              )
              logger.info 'what is found topics? ' + topics.inspect
              return params if topics.blank?
              topic_url = url_for_dc_identifier(topics.first)
              value = { 'label' => value, 'value' => topic_url }
              logger.info 'what is resulting value? ' + value.inspect
            end
            params[zoom_class_for_params]['extended_content_values'][extended_field.label_for_params] = value

          elsif extended_field.ftype == 'year'
            if extended_field.multiple
              multiple_values = value.split(',')
              m_field_count = 1
              params[zoom_class_for_params]['extended_content_values'][extended_field.label_for_params] = {}
              multiple_values.each do |m_field_value|
                circa = m_field_value =~ /(circa|c.?\d+)/i # circa 2010, c 2010, c.2010
                m_field_value = (m_field_value =~ /(\d+)/ && $1) if circa
                params[zoom_class_for_params]['extended_content_values'][extended_field.label_for_params][m_field_count] = { value: m_field_value.to_s.strip, circa: (circa ? '1' : '0') }
                m_field_count += 1
              end
            else
              circa = value =~ /(circa|c.?\d+)/i # circa 2010, c 2010, c.2010
              value = (value =~ /(\d+)/ && $1) if circa
              params[zoom_class_for_params]['extended_content_values'][extended_field.label_for_params] = { value: value.to_s.strip, circa: (circa ? '1' : '0') }
            end

          else
            if extended_field.multiple
              multiple_values = value.split(',')
              m_field_count = 1
              params[zoom_class_for_params]['extended_content_values'][extended_field.label_for_params] = {}
              multiple_values.each do |m_field_value|
                params[zoom_class_for_params]['extended_content_values'][extended_field.label_for_params][m_field_count] = m_field_value.to_s.strip
                m_field_count += 1
              end
            else
              params[zoom_class_for_params]['extended_content_values'][extended_field.label_for_params] = value.to_s
            end
          end
        end
      end

      params
    end

    # populate extended_fields param with xml
    # based on params from the form
    def importer_extended_fields_update_hash_for_item(options = {})
      params = options[:params]
      item_key = options[:item_key].to_sym

      builder = Nokogiri::XML::Builder.new
      builder.root do |xml|
        @fields.each do |field_to_xml|
          field_name = field_to_xml.extended_field_label.downcase.tr(' ', '_')
          if field_to_xml.extended_field_multiple
            hash_of_values = params[item_key]['extended_content_values'][field_name] rescue nil
            if !hash_of_values.nil?
              xml.safe_send("#{field_name}_multiple") do
                hash_of_values.keys.each do |key|
                  xml.safe_send(key.to_s) do
                    logger.debug('inside hash: key: ' + key.to_s)
                    m_value = hash_of_values[key]
                    extended_content_field_xml_tag(
                      xml: xml,
                      field: field_name,
                      value: m_value,
                      xml_element_name: field_to_xml.extended_field_xml_element_name,
                      xsi_type: field_to_xml.extended_field_xsi_type,
                      extended_field: field_to_xml.extended_field
                    )
                  end
                end
              end
            end
          else
            value = (params[item_key]['extended_content_values'][field_name] || '') rescue ''
            extended_content_field_xml_tag(
              xml: xml,
              field: field_name,
              value: value,
              xml_element_name: field_to_xml.extended_field_xml_element_name,
              xsi_type: field_to_xml.extended_field_xsi_type,
              extended_field: field_to_xml.extended_field
            )
          end
        end
      end

      params[item_key][:extended_content] = builder.to_stripped_xml
      params
    end

    # strip out raw extended_fields and create a valid params hash for new/create/update
    def importer_extended_fields_replacement_params_hash(options = {})
      params = options[:params]
      item_key = options[:item_key].to_sym
      item_class = options[:item_class]

      extra_fields = options[:extra_fields] || []
      extra_fields << 'tag_list'
      extra_fields << 'uploaded_data'

      extra_fields << 'url'

      replacement_hash = {}

      params[item_key].keys.each do |field_key|
        # we only want real topic columns, not pseudo ones that are handled by extended_content xml
        if Module.class_eval(item_class).column_names.include?(field_key) || extra_fields.include?(field_key)
          replacement_hash = replacement_hash.merge(field_key => params[item_key][field_key])
        end
      end

      # imports aren't moderated, at least not for the time being
      replacement_hash[:do_not_moderate] = true

      replacement_hash
    end

    def importer_prepare_short_summary(source_string, length = 25, end_string = '')
      # length is how many words, rather than characters
      words = source_string.split
      words[0..(length - 1)].join(' ') + (words.length > length ? end_string : '')
    end

    def importer_prepare_path_to_image_file(image_file)
      image_path_array = image_file.split('\\')

      # prep alternative versions of the filename
      directories_up_to = @import_parent_dir_for_image_dirs + '/' + image_path_array[0] + '/'
      the_file_name = image_path_array[1]

      path_to_file_to_grab = directories_up_to + the_file_name

      # if we can't find the file, try downcasing or upcasing the extension
      # also try escaping any spaces

      if !File.exist?(path_to_file_to_grab)
        logger.debug('path_to_file_to_grab no match yet')

        # Try case insensitive check
        # this may not work on all systems, so falling back to only checking extensions after
        case_insensitive_matches = Dir.glob(path_to_file_to_grab, File::FNM_CASEFOLD)
        if case_insensitive_matches.any?
          path_to_file_to_grab = case_insensitive_matches.first
          logger.debug('path_to_file_to_grab is different by case: ' + path_to_file_to_grab)
        else

          file_name_array = the_file_name.scan(/(.+)(\.[^\d]+$)/)[0]
          file_name_no_extension = file_name_array[0]
          extension = file_name_array[1]

          downer = directories_up_to + file_name_no_extension + extension.downcase
          upper = directories_up_to + file_name_no_extension + extension.upcase

          if File.exist?(downer)
            path_to_file_to_grab = downer
            logger.debug('path_to_file_to_grab is downer: ' + path_to_file_to_grab)
          elsif File.exist?(upper)
            path_to_file_to_grab = upper
            logger.debug('path_to_file_to_grab is upper: ' + path_to_file_to_grab)
          end
        end
      end

      # make a copy of any files that have spaces in their name
      # a better formed name
      # to avoid problems later
      if !the_file_name.scan(' ').blank? && File.exist?(path_to_file_to_grab)
        the_new_file_name = the_file_name.tr(' ', "\.")
        new_file_path = directories_up_to + the_new_file_name

        if !File.exist?(new_file_path)
          FileUtils.copy_file path_to_file_to_grab, new_file_path
        end
        path_to_file_to_grab = new_file_path
      end

      path_to_file_to_grab
    end

    def importer_update_records_processed_vars
      @successful = true
      @results[:records_processed] += 1
      cache[:results] = @results
      @import.update_attributes(records_processed: @results[:records_processed])
    end

    def stop_worker
      exit
    end

    def importer_update_processing_vars_at_end
      if @successful
        @results[:notice] = I18n.t('importer_lib.importer_update_processing_vars_at_end.import_successful')
        @results[:done_with_do_work] = true
        @import.update_attributes(status: 'complete')
      else
        @results[:notice] = I18n.t('importer_lib.importer_update_processing_vars_at_end.import_failed')
        if !@results[:error].nil?
          logger.info("import error: #{@results[:error]}")
          @results[:notice] += @results[:error]
        end
        @results[:done_with_do_work] = true
        @import.update_attributes(status: I18n.t('importer_lib.importer_update_processing_vars_at_end.failed_status'))
      end
      cache[:results] = @results
      stop_worker
    end

    def importer_update_processing_vars_if_rescue
      @results[:error], @successful = $!.to_s, false
      @results[:done_with_do_work] = true
      cache[:results] = @results
      @import.update_attributes(status: I18n.t('importer_lib.importer_update_processing_vars_if_rescue.failed_status'))
      stop_worker
    end

    def importer_locate_existing_items(options = {})
      # not applicable to related_topic imports, at least for the moment
      return [] if @related_topic.present?

      options = {
        item_type: @zoom_class_for_params.pluralize,
        title: nil,
        topic_type: nil,
        extended_field_data: {},
        filename: nil
      }.merge(options)

      conditions = []
      params = {}

      if options[:title].present?
        conditions << '(LOWER(title) = :title)'
        params[:title] = options[:title].downcase
      end

      if options[:item_type] == 'topics' && options[:topic_type].present?
        conditions << '(topic_type_id = :topic_type_id)'
        params[:topic_type_id] = options[:topic_type].id
      end

      if options[:filename].present?
        # if zoom_class is StillImage
        # we need to do a join on ImageFile
        # to check filename
        filename_condition = 'LOWER(filename) = :filename'
        if options[:item_type] == 'still_images'
          image_file_conditions = "id IN (SELECT still_image_id FROM image_files WHERE #{filename_condition})"
        else
          conditions << "(#{filename_condition})"
        end
        params[:filename] = options[:filename].downcase
      end

      unless options[:extended_field_data].blank?
        regexp = ActiveRecord::Base.connection.adapter_name.downcase =~ /postgres/ ? '~*' : 'REGEXP'
        ext_field_label = options[:extended_field_data][:label]
        ext_field_value = options[:extended_field_data][:value]
        conditions << "(LOWER(extended_content) #{regexp} :ext_field_data)"
        params[:ext_field_data] = "<#{ext_field_label}[^>]*>#{ext_field_value}</#{ext_field_label}>".downcase
      end

      # Select all topics where the id is within a subselect of topic versions matching criteria
      # Adds a little complexity, but gets around privacy related import issues, as well as
      # no longer adds the topic if the first version was the same title but was later changed
      conditions = formulate_conditions(conditions.join(' AND '), options[:item_type].singularize)
      conditions = conditions + ' AND ' + image_file_conditions if options[:item_type] == 'still_images'
      conditions = [conditions, params] unless params.blank?
      logger.debug('what are conditions: ' + conditions.inspect)
      @current_basket.send(options[:item_type]).find(:all, conditions: conditions)
    end

    def formulate_conditions(conditions, item_type)
      "id IN (SELECT #{item_type}_id FROM #{item_type}_versions WHERE #{conditions})"
    end

    # override in your importer worker to customize
    # takes an xml element
    def importer_process(record, params)
      current_record = @results[:records_processed] + 1
      logger.info("starting record #{current_record}")

      record_hash = {}
      # if a file is passed in, we assume embedded metadata
      # (or filename and form settings)
      # will be what we derive our hash values from
      # otherwise, we expect xml to derive hash values from
      if File.exist?(record)
        record_hash['placeholder_title'] = File.basename(record, File.extname(record)).tr('_', ' ')
        record_hash['path_to_file'] = record
      else
        record_hash = importer_xml_record_to_hash(record)
      end

      reason_skipped = nil

      logger.info("record #{current_record} : looking for topic")

      # will only work with topics
      # we need a title attribute
      # if this is well set up there should only be one matching record_hash key
      # that is a title synonym, we go with last match just in case
      title = nil
      record_hash.keys.each do |field_name|
        title = record_hash[field_name].strip if field_name.casecmp('title').zero? || (SystemSetting.SystemSetting.title_synonyms && SystemSetting.SystemSetting.title_synonyms.include?(field_name))
      end

      logger.info('after record field_name loop')

      # In some cases, records may share the same name, but have a different code
      # In order to accomodate for that, we check both title, extended field data
      # and topic type if available
      # Otherwise, do a very basic check againts items with the same title and topic type
      options = {
        title: title,
        topic_type: @import_topic_type
      }

      if record_hash[@record_identifier_xml_field].present? && @extended_field_that_contains_record_identifier.present?
        options[:extended_field_data] = {
          label: @extended_field_that_contains_record_identifier.label_for_params,
          value: record_hash[@record_identifier_xml_field]
        }
      end

      # attachable classes may have an upload file specified in file xml element
      # if file exists, we know we are uploading files for an attachable class
      if record_hash['path_to_file'].present? &&
         File.exist?(record_hash['path_to_file'])
        logger.info('setting filename check')
        options[:filename] = File.basename(record_hash['path_to_file'])
      end
      logger.info('after path_to_file present')

      existing_item = importer_locate_existing_items(options).first

      new_record = nil
      if existing_item.blank?
        description_end_template = @description_end_templates['default']
        new_record = create_new_item_from_record(record, @zoom_class, { params: params, record_hash: record_hash, description_end_template: description_end_template })
      else
        logger.info('what is existing item: ' + existing_item.id.to_s)
        # record exists in kete already
        reason_skipped = I18n.t('importer_lib.importer_process.already_have_record')
      end

      if !new_record.nil? && !new_record.id.nil?
        logger.info('new record succeeded for insert')
        new_record.prepare_and_save_to_zoom
        importer_update_records_processed_vars
      end

      # if this record was skipped, add to skipped_records
      if !reason_skipped.blank?
        importer_log_to_skipped_records(title, reason_skipped)
      end
      # will this help memory leaks
      record = nil
      # give zebra and our server a small break
      sleep(@record_interval) if @record_interval > 0

      existing_item || new_record
    end

    # XPATH was proving too unreliable
    # switching to pulling record to a hash
    # and grabbing the specific fields
    # we need to check
    def importer_xml_record_to_hash(record, upcase = false)
      record_hash = Hash.from_xml(record.to_s)

      # HACK: to go down one more level
      record_hash.keys.each do |record_field|
        record_hash = record_hash[record_field]
      end

      # move all hash keys to upcase
      # we use this to smooth some legacy code in past perfect import
      if upcase
        new_record_hash = {}
        record_hash.each do |key, value|
          key = key.upcase if key.is_a?(String)
          new_record_hash[key] = value
        end
        record_hash = new_record_hash
      end

      logger.info('record_hash inspect: ' + record_hash.inspect)
      record_hash
    end

    # copied and modified from http://www.broobles.com/eml2mbox/eml2mboxscript.html (GPL 2 or later)
    def remove_non_unix_new_lines(line)
      line = line[0..-3] + line[-1..-1] if line[-2] == 0xD
      line = line[0..-2] if line[-1] == 0xA
      # add a unix newline if not already there
      line = line + "\n" unless line.include?("\n")
    end

    # override in your importer worker to customize
    # takes a potentially huge xml file and strips out all the empty fields
    # so it much more manageable
    # output is to a tmp file
    # has commented out code for replacing macronized vowels
    # uncomment if you need them
    def importer_trim_fat_from_xml_import_file(path_to_original_file, path_to_output, accession = nil)
      fat_free_file = File.new(path_to_output, 'w+')

      fatty_re = Regexp.new("\/\>.*")

      accessno_re = Regexp.new(/ACCESSNO>(.*)</i)

      IO.foreach(path_to_original_file) do |line|
        line = remove_non_unix_new_lines(line)
        # HACK: to seriously trim down accession records
        # and make them in a form we can search easily
        # only add non-fat to our fat_free_file
        #  && !line.blank?
        # keeping new lines only lines for redcloth formatting
        if !line.match(fatty_re)
          if accession.nil?
            # replace double dotted version of maori vowels
            # with macrons
            # replacements = { 'ä' => 'ā',
            #               'ë' => 'ē',
            #               'ï' => 'ī',
            #               'ö' => 'ō',
            #               'ü' => 'ū' }

            #             replacements.each do |old_style_vowel, macronized|
            #               line = line.gsub(old_style_vowel, macronized).gsub(old_style_vowel.upcase, macronized.upcase)
            #             end
            fat_free_file << line
          else
            # we only keep accessno and descrip
            # and their containing elements
            # but we change accessno to an attribute of record
            # rather than an element
            # this relies on the accessno line coming before the descrip line
            # it tosses the original <Record> or <export> line, so that it can be replaced
            # putting in both styles of records
            if line.include?('<ACCESSNO') || line.include?('<accessno') ||
               line.include?('<DESCRIP') || line.include?('<descrip') ||
               line.include?("<\/DESCRIP") || line.include?("<\/descrip") ||
               line.include?("<\/Record") || line.include?("<\/export") ||
               line.include?('<Information') || line.include?("<\/Information") ||
               line.include?('<Root') || line.include?('<VFPData') ||
               line.include?("<\/Root") || line.include?("<\/VFPData")

              # we expect accessno to be on one line, this will break if not
              if line.include?('<accessno') || line.include?('<ACCESSNO')
                accessno_match_result = line.match(accessno_re)
                accessno = !accessno_match_result.nil? && !accessno_match_result[1].nil? ? accessno_match_result[1] : nil

                new_start_record_line = '<'
                # if accessno is empty, we just open the export or Record so we have valid xml
                # otherwise set as appropriate to the source xml file's format
                if !@root_element_name.nil? && @root_element_name == 'Root'
                  new_start_record_line += 'Record'
                else
                  new_start_record_line += 'export'
                end

                unless accessno.blank?
                  new_start_record_line += " ACCESSNO=\'#{accessno}\'"
                end

                fat_free_file << new_start_record_line + ">\n"
              else
                fat_free_file << line
              end
            end
          end
        end
      end

      # add a blank line a the end
      fat_free_file << ''
      fat_free_file.close

      path_to_output
    end

    def assign_value_to_appropriate_fields(record_field, record_value, params, zoom_class)
      return if SystemSetting.import_fields_to_ignore.include?(record_field)
      logger.debug('record_field ' + record_field.inspect)

      zoom_class_for_params = zoom_class.tableize.singularize

      record_value = record_value.strip.tr("\r", "\n") if record_value.present?

      if record_value.present?
        # if it's mapped to an extended field, params are updated
        params = importer_prepare_extended_field(
          value: record_value,
          field: record_field,
          zoom_class_for_params: zoom_class_for_params,
          params: params
        )

        # the field may also be mapped to non-extended fields
        # such as tags, description, title
        # the value maybe used multiple times, so case isn't appropriate
        if record_field.casecmp('TITLE').zero? || (!SystemSetting.title_synonyms.blank? && SystemSetting.title_synonyms.include?(record_field))
          params[zoom_class_for_params][:title] = record_value
        end

        if !SystemSetting.description_synonyms.blank? && SystemSetting.description_synonyms.include?(record_field)
          if params[zoom_class_for_params][:description].nil?
            params[zoom_class_for_params][:description] = record_value
          else
            params[zoom_class_for_params][:description] += "\n\n" + record_value
          end
        end

        if !SystemSetting.short_summary_synonyms.blank? && SystemSetting.short_summary_synonyms.include?(record_field)
          if params[zoom_class_for_params][:short_summary].nil?
            params[zoom_class_for_params][:short_summary] = record_value
          else
            params[zoom_class_for_params][:short_summary] += "\n\n" + record_value
          end
        end

        if !SystemSetting.tags_synonyms.blank? && SystemSetting.tags_synonyms.include?(record_field)
          @tag_list_array += record_value.split(',').collect { |tag| tag.strip }
        end

        if zoom_class == 'WebLink' && record_field.casecmp('URL').zero?
          params[zoom_class_for_params][:url] = record_value
        end

        # path_to_file is special case, we know we have an associated file that goes in uploaded_data
        if record_field == 'path_to_file'
          logger.debug('in path_to_file')
          if ::Import::VALID_ARCHIVE_CLASSES.include?(zoom_class) && File.exist?(record_value)
            # we do a check earlier in the script for imagefile
            # so we should have something to work with here
            upload_hash = { uploaded_data: copy_and_load_to_temp_file(record_value) }
            if zoom_class == 'StillImage'
              logger.debug('in image')
              params[:image_file] = upload_hash
            else
              logger.debug('in not image')
              params[zoom_class_for_params] = params[zoom_class_for_params].merge(upload_hash)
            end
          end
        end
      end

      params
    end

    # override in your importer worker to customize
    # expects an xml element of our record or a file with a simple record_hash
    # TODO: add support for zoom_classes that may have attachments
    # steal from past perfect importer
    # record_hash has to have file key
    def create_new_item_from_record(record, zoom_class, options = {})
      zoom_class_for_params = zoom_class.tableize.singularize

      params = options[:params]

      # initialize the subhash in params
      # clears it out if it does already
      params[zoom_class_for_params] = {}

      if options[:basket_id].nil?
        params[zoom_class_for_params][:basket_id] = @current_basket.id
      else
        params[zoom_class_for_params][:basket_id] = options[:basket_id]
      end

      # check extended_field.import_field_synonyms
      # for which extended field to map the import_field to
      # special cases for title, short_summary, and description
      record_hash = {}
      if options[:record_hash].nil?
        record_hash = importer_xml_record_to_hash(record)
      else
        record_hash = options[:record_hash]
      end

      field_count = 1
      @tag_list_array = []
      # add support for all items during this import getting a set of tags
      # added to every item in addition to the specific ones for the item
      @tag_list_array = @import.base_tags.split(',').collect { |tag| tag.strip } if !@import.base_tags.blank?

      # Run each value through any importer field methods that exist
      # and get back the value plus any other fields needing setting
      import_field_methods_file = Rails.root.join('config/importers.yml').to_s
      if File.exist?(import_field_methods_file)
        importer_field_methods = (YAML.load(File.read(import_field_methods_file)) || {})[@import_type.to_s]

        if importer_field_methods.is_a?(Hash)
          additional_fields_derived_from_processing_values = {}
          record_hash.each do |record_field, record_value|
            if record_value.present? && importer_field_methods[record_field.downcase]
              field_modifier = eval(importer_field_methods[record_field.downcase])
              args = field_modifier.arity == 2 ? [record_value, record_hash] : [record_value]
              parsed_value = Array(field_modifier.call(*args))
              additional_fields_derived_from_processing_values.merge!(parsed_value.last) if parsed_value.last.is_a?(Hash)
              record_hash[record_field] = parsed_value.first
            end
          end

          # Loop over each result, add to record_hash if it doesn't exist yet,
          # or append the value to what already exists in record_hash
          additional_fields_derived_from_processing_values.each do |record_field, record_value|
            if record_hash[record_field].present?
              record_hash[record_field] += "\n\n" + record_value
            else
              record_hash[record_field] = record_value
            end
          end
        end
      end

      # Loops over each record value and assign the value to the appropriate fields
      record_hash.each do |record_field, record_value|
        params = assign_value_to_appropriate_fields(record_field, record_value, params, zoom_class)
        field_count += 1
      end

      logger.info('after fields')

      if !@import.description_beginning_template.blank?
        # append the citation to the description field
        if !params[zoom_class_for_params][:description].nil?
          params[zoom_class_for_params][:description] = @import.description_beginning_template + "\n\n" + params[zoom_class_for_params][:description]
        else
          params[zoom_class_for_params][:description] = @import.description_beginning_template
        end
      elsif !SystemSetting.description_template.blank?
        if !params[zoom_class_for_params][:description].nil?
          params[zoom_class_for_params][:description] = SystemSetting.description_template + "\n\n" + params[zoom_class_for_params][:description]
        else
          params[zoom_class_for_params][:description] = SystemSetting.description_template
        end
      end

      if !options[:description_end_template].nil?
        # append the description_end_template to the description field
        if !params[zoom_class_for_params][:description].nil?
          params[zoom_class_for_params][:description] += "\n\n" + options[:description_end_template]
        else
          params[zoom_class_for_params][:description] = options[:description_end_template]
        end
      end

      description = ''
      # used to give use better html output for descriptions
      if !params[zoom_class_for_params][:description].nil?
        description = RedCloth.new params[zoom_class_for_params][:description]
        params[zoom_class_for_params][:description] = description.to_html
      end

      params[zoom_class_for_params][:tag_list] = @tag_list_array.join(',')
      params[zoom_class_for_params][:raw_tag_list] = params[zoom_class_for_params][:tag_list]

      # set the chosen privacy
      private_setting = @import.private
      logger.debug('private = ' + private_setting.to_s)
      params[zoom_class_for_params][:private] = private_setting

      # set the chosen file privacy
      file_private_setting = @import.file_private
      params[zoom_class_for_params][:file_private] = file_private_setting

      # add the uniform license chosen at import to this item
      params[zoom_class_for_params][:license_id] = @import.license.id if !@import.license.blank?

      # clear any lingering values for @fields
      # and instantiate it, in case we need it
      @fields = nil

      if zoom_class == 'Topic'
        params[zoom_class_for_params][:topic_type_id] = @import_topic_type.id

        @fields = @import_topic_type.topic_type_to_field_mappings

        ancestors = TopicType.find(@import_topic_type).ancestors

        if ancestors.size > 0
          ancestors.each do |ancestor|
            @fields = @fields + ancestor.topic_type_to_field_mappings
          end
        end
      else
        content_type = ContentType.find_by_class_name(zoom_class)
        @fields = content_type.content_type_to_field_mappings
      end

      if @fields.size > 0
        logger.info('fields larger than 0')

        # we use our version of this method
        # that calls xml builder directly, rather than using partial template
        params[zoom_class_for_params.to_sym] = params[zoom_class_for_params]
        params = importer_extended_fields_update_hash_for_item(item_key: zoom_class_for_params, params: params)
      end

      logger.info('after field set up')

      # replace with something that isn't reliant on params
      replacement_zoom_item_hash = importer_extended_fields_replacement_params_hash(item_key: zoom_class_for_params, item_class: zoom_class, params: params)

      logger.info 'what is replacement_zoom_item_hash? ' + replacement_zoom_item_hash.inspect

      new_record = Module.class_eval(zoom_class).new(replacement_zoom_item_hash)

      # we need new_image_file's file, for our embedded metadata (if enabled)
      # thus we have to create it before the still image
      new_image_file = nil
      new_image_file = importer_add_image(params, zoom_class) unless params[:image_file].blank?

      # only necessary for still images, because attachment is in a child model
      # if we are allowing harvesting of embedded metadata from the image_file
      # we need to grab it from the image_file's file path
      if SystemSetting.enable_embedded_support && !new_image_file.nil? && zoom_class == 'StillImage'
        new_record.populate_attributes_from_embedded_in(new_image_file.full_filename)
      end

      # handle special case where title is derived from filename
      if new_record.title.blank?
        if SystemSetting.enable_embedded_support && zoom_class != 'StillImage' && ATTACHABLE_CLASSES.include?(zoom_class)
          new_record.title = '-replace-' + record_hash['placeholder_title']
        else
          new_record.title = record_hash['placeholder_title']
        end
      end

      # respect the Related Items Inset configurations
      if new_record.respond_to?(:related_items_position)
        new_record.related_items_position = (SystemSetting.related_items_position_default ? SystemSetting.related_items_position_default : 'inset')
      end

      # if still image and new_image failed, fail
      new_record_added = false
      unless zoom_class == 'StillImage'
        new_record_added = new_record.save
      else
        new_record_added = new_record.save unless new_image_file.nil?
      end

      if new_record_added
        importer_add_still_image_to(new_image_file, new_record, zoom_class) unless new_image_file.nil?

        new_record.creator = @contributing_user

        importer_build_relations_to(new_record, record_hash, options[:params])

        logger.info('in topic creation made it past creator')
      else
        # destroy images if the record wasn't added successfully
        new_image_file.destroy unless new_image_file.nil?

        logger.info('what are errors on save of new record: ' + new_record.errors.inspect)
      end

      new_record
    end

    def importer_build_relations_to(new_record, record_hash, params)
      logger.info('building relations for new record')

      if @related_topics_reference_in_record_xml_field.blank? && @related_topic.blank?
        logger.info('no relations to be made for new record')
        return
      end

      # two options to build relations
      # single @related_topic exists
      # or more complex mapping in the data to topic to relate to
      if @related_topic.present?
        # add relation to related_topic
        ContentItemRelation.new_relation_to_topic(@related_topic.id, new_record)

        # it would be faster to do this just once afte all new records
        # were related
        # but doing this for each new record
        # means that if import fails
        # each related record up to the failure is has relationship
        # reflected in related topic
        @related_topic.prepare_and_save_to_zoom
      else

        # We need an array to loop over, but we also allow single values as strings, so convert as needed
        # Split by commas incase mutliple ones are provided, and strip whitespace
        if @related_topics_reference_in_record_xml_field.is_a?(String)
          @related_topics_reference_in_record_xml_field = @related_topics_reference_in_record_xml_field.split(',').collect { |r| r.strip }
        end

        @related_topics_reference_in_record_xml_field.each do |related_topics_reference_in_record_xml_field|
          next if related_topics_reference_in_record_xml_field.blank?
          if record_hash[related_topics_reference_in_record_xml_field].blank?
            logger.info("no relational field found with name of #{related_topics_reference_in_record_xml_field}")
            next
          end

          record_hash[related_topics_reference_in_record_xml_field].split(',').each do |related_topic_identifier|
            related_topic_identifier = related_topic_identifier.strip

            if @last_related_topic_identifier.blank? || @last_related_topic_identifier != related_topic_identifier
              related_topics = importer_fetch_related_topics(
                related_topic_identifier, params, {
                  item_type: 'topics',
                  topic_type: @related_topic_type,
                  extended_field_data: {
                    label: @extended_field_that_contains_related_topics_reference.label_for_params,
                    value: related_topic_identifier
                  }
                }
              ) if @extended_field_that_contains_related_topics_reference.present?
            else
              related_topics = @last_related_topics
            end

            next if related_topics.blank?

            related_topics.uniq.flatten.compact.each do |related_topic|
              next if related_topic == new_record
              ContentItemRelation.new_relation_to_topic(related_topic, new_record)
            end

            @last_related_topic_identifier = related_topic_identifier
            @last_related_topics = related_topics
          end
        end
      end
      logger.info('finished building relations for new record')
    end

    # override in your importer worker to customize
    def importer_log_to_skipped_records(identifier, reason_skipped)
      logger.info("#{identifier}: #{reason_skipped}")
    end

    def importer_create_related_topic(topic_params)
      # clear any lingering values for @fields
      # and instantiate it, in case we need it
      @fields = nil

      @fields = @related_topic_type.topic_type_to_field_mappings

      ancestors = @related_topic_type.ancestors

      if ancestors.size > 1
        ancestors.each do |ancestor|
          @fields = @fields + ancestor.topic_type_to_field_mappings
        end
      end

      # we use our version of this method
      # that calls xml builder directly, rather than using partial template
      # HACK, conflict with symbol vs string for hash key
      # duplicate
      temp_params = {}
      temp_params[:topic] = topic_params['topic']
      topic_params = importer_extended_fields_update_hash_for_item(item_key: 'topic', params: temp_params)

      topic_params[:topic][:basket_id] = @current_basket.id

      # add the uniform license chosen at import to this item
      if !@import.license.blank?
        topic_params[:topic][:license_id] = @import.license.id
      else
        topic_params[:topic][:license_id] = nil
      end

      # replace with something that isn't reliant on params
      # replacement_topic_hash = pp4_importer_extended_fields_replacement_params_hash(:item_key => "topic", :item_class => 'Topic', :params => topic_params)

      # we set the virtual attribute, do_not_moderate to true
      # so that our imported topics go live right away
      # and thus can be found (since then they won't have blank attributes)
      related_topic = Topic.create!(
        title: topic_params[:topic][:title],
        description: topic_params[:topic][:description],
        short_summary: topic_params[:topic][:short_summary],
        extended_content: topic_params[:topic][:extended_content],
        basket_id: topic_params[:topic][:basket_id],
        license_id: topic_params[:topic][:license_id],
        topic_type_id: topic_params[:topic][:topic_type_id],
        do_not_moderate: true,
        related_items_position: (SystemSetting.related_items_position_default ? SystemSetting.related_items_position_default : 'inset')
      )

      related_topic.creator = @contributing_user
      related_topic
    end
  end
end