app/models/extended_content_parser.rb from kete/kete

app/models/extended_content_parser.rb
Summary

Maintainability

1 hr
Test Coverage

Issues
class ExtendedContentParser
  # ROB: This code is ripped from the oai_dc_xml_dc_extended_content() and oai_dc_xml_for_field_dataset()
  #      in lib/extended_content_helpers.rb.
  #      It has been refactored to return values rather than add xml tags at some random point in a huge
  #      nested if/each/if/elsif/... trees.
  #
  #      The idea was to be a bit more easier to follow (it's only marginally better) but at least now
  #      this takes values and returns values with no side-effects in the middle.

  # Method for converting an content-item's extended_content field to key value pairs for
  # display (in RSS for instance).
  def self.key_value_pairs(item)
    # ROB:
    #
    # In existing oai-xml anonymous_pairs are shown inside <description></description> tags,
    # whereas non_anonymous_pairs are shown unwrapped.
    #
    # An example xml-tag could be:  <creation_date xml_element_name=\"dc:date\">16/04/2005</creation_date>
    # which generates the pair:     [ "dc:date", "16/04/2005" ]
    #
    # There are multiple different xml-tag formats.

    anonymous_pairs = []
    non_anonymous_pairs = []

    attribute_pairs = attribute_pairs_to_process(item)

    attribute_pairs.each do |field_key, field_data|
      anonymous_pairs << get_anonymous_key_value_pair(field_key, field_data)
      non_anonymous_pairs += get_non_anonymous_key_value_pairs(field_key, field_data)
    end

    [anonymous_pairs.compact, non_anonymous_pairs]
  end

  # ROB: complex example item.xml_attributes given by oai_dc_xml_dc_extended_content():
  #
  # {
  #    "text_field_multiple"=>{
  #       "2"=>{"text_field"=>{
  #          "xml_element_name"=>"dc:description",
  #          "value"=>"Value"
  #       }
  #       },
  #       "3"=>{
  #          "text_field"=>{
  #             "xml_element_name"=>"dc:description",
  #             "value"=>"Second value"
  #           }
  #       }
  #    },
  #    "married"=>"No",
  #    "check_boxes_multiple"=>{
  #      "1"=>{"check_boxes"=>"Yes"}
  #    },
  #    "vehicle_type"=>{
  #       "1"=>"Car",
  #       "2"=>"Coupé"
  #     },
  #    "truck_type_multiple"=>{
  #       "1"=>{
  #          "truck_type"=>{"1"=>"Lorry"}
  #       },
  #       "2"=>{
  #          "truck_type"=>{
  #              "1"=>"Tractor Unit",
  #              "2"=>"Tractor with one trailer"
  #          }
  #       }
  #    }
  # }

  def self.attribute_pairs_to_process(item)
    # item.extended_content like this:
    #
    #   <creator xml_element_name="dc:creator"></creator>
    #   <creation_date xml_element_name="dc:date"></creation_date>
    #   <user_reference xml_element_name="dc:identifier"></user_reference>

    fields_with_position_hash = item.xml_attributes

    # fields_with_position_hash like this:
    #
    # '1':
    #   creator:
    #     xml_element_name: dc:creator
    # '2':
    #   creation_date:
    #     xml_element_name: dc:date
    # '3':
    #   user_reference:
    # xml_element_name: dc:identifier

    sorted_fields_with_position_hash = fields_with_position_hash.sort_by { |k, v| k.to_s }.to_h
    fields_in_sorted_array = sorted_fields_with_position_hash.values

    # fields_in_sorted_array like this:
    #
    # - creator:
    #     xml_element_name: dc:creator
    # - creation_date:
    #     xml_element_name: dc:date
    # - user_reference:
    #     xml_element_name: dc:identifier

    attribute_pairs = []

    fields_in_sorted_array.each do |field_hash|
      field_hash =
        field_hash.reject do |field_key, field_data|
          # If this is google map contents, and no_map is '1', then do not use this data
          field_data.is_a?(Hash) && field_data['no_map'] && field_data['no_map'] == '1'
        end

      multi_instance_attributes = field_hash.select { |field_key, field_data| field_key =~ /_multiple$/  }
      regular_attributes =        field_hash.reject { |field_key, field_data| field_key =~ /_multiple$/  }

      multi_instance_attribute_pairs =
        multi_instance_attributes.flat_map do |field_key, field_data|
          field_data.map do |index, data|
            [field_key, data.values.first]
          end
        end

      regular_attribute_pairs =
        regular_attributes.map do |field_key, field_data|
          [field_key, field_data]
        end

      attribute_pairs = attribute_pairs + multi_instance_attribute_pairs + regular_attribute_pairs
    end

    attribute_pairs
  end

  def self.get_non_anonymous_key_value_pairs(field_key, data)
    key_value_pairs = []

    return key_value_pairs if data.is_a?(String)

    # We add a dc:date for 5 years before and after the value specified
    # We also convert the single YYYY value to a format Zebra can search against
    # Note: We use DateTime instead of just Date/Time so that we can get dates before 1900
    date_conversion_for_extended_content_hash!(data)

    if data.has_key?('value') && data.has_key?('circa') && data['circa'] == '1'
      five_years_before, five_years_after = (data['value'].to_i - 5), (data['value'].to_i + 5)
      key_value_pairs << ['dc:date', Time.zone.parse("#{five_years_before}-01-01").xmlschema]
      key_value_pairs << ['dc:date', Time.zone.parse("#{five_years_after}-12-31").xmlschema]
    end

    xml_value = flatten_any_extended_content_trees(data)
    # return key_value_pairs if xml_value.nil?

    # safe_send will drop the namespace from the element and therefore our dc elements
    # will not be parsed by zebra, only use safe_send on non-dc elements
    if data['xml_element_name'].present?
      xml_name = data['xml_element_name']

      unless data['xml_element_name'].include?('dc:')
        xml_name = escape_xml_name(xml_name)
      end

      key_value_pairs << [xml_name, xml_value]
    end

    key_value_pairs
  end

  def self.get_anonymous_key_value_pair(field_key, data)
    anonymous_fields = nil
    original_field_key = field_key.gsub(/_multiple/, '')

    if data.is_a?(String)
      # This works as expected
      # In the most simple case, the content is represented as "key" => "value", so use this directly
      # now if it's available.
      anonymous_fields = [original_field_key, data]

    else
      date_conversion_for_extended_content_hash!(data)

      xml_value = flatten_any_extended_content_trees(data)
      # return nil if xml_value.nil?

      # If data["xml_element_name"] exists this is handled by get_non_anonymous_key_value_pairs()
      if data['xml_element_name'].blank?
        anonymous_fields = [original_field_key, xml_value]
      end
    end

    anonymous_fields
  end

  def self.date_conversion_for_extended_content_hash!(data)
    if data.has_key?('value') && data.has_key?('circa')
      data['value'] = Time.zone.parse("#{data['value']}-01-01").xmlschema
    end
  end

  def self.flatten_any_extended_content_trees(data)
    # Example of what we might have in data at this point
    # {"xml_element_name"=>"dc:subject",
    #  "1"=>{"value"=>"Recreation", "label"=>"Sports & Recreation"},
    #  "2"=>"Festivals",
    #  "3"=>"New Year"}

    if data.has_key?('value')
      data['value']
    else
      # This means we're dealing with a second set of nested values, to build these now.
      data_for_values = data.reject { |k, v| k == 'xml_element_name' || k == 'label' }.values

      # By this stage, we may have either of the following:
      # [{:label => 'Something', :value => 'This'}, {:label => 'Another', :value => 'That'}]
      # ['This', 'That']
      # (or a combination of both). So in this case, lets collect the correct values before continuing
      data_for_values.collect! { |v| v.is_a?(Hash) && v['value'] ? v['value'] : v }.flatten.compact

      if data_for_values.empty?
        nil
      else
        ":#{data_for_values.join(":")}:"
      end
    end
  end

  # Make sure that the name is a valid XML name and escape common patterns (spaces to underscores)
  # to prevent import errors
  def self.escape_xml_name(name)
    name.to_s.gsub(/\W/, '_').gsub(/(^_*|_*$)/, '')
  end
end