app/services/notes_grouper.rb from sul-dlss/argo

app/services/notes_grouper.rb
Summary

Maintainability

0 mins
Test Coverage

100%
Issues
Coverage
# frozen_string_literal: true

# For grouping note attributes, we list the unique note display label and type
# values and their counts in the descriptions and put them in descending count
# order (most frequent note labels/types first, least frequent last), and
# resulting in a mapping of note numbers to note labels/types, e.g.:
#
# {"note1"=>[nil, "abstract"], "note2"=>["Provenance", "ownership"]}
#
# We expand this mapping as we examine more descriptions in the set, e.g., if a record has multiple
# notes w/ a nil display label and type "abstract", the above mapping would expand to:
#
# {"note1"=>[nil, "abstract"], "note2"=>[nil, "abstract"], "note3"=>["Provenance", "ownership"]}
#
# We then use this mapping to group all given descriptions.
#
# NOTE: This class is tested in the context of the DescriptionsGrouper
class NotesGrouper
  def self.group(descriptions:)
    new(descriptions:).group
  end

  def initialize(descriptions:)
    @descriptions = descriptions
    # Grab labels and types early, before any mutation happens, and put the note
    # label and type values in order from most ubiquitous to least. Why? This
    # way the leftmost columns will be more populated, with less used values
    # pushed to the right.
    @ordered_mapping = ordered_mapping_from(descriptions)
  end

  def group
    descriptions.transform_values do |description|
      # Build up a description-specific mapping as a way to track for the current description
      # how we are mapping old note elements to new ones.
      mapping = {}

      # First, we rename all the note header values, e.g., "note1.type" to "old_note1.type"
      # so that we can track which values have already been changed, else we get collisions and are
      # unable to distinguish already grouped data from yet-to-be-grouped data.
      description.transform_keys! { |key| key.match?(/^note\d+/) ? key.sub(/^(\D+)/, 'old_\1') : key }

      description.transform_keys! do |key|
        # Short-circuit by returning the key unchanged if not note-related.
        next key unless (note_number = key.match(/^old_note(\d+)/))

        # If we already have a description-specific mapping for the note_number
        # being operated on, use it and move on
        next key.sub(/^old_note\d+/, mapping[note_number[0]]) if mapping.key?(note_number[0])

        # Get the displayLabel value of the note number corresponding to the key currently being transformed
        label_for_note_number = description.slice(*description.keys.grep(/^old_note#{note_number[1]}\.displayLabel/)).values.first
        # Get the type value of the note number corresponding to the key currently being transformed
        type_for_note_number = description.slice(*description.keys.grep(/^old_note#{note_number[1]}\.type/)).values.first

        # Look up the note number of the [displayLabel, type] tuple value
        new_note_number = case description.slice(*description.keys.grep(/note.+(displayLabel|type)/))
                                          .group_by { |k, _v| k.match(/(.*note\d+)\./)[1] }
                                          .count do |_key, value|
                                 hash = value.to_h
                                 num = hash.keys.first[/\d+/]
                                 [hash["old_note#{num}.displayLabel"],
                                  hash["old_note#{num}.type"]] == [label_for_note_number, type_for_note_number] ||
                                   [hash["note#{num}.displayLabel"],
                                    hash["note#{num}.type"]] == [label_for_note_number, type_for_note_number]
                               end

                          when 1
                            # If there is only one matching note number in the mapping, use it and move on.
                            ordered_mapping.key([label_for_note_number, type_for_note_number])
                          else
                            # If there are multiple notes of this type, e.g., an
                            # item with multiple notes of type "abstract" with a
                            # nil display label, use the first note number not
                            # already used.
                            # Also applies when there are no displayLabels or types for the note
                            ordered_mapping.find do |mapped_note_number, (label_value, type_value)|
                              [label_value,
                               type_value] == [label_for_note_number,
                                               type_for_note_number] && !description.key?(key.sub(/^old_note\d+/,
                                                                                                  mapped_note_number))
                            end&.first
                          end

        # Fall back to original number if look-up returned nil
        new_note_number ||= "note#{note_number[1]}"

        # Extend the description-specific mapping with the above information
        mapping[note_number[0]] = new_note_number

        # Map the current "old" note number element to the new one.
        key.sub(/^old_note\d+/, new_note_number)
      end
    end
  end

  private

  attr_reader :descriptions, :ordered_mapping

  def ordered_mapping_from(descriptions)
    label_and_type_values = descriptions
                            .values
                            .map do |description|
      notes_count = description.keys.grep(/^note\d+\./).max_by { |field| field[/\d+/].to_i }
      next if notes_count.nil?

      1.upto(notes_count[/\d+/].to_i).map do |note_number|
        [
          description["note#{note_number}.displayLabel"],
          description["note#{note_number}.type"]
        ]
      end
    end
    # e.g., when passed a set of six descriptions
    # [
    #   [[nil, "statement of responsibility"], [nil, nil], [nil, "system details"], ["Display label", nil], ["Another label", "condition"], ["Display label", nil]],
    #   [[nil, "abstract"], [nil, nil], [nil, nil], [nil, nil], [nil, "system details"], ["Display label", nil], ["Another label", "condition"]],
    #   [[nil, "abstract"], [nil, "statement of responsibility"], [nil, nil], [nil, nil], [nil, "condition"]],
    #   [[nil, nil], [nil, nil], [nil, nil]],
    #   [[nil, "abstract"], [nil, "date/sequential designation"], [nil, nil]],
    #   [[nil, "abstract"], ["Contents", "table of contents"]]
    # ]
    return {} unless label_and_type_values.any?

    # Order based on frequency of a given tuple
    unique_types_in_order = label_and_type_values
                            .flatten(1)
                            .index_with { |note_type| label_and_type_values.flatten(1).count(note_type) }
                            .sort_by { |_value, count| -count }
                            .to_h
                            .keys
    # e.g.:
    # [
    #   [nil, nil],
    #   [nil, "abstract"],
    #   ["Display label", nil],
    #   [nil, "statement of responsibility"],
    #   [nil, "system details"],
    #   ["Another label", "condition"],
    #   [nil, "condition"],
    #   [nil, "date/sequential designation"],
    #   ["Contents", "table of contents"]
    # ]

    repeat_types_counts = {}
    label_and_type_values
      .map { |row| row&.select { |field| row.count(field) > 1 } }
      .compact_blank
      .each do |repeats|
      repeat_types_counts.merge!(
        repeats.index_with { |e| repeats.count(e) }
      )
    end
    # e.g.: {["Display label", nil]=>2, [nil, nil]=>3}

    repeat_types_counts.each do |value, count|
      unique_types_in_order.insert(unique_types_in_order.index(value), *Array.new(count - 1) { value })
    end
    # unique_types_in_order is now:
    #
    # [
    #   [nil, nil],
    #   [nil, nil],
    #   [nil, nil],
    #   [nil, "abstract"],
    #   ["Display label", nil],
    #   ["Display label", nil],
    #   [nil, "statement of responsibility"],
    #   [nil, "system details"],
    #   ["Another label", "condition"],
    #   [nil, "condition"],
    #   [nil, "date/sequential designation"],
    #   ["Contents", "table of contents"]
    # ]
    #
    # What's important here is the repeat tuples (e.g., double nils) are immediately adjacent

    unique_types_in_order.map.with_index(1).to_h { |key, index| ["note#{index}", key] }
    # {"note1"=>[nil, nil], "note2"=>[nil, nil], "note3"=>[nil, nil], "note4"=>[nil, "abstract"],
    #  "note5"=>["Display label", nil], "note6"=>["Display label", nil], "note7"=>[nil, "statement of responsibility"],
    #  "note8"=>[nil, "system details"], "note9"=>["Another label", "condition"], "note10"=>[nil, "condition"],
    #  "note11"=>[nil, "date/sequential designation"], "note12"=>["Contents", "table of contents"]}
  end
end