scottwillson/racing_on_rails

View on GitHub
app/models/categories/name_normalization.rb

Summary

Maintainability
D
2 days
Test Coverage
# frozen_string_literal: true

module Categories
  module NameNormalization
    extend ActiveSupport::Concern

    RACING_ASSOCIATIONS = %w[ ABA ATRA CBRA GBRA MBRA NABRA OBRA WSBA ].freeze

    included do
      def self.find_or_create_by_normalized_name(name)
        Category.find_or_create_by name: normalized_name(name)
      end

      def self.normalized_name(name)
        _name = strip_whitespace(name)
        _name = split_camelcase(_name)
        _name = normalize_punctuation(_name)
        _name = normalize_case(_name)
        _name = replace_roman_numeral_categories(_name)
        normalize_spelling _name
      end

      def self.strip_whitespace(name)
        if name
          name = name.to_s.strip
          name = name.gsub(/\s+/, " ")
          # E.g., 30 - 39
          name = name.gsub(/(\d+)\s?-\s?(\d+)/, '\1-\2')

          # 1 / 2 => 1/2
          name = name.gsub(%r{\s?/\s?}, "/")

          # 6- race
          name = name.gsub(/\s+-\s?/, " - ")
          name = name.gsub(/\s?-\s+/, " - ")

          # 40 + => 40+
          name = name.gsub(/(\d+)\s+\+/, '\1+')

          # U 14, U-14
          name = name.gsub(/U[ -](\d\d)/, 'U\1')
        end
        name
      end

      def self.split_camelcase(name)
        return name if name.nil?

        unless name.downcase == name || name.upcase == name
          name = name.gsub(/([A-Z\d]+)([A-Z][a-z])/, '\1 \2')
          name = name.gsub(/([a-z\d])([A-Z])/, '\1 \2')
        end

        # Men30+, W4
        name.gsub(/(masters|master|men|women|m|w)(\d)/i, '\1 \2')
      end

      def self.normalize_punctuation(name)
        if name
          # trailing punctuation
          name = name.gsub(%r{[/:.,"]\z}, "")

          # Men (Juniors)
          name = name.gsub(/\((masters|master|juniors|junior|men|women)\)/i, '\1')

          name = normalize_ability_punctuation(name)
          name = normalize_age_group_punctuation(name)

          name = name.gsub(%r{//+}, "/")
          name = name.gsub(/\+ -( ?)/, "+ ")
          name = name.gsub(/\+ -( ?)/, "+ ")
          name = name.delete("*")

          # (200+)
          name = name.gsub(/\((\d\d+\+)\)/i, '\1')

          # split_camelcase may have alredy split this
          name = name.gsub(/\((\d ?k pursuit)\)/i, '\1')
          name = name.gsub(/\((\d ?k)\)/i, '\1')
          name = name.gsub(/six[ -]?day/i, "Six-day")

          name = name.gsub(/(\d+) day/i, '\1-Day')
          name = name.gsub(/(\d+) hour/i, '\1-Hour')
          name = name.gsub(/(\d+) mile/i, '\1-Mile')

          name = name.gsub(/(\d+) man/i, '\1-Man')
          name = name.gsub(/(\d+) person/i, '\1-Person')
          name = name.gsub(/(one|two|three|four|five|six)[ -]man/i, '\1-Man')
          name = name.gsub(/(one|two|three|four|five|six)[ -]person/i, '\1-Person')

          name = name.gsub(/(\d+) lap/i, '\1-Lap') unless name[/laps/i]

          name = name.gsub("Category 1/Pro", "Pro/Category 1")
        end
        name
      end

      # 1 2, 2 3, 3.4.5, 2-3-4 to 1/2/3
      def self.normalize_ability_punctuation(name)
        # Don't combine Junior Men 9-12 3/4/5
        return name if name[%r{\d-1\d \d/\d/\d}]

        5.downto(2).each do |length|
          ["P", 1, 2, 3, 4, 5].each_cons(length) do |cats|
            [" ", "\.", "-"].each do |delimiter|
              # Don't combine 1/2/3 40+
              next if name[%r{[/ ]\d#{delimiter}\d\d}]

              name = name.gsub(/( ?)#{cats.join(delimiter)}( ?)/, "\\1#{cats.join('/')}\\2")
            end
          end
        end
        name
      end

      def self.normalize_age_group_punctuation(name)
        (9..17).each do |age|
          name = name.gsub(%r{#{age}/#{age + 1}}, "#{age}-#{age + 1}")
        end

        (9..16).each do |age|
          name = name.gsub(%r{#{age}/#{age + 2}}, "#{age}-#{age + 2}")
        end

        (9..15).each do |age|
          name = name.gsub(%r{#{age}/#{age + 3}}, "#{age}-#{age + 3}")
        end

        (30..90).each do |age|
          name = name.gsub(%r{#{age}/#{age + 9}}, "#{age}-#{age + 5}")
          name = name.gsub(%r{#{age}/#{age + 4}}, "#{age}-#{age + 9}")
          name = name.gsub(%r{#{age}/#{age + 14}}, "#{age}-#{age + 14}")
        end

        name.gsub(/\((\d\d-\d\d)\)/, '\1')
      end

      def self.normalize_case(name)
        if name
          name = name.split.map do |token|
            # Calling RacingAssociation.current triggers an infinite loop
            if token[/of/i]
              "of"
            elsif token[/women/i]
              "Women"
            elsif token[/\Ai+\z/i] || token[/\A\d[a-z]/i]
              token.upcase
            elsif token[/\Ac{1,2}x\z/i] || token[/\At{2,3}\z/i] || token[/\Abmx\z/i]
              token.upcase
            elsif token[/\Att-?\w*/i] || token[/\A-?tt\w*/i]
              token.gsub(/tt/i, "TT")
            elsif token[/\Attt-?\w*/i] || token[/\A-?ttt\w*/i]
              token.gsub(/ttt/i, "TTT")
            elsif token.in?(%w[ MTB SS TT TTT ]) || token[/\A[A-Z][a-z]/]
              token
            elsif token.in?(RACING_ASSOCIATIONS) || RACING_ASSOCIATIONS.any? { |association| association.in?(token) }
              token
            else
              token.downcase.gsub(/\A[a-z]/) { $&.upcase }.gsub(/[[:punct:]][a-z]/) { $&.upcase }
            end
          end.join(" ")
        end
        name
      end

      def self.replace_roman_numeral_categories(name)
        if name
          name = name.split.map do |token|
            case token
            when "I"
              "1"
            when "II"
              "2"
            when "III"
              "3"
            when "IV"
              "4"
            when "V"
              "5"
            else
              token
            end
          end.join(" ")
        end
        name
      end

      def self.normalize_spelling(name)
        if name
          name = name.split.map do |token|
            if token[/\A(cat|caat|categpry|categroy|cateogry|categegory|catgory|caegory|ct)\.?\z/i]
              "Category"
            elsif token[/\ds\z/i]
              token.gsub(/(\d)s\z/i, '\1')
            elsif token == "1/23" || token == "12/3" || token == "123"
              "1/2/3"
            elsif token[/\A(sr|sen)\.?\z/i] || token[/\Aseniors\z/i] || token[/\Asenoir\z/i]
              "Senior"
            elsif token[/\Ajr\.?\z/i] || token[/\Ajuniors\z/i] || token[/\Ajrs\.?\z/i] || token[/\Ajunior(s)?:\z/i] ||
                  token[/\Ajnr\.?\z/i]
              "Junior"
            elsif token[/\Awjr\z/i]
              "Junior Women"
            elsif token[/\Amaster\z/i] || token[/\Amas\z/i] || token[/\Amstr?\z/i] || token[/\Amaster's\z/i] ||
                  token[/\Amast.?\z/i] || token[/\Amaasters\z/i] || token[/\Amastes\z/i] || token[/\Amastres\z/i] ||
                  token[/\Amater\z/i] || token[/\Amaser\z/i] || token[/\Amst\z/i]

              "Masters"
            elsif token[/\Amas\d\d\+\z/i]
              token.gsub(/\Amas(\d\d\+)\z/i, 'Masters \1')
            elsif token[/\Awmas\z/i]
              "Masters Women"
            elsif token[/\Aveteran'?s\z/i] || token[/\Aveteren\z/i] || token[/\A(vet|vt)\.?\z/i]
              "Veteran"
            elsif token[/\Avsty\z/i]
              "Varsity"
            elsif token[/\Aspt\z/i]
              "Sport"
            elsif token[/\Ajv\z/i]
              "Junior Varsity"
            elsif token[/\Aclydesdales\z/i] || token[/\Aclyde(s)?\z/i] || token[/\Aclydsdales\z/i] || token[/\Aclydesdatle\z/i]
              "Clydesdale"
            elsif token[/\Awomen'?s\z/i] || token[/\Awoman'?s\z/i]
              "Women"
            elsif token[/\Awmn?\.?\z/i] || token[/\Awom\.?\z/i] || token[/\Aw\z/i] || token[/\Awmen?\.?\z/i] || token[/\Awomenen\z/i]
              "Women"
            elsif token[/\Afemale\z/i] || token[/\Awommen:\z/i] || token[/\Aw\z/i] || token[/\Awome\z/i] || token[/\Awomwen\z/i]
              "Women"
            elsif token[/\Amen'?s\z/i] || token[/\Amale\Z/i] || token[/\Amen:\z/i] || token[/\Amed\z/i] || token[/\Amens's\z/i]
              "Men"
            elsif token[/\A\dmen\z/i]
              token.gsub(/\A(\d)men\z/i, '\1 Men')
            elsif token[/\Aco(-)?ed\z/i]
              "Co-ed"
            elsif token[/\Abeg?\.?\z/i] || token[/\Abg\.?\z/i] || token[/\Abegin?\.?\z/i] || token[/\Abeginners\z/i] || token[/\Abeg:\z/i] ||
                  token[/\ABeginning\z/i]

              "Beginner"
            elsif token[/\A(exp|expt|ex|exeprt|exb|exper|exprert)\.?\z/i]
              "Expert"
            elsif token[/\Asprt\.?\z/i]
              "Sport"
            elsif token[/\Asinglespeeds?\z/i] || token[/\Ass\z/i]
              "Singlespeed"
            elsif token[/fix gear/i]
              "Fixed Gear"
            elsif token[/\Atand?\z/i] || token[/\Atandems\z/i]
              "Tandem"
            elsif token[/\Auni\z/i] || token[/\AUnicycles\z/i]
              "Unicycle"
            elsif token[/\A\d\dU\z/i]
              # 14U => U14
              token.gsub(/(\d\d)U/, 'U\1')
            elsif token[/\A\d\d>\z/i]
              # Example: Men 30> => Men 30+
              token.gsub(/(\d\d)>/, '\1+')
            elsif token[/\Apursuite\z/i]
              "Pursuit"
            elsif token == "Mdison"
              "Madison"
            elsif token == "Kilom"
              "Kilometer"
            elsif token == "Siixday"
              "Six-day"
            elsif token == "&"
              "and"
            else
              token
            end
          end.join(" ")

          name = normalize_category_spelling(name)
          name = normalize_junior_spelling(name)
          name = normalize_masters_spelling(name)
          name = normalize_ability_spelling(name)
          name = normalize_mtb_spelling(name)
          name = normalize_competition_spelling(name)
          name = normalize_time_spelling(name)
          name = normalize_weight_spelling(name)
          name = normalize_distance_spelling(name)
          name = normalize_order(name)

          name = name.gsub(/\bAnd\b/, "and")

          name = name.gsub(/\A\d+\) ?/, "")
        end
        name
      end

      def self.normalize_category_spelling(name)
        name.gsub(/cat ?(\d)/i, 'Category \1')
            .gsub(/category(\d)/i, 'Category \1')
            .gsub(%r{category (\d)/ }i, 'Category \1 ')
      end

      # 14 and Under, 14U, 14 & U
      def self.normalize_junior_spelling(name)
        name.gsub(/junior m /i, "Junior Men ")
            .gsub(/Espior/i, "Espoir ")
            .gsub(/(\d+) (and|&) U\z/i, 'U\1')
            .gsub(/(\d+)& U\z/i, 'U\1')
            .gsub(/under (\d{2,3})/i, 'U\1')
            .gsub(/(\d+) ?(and)? ?(under|younger|up to)/i, 'U\1')
            .gsub(/(\d+) ?& ?under|younger|up to/i, 'U\1')
            .gsub(/ 0-(\d+)/i, ' U\1')
            .gsub(/ U (\d+)/i, ' U\1')
      end

      def self.normalize_masters_spelling(name)
        name = name.gsub(/mm (\d\d)\+/i, 'Masters Men \1+')
                   .gsub(/\Am (\d\d)\+/i, 'Masters \1+')
                   .gsub(/ m (\d\d)\+/i, ' Masters \1+')

        if name[/\bM [1-5]+\b/i]
          categories = name[/M ([1-5]+)/i, 1].split("")
          name = name.gsub(/M [1-5]+/i, "Men #{categories.join('/')}")
        end

        name.gsub(/masters (\d\d)\Z/i, 'Masters \1+')
            .gsub(/masters (\d\d) /i, 'Masters \1+ ')
            .gsub(/(\d+) ?and ?(over|older)/i, '\1+')
            .gsub(/(\d+) ?& ?(over|older)/i, '\1+')
      end

      def self.normalize_ability_spelling(name)
        name.gsub(%r{M P/1/2}i, "Men Pro/1/2")
            .gsub(%r{P/1/2}i, "Pro/1/2")
            .gsub(%r{Pro.*1/2}i, "Pro/1/2")
            .gsub(%r{Pr([/, ])}i, 'Pro\1')
      end

      def self.normalize_mtb_spelling(name)
        name.gsub(/semi( ?)pro/i, "Semi-Pro")
            .gsub(%r{exp/pro}i, "Pro/Expert")
            .gsub(/varsity junior/i, "Junior Varsity")
            .gsub(/jr. varsity/i, "Junior Varsity")
            .gsub(/single speeds?/i, "Singlespeed")
            .gsub(/sgl spd/i, "Singlespeed")
            .gsub(/sgl speed/i, "Singlespeed")
            .gsub(/hard tail/i, "Hardtail")
      end

      def self.normalize_time_spelling(name)
        name.gsub(/( ?)hr( ?)/i, '\1Hour\2')
            .gsub(/(\d+) ?hour/i, '\1-Hour')
      end

      def self.normalize_weight_spelling(name)
        name.gsub(/(\d{3})\+ (lbs|lb)(\.)?/i, '\1+')
            .gsub(/(\d{3})( )?(lbs|lb) \+/i, '\1+')
            .gsub(/(\d{3})( )?(lbs|lb)(.)?\+/i, '\1+')
            .gsub(/\((\d\d+\+)\)/i, '\1')
      end

      def self.normalize_distance_spelling(name)
        name = name.gsub(/\bmeter(s)?/i, "m")
                   .gsub(/metre/i, "m")
                   .gsub(/(\d) ?m\b/i, '\1m')
                   .gsub(/(\d\d\d\d) ?m\b/i, '\1m')

        # Not masters Kilometer
        name = name.gsub(/(\d+) ?(kilometer|kilometre|kilos|km|k)\b/i, '\1K') unless name[/\d\d-\d\d Kilometer/]
        name
      end

      def self.normalize_competition_spelling(name)
        name.gsub(/hot spot/i, "Hotspot")
            .gsub(/iron man/i, "Ironman")
            .gsub(/multi[ -]person/i, "Multiperson")
            .gsub(/miss.*out/i, "Miss and Out")
            .gsub(/win.*out/i, "Win and Out")
            .gsub(/Eddie/, "Eddy")
      end

      # Men Masters => Masters Men
      def self.normalize_order(name)
        %w[Masters Juniors Beginner Novice Sport Expert Semi-Pro Elite Singlespeed].each do |cat|
          name = name.gsub("Men #{cat}", "#{cat} Men")
          name = name.gsub("Women #{cat}", "#{cat} Women")
        end
        name
      end
    end
  end
end