app/models/name/parse.rb from MushroomObserver/mushroom-observer

app/models/name/parse.rb
Summary

Maintainability

0 mins
Test Coverage

Issues
# frozen_string_literal: true

# Usage: in class Name, `extend Parse`, not `include Parse`.
# Extending makes these module methods into class methods of Name.
module Name::Parse
  # RankMatcher:: Lighweight class used to get ranks from text strings
  # Use:
  #   XXX_MATCHERS = [RankMatcher.new(:Rank1, /regexp1/),
  #                   ...
  #                   RankMatcher.new(:Rankn, /regexpn/)]
  #
  #   def self.guess_rank(text_name)
  #     TEXT_NAME_MATCHERS.find { |matcher| matcher.match?(text_name) }.rank
  #   end
  #
  class RankMatcher
    attr_reader :pattern, :rank

    def initialize(rank, pattern)
      @rank = rank
      @pattern = pattern
    end

    def match?(str)
      str.match?(@pattern)
    end
  end

  # All abbrevisations for a given rank
  # Used by RANK_FROM_ABBREV_MATCHERS and in app/models/name/parse.rb
  SUBG_ABBR    = / subgenus | subgen\.? | subg\.?          /xi
  SECT_ABBR    = / section | sect\.?                       /xi
  SUBSECT_ABBR = / subsection | subsect\.?                 /xi
  STIRPS_ABBR  = / stirps                                  /xi
  SP_ABBR      = / species | sp\.?                         /xi
  SSP_ABBR     = / subspecies | subsp\.? | ssp\.? | s\.?   /xi
  VAR_ABBR     = / variety | var\.? | v\.?                 /xi
  F_ABBR       = / forma | form\.? | fo\.? | f\.?          /xi
  GROUP_ABBR   = / group | gr\.? | gp\.? | clade | complex /xi

  # Match text_name to rank
  TEXT_NAME_MATCHERS = [
    RankMatcher.new("Group",      / (group|clade|complex)$/),
    RankMatcher.new("Form",       / f\. /),
    RankMatcher.new("Variety",    / var\. /),
    RankMatcher.new("Subspecies", / subsp\. /),
    RankMatcher.new("Stirps",     / stirps /),
    RankMatcher.new("Subsection", / subsect\. /),
    RankMatcher.new("Section",    / sect\. /),
    RankMatcher.new("Subgenus",   / subg\. /),
    RankMatcher.new("Species",    / /),
    RankMatcher.new("Family",     /^\S+aceae$/),
    RankMatcher.new("Family",     /^\S+ineae$/), # :Suborder
    RankMatcher.new("Order",      /^\S+ales$/),
    RankMatcher.new("Order",      /^\S+mycetidae$/), # :Subclass
    RankMatcher.new("Class",      /^\S+mycetes$/),
    RankMatcher.new("Class",      /^\S+mycotina$/), # :Subphylum
    RankMatcher.new("Phylum",     /^\S+mycota$/),
    RankMatcher.new("Phylum",     /^Fossil-/),
    RankMatcher.new("Genus",      //) # match anything else
  ].freeze

  # Matcher abbreviation to rank
  RANK_FROM_ABBREV_MATCHERS = [
    RankMatcher.new("Subgenus",   SUBG_ABBR),
    RankMatcher.new("Section",    SECT_ABBR),
    RankMatcher.new("Subsection", SUBSECT_ABBR),
    RankMatcher.new("Stirps",     STIRPS_ABBR),
    RankMatcher.new("Subspecies", SSP_ABBR),
    RankMatcher.new("Variety",    VAR_ABBR),
    RankMatcher.new("Form",       F_ABBR),
    RankMatcher.new(nil,          //) # match anything else
  ].freeze

  AUCT_ABBR    = / auct\.? /xi
  INED_ABBR    = / in\s?ed\.? /xi
  NOM_ABBR     = / nomen | nom\.? /xi
  COMB_ABBR    = / combinatio | comb\.? /xi
  SENSU_ABBR   = / sensu?\.? /xi
  NOV_ABBR     = / nova | novum | nov\.? /xi
  PROV_ABBR    = / provisional | prov\.? /xi
  CRYPT_ABBR   = / crypt\.? \s temp\.? /xi

  ANY_SUBG_ABBR   = / #{SUBG_ABBR} | #{SECT_ABBR} | #{SUBSECT_ABBR} |
                      #{STIRPS_ABBR} /x
  ANY_SSP_ABBR    = / #{SSP_ABBR} | #{VAR_ABBR} | #{F_ABBR} /x
  ANY_NAME_ABBR   = / #{ANY_SUBG_ABBR} | #{SP_ABBR} | #{ANY_SSP_ABBR} |
                      #{GROUP_ABBR} /x
  ANY_AUTHOR_ABBR = / (?: #{AUCT_ABBR} | #{INED_ABBR} | #{NOM_ABBR} |
                          #{COMB_ABBR} | #{SENSU_ABBR} | #{CRYPT_ABBR} )
                      (?:\s|$) /x

  UPPER_WORD = /
                [A-Z][a-zë-]*[a-zë] | "[A-Z][a-zë\-.]*[a-zë]"
  /x
  LOWER_WORD = /
    (?!(?:sensu|van|de)\b) [a-z][a-zë-]*[a-zë] | "[a-z][\wë\-.]*[\wë]"
    /x
  BINOMIAL   = / #{UPPER_WORD} \s #{LOWER_WORD} /x
  LOWER_WORD_OR_SP_NOV = / (?! sp\s|sp$|species) #{LOWER_WORD} |
                           sp\.\s\S*\d\S* /x

  # Matches the last epithet in a (standardized) name,
  # including preceding abbreviation if there is one.
  LAST_PART = / (?: \s[a-z]+\.? )? \s \S+ $/x

  AUTHOR_START = /
    #{ANY_AUTHOR_ABBR} |
    van\s | d[eu]\s |
    [A-ZÀÁÂÃÄÅÆÇĐÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞČŚŠ(] |
    "[^a-z\s]
  /x

  # AUTHOR_PAT is separate from, and can't include GENUS_OR_UP_TAXON, etc.
  #   AUTHOR_PAT ensures "sp", "ssp", etc., aren't included in author.
  #   AUTHOR_PAT removes the author first thing.
  # Then the other parsers have a much easier job.
  AUTHOR_PAT =
    /^
      ( "?
        #{UPPER_WORD}
        (?:
            # >= 1 of (rank Epithet)
            \s     #{ANY_SUBG_ABBR} \s #{UPPER_WORD}
            (?: \s #{ANY_SUBG_ABBR} \s #{UPPER_WORD} )* "?
          |
            \s (?! #{AUTHOR_START} | #{ANY_SUBG_ABBR} ) #{LOWER_WORD}
            (?: \s #{ANY_SSP_ABBR} \s #{LOWER_WORD} )* "?
          |
            "? \s #{SP_ABBR}
        )?
      )
      ( \s (?! #{ANY_NAME_ABBR} \s ) #{AUTHOR_START}.* )
    $/x

  # Disable cop to allow alignment and easier comparison of regexps
  # rubocop:disable Layout/LineLength

  # Taxa without authors (for use by GROUP PAT)
  GENUS_OR_UP_TAXON = /("? (?:Fossil-)? #{UPPER_WORD} "?) (?: \s #{SP_ABBR} )?/x
  SUBGENUS_TAXON    = /("? #{UPPER_WORD} \s (?: #{SUBG_ABBR} \s #{UPPER_WORD}) "?)/x
  SECTION_TAXON     = /("? #{UPPER_WORD} \s (?: #{SUBG_ABBR} \s #{UPPER_WORD} \s)?
                       (?: #{SECT_ABBR} \s #{UPPER_WORD}) "?)/x
  SUBSECTION_TAXON  = /("? #{UPPER_WORD} \s (?: #{SUBG_ABBR} \s #{UPPER_WORD} \s)?
                       (?: #{SECT_ABBR} \s #{UPPER_WORD} \s)?
                       (?: #{SUBSECT_ABBR} \s #{UPPER_WORD}) "?)/x
  STIRPS_TAXON      = /("? #{UPPER_WORD} \s (?: #{SUBG_ABBR} \s #{UPPER_WORD} \s)?
                       (?: #{SECT_ABBR} \s #{UPPER_WORD} \s)?
                       (?: #{SUBSECT_ABBR} \s #{UPPER_WORD} \s)?
                       (?: #{STIRPS_ABBR} \s #{UPPER_WORD}) "?)/x
  SPECIES_TAXON     = /("? #{UPPER_WORD} \s #{LOWER_WORD_OR_SP_NOV} "?)/x
  # rubocop:enable Layout/LineLength

  GENUS_OR_UP_PAT = /^ #{GENUS_OR_UP_TAXON} (\s #{AUTHOR_START}.*)? $/x
  SUBGENUS_PAT    = /^ #{SUBGENUS_TAXON}    (\s #{AUTHOR_START}.*)? $/x
  SECTION_PAT     = /^ #{SECTION_TAXON}     (\s #{AUTHOR_START}.*)? $/x
  SUBSECTION_PAT  = /^ #{SUBSECTION_TAXON}  (\s #{AUTHOR_START}.*)? $/x
  STIRPS_PAT      = /^ #{STIRPS_TAXON}      (\s #{AUTHOR_START}.*)? $/x
  SPECIES_PAT     = /^ #{SPECIES_TAXON}     (\s #{AUTHOR_START}.*)? $/x
  SUBSPECIES_PAT  = /^ ("? #{BINOMIAL} (?: \s #{SSP_ABBR} \s #{LOWER_WORD}) "?)
                       (\s #{AUTHOR_START}.*)?
                   $/x
  VARIETY_PAT     = /^ ("? #{BINOMIAL} (?: \s #{SSP_ABBR} \s #{LOWER_WORD})?
                         (?: \s #{VAR_ABBR} \s #{LOWER_WORD}) "?)
                       (\s #{AUTHOR_START}.*)?
                   $/x
  FORM_PAT        = /^ ("? #{BINOMIAL} (?: \s #{SSP_ABBR} \s #{LOWER_WORD})?
                         (?: \s #{VAR_ABBR} \s #{LOWER_WORD})?
                         (?: \s #{F_ABBR} \s #{LOWER_WORD}) "?)
                       (\s #{AUTHOR_START}.*)?
                   $/x

  GROUP_PAT       = /^(?<taxon>
                        #{GENUS_OR_UP_TAXON} |
                        #{SUBGENUS_TAXON}    |
                        #{SECTION_TAXON}     |
                        #{SUBSECTION_TAXON}  |
                        #{STIRPS_TAXON}      |
                        #{SPECIES_TAXON}     |
                        (?: "? #{UPPER_WORD} # infra-species taxa
                          (?: \s #{LOWER_WORD}
                            (?: \s #{SSP_ABBR} \s #{LOWER_WORD})?
                            (?: \s #{VAR_ABBR} \s #{LOWER_WORD})?
                            (?: \s #{F_ABBR}   \s #{LOWER_WORD})?
                          )? "?
                        )
                      )
                      (
                        ( # group, optionally followed by author
                          \s #{GROUP_ABBR} (\s (#{AUTHOR_START}.*))?
                        )
                        | # or
                        ( # author followed by group
                          ( \s (#{AUTHOR_START}.*)) \s #{GROUP_ABBR}
                        )
                      )
                    $/x

  # group or clade part of name, with
  # <group_wd> capture group capturing the stripped group or clade abbr
  GROUP_CHUNK     = /\s (?<group_wd>#{GROUP_ABBR}) \b/x

  # matches to ranks that are included in the name proper
  # subspecies is not included because it's the catchall default
  RANK_START_MATCHER = /^(f|sect|stirps|subg|subsect|v)/i

  # convert rank start_match to standard form of rank
  # subspecies is not included because it's the catchall default
  STANDARD_SECONDARY_RANKS = {
    f: "f.",
    sect: "sect.",
    stirps: "stirps",
    subg: "subg.",
    subsect: "subsect.",
    v: "var."
  }.freeze

  class RankMessedUp < ::StandardError
  end

  # Parse a name given no additional information. Returns a ParsedName instance.
  def parse_name(str, rank: "Genus", deprecated: false)
    str = clean_incoming_string(str)
    parse_group(str, deprecated) ||
      parse_subgenus(str, deprecated) ||
      parse_section(str, deprecated) ||
      parse_subsection(str, deprecated) ||
      parse_stirps(str, deprecated) ||
      parse_subspecies(str, deprecated) ||
      parse_variety(str, deprecated) ||
      parse_form(str, deprecated) ||
      parse_species(str, deprecated) ||
      parse_genus_or_up(str, deprecated, rank)
  end

  # Guess rank of +text_name+.
  def guess_rank(text_name)
    TEXT_NAME_MATCHERS.find { |m| m.match?(text_name) }.rank
  end

  def parse_author(str)
    str = clean_incoming_string(str)
    results = [str, nil]
    if (match = AUTHOR_PAT.match(str))
      results = [match[1].strip, match[2].strip]
    end
    results
  end

  def parse_group(str, deprecated = false)
    return unless GROUP_PAT.match(str)

    result = parse_name(str_without_group(str),
                        rank: "Group", deprecated: deprecated)
    return nil unless result

    # Adjust the parsed name
    group_type = standardized_group_abbr(str)

    result.text_name += " #{group_type}"

    if result.author.present?
      # Add "clade" or "group" before author
      author = Regexp.escape(result.author)
      result.search_name.sub!(/(#{author})$/, "#{group_type} \\1")
      result.sort_name.sub!(/(#{author})$/, " #{group_type}  \\1")
      result.display_name.sub!(/(#{author})$/, "#{group_type} \\1")
    else
      # Append "group" at end
      result.search_name +=  " #{group_type}"
      result.sort_name +=    "   #{group_type}"
      result.display_name += " #{group_type}"
    end

    result.rank = "Group"
    result.parent_name ||= ""

    result
  end

  def str_without_group(str)
    str.sub(GROUP_CHUNK, "")
  end

  def standardized_group_abbr(str)
    word = group_wd(str.to_s.downcase)
    word.start_with?("g") ? "group" : word
  end

  # sripped group_abbr
  def group_wd(str)
    GROUP_CHUNK.match(str)[:group_wd]
  end

  def parse_genus_or_up(str, deprecated = false, rank = "Genus")
    results = nil
    if (match = GENUS_OR_UP_PAT.match(str))
      name = match[1]
      author = match[2]
      rank = guess_rank(name) unless Name.ranks_above_genus.include?(rank)
      (name, author, rank) = fix_autonym(name, author, rank)
      author = standardize_author(author)
      author2 = author.blank? ? "" : " #{author}"
      text_name = name.tr("ë", "e")
      parent_name = if Name.ranks_below_genus.include?(rank)
                      name.sub(LAST_PART, "")
                    end
      display_name = format_autonym(name, author, rank, deprecated)
      results = ParsedName.new(
        text_name: text_name,
        search_name: text_name + author2,
        sort_name: format_sort_name(text_name, author),
        display_name: display_name,
        parent_name: parent_name,
        rank: rank,
        author: author
      )
    end
    results
  rescue RankMessedUp
    nil
  end

  def parse_below_genus(str, deprecated, rank, pattern)
    results = nil
    if (match = pattern.match(str))
      name = match[1]
      author = match[2].to_s
      name = standardize_sp_nov_variants(name) if rank == "Species"
      (name, author, rank) = fix_autonym(name, author, rank)
      name = standardize_name(name)
      author = standardize_author(author)
      author2 = author.blank? ? "" : " #{author}"
      text_name = name.tr("ë", "e")
      parent_name = name.sub(LAST_PART, "")
      display_name = format_autonym(name, author, rank, deprecated)
      results = ParsedName.new(
        text_name: text_name,
        search_name: text_name + author2,
        sort_name: format_sort_name(text_name, author),
        display_name: display_name,
        parent_name: parent_name,
        rank: rank,
        author: author
      )
    end
    results
  rescue RankMessedUp
    nil
  end

  def parse_subgenus(str, deprecated = false)
    parse_below_genus(str, deprecated, "Subgenus", SUBGENUS_PAT)
  end

  def parse_section(str, deprecated = false)
    parse_below_genus(str, deprecated, "Section", SECTION_PAT)
  end

  def parse_subsection(str, deprecated = false)
    parse_below_genus(str, deprecated, "Subsection", SUBSECTION_PAT)
  end

  def parse_stirps(str, deprecated = false)
    parse_below_genus(str, deprecated, "Stirps", STIRPS_PAT)
  end

  def parse_species(str, deprecated = false)
    parse_below_genus(str, deprecated, "Species", SPECIES_PAT)
  end

  def parse_subspecies(str, deprecated = false)
    parse_below_genus(str, deprecated, "Subspecies", SUBSPECIES_PAT)
  end

  def parse_variety(str, deprecated = false)
    parse_below_genus(str, deprecated, "Variety", VARIETY_PAT)
  end

  def parse_form(str, deprecated = false)
    parse_below_genus(str, deprecated, "Form", FORM_PAT)
  end

  def parse_rank_abbreviation(str)
    RANK_FROM_ABBREV_MATCHERS.find { |matcher| matcher.match?(str) }.rank
  end

  # Standardize various ways of writing sp. nov.  Convert to: Amanita "sp-T44"
  def standardize_sp_nov_variants(name)
    words = name.split
    if words.length > 2
      genus = words[0]
      epithet = words[2]
      epithet.sub!(/^"(.*)"$/, '\1')
      name = "#{genus} \"sp-#{epithet}\""
    else
      name.sub!(/ "sp\./i, ' "sp-')
    end
    name
  end

  # Fix common error: Amanita vaginatae Author var. vaginatae
  # Convert to: Amanita vaginatae var. vaginatae Author
  def fix_autonym(name, author, rank)
    last_word = name.split.last.gsub(/[()]/, "")
    if (match = author.to_s.match(
      /^(.*?)(( (#{ANY_SUBG_ABBR}|#{ANY_SSP_ABBR}) #{last_word})+)$/
    ))
      name = "#{name}#{match[2]}"
      author = match[1].strip
      words = match[2].split
      while words.any?
        next_rank = parse_rank_abbreviation(words.shift)
        words.shift
        make_sure_ranks_ordered_right!(rank, next_rank)
        rank = next_rank
      end
    end
    [name, author, rank]
  end

  def make_sure_ranks_ordered_right!(prev_rank, next_rank)
    if compare_ranks(prev_rank, next_rank) <= 0 ||
       Name.ranks_above_species.include?(prev_rank) &&
       Name.ranks_below_species.include?(next_rank)
      raise(RankMessedUp.new)
    end
  end

  # Format a name ranked below genus, moving the author to before the var.
  # in natural varieties such as
  # "__Acarospora nodulosa__ (Dufour) Hue var. __nodulosa__".
  def format_autonym(name, author, _rank, deprecated)
    words = name.split
    if author.blank?
      format_name(name, deprecated)
    elsif words[-7] == words[-1]
      [
        format_name(words[0..-7].join(" "), deprecated),
        author,
        words[-6],
        format_name(words[-5], deprecated),
        words[-4],
        format_name(words[-3], deprecated),
        words[-2],
        format_name(words[-1], deprecated)
      ].join(" ")
    elsif words[-5] == words[-1]
      [
        format_name(words[0..-5].join(" "), deprecated),
        author,
        words[-4],
        format_name(words[-3], deprecated),
        words[-2],
        format_name(words[-1], deprecated)
      ].join(" ")
    elsif words[-3] == words[-1]
      [
        format_name(words[0..-3].join(" "), deprecated),
        author,
        words[-2],
        format_name(words[-1], deprecated)
      ].join(" ")
    else
      format_name(name, deprecated) + " " + author
    end
  end

  def standardize_name(str)
    words = str.split
    # every other word, starting next-from-last, is an abbreviation
    i = words.length - 2
    while i.positive?
      words[i] = if (match_start_of_rank =
                       RANK_START_MATCHER.match(words[i]))
                   start_of_rank = match_start_of_rank[0]
                   STANDARD_SECONDARY_RANKS[start_of_rank.downcase.to_sym]
                 else
                   "subsp."
                 end
      i -= 2
    end
    words.join(" ")
  end

  def standardize_author(str)
    str = str.to_s.
          sub(/^ ?#{AUCT_ABBR}/o,  "auct. ").
          sub(/^ ?#{INED_ABBR}/o,  "ined. ").
          sub(/^ ?#{NOM_ABBR}/o,   "nom. ").
          sub(/^ ?#{COMB_ABBR}/o,  "comb. ").
          sub(/^ ?#{SENSU_ABBR}/o, "sensu ").
          # Having fixed comb. & nom., standardize their suffixes
          sub(/(?<=comb. |nom. ) ?#{NOV_ABBR}/o,  "nov. ").
          sub(/(?<=comb. |nom. ) ?#{PROV_ABBR}/o, "prov. ").
          strip_squeeze
    squeeze_author(str)
  end

  # Squeeze "A. H. Smith" into "A.H. Smith".
  def squeeze_author(str)
    str.gsub(/([A-Z]\.) (?=[A-Z]\.)/, '\\1')
  end

  # Add italics and boldface markup to a standardized name (without author).
  def format_name(str, deprecated = false)
    boldness = deprecated ? "" : "**"
    words = str.split
    if words.length.even?
      genus = words.shift
      words[0] = genus + " " + words[0]
    end
    i = words.length - 1
    while i >= 0
      words[i] = "#{boldness}__#{words[i]}__#{boldness}"
      i -= 2
    end

    words.join(" ")
  end

  def clean_incoming_string(str)
    str.to_s.
      gsub(/“|”/, '"'). # let RedCloth format quotes
      gsub(/‘|’/, "'").
      delete("\u2028"). # Unicode RLE that we see occasionally as line separator
      gsub(/\s+/, " ").
      strip_squeeze
  end

  # Adjust +search_name+ string to collate correctly. Pass in +search_name+.
  def format_sort_name(name, author)
    str = format_name(name, :deprecated).
          sub(/^_+/, "").
          gsub(/_+/, " "). # put genus at the top
          sub(/ "(sp[-.])/, ' {\1'). # put "sp-1" at end
          gsub(/"([^"]*")/, '\1'). # collate "baccata" with baccata
          sub(" subg. ", " {1subg. ").
          sub(" sect. ",    " {2sect. ").
          sub(" subsect. ", " {3subsect. ").
          sub(" stirps ",   " {4stirps ").
          sub(" subsp. ",   " {5subsp. ").
          sub(" var. ",     " {6var. ").
          sub(" f. ", " {7f. ").
          strip.
          sub(/(^\S+)aceae$/,        '\1!7').
          sub(/(^\S+)ineae$/,        '\1!6').
          sub(/(^\S+)ales$/,         '\1!5').
          sub(/(^\S+?)o?mycetidae$/, '\1!4').
          sub(/(^\S+?)o?mycetes$/,   '\1!3').
          sub(/(^\S+?)o?mycotina$/,  '\1!2').
          sub(/(^\S+?)o?mycota$/,    '\1!1')

    # put autonyms at the top
    1 while str.sub!(/(^| )([A-Za-z-]+) (.*) \2( |$)/, '\1\2 \3 !\2\4')

    if author.present?
      str += "  " + author.
             gsub(/"([^"]*")/, '\1'). # collate "baccata" with baccata
             gsub(/[Đđ]/, "d"). # mysql isn't collating these right
             gsub(/[Øø]/, "O").
             strip
    end
    str
  end
end