david-mccullars/text_rank

View on GitHub
lib/text_rank/tokenizer/money.rb

Summary

Maintainability
A
0 mins
Test Coverage
module TextRank
  module Tokenizer

    CURRENCY_SYMBOLS = "[#{[
      "\u00a4", # Generic Currency Symbol
      "\u0024", # Dollar Sign
      "\u00a2", # Cent Sign
      "\u00a3", # Pound Sterling
      "\u00a5", # Yen Symbol
      "\u20a3", # Franc Sign
      "\u20a4", # Lira Symbol
      "\u20a7", # Peseta Sign
      "\u20ac", # Euro Symbol
      "\u20B9", # Rupee
      "\u20a9", # Won Sign
      "\u20b4", # Hryvnia Sign
      "\u20af", # Drachma Sign
      "\u20ae", # Tugrik Sign
      "\u20b0", # German Penny Sign
      "\u20b2", # Guarani Sign
      "\u20b1", # Peso Sign
      "\u20b3", # Austral Sign
      "\u20b5", # Cedi Sign
      "\u20ad", # Kip Sign
      "\u20aa", # New Sheqel Sign
      "\u20ab", # Dong Sign
      "\u0025", # Percent
      "\u2030", # Per Million
    ].join}]"
    private_constant :CURRENCY_SYMBOLS # Do not expose this to avoid confusion

    ##
    # A tokenizer regex that preserves money or formatted numbers as a single token. This
    # currently supports 24 different currency symbols:
    #
    #
    # * ¤
    # * $
    # * ¢
    # * £
    # * ¥
    # * ₣
    # * ₤
    # * ₧
    # * €
    # * ₹
    # * ₩
    # * ₴
    # * ₯
    # * ₮
    # * ₰
    # * ₲
    # * ₱
    # * ₳
    # * ₵
    # * ₭
    # * ₪
    # * ₫
    # * %
    # * ‰

    #
    # It also supports two alternative formats for negatives as well as optional three digit comma
    # separation and optional decimals.
    ##
    # rubocop:disable Naming/ConstantName
    Money = /
      (
        #{CURRENCY_SYMBOLS} -? #{Number}       # $-45,231.21
        |
        -? #{CURRENCY_SYMBOLS} #{Number}       # -$45,231.21
        |
        \( #{CURRENCY_SYMBOLS} #{Number} \)    # ($45,231.21)
      )
    /x
    # rubocop:enable Naming/ConstantName

  end
end