twitter/twitter-cldr-rb

View on GitHub
lib/twitter_cldr/shared/hyphenator.rb

Summary

Maintainability
A
2 hrs
Test Coverage
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

# Documentation: https://github.com/hunspell/hyphen/blob/21127cc8493a68d4fe9adbb71377b469b4f2b550/doc/tb87nemeth.pdf

module TwitterCldr
  module Shared
    class Hyphenator
      class UnsupportedLocaleError < StandardError; end

      BASE_RESOURCE_PATH = %w(shared hyphenation).freeze
      DEFAULT_LEFT_HYPHEN_MIN = 2
      DEFAULT_RIGHT_HYPHEN_MIN = 2
      DEFAULT_NO_HYPHEN = "-'’".freeze

      class << self
        def get(locale)
          locale = find_supported_locale(locale)

          unless locale
            raise UnsupportedLocaleError,
              "'#{locale}' is not a supported hyphenation locale"
          end

          cache[locale] ||= begin
            resource = resource_for(locale)
            new(resource[:rules], locale, resource[:options])
          end
        end

        def supported_locale?(locale)
          !!find_supported_locale(locale)
        end

        def supported_locales
          @supported_locales ||= begin
            absolute_resource_path = TwitterCldr.absolute_resource_path(
              File.join(BASE_RESOURCE_PATH)
            )

            files = Dir.glob(File.join(absolute_resource_path, '*.yml'))
            files.map { |f| File.basename(f).chomp('.yml') }
          end
        end

        private

        def find_supported_locale(locale)
          maximized_locale = Locale.parse(locale.to_s).maximize

          maximized_locale.permutations('-').find do |locale_candidate|
            TwitterCldr.resource_exists?(
              *BASE_RESOURCE_PATH, locale_candidate
            )
          end
        end

        def cache
          @cache ||= {}
        end

        def resource_for(locale)
          TwitterCldr.get_resource(*BASE_RESOURCE_PATH, locale)
        end
      end

      attr_reader :rules, :locale, :options, :trie

      def initialize(rules, locale, options)
        @rules = rules
        @locale = locale
        @options = options
        @trie = build_trie_from(rules)
      end

      # 0x00AD is a soft hyphen
      def hyphenate(text, hyphen = "\u00AD")
        each_chunk(text).to_a.join(hyphen)
      end

      def each_chunk(text)
        if block_given?
          last_pos = 0

          each_position(text) do |pos|
            yield text[last_pos...pos].tap { last_pos = pos }
          end

          if last_pos < text.size
            yield text[last_pos..text.size]
          end
        else
          to_enum(__method__, text)
        end
      end

      def each_position(text)
        if block_given?
          text = ".#{text}."
          break_weights = break_weights_for(text)

          left = left_hyphen_min
          right = text.size - right_hyphen_min - 2

          (left...right).each do |idx|
            yield idx if break_weights[idx].odd?
          end
        else
          to_enum(__method__, text)
        end
      end

      private

      def break_weights_for(text)
        break_weights = Array.new(text.size - 1, 0)

        text.each_char.with_index do |char, idx|
          subtrie = trie.root
          counter = idx

          while subtrie
            subtrie = subtrie.child(text[counter])
            counter += 1

            if subtrie && subtrie.has_value?
              update_break_weights(subtrie.value, break_weights, idx)
            end
          end
        end

        remove_illegal_hyphens(break_weights, text)
      end

      def update_break_weights(pattern, break_weights, start_idx)
        pattern_idx = 0

        pattern.each_char do |segment|
          if segment =~ /\d/
            int_seg = segment.to_i
            idx = (start_idx + pattern_idx) - 1
            break if idx >= break_weights.size

            break_weights[idx] = if break_weights[idx] > int_seg
              break_weights[idx]
            else
              int_seg
            end
          else
            pattern_idx += 1
          end
        end
      end

      def remove_illegal_hyphens(break_weights, text)
        break_weights.map.with_index do |break_weight, idx|
          next break_weight if idx.zero?
          next 0 if no_hyphen.include?(text[idx - 1])
          break_weight
        end
      end

      def left_hyphen_min
        @left_hyphen_min ||=
          options.fetch(:lefthyphenmin, DEFAULT_LEFT_HYPHEN_MIN).to_i
      end

      def right_hyphen_min
        @right_hyphen_min ||=
          options.fetch(:righthyphenmin, DEFAULT_RIGHT_HYPHEN_MIN).to_i
      end

      def no_hyphen
        @no_hyphen ||= options.fetch(:nohyphen, DEFAULT_NO_HYPHEN)
      end

      def build_trie_from(rules)
        TwitterCldr::Utils::Trie.new.tap do |trie|
          rules.each do |rule|
            trie.add(rule.gsub(/\d/, '').each_char, rule)
          end
        end
      end
    end
  end
end