diasks2/pragmatic_segmenter

View on GitHub
lib/pragmatic_segmenter/punctuation_replacer.rb

Summary

Maintainability
A
1 hr
Test Coverage
# -*- encoding : utf-8 -*-
# frozen_string_literal: true

module PragmaticSegmenter
  # This class replaces punctuation that is typically a sentence boundary
  # but in this case is not a sentence boundary.
  class PunctuationReplacer
    module Rules
      module EscapeRegexReservedCharacters
        LeftParen = Rule.new('(', '\\(')
        RightParen = Rule.new(')', '\\)')
        LeftBracket = Rule.new('[', '\\[')
        RightBracket = Rule.new(']', '\\]')
        Dash = Rule.new('-', '\\-')

        All = [ LeftParen, RightParen,
                LeftBracket, RightBracket, Dash ]
      end

      module SubEscapedRegexReservedCharacters
        SubLeftParen = Rule.new('\\(', '(')
        SubRightParen = Rule.new('\\)', ')')
        SubLeftBracket = Rule.new('\\[', '[')
        SubRightBracket = Rule.new('\\]', ']')
        SubDash = Rule.new('\\-', '-')

        All = [ SubLeftParen, SubRightParen,
                SubLeftBracket, SubRightBracket, SubDash ]
      end

    end

    attr_reader :matches_array, :text, :match_type
    def initialize(text:, matches_array:, match_type: nil)
      @text = text
      @matches_array = matches_array
      @match_type = match_type
    end

    def replace
      replace_punctuation(matches_array)
    end

    private

    def replace_punctuation(array)
      return if !array || array.empty?
      Rule.apply(@text, Rules::EscapeRegexReservedCharacters::All)
      array.each do |a|
        Rule.apply(a, Rules::EscapeRegexReservedCharacters::All)
        sub = sub_characters(a, '.', '∯')
        sub_1 = sub_characters(sub, '。', '&ᓰ&')
        sub_2 = sub_characters(sub_1, '.', '&ᓱ&')
        sub_3 = sub_characters(sub_2, '!', '&ᓳ&')
        sub_4 = sub_characters(sub_3, '!', '&ᓴ&')
        sub_5 = sub_characters(sub_4, '?', '&ᓷ&')
        sub_6 = sub_characters(sub_5, '?', '&ᓸ&')
        unless match_type.eql?('single')
          sub_7 = sub_characters(sub_6, "'", '&⎋&')
        end
      end
      Rule.apply(@text, Rules::SubEscapedRegexReservedCharacters::All)
    end

    def sub_characters(string, char_a, char_b)
      sub = string.gsub(char_a, char_b)
      @text.gsub!(/#{Regexp.escape(string)}/, sub)
      sub
    end
  end
end