lib/pragmatic_segmenter/punctuation_replacer.rb
# -*- encoding : utf-8 -*-
# frozen_string_literal: true
module PragmaticSegmenter
# This class replaces punctuation that is typically a sentence boundary
# but in this case is not a sentence boundary.
class PunctuationReplacer
module Rules
module EscapeRegexReservedCharacters
LeftParen = Rule.new('(', '\\(')
RightParen = Rule.new(')', '\\)')
LeftBracket = Rule.new('[', '\\[')
RightBracket = Rule.new(']', '\\]')
Dash = Rule.new('-', '\\-')
All = [ LeftParen, RightParen,
LeftBracket, RightBracket, Dash ]
end
module SubEscapedRegexReservedCharacters
SubLeftParen = Rule.new('\\(', '(')
SubRightParen = Rule.new('\\)', ')')
SubLeftBracket = Rule.new('\\[', '[')
SubRightBracket = Rule.new('\\]', ']')
SubDash = Rule.new('\\-', '-')
All = [ SubLeftParen, SubRightParen,
SubLeftBracket, SubRightBracket, SubDash ]
end
end
attr_reader :matches_array, :text, :match_type
def initialize(text:, matches_array:, match_type: nil)
@text = text
@matches_array = matches_array
@match_type = match_type
end
def replace
replace_punctuation(matches_array)
end
private
def replace_punctuation(array)
return if !array || array.empty?
Rule.apply(@text, Rules::EscapeRegexReservedCharacters::All)
array.each do |a|
Rule.apply(a, Rules::EscapeRegexReservedCharacters::All)
sub = sub_characters(a, '.', '∯')
sub_1 = sub_characters(sub, '。', '&ᓰ&')
sub_2 = sub_characters(sub_1, '.', '&ᓱ&')
sub_3 = sub_characters(sub_2, '!', '&ᓳ&')
sub_4 = sub_characters(sub_3, '!', '&ᓴ&')
sub_5 = sub_characters(sub_4, '?', '&ᓷ&')
sub_6 = sub_characters(sub_5, '?', '&ᓸ&')
unless match_type.eql?('single')
sub_7 = sub_characters(sub_6, "'", '&⎋&')
end
end
Rule.apply(@text, Rules::SubEscapedRegexReservedCharacters::All)
end
def sub_characters(string, char_a, char_b)
sub = string.gsub(char_a, char_b)
@text.gsub!(/#{Regexp.escape(string)}/, sub)
sub
end
end
end