diasks2/pragmatic_segmenter

View on GitHub
lib/pragmatic_segmenter/processor.rb

Summary

Maintainability
A
25 mins
Test Coverage
# -*- encoding : utf-8 -*-
# frozen_string_literal: true

require 'pragmatic_segmenter/punctuation_replacer'
require 'pragmatic_segmenter/between_punctuation'


require 'pragmatic_segmenter/list'
require 'pragmatic_segmenter/abbreviation_replacer'
require 'pragmatic_segmenter/exclamation_words'

module PragmaticSegmenter
  # This class processing segmenting the text.
  class Processor

    attr_reader :text
    def initialize(language: Languages::Common)
      @language = language
    end

    def process(text:)
      @text = List.new(text: text).add_line_break
      replace_abbreviations
      replace_numbers
      replace_continuous_punctuation
      replace_periods_before_numeric_references
      Rule.apply(@text, @language::Abbreviations::WithMultiplePeriodsAndEmailRule)
      Rule.apply(@text, @language::GeoLocationRule)
      Rule.apply(@text, @language::FileFormatRule)
      split_into_segments
    end

    private

    def split_into_segments
      check_for_parens_between_quotes(@text).split("\r")
         .map! { |segment| Rule.apply(segment, @language::SingleNewLineRule, @language::EllipsisRules::All) }
         .map { |segment| check_for_punctuation(segment) }.flatten
         .map! { |segment| Rule.apply(segment, @language::SubSymbolsRules::All) }
         .map { |segment| post_process_segments(segment) }
         .flatten.compact.delete_if(&:empty?)
         .map! { |segment| Rule.apply(segment, @language::SubSingleQuoteRule) }
    end

    def post_process_segments(txt)
      return txt if txt.length < 2 && txt =~ /\A[a-zA-Z]*\Z/
      return if consecutive_underscore?(txt) || txt.length < 2
      Rule.apply(
        txt,
        @language::ReinsertEllipsisRules::All,
        @language::ExtraWhiteSpaceRule
      )

      if txt =~ @language::QUOTATION_AT_END_OF_SENTENCE_REGEX
        txt.split(@language::SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX)
      else
        txt.tr("\n", '').strip
      end
    end

    def check_for_parens_between_quotes(txt)
      return txt unless txt =~ @language::PARENS_BETWEEN_DOUBLE_QUOTES_REGEX
      txt.gsub!(@language::PARENS_BETWEEN_DOUBLE_QUOTES_REGEX) do |match|
        match.gsub(/\s(?=\()/, "\r").gsub(/(?<=\))\s/, "\r")
      end
    end

    def replace_continuous_punctuation
      @text.gsub!(@language::CONTINUOUS_PUNCTUATION_REGEX) do |match|
        match.gsub(/!/, '&ᓴ&').gsub(/\?/, '&ᓷ&')
      end
    end

    def replace_periods_before_numeric_references
      @text.gsub!(@language::NUMBERED_REFERENCE_REGEX, "∯\\2\r\\7")
    end

    def consecutive_underscore?(txt)
      # Rubular: http://rubular.com/r/fTF2Ff3WBL
      txt.gsub(/_{3,}/, '').length.eql?(0)
    end

    def check_for_punctuation(txt)
      if @language::Punctuations.any? { |p| txt.include?(p) }
        process_text(txt)
      else
        txt
      end
    end

    def process_text(txt)
      txt << 'ȸ' unless @language::Punctuations.any? { |p| txt[-1].include?(p) }
      ExclamationWords.apply_rules(txt)
      between_punctuation(txt)
      txt = Rule.apply(
        txt,
        @language::DoublePunctuationRules::All,
        @language::QuestionMarkInQuotationRule,
        @language::ExclamationPointRules::All
      )
      txt = List.new(text: txt).replace_parens
      sentence_boundary_punctuation(txt)
    end

    def replace_numbers
      Rule.apply @text, @language::Numbers::All
    end

    def abbreviations_replacer
      if defined? @language::AbbreviationReplacer
        @language::AbbreviationReplacer
      else
        AbbreviationReplacer
      end
    end

    def replace_abbreviations
      @text = abbreviations_replacer.new(text: @text, language: @language).replace
    end

    def between_punctuation_processor
      if defined? @language::BetweenPunctuation
        @language::BetweenPunctuation
      else
        BetweenPunctuation
      end
    end

    def between_punctuation(txt)
      between_punctuation_processor.new(text: txt).replace
    end

    def sentence_boundary_punctuation(txt)
      txt = Rule.apply txt, @language::ReplaceColonBetweenNumbersRule if defined? @language::ReplaceColonBetweenNumbersRule
      txt = Rule.apply txt, @language::ReplaceNonSentenceBoundaryCommaRule if defined? @language::ReplaceNonSentenceBoundaryCommaRule

      txt.scan(@language::SENTENCE_BOUNDARY_REGEX)
    end
  end
end