diasks2/pragmatic_segmenter

View on GitHub
lib/pragmatic_segmenter/list.rb

Summary

Maintainability
B
6 hrs
Test Coverage
# -*- encoding : utf-8 -*-
# frozen_string_literal: true

module PragmaticSegmenter
  # This class searches for a list within a string and adds
  # newlines before each list item.
  class List
    ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
    LATIN_NUMERALS = ('a'..'z').to_a

    # Rubular: http://rubular.com/r/XcpaJKH0sz
    ALPHABETICAL_LIST_WITH_PERIODS =
      /(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/

    # Rubular: http://rubular.com/r/Gu5rQapywf
    ALPHABETICAL_LIST_WITH_PARENS =
      /(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i

    SubstituteListPeriodRule = Rule.new(/♨/, '∯')
    ListMarkerRule = Rule.new(/☝/, '')

    # Rubular: http://rubular.com/r/Wv4qLdoPx7
    SpaceBetweenListItemsFirstRule = Rule.new(/(?<=\S\S|^)\s(?=\S\s*\d{1,2}♨)/, "\r")

    # Rubular: http://rubular.com/r/AizHXC6HxK
    SpaceBetweenListItemsSecondRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}♨)/, "\r")

    # Rubular: http://rubular.com/r/GE5q6yID2j
    SpaceBetweenListItemsThirdRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}☝)/, "\r")

    NUMBERED_LIST_REGEX_1 =
      /\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))/
    NUMBERED_LIST_REGEX_2 =
      /(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))/
    NUMBERED_LIST_PARENS_REGEX = /\d{1,2}(?=\)\s)/

    # Rubular: http://rubular.com/r/NsNFSqrNvJ
    EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX =
      /\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i

    # Rubular: http://rubular.com/r/wMpnVedEIb
    ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX =
      /(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\./i

    # Rubular: http://rubular.com/r/GcnmQt4a3I
    ROMAN_NUMERALS_IN_PARENTHESES =
        /\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])/

    attr_reader :text
    def initialize(text:)
      @text = text.dup
    end

    def add_line_break
      format_alphabetical_lists
      format_roman_numeral_lists
      format_numbered_list_with_periods
      format_numbered_list_with_parens
    end

    def replace_parens
      text.gsub!(ROMAN_NUMERALS_IN_PARENTHESES, '&✂&\1&⌬&'.freeze)
      text
    end

    private

    def format_numbered_list_with_parens
      replace_parens_in_numbered_list
      add_line_breaks_for_numbered_list_with_parens
      Rule.apply(@text, ListMarkerRule)
    end

    def format_numbered_list_with_periods
      replace_periods_in_numbered_list
      add_line_breaks_for_numbered_list_with_periods
      Rule.apply(@text, SubstituteListPeriodRule)
    end

    def format_alphabetical_lists
      add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
      add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
    end

    def format_roman_numeral_lists
      add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: true)
      add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: true)
    end

    def replace_periods_in_numbered_list
      scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', strip: true)
    end

    def add_line_breaks_for_numbered_list_with_periods
      if @text.include?('♨') && @text !~ /♨.+\n.+♨|♨.+\r.+♨/ && @text !~ /for\s\d{1,2}♨\s[a-z]/
        Rule.apply(@text, SpaceBetweenListItemsFirstRule, SpaceBetweenListItemsSecondRule)
      end
    end

    def replace_parens_in_numbered_list
      scan_lists(
        NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
      scan_lists(NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
    end

    def add_line_breaks_for_numbered_list_with_parens
      if @text.include?('☝') && @text !~ /☝.+\n.+☝|☝.+\r.+☝/
        Rule.apply(@text, SpaceBetweenListItemsThirdRule)
      end
    end

    def scan_lists(regex1, regex2, replacement, strip: false)
      list_array = @text.scan(regex1).map(&:to_i)
      list_array.each_with_index do |a, i|
        next unless (a + 1).eql?(list_array[i + 1]) ||
                    (a - 1).eql?(list_array[i - 1]) ||
                    (a.eql?(0) && list_array[i - 1].eql?(9)) ||
                    (a.eql?(9) && list_array[i + 1].eql?(0))
        substitute_found_list_items(regex2, a, strip, replacement)
      end
    end

    def substitute_found_list_items(regex, a, strip, replacement)
      @text.gsub!(regex).with_index do |m|
        if a.to_s.eql?(strip ? m.strip.chop : m)
          "#{Regexp.escape(a.to_s)}" + replacement
        else
          "#{m}"
        end
      end
    end

    def add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
      iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, roman_numeral: roman_numeral)
    end

    def add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
      iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS,
        parens: true,
        roman_numeral: roman_numeral)
    end

    def replace_alphabet_list(a)
      @text.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
        a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}"
      end
    end

    def replace_alphabet_list_parens(a)
      @text.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
        if m.include?('(')
          a.eql?(Unicode::downcase(m.dup).gsub!(/\(/, '')) ? "\r&✂&#{Regexp.escape(m.gsub!(/\(/, ''))}" : "#{m}"
        else
          a.eql?(Unicode::downcase(m.dup)) ? "\r#{Regexp.escape(m)}" : "#{m}"
        end
      end
    end

    def replace_correct_alphabet_list(a, parens)
      if parens
        replace_alphabet_list_parens(a)
      else
        replace_alphabet_list(a)
      end
    end

    def last_array_item_replacement(a, i, alphabet, list_array, parens)
      return if alphabet & list_array == [] ||
        !alphabet.include?(list_array[i - 1]) ||
        !alphabet.include?(a)
      return if (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
      replace_correct_alphabet_list(a, parens)
    end

    def other_items_replacement(a, i, alphabet, list_array, parens)
      return if alphabet & list_array == [] ||
        !alphabet.include?(list_array[i - 1]) ||
        !alphabet.include?(a) ||
        !alphabet.include?(list_array[i + 1])
      return if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 &&
                (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
      replace_correct_alphabet_list(a, parens)
    end

    def iterate_alphabet_array(regex, parens: false, roman_numeral: false)
      list_array = @text.scan(regex).map { |s| Unicode::downcase(s) }
      if roman_numeral
        alphabet = ROMAN_NUMERALS
      else
        alphabet = LATIN_NUMERALS
      end
      list_array.delete_if { |item| !alphabet.any? { |a| a.include?(item) } }
      list_array.each_with_index do |a, i|
        if i.eql?(list_array.length - 1)
          last_array_item_replacement(a, i, alphabet, list_array, parens)
        else
          other_items_replacement(a, i, alphabet, list_array, parens)
        end
      end
    end
  end
end