lib/pragmatic_segmenter/list.rb
# -*- encoding : utf-8 -*-
# frozen_string_literal: true
module PragmaticSegmenter
# This class searches for a list within a string and adds
# newlines before each list item.
class List
ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
LATIN_NUMERALS = ('a'..'z').to_a
# Rubular: http://rubular.com/r/XcpaJKH0sz
ALPHABETICAL_LIST_WITH_PERIODS =
/(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/
# Rubular: http://rubular.com/r/Gu5rQapywf
ALPHABETICAL_LIST_WITH_PARENS =
/(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i
SubstituteListPeriodRule = Rule.new(/♨/, '∯')
ListMarkerRule = Rule.new(/☝/, '')
# Rubular: http://rubular.com/r/Wv4qLdoPx7
SpaceBetweenListItemsFirstRule = Rule.new(/(?<=\S\S|^)\s(?=\S\s*\d{1,2}♨)/, "\r")
# Rubular: http://rubular.com/r/AizHXC6HxK
SpaceBetweenListItemsSecondRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}♨)/, "\r")
# Rubular: http://rubular.com/r/GE5q6yID2j
SpaceBetweenListItemsThirdRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}☝)/, "\r")
NUMBERED_LIST_REGEX_1 =
/\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))/
NUMBERED_LIST_REGEX_2 =
/(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))/
NUMBERED_LIST_PARENS_REGEX = /\d{1,2}(?=\)\s)/
# Rubular: http://rubular.com/r/NsNFSqrNvJ
EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX =
/\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i
# Rubular: http://rubular.com/r/wMpnVedEIb
ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX =
/(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\./i
# Rubular: http://rubular.com/r/GcnmQt4a3I
ROMAN_NUMERALS_IN_PARENTHESES =
/\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])/
attr_reader :text
def initialize(text:)
@text = text.dup
end
def add_line_break
format_alphabetical_lists
format_roman_numeral_lists
format_numbered_list_with_periods
format_numbered_list_with_parens
end
def replace_parens
text.gsub!(ROMAN_NUMERALS_IN_PARENTHESES, '&✂&\1&⌬&'.freeze)
text
end
private
def format_numbered_list_with_parens
replace_parens_in_numbered_list
add_line_breaks_for_numbered_list_with_parens
Rule.apply(@text, ListMarkerRule)
end
def format_numbered_list_with_periods
replace_periods_in_numbered_list
add_line_breaks_for_numbered_list_with_periods
Rule.apply(@text, SubstituteListPeriodRule)
end
def format_alphabetical_lists
add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
end
def format_roman_numeral_lists
add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: true)
add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: true)
end
def replace_periods_in_numbered_list
scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', strip: true)
end
def add_line_breaks_for_numbered_list_with_periods
if @text.include?('♨') && @text !~ /♨.+\n.+♨|♨.+\r.+♨/ && @text !~ /for\s\d{1,2}♨\s[a-z]/
Rule.apply(@text, SpaceBetweenListItemsFirstRule, SpaceBetweenListItemsSecondRule)
end
end
def replace_parens_in_numbered_list
scan_lists(
NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
scan_lists(NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
end
def add_line_breaks_for_numbered_list_with_parens
if @text.include?('☝') && @text !~ /☝.+\n.+☝|☝.+\r.+☝/
Rule.apply(@text, SpaceBetweenListItemsThirdRule)
end
end
def scan_lists(regex1, regex2, replacement, strip: false)
list_array = @text.scan(regex1).map(&:to_i)
list_array.each_with_index do |a, i|
next unless (a + 1).eql?(list_array[i + 1]) ||
(a - 1).eql?(list_array[i - 1]) ||
(a.eql?(0) && list_array[i - 1].eql?(9)) ||
(a.eql?(9) && list_array[i + 1].eql?(0))
substitute_found_list_items(regex2, a, strip, replacement)
end
end
def substitute_found_list_items(regex, a, strip, replacement)
@text.gsub!(regex).with_index do |m|
if a.to_s.eql?(strip ? m.strip.chop : m)
"#{Regexp.escape(a.to_s)}" + replacement
else
"#{m}"
end
end
end
def add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, roman_numeral: roman_numeral)
end
def add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS,
parens: true,
roman_numeral: roman_numeral)
end
def replace_alphabet_list(a)
@text.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}"
end
end
def replace_alphabet_list_parens(a)
@text.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
if m.include?('(')
a.eql?(Unicode::downcase(m.dup).gsub!(/\(/, '')) ? "\r&✂&#{Regexp.escape(m.gsub!(/\(/, ''))}" : "#{m}"
else
a.eql?(Unicode::downcase(m.dup)) ? "\r#{Regexp.escape(m)}" : "#{m}"
end
end
end
def replace_correct_alphabet_list(a, parens)
if parens
replace_alphabet_list_parens(a)
else
replace_alphabet_list(a)
end
end
def last_array_item_replacement(a, i, alphabet, list_array, parens)
return if alphabet & list_array == [] ||
!alphabet.include?(list_array[i - 1]) ||
!alphabet.include?(a)
return if (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
replace_correct_alphabet_list(a, parens)
end
def other_items_replacement(a, i, alphabet, list_array, parens)
return if alphabet & list_array == [] ||
!alphabet.include?(list_array[i - 1]) ||
!alphabet.include?(a) ||
!alphabet.include?(list_array[i + 1])
return if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 &&
(alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
replace_correct_alphabet_list(a, parens)
end
def iterate_alphabet_array(regex, parens: false, roman_numeral: false)
list_array = @text.scan(regex).map { |s| Unicode::downcase(s) }
if roman_numeral
alphabet = ROMAN_NUMERALS
else
alphabet = LATIN_NUMERALS
end
list_array.delete_if { |item| !alphabet.any? { |a| a.include?(item) } }
list_array.each_with_index do |a, i|
if i.eql?(list_array.length - 1)
last_array_item_replacement(a, i, alphabet, list_array, parens)
else
other_items_replacement(a, i, alphabet, list_array, parens)
end
end
end
end
end