lib/truty/general.rb
module Truty
# Module with general typography fixes for all the languages. The fixes in here should not be language specific.
# @author Matěj Kašpar Jirásek
module General
# Improves the typography of the large plain text with paragraphs. Adds non-breaking spaces, hyphenation, fixes dashes, etc.
#
# @param input [String] The text which will be converted.
# @param lang [Symbol] Sets the language (english name like "czech", "german", etc.)
# @return [String] Text with improved typography.
def fix(input, lang = :general, convert = [:all])
if not Truty.respond_to? lang then
lang = :general
end
input.split("\n").collect { |p| Truty.send lang, p, convert }.join("\n")
end
# Improves basic non-language specific issues in typography.
#
# @param input [String] The paragraph which will be converted.
# @param convert [Array] Array of symbols with features that should be improved (possibilities: +all+, +hyphens+, +quotes+, +ellipsis+, +dashes+, +abbreviations+, +prepositions+, +numbers+, +dates+, +characters+, +brackets+, +multiplication+, +units+, +widows+)
# @return [String] Paragraph with improved typography.
def general(input, convert = [:all])
output = input
output = ellipsis(output) if (convert.include?(:all) || convert.include?(:ellipsis))
output = multicharacters(output) if (convert.include? (:all) || convert.include?(:characters))
output = brackets_whitespace(output) if (convert.include?(:all) || convert.include?(:brackets))
output = emdash(output) if (convert.include?(:all) || convert.include?(:dashes))
output = endash(output) if (convert.include?(:all) || convert.include?(:dashes))
output = name_abbreviations(output) if (convert.include?(:all) || convert.include?(:abbreviations))
output = multiplication_sign(output) if (convert.include?(:all) || convert.include?(:multiplication))
output = space_between_numbers(output) if (convert.include?(:all) || convert.include?(:numbers))
output = units(output) if (convert.include?(:all) || convert.include?(:units))
output = widows(output) if (convert.include?(:all) || convert.include?(:widows))
output
end
# Converts three or more periods (dots, points) into ellipsis.
#
# @param input [String] The paragraph which will be converted.
# @return [String] Paragraph with ellipses.
def ellipsis(input)
input.gsub(/\.{3,}/, "…")
end
# Adds thin spaces to emdash from both sides. Also converts two or three hyphens to emdash.
#
# @param input [String] The paragraph which will be converted.
# @return [String] Paragraph with corrected emdashes.
def emdash(input)
input.gsub(/\s+(—|-{2,3})\s+/, " — ")
end
# Adds non-breaking space before endash.
#
# @param input [String] The paragraph which will be converted.
# @return [String] Paragraph with corrected endashes.
def endash(input)
input.gsub(/\s+(–|-)\s+/, " – ")
end
# Adds soft hyphens to the input.
#
# @param input [String] The paragraph which will be converted.
# @param lang [String] Sets the language of hyphenation. One of the languages a {http://www.rubydoc.info/gems/text-hyphen/ text-hyphen gem} can use.
# @param left [Integer] Number of characters on the beginning of the words which cannnot be hyphenated.
# @param right [Integer] Number of characters on the beginning of the words which cannnot be hyphenated.
# @param char [Integer] The character which will be added to hyphenation places.
# @return [String] Paragraph with added hyphenation characters.
def soft_hyphens(input, lang = "en_us", left = 2, right = 2, char = "")
l = Text::Hyphen.new(:language => lang, :left => left, :right => right)
words = input.split(/[ ]+/m)
result = []
words.each_with_index do |w, n|
if !(w.length < 6 || n == words.size - 1 || w =~ URI::REGEXP || w =~ /\A[\w+\-.]+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+\z/i)
w = l.visualise(w, char)
end
result << w
end
result.join(" ")
end
# Converts quotes to the typograhic ones.
#
# @param input [String] The paragraph which will be converted.
# @param type [String] Character which will be substited for correct quotes.
# @param start_quotes [String] The character used for starting quotes.
# @param end_quotes [String] The character used for ending quotes.
# @return [String] Paragraph with correct double quotes.
def quotes(input, type = '"“”„', start_quotes = "“", end_quotes = "”")
regexp = Regexp.new('[' + type + '][^' + type + ']*[' + type + ']')
input.gsub(regexp) { |s| start_quotes + s[1..-2].gsub(/(^[\s ]+|[\s ]+$)/, "") + end_quotes }
end
# Adds multiplication sign between numbers instead of X.
#
# @param input [String] The paragraph which will be converted.
# @return [String] Paragraph with correct multiplication signs.
def multiplication_sign(input)
output = input.gsub(/(\d+)\s{0,1}[Xx]\s{0,1}(\d+)/, '\1 × \2')
output = output.gsub(/(\d+)[Xx]/, '\1×')
end
# Adds thin non-breaking space between numbers.
#
# @param input [String] The paragraph which will be converted.
# @return [String] Paragraph with correct spaces between numbers.
def space_between_numbers(input)
input.gsub(/(\d)\s+(\d)/, '\1 \2')
end
# Fixes spaces around various brackets.
#
# @param input [String] The paragraph which will be converted.
# @return [String] Paragraph with correct spaces around brackets.
def brackets_whitespace(input)
output = input.gsub(/([\(\[\{])\s*/, '\1')
output = output.gsub(/\s*([\]\)\}])/, '\1')
output = output.gsub(/\s+([\(\[\{])\s*/, ' \1')
output = output.gsub(/\s*([\]\)\}])\s+/, '\1 ')
end
# Tries to substitute more characters which should be one, like "©", "™", etc.
#
# @param input [String] The paragraph which will be converted.
# @return [String] Paragraph with converted characters.
def multicharacters(input)
output = input.gsub(/\([Cc]\)/, "©")
output = output.gsub(/\([Pp]\)/, "℗")
output = output.gsub(/\([Rr]\)/, "®")
output = output.gsub(/\((SM|sm|Sm)\)/, "℠")
output = output.gsub(/\((TM|tm|Tm)\)/, "™")
output = output.gsub(/\+-/, "±")
output = output.gsub(/-\+/, "∓")
output = output.gsub(/N[oO]\.?\s*(\d+)/, '№\1')
output = output.gsub(/°C/, '℃')
output = output.gsub(/°F/, '℉')
end
# Fixes spaces around punctuation.
#
# @param input [String] The paragraph which will be converted.
# @return [String] Paragraph with correct spaces around punctuation.
def punctuation_whitespace(input)
input.gsub(/\s*([\!\?\.,;:…]+)\s*/, '\1 ')
end
# Fixes non-breaking spaces between number and unit, mainly SI.
#
# @param input [String] The paragraph which will be converted.
# @return [String] Paragraph with correct spaces between number and unit.
def units(input)
output = input.gsub(/(\d+)\s+(%|‰|‱|℃|℉|°|€|Kč|(Y|Z|E|P|T|G|M|k|h|da|d|m|µ|n|p|f|a|z|y)?(m(²|³)?|g|s|h|A|K|cd|mol|Ω|℃|℉))/, '\1 \2')
output.gsub(/(\*|§|#|†)\s+(\d+)/, '\1 \2')
end
# Adds non-breaking space before the last word in the paragraph.
#
# @param input [String] The paragraph which will be converted.
# @return [String] Paragraph with removed widows.
def widows(input)
input.gsub(/(\s)(\S+(\$|\z))/, ' \2')
end
# Removes whitespace after the end of the paragraph.
#
# @param input [String] The paragraph which will be converted.
# @return [String] Paragraph without trailing spaces.
def trailing_spaces(input)
input.gsub(/\s*($|\z)/, '')
end
# Adds non-breaking space after one-character name abbreviation ("A.", "J.", etc.)
#
# @param input [String] The paragraph which will be converted.
# @return [String] Paragraph with non-breaking spaces after name abbreviations.
def name_abbreviations(input)
input.gsub(/(\s|^)(([A-Z]\.\s+)+)/) { $1 + $2.gsub(/ +/, " ")}
end
end
end