app/helpers/formatting_helper.rb from joegattnet/joegattnet_v3

app/helpers/formatting_helper.rb
Summary

Maintainability

7 hrs
Test Coverage

Issues
# encoding: utf-8

# REVIEW: All these functions should be moved to Nokogiri

module FormattingHelper
  def bodify(text, books = [], books_citation_style = 'citation.book.inline_annotated_html', links_citation_style = 'citation.link.inline_annotated_html', annotated = true)
    return '' if text.blank?
    # REVIEW: Add settings condition
    text = related_notify(text)
    text = related_citationify(text)
    text = sanitize_from_db(text)
    text = clean_whitespace(text)
    text = bookify(text, books, books_citation_style) if NB.books_section == 'true'
    text = relativize(text)
    text = headerize(text)
    text = sectionize(text)
    text = paragraphize(text)
    text = annotated ? annotate(text) : remove_annotations(text)
    text = denumber_headers(text)
    clean_up_via_dom(text, false, true)
  end

  def bodify_collate(source_text, target_text, source_lang, books = [], books_citation_style = 'citation.book.inline_annotated_html', links_citation_style = 'citation.link.inline_annotated_html', annotated = true)
    return '' if source_text.blank? || target_text.blank?
    source_text = sanitize_from_db(source_text)
    source_text = clean_whitespace(source_text)
    source_text = headerize(source_text)
    source_text = sectionize(source_text)
    source_text = paragraphize(source_text)
    source_text = remove_annotations(source_text)
    source_text = denumber_headers(source_text)
    source_text = clean_up_via_dom(source_text)
    # REVIEW: Add settings condition
    target_text = related_notify(target_text)
    target_text = related_citationify(target_text)
    target_text = sanitize_from_db(target_text)
    target_text = clean_whitespace(target_text)
    target_text = bookify(target_text, books, books_citation_style) if NB.books_section == 'true'
    target_text = relativize(target_text)
    target_text = headerize(target_text)
    target_text = sectionize(target_text)
    target_text = paragraphize(target_text)
    target_text = annotated ? annotate(target_text) : remove_annotations(target_text)
    target_text = denumber_headers(target_text)
    target_text = clean_up_via_dom(target_text)

    collate(source_text, target_text, source_lang)
  end

  def blurbify(text, books = [], books_citation_style = 'citation.book.inline_unlinked_html', links_citation_style = 'citation.link.inline_unlinked_html', strip_tags = true)
    return '' if text.blank?
    # REVIEW: Add settings condition
    text = related_notify(text, true)
    text = related_citationify(text)
    text = sanitize_from_db(text)
    text = clean_whitespace(text)
    text = deheaderize(text)
    text = bookify(text, books, books_citation_style) if NB.books_section == 'true'
    text = relativize(text)
    text = clean_up_via_dom(text, true)
    text = strip_tags(text) if strip_tags
    text
  end

  def sanitize_by_settings(text, allowed_tags = NB.allowed_html_tags)
    sanitize(text,
      tags: allowed_tags.split(/, ?| /) - ['span'], # REVIEW: Why remove <span> here?
      attributes: NB.allowed_html_attributes.split(/, ?| /))
  end

  def simple_blurbify_link(text, allowed_tags = NB.allowed_html_tags)
    text = text.gsub(/ *\|.*$/, '').gsub(/ *—.*$/, '')
    simple_blurbify(text, allowed_tags)
  end

  def simple_blurbify(text, allowed_tags = NB.allowed_html_tags)
    return '' if text.blank?
    text = sanitize(text, { tags: allowed_tags.split(/, ?| /) + ['span'] }) # REVIEW: Why add <span> here? For Feature titles
    text = clean_whitespace(text)
    text = smartify_punctuation(text)
    # FIXME: Clean up smart quotes inside tags
    text = text.gsub(/<([^<]*)("|“|\u201C|\u201D)([^>]*)("|“|\u201C|\u201D)([^>]*)>/, "<\\1\"\\3\"\\5>").html_safe
    # clean_up(text)
  end

  def commentify(text)
    text = sanitize_from_db(text, ['a'])
    text = paragraphize(text)
    text = smartify_punctuation(text)
    # FIXME: Clean up smart quotes inside tags
    text = text.gsub(/<([^<]*)("|“|\u201C|\u201D)([^>]*)("|“|\u201C|\u201D)([^>]*)>/, "<\\1\"\\3\"\\5>").html_safe
    # clean_up(text)
  end

  # REVIEW: Overkill with allowed_tags = NB.allowed_html_tags
  def sanitize_from_db(text, allowed_tags = NB.allowed_html_tags)
    text = sanitize_from_evernote(text)
    text = text.gsub(/#{ NB.truncate_after_regexp }.*\Z/m, '')
           .gsub(/<br[^>]*?>/, "\n")
           .gsub(/<b>|<h\d>/, '<strong>')
           .gsub(%r{</b>|</h\d>}, '</strong>')
    # OPTIMIZE: Here we need to allow a few more tags than we do on output
    #  e.g. image tags for inline image.
    text = sanitize_by_settings(text, allowed_tags)
    text = format_blockquotes(text)
    text = format_code(text)
    text = remove_instructions(text)
  end

  def sanitize_from_evernote(text)
    # Make all local links relative and
    #  Evernote expects all paragraphs to be wrapped in divs
    #  See: http://dev.evernote.com/doc/articles/enml.php#plaintext
    text = text.gsub(/\n|\r/, '')
           .gsub(%r{^http:\/\/[a-z0-9]*\.?#{ NB.host }}, '')
           .gsub(/(<div style="padding\-left: 30px;">)(.*?)(<\/div>)/mi, "<div>{quote:\n\\2\n}</div>")
           .gsub(/(<div style="padding\-left: 30px;">)(.*?)(<\/div>)/i, "<div>{quote:\\2}</div>")
           .gsub(/(<p>|<div)/i, "\n\\1")
           .gsub(/(<\/p>|<\/div>)/i, "\\1\n")
    #.gsub(/(<aside|<blockquote|<br|<div|<fig|<p|<ul|<ol|<li|<nav|<section|<table)/i, "\n\\1")
    #.gsub(/(<\/aside>|<\/blockquote>|<\/br>|<\/div>|<\/figure>|<\/p>|<\/figcaption>|<\/ul>|<\/ol>|<\/li>|<\/nav>|<\/section>|<\/table>)/i, "\\1\n")
    text = "\n#{ text }\n"
  end

  def format_blockquotes(text)
    text.gsub(/{\s*quote:([^}]*?)\n? ?-- *([^}]*?)\s*}/i, "\n<blockquote>\\1[\\2]</blockquote>\n")
      .gsub(/{\s*quote:([^}]*?)\n? ?-- *([^}]*?)\s*}/mi, "\n<blockquote>\n\\1[\\2]\n</blockquote>\n")
      .gsub(/{\s*quote:([^}]*?)\s*}/i, "\n<blockquote>\\1</blockquote>\n")
      .gsub(/{\s*quote:([^}]*?)\s*}/mi, "\n<blockquote>\n\\1\n</blockquote>\n")
      .gsub(/\s*quote:([^}]*?)\n? ?-- *([^\}]*?)\s*/i, "\n<blockquote>\\1[\\2]</blockquote>\n")
      .gsub(/\s*quote:([^}]*?)\s*/i, "\n<blockquote>\\1</blockquote>\n")
  end

  def format_code(text)
    text.gsub(/{\s*code:([^}]*?)\n? ?-- *([^}]*?)\s*}/i, "\n<pre><code>\\1[\\2]</code></pre>\n")
      .gsub(/{\s*code:([^}]*?)\n? ?-- *([^}]*?)\s*}/mi, "\n<pre><code>\n\\1[\\2]\n</code></pre>\n")
      .gsub(/{\s*code:([^}]*?)\s*}/i, "\n<pre><code>\\1</code></pre>\n")
      .gsub(/{\s*code:([^}]*?)\s*}/mi, "\n<pre><code>\n\\1\n</code></pre>\n")
      .gsub(/\s*code:([^}]*?)\n? ?-- *([^\}]*?)\s*/i, "\n<pre><code>\\1[\\2]</code></pre>\n")
      .gsub(/\s*code:([^}]*?)\s*/i, "\n<pre><code>\\1</code></pre>\n")
  end

  def remove_instructions(text)
    text.gsub(/\{fork:.*\}/i, '')
      .gsub(/\{(cap|alt|description|credit|intro):.*\}/i, '')
  end

  def clean_whitespace(text)
    text.gsub(/\n(<\/)/, '\1')
      .gsub(/&amp;/, '&')
      .gsub(/&quot;/, '"')
      .gsub(/&nbsp;/, ' ')
      .gsub(/ +/m, ' ')
      .gsub(/\r\n?/, "\n")
      .gsub(/\n\n+/, "\n")
      .gsub(/^ +$/, '')
      .strip
  end

  def bookify(text, books, citation_style)
    books.each do |book|
      text.gsub!(/(<figure>\s*<blockquote)>(.*?#{ book.tag }.*?<\/figure>)/m, "\\1 cite=\"#{ url_for book }\">\\2")
      text.gsub!(/#{ book.tag }/, t(citation_style, path: book_path(book), title: book.headline, author: book.author_sort, publisher: book.publisher, published_year: book.published_date.year))
    end
    text
  end

  def relativize(text)
    # Make all local links relative
    text.gsub(/(<a href=")http:\/\/#{ NB.host }([^"]*?"[^>]*?>)/, "\\1\\2")
  end

  def related_notify(text, strip_tags = false)
    # REVIEW: Do this with note titles?
    note_ids = mentioned_notes(text)
    related_notes = Note.related_notes(note_ids)
    related_notes.each do |note|
      body = strip_tags ? sanitize(note.clean_body) : note.body
      text.gsub!(/\{link:? *#{ note_or_feature_path(note) }\}/, link_to(note.headline, note_path(note)))
      text.gsub!(/\{link:? *#{ note.headline }\}/, link_to(note.headline, note_path(note)))
      text.gsub!(/\{blurb:? *#{ note_or_feature_path(note) }\}/, (render 'shared/note_blurb', note: note))
      text.gsub!(/\{blurb:? *#{ note.headline }\}/, (render 'shared/note_blurb', note: note))
      text.gsub!(/\{text:? *#{ note_or_feature_path(note) }\}/, "#{ body }\n[#{ link_to(note.headline, note_path(note)) }]")
      text.gsub!(/\{text:? *#{ note.headline }\}/, "#{ body }\n[#{ link_to(note.headline, note_path(note)) }]")
    end
    # text.gsub!(/\{[^\}]*?\}/, '') # Clean up faulty references
    text = strip_tags(text) if strip_tags
    text
  end

  def mentioned_notes(text)
    text.scan(/\{ *(link|blurb|text)\:? *\/texts\/([\d]+) *\}/).map(&:last).flatten
  end

  def related_citationify(text, strip_tags = false)
    citation_ids = mentioned_citations(text)
    related_citations = Note.related_citations(citation_ids)
    related_citations.each do |citation|
      body = strip_tags ? sanitize(citation.clean_body) : citation.body
      text.gsub!(/\{link:? *#{ citation_path(citation) }\}/, link_to(citation.headline, citation_path(citation)))
      text.gsub!(/\{blurb:? *#{ citation_path(citation) }\}/, body)
      text.gsub!(/\{text:? *#{ citation_path(citation) }\}/, "#{ body }\n") # REVIEW: Also link to citation?
    end
    # text.gsub!(/\{[^\}]*?\}/, '') # Clean up faulty references
    text = strip_tags(text) if strip_tags
    text
  end

  def mentioned_citations(text)
    text.scan(/\{ *(link|blurb|text)\:? *\/citations\/([\d]+) *\}/).map(&:last).flatten
  end

  def annotate(text)
    text.gsub!(/(\[[^\]]*)\[([^\]]*)\]([^\[]*\])/, '\1\3') # Remove any nested annotations
    annotations = text.scan(/\[([^\.].*? .*?)\]/)
    if !annotations.empty?
      text.gsub!(/\s*( *\[)([^\.].*? .*?)(\])/m).each_with_index do |_match, index|
        %(<a href="#annotation-#{ index + 1 }" id="annotation-mark-#{ index + 1 }">#{ index + 1 }</a>)
      end
      render 'notes/annotated_text.html', text: text, annotations: annotations.flatten
    else
      render 'notes/text.html', text: text
    end
  end

  def remove_annotations(text)
    text.gsub!(/(\[[^\]]*)\[([^\]]*)\]([^\[]*\])/, '\1\3') # Remove any nested annotations
    text.gsub!(/\[([^\.].*? .*?)\]/, '')
    "<section class=\"body\">#{ text.html_safe }</section>"
  end

  def clean_up(text, _clean_up_dom = true)
    # REVIEW: These operations should not be necessary!
    # .gsub(/(<[^>"]*?)[\u201C|\u201D]([^<"]*?>)/, '\1"\2') # FIXME: This is for links in credits but it should not be necessary
    text.gsub!(/^<p>\s*<\/p>$/m, '') # Removes empty paragraphs # FIXME
    text = hyper_conform(text) if NB.hyper_conform == 'true'
    text = text.gsub(/  +/m, ' ') # FIXME
           .gsub(/ ?\, ?p\./, 'p.') # Clean up page numbers (we don't always want this) # language-dependent
           .gsub(/"/, "\u201C") # Assume any remaining quotes are opening quotes.
           .gsub(/'/, "\u2018") # Same here
           .html_safe
  end

  def clean_up_via_dom(text, unwrap_p = false, number_paragraphs = false)
    text = text.gsub(/ +/m, ' ')
    text = hyper_conform(text) if NB.hyper_conform == 'true'
    text = smartify_numbers(text)
    dom = Nokogiri::HTML(text)
    dom = clean_up_dom(dom, unwrap_p, number_paragraphs)
    dom.css('body').children.to_html.html_safe
  end

  def clean_up_dom(dom, unwrap_p = false, number_paragraphs = false)
    dom.css('a, h2, header, p, section').find_all.each { |e| e.remove if e.content.blank? } # Remove empty tags
    dom.css('h2 p, cite cite, p section, p header, p p, p h2, blockquote blockquote').find_all.each { |e| e.replace e.inner_html } # Sanitise wrong nesting
    dom.css('h2').find_all.each { |h| h.content = h.content.gsub(/(<h2>)\d+\.? */, '\1') }  # Denumberise headers

    # Number paragraphs
    dom.css('p').each_with_index { |e, i| e['id'] = "paragraph-#{ i + 1 }" } if number_paragraphs

    dom.xpath('//text()').find_all.each do |t|
      t.content = smartify_punctuation(t.content)
      # t.content = t.content.strip ... we only want to strip from the beginning of files
      # t.content = hyper_conform(t.content)
    end
    dom = indent_dom(dom) if NB.html_pretty_body == 'true'
    unwrap_from_paragraph_tag(dom) if unwrap_p
    dom
  end

  def collate(source_text, target_text, source_lang)
    source_dom = Nokogiri::HTML(source_text)
    source_paragraphs = source_dom.css('p')

    target_dom = Nokogiri::HTML(target_text)
    target_paragraphs = target_dom.css('p')

    annotations = target_dom.css('.annotations')

    source_paragraphs.each_with_index do |p, _i|
      # REVIEW: We can also add 'notranslate' here rather than as a metatag
      #  https://support.google.com/translate/?hl=en-GB#2641276
      p['class'] = 'source'
      p['lang'] = lang_attr(source_lang)
      p['dir'] = dir_attr(source_lang) unless dir_attr(source_lang).blank?
    end

    target_paragraphs.each_with_index do |p, i|
      p['class'] = 'target'
      source_paragraph_html = source_paragraphs[i].nil? ? '<!-- -->' : source_paragraphs[i].to_html
      target_paragraph_html = target_paragraphs[i].nil? ? '<!-- -->' : target_paragraphs[i].to_html
      p.replace "<div id=\paragraph-#{ i+1 }\>#{ source_paragraph_html }#{ target_paragraph_html }</div>"
    end

    dom = clean_up_dom(target_dom)
    dom.css('body').children.to_html.html_safe
  end


  def indent_dom(dom)
    tidy = Nokogiri::XSLT File.open('vendor/tidy.xsl')
    dom = tidy.transform(dom)
  end

  def unwrap_from_paragraph_tag(dom)
    e = dom.at_css('body p')
    e ? e.replace(e.inner_html) : dom
  end

  def smartify_punctuation(text)
    text = smartify_hyphens(text)
    text = smartify_quotation_marks(text)
    text = force_double_quotes(text) if NB.force_double_quotes == 'true'
  end

  def smartify_hyphens(text)
    text.gsub(/\s+[\-\u2013]+\s+/, "\u2014") # Em-dashes for everything.
  end

  def smartify_hyphens_mixed(text)
    text.gsub(/ +- +([^-^.]+) +- +/, "\u2013\\1\u2013") # Em-dashes for parentheses
      .gsub(/(^|>| +)--?( +)/, "\u2014") # En-dashes for everything else
  end

  def smartify_quotation_marks(text)
    # TODO: This needs to be language dependent
    # The following assumes we are not running this on HTML text. This is not hugely concerning since for body text we
    #  run this via Nokogiri and other strings should not be marked up. (But: cite links in headers?)
    text.gsub(/'([\d]{2})/, "\u2019\\1")
      .gsub(/\&lsquo\;/, "\u2018")
      .gsub(/\&rsquo\;/, "\u2019")
      .gsub(/\&\#x27\;/, "\u2019")
      .gsub(/s' /, "s\u2019 ")
      .gsub(/(\b)'(\b)/, "\u2019")
      .gsub(/(\w)'(\w)/, "\\1\u2019\\2")
      .gsub(/'(\w|<)/, "\u2018\\1")
      .gsub(/([\w\.\,\?\!>])'/, "\\1\u2019")
      .gsub(/\&\#39\;/, '"')
      .gsub(/"(\w|<)/, "\u201C\\1")
      .gsub(/([\w\.\,\?\!>])"/, "\\1\u201D")
      .gsub(/(\u2019|\u201C)([\.\,<])/, '\\2\\1')
  end

  def force_double_quotes(text)
    text.gsub(/'(\w|<)(.*?)([\w\.\,\?\!>])'(\W)/, "\u201C\\1\\2\\3\u201D\\4")
      .gsub(/\u2018(\w|<)(.*?)([\w\.\,\?\!>])\u2019(\W)/, "\u201C\\1\\2\\3\u201D\\4")
  end

  def smartify_numbers(text)
    text.gsub(/(\d)\^([\d\,\.]+)/, '\\1<sup>\\2</sup>') # Exponential
  end

  def headerize(text)
    text.gsub(/^\s*<strong>(.+?)<\/strong>\s*$/m, '<header><h2>\1</h2></header>')
      .gsub(/^\s*<p><strong>(.+?)<\/strong><\/p>\s*$/m, '<header><h2>\1</h2></header>')
      .gsub(/^\s*<b>(.+?)<\/b>\s*$/m, '<header><h2>\1</h2></header>')
      .gsub(/^\s*<b>(.+?)<\/b>\s*$/m, '<header><h2>\1</h2></header>')
  end

  def deheaderize(text)
    text.gsub(/<(strong|h2)>.*?<\/(strong|h2)>/m, '')
  end

  def denumber_headers(text)
    text.gsub(/(<h2>)\d+\.? */, '\1')
  end

  def paragraphize(text)
    text.gsub(/^\s*([^<].*?)\s*$/, "<p>\\1</p>") # Wrap lines that do not begin with a tag
      .gsub(/^\s*(<a|<del|<em|<i|<ins|<strong)(.*?)\s*$/, "<p>\\1\\2</p>") # Wrap lines that begin with inline tags
      .gsub(/^\s*(.*?)(a>|del>|em>|i>|ins>|strong>)\s*$/, "<p>\\1\\2</p>") # Wrap lines that end with inline tags
  end

  def sectionize(text)
    text = text.split(/^\*\*\*+$|^\-\-\-+$|<hr ?\/?>/)
           .reject(&:empty?)
           .map { |content| "<section>\n#{ content }\n</section>" }
           .join unless text[/^\s*(\*\*+|\-\-+)|<hr ?\/?>\s*$/].blank?
    text = text.split(/(?=<header)/)
           .reject(&:empty?)
           .map { |content| "<section>\n#{ content }\n</section>" } # n#{ content.include? '<h2' ? '<header>' : '' }\
           .join unless text[/<header/].blank?
    text
  end

  def hyper_conform(text)
    text.gsub(/\s+([\)\n\.\,\?\!])/m, '\1') # Ensure no space before certain punctuation
      .gsub(/([\(])\s+/m, '\1') # Ensure no space after certain elements
      .gsub(/([\.\,\?\!])([a-zA-Z])/m, '\1 \2') # Ensure space after certain punctuation
      .gsub(/([[:upper:]]{3,})/, '<abbr>\1</abbr>') # Wrap all-caps in <abbr>
      .gsub(/\b([A-Z]{1})\./, '\1') # Wrap all-caps in <abbr>
      .gsub(/(<p>|<li>)([[:lower:]])/) { "#{ Regexp.last_match[1] }#{ Regexp.last_match[2].upcase }" } # Always start with a capital
      .gsub(/(\.|\?|\!) ([[:lower:]])/) { "#{ Regexp.last_match[1] }#{ Regexp.last_match[2].upcase }" } # Always start with a capital
      .gsub(/(\w)(<\/p>|<\/li>)/, '\1.\2') # Always end with punctuation -- What about verse? __VERSE ? (& lists?)
      .gsub(/\s+(<a href=\"#annotation-)/m, '\1')
      .gsub(/ *(<a href=\"#annotation-.*?<\/a>) *([\.\,\;\?\!])/, '\2\1')
      .gsub(/([\.\?\!])(<\/cite>)([\.\?\!])/, '\1\2') # Ensure no double punctuation after titles
      .html_safe
  end
end