app/services/documents/plaintext_service.rb from indentlabs/notebook

app/services/documents/plaintext_service.rb
Summary

Maintainability

5 hrs
Test Coverage

Issues
module Documents
  class PlaintextService < Service
    PLAINTEXT_LINES_PER_PAGE = 25

    # From https://github.com/alexdunae/premailer/blob/master/lib/premailer/html_to_plain_text.rb
    def self.from_html(html, line_length = 80, from_charset = 'UTF-8')
      return "" if html.nil?

      txt = html.dup

      # Do Notebook.ai token replacements in the form of [[PAGE_TYPE-PAGE_ID]] -> Page->Name
      Rails.application.config.content_types[:all].each do |content_type|
        tokens = txt.scan(/\[\[#{content_type}-([\d]+)\]\]/).uniq
        tokens.each do |content_id|
          # Tokens captures are in the form [["id1"], ["id2"], ...] so we dig in to convert to (somewhat safer) 
          # scalar before putting it into a lookup
          content_id = content_id.first.to_i

          page = content_type.find_by(id: content_id)
          if page.present? 
            txt.gsub!(/\[\[#{content_type}-#{content_id}\]\]/, page.name)
          else
            txt.gsub!(/\[\[#{content_type}-#{content_id}\]\]/, "[Missing #{content_type}]")
          end
        end
      end

      # strip text ignored html. Useful for removing
      # headers and footers that aren't needed in the
      # text version
      txt.gsub!(/<!-- start text\/html -->.*?<!-- end text\/html -->/m, '')

      # replace images with their alt attributes
      # for img tags with "" for attribute quotes
      # with or without closing tag
      # eg. the following formats:
      # <img alt="" />
      # <img alt="">
      txt.gsub!(/<img.+?alt=\"([^\"]*)\"[^>]*\>/i, '\1')

      # for img tags with '' for attribute quotes
      # with or without closing tag
      # eg. the following formats:
      # <img alt='' />
      # <img alt=''>
      txt.gsub!(/<img.+?alt=\'([^\']*)\'[^>]*\>/i, '\1')

      # links - we strip the link out completely
      txt.gsub!(/<a\s.*?href=["'](mailto:)?([^"']*)["'][^>]*>((.|\s)*?)<\/a>/i) do |s|
        if $3.empty?
          ''
        else
          # $3.strip + ' ( ' + $2.strip + ' )'
          $3.strip + ' '
        end
      end

      # handle headings (H1-H6)
      txt.gsub!(/(<\/h[1-6]>)/i, "\n\\1") # move closing tags to new lines
      txt.gsub!(/[\s]*<h([1-6]+)[^>]*>[\s]*(.*)[\s]*<\/h[1-6]+>/i) do |s|
        hlevel = $1.to_i

        htext = $2
        htext.gsub!(/<br[\s]*\/?>/i, "\n") # handle <br>s
        htext.gsub!(/<\/?[^>]*>/i, '') # strip tags

        # determine maximum line length
        hlength = 0
        htext.each_line { |l| llength = l.strip.length; hlength = llength if llength > hlength }
        hlength = line_length if hlength > line_length

        case hlevel
          when 1   # H1, asterisks above and below
            htext = ('*' * hlength) + "\n" + htext + "\n" + ('*' * hlength)
          when 2   # H1, dashes above and below
            htext = ('-' * hlength) + "\n" + htext + "\n" + ('-' * hlength)
          else     # H3-H6, dashes below
            htext = htext + "\n" + ('-' * hlength)
        end

        "\n\n" + htext + "\n\n"
      end

      # wrap spans
      txt.gsub!(/(<\/span>)[\s]+(<span)/mi, '\1 \2')

      # lists -- TODO: should handle ordered lists
      txt.gsub!(/[\s]*(<li[^>]*>)[\s]*/i, '* ')
      # list not followed by a newline
      txt.gsub!(/<\/li>[\s]*(?![\n])/i, "\n")

      # paragraphs and line breaks
      txt.gsub!(/<\/p>/i, "\n\n")
      txt.gsub!(/<br[\/ ]*>/i, "\n")

      # strip remaining tags
      txt.gsub!(/<\/?[^>]*>/, '')

      # decode HTML entities
      he = HTMLEntities.new
      txt = he.decode(txt)

      # no more than two consecutive spaces
      txt.gsub!(/ {2,}/, " ")

      txt = word_wrap(txt, line_length)

      # remove linefeeds (\r\n and \r -> \n)
      txt.gsub!(/\r\n?/, "\n")

      # strip extra spaces
      txt.gsub!(/[ \t]*\302\240+[ \t]*/, " ") # non-breaking spaces -> spaces
      txt.gsub!(/\n[ \t]+/, "\n") # space at start of lines
      txt.gsub!(/[ \t]+\n/, "\n") # space at end of lines

      # no more than two consecutive newlines
      txt.gsub!(/[\n]{3,}/, "\n\n")

      # the word messes up the parens
      txt.gsub!(/\(([ \n])(http[^)]+)([\n ])\)/) do |s|
        ($1 == "\n" ? $1 : '' ) + '( ' + $2 + ' )' + ($3 == "\n" ? $1 : '' )
      end

      txt.strip
    end

    # Taken from Rails' word_wrap helper (http://api.rubyonrails.org/classes/ActionView/Helpers/TextHelper.html#method-i-word_wrap)
    def self.word_wrap(text, line_length)
      text.split("\n").collect do |line|
        line.length > line_length ? line.gsub(/(.{1,#{line_length}})(\s+|$)/, "\\1\n").strip : line
      end * "\n"
    end
  end
end