indentlabs/notebook

View on GitHub
app/models/concerns/has_parseable_text.rb

Summary

Maintainability
A
0 mins
Test Coverage
require 'active_support/concern'

module HasParseableText
  extend ActiveSupport::Concern

  included do
    def plaintext(line_length = 80, from_charset = 'UTF-8')
      @plaintext ||= Documents::PlaintextService.from_html(self.body, line_length, from_charset)
    end

    def characters
      @characters ||= plaintext.chars
    end

    def paragraphs
      @paragraphs ||= begin
        # Normalize text
        ## We use paragraph tags by default, but people might paste in divs also
        paragraphed_sanity = ActionController::Base.helpers.sanitize(body, tags: %w(div p), attributes: %w())
        if paragraphed_sanity.nil?
          paragraphs = []
        else
          paragraphed_sanity.gsub!('<div></div>', '')
          paragraphed_sanity.gsub!('<p></p>', '')
          
          paragraphs =  paragraphed_sanity.scan(/<p>[^<]+<\/p>/).map {     |text| ActionView::Base.full_sanitizer.sanitize(text) }
          paragraphs << paragraphed_sanity.scan(/<div>[^<]+<\/div>/).map { |text| ActionView::Base.full_sanitizer.sanitize(text) }
        end
      end.flatten
    end

    def sentences
      @sentences ||= sentences_with_newlines.map { |sentence| sentence.gsub("\n", ' ') }
    end

    def sentences_with_newlines
      @sentences_with_newlines ||= plaintext(line_length=Float::INFINITY).scan(/[^\.!?]+[\.!?]+/)
    end

    def words
      @words ||= plaintext.downcase.gsub(/[^\s\w\d']/, '').split(' ')
    end

    def pages
      # todo this might make more sense as a word count splitter instead of lines?
      @pages ||= plaintext.split("\n").each_slice(Documents::PlaintextService::PLAINTEXT_LINES_PER_PAGE)
    end

    def acronyms
      @acroynyms ||= words
        .select { |word| word == word.upcase && word.length > 1 && !is_numeric?(word) }
        .uniq
        .sort
    end

    # As defined by Robert Gunning in the GFI and SMOG
    def complex_words
      @complex_words ||= unique_words.select { |word| Documents::Analysis::SyllablesService.count(word) >= 3 }
    end

    def simple_words
      @simple_words ||= unique_words - complex_words
    end

    def unique_words
      words.map(&:downcase).uniq
    end

    def words_with_syllables syllable_count
      words.select { |word| Documents::Analysis::SyllablesService.count(word) == syllable_count }
    end

    def word_syllables
      words.map { |word| Documents::Analysis::SyllablesService.count(word) }
    end

    def is_numeric?(string)
      true if Float(string) rescue false
    end
  end
end