app/services/documents/analysis/readability_service.rb from indentlabs/notebook

app/services/documents/analysis/readability_service.rb
Summary

Maintainability

0 mins
Test Coverage

Issues
module Documents
  module Analysis
    class ReadabilityService < Service
      def self.analyze(analysis_id)
        analysis = DocumentAnalysis.find(analysis_id)
        document = analysis.document

        analysis.flesch_kincaid_grade_level     = FleschKincaidService.grade_level(document)
        analysis.flesch_kincaid_age_minimum     = FleschKincaidService.age_minimum(document)
        analysis.flesch_kincaid_reading_ease    = FleschKincaidService.reading_ease(document)

        analysis.forcast_grade_level            = self.forcast_grade_level(document)
        analysis.coleman_liau_index             = self.coleman_liau_index(document)
        analysis.automated_readability_index    = self.automated_readability_index(document)
        analysis.gunning_fog_index              = self.gunning_fog_index(document)
        analysis.smog_grade                     = self.smog_grade(document)
        analysis.linsear_write_grade            = self.linsear_write_grade(document)
        analysis.dale_chall_grade               = self.dale_chall_grade(document)
        # todo clamp/scale these from 0..16?

        # analysis.combined_average_reading_level = self.combined_average_grade_level(analysis)
        analysis.combined_average_reading_level = TextStat.text_standard(document.plaintext, float_output=true)
        analysis.readability_score              = self.readability_score(analysis)

        analysis.save!
      end

      def self.forcast_grade_level(document)
        @forcast_grade_level ||= 20 - (((document.words_with_syllables(1).length.to_f / document.words.length) * 150) / 10.0).clamp(1, 16)
      end

      def self.coleman_liau_index(document)
        @coleman_liau_index ||= [
          0.0588 * 100 * document.characters.reject { |l| [" ", "\t", "\r", "\n"].include?(l) }.length.to_f / document.words.length,
          -0.296 * 100/ (document.words.length.to_f / document.sentences.length),
          -15.8
        ].sum.clamp(1, 16)
      end

      def self.automated_readability_index(document)
        @automated_readability_index ||= [
          4.71 * document.characters.reject(&:blank?).length.to_f / document.words.length,
          0.5 * document.words.length.to_f / document.sentences.length,
          -21.43
        ].sum.clamp(1, 16)
      end

      def self.gunning_fog_index(document)
        #todo GFI word/suffix exclusions
        @gunning_fog_index ||= 0.4 * (document.words.length.to_f/ document.sentences.length + 100 * (document.complex_words.length.to_f / document.words.length)).clamp(1, 16)
      end

      def self.smog_grade(document)
        @smog_grade ||= (1.043 * Math.sqrt(document.complex_words.length.to_f * (30.0 / document.sentences.length)) + 3.1291).clamp(1, 16)
      end

      def self.linsear_write_grade(document)
        @linsear_write_grade ||= TextStat.linsear_write_formula(document.plaintext).clamp(1, 16)
      end

      def self.dale_chall_grade(document)
        @dale_chall_grade ||= TextStat.dale_chall_readability_score(document.plaintext).clamp(1, 10)
      end

      # deprecated in favor of TextStat.text_standard(text, float_output=False)
      def self.combined_average_grade_level(analysis)
        # TODO need to normalize all these scores to 1..16
        # TODO need to exclude scores here that aren't actually grade level output?
        readability_scores = [
          analysis.automated_readability_index, # todo this is an age that needs converted to grade level
          analysis.coleman_liau_index,
          analysis.flesch_kincaid_grade_level,
          analysis.forcast_grade_level,
          analysis.gunning_fog_index,
          analysis.smog_grade
        ]

        readability_scores.reject! &:nil?
        readability_scores.select! { |value| value.is_a?(Numeric) }
        readability_scores.reject! { |hasselhoff| hasselhoff.abs == Float::INFINITY }

        return nil unless readability_scores.compact.length > 2
        (readability_scores.compact.sort.slice(1..-2).sum.to_f / 4).clamp(0, 16)
      end

      def self.readability_score(analysis)
        # TODO need to normalize all these scores to 0..100
        readability_scores = [
          analysis.coleman_liau_index,
          analysis.flesch_kincaid_reading_ease,
          analysis.forcast_grade_level,
          analysis.gunning_fog_index,
          analysis.smog_grade,
          analysis.linsear_write_grade
        ]

        readability_scores.reject! &:nil?
        readability_scores.select! { |value| value.is_a?(Numeric) }
        readability_scores.reject! { |hasselhoff| hasselhoff.abs == Float::INFINITY }

        readability_scores.map! { |score| score * 8 } # multiply by a factor of 8 for rough 1..100 normalization scale

        return nil unless readability_scores.compact.length > 2
        (100 - readability_scores.compact.sort.slice(1..-2).sum.to_f / 4).clamp(0, 100)
      end

      def self.readability_score_category(score)
        case score
        when 0..35
          return 'Hard'
        when 36..65
          return 'Average'
        when 66..100
          return 'Easy'
        end
      end

      def self.readability_score_text(analysis)
        category = self.readability_score_category(analysis.readability_score)

        case category
        when 'Easy'
          [
            "Nice!",
            "This document is easily understandable by readers with at least a",
            analysis.combined_average_reading_level.round.ordinalize,
            "grade reading level."
          ].join(' ')
        when 'Average'
          [
            "Not bad!",
            "This document is understandable by readers with at least a",
            analysis.combined_average_reading_level.round.ordinalize,
            "grade reading level."
          ].join(' ')
        when 'Hard'
          [
            "This document is mostly understandable by readers with at least a",
            analysis.combined_average_reading_level.round.ordinalize,
            "grade reading level.",
            "It could definitely be improved!"
          ].join(' ')
        end
      end

    end
  end
end