david-mccullars/text_rank

View on GitHub
lib/text_rank/char_filter/strip_html.rb

Summary

Maintainability
A
0 mins
Test Coverage
require 'nokogiri'
require 'stringio'

module TextRank
  module CharFilter
    ##
    # Character filter to remove HTML tags and convert HTML entities to text.
    #
    # = Example
    #
    #  StripHtml.new.filter!(""Optimism", said Cacambo, "What is that?"")
    #  => "\"Optimism\", said Cacambo, \"What is that?\""
    #
    #  StringHtml.new.filter!("<b>Alas! It is the <u>obstinacy</u> of maintaining that everything is best when it is worst.</b>")
    #  => "Alas! It is the obstinacy of maintaining that everything is best when it is worst."
    ##
    class StripHtml < Nokogiri::XML::SAX::Document

      def initialize
        super
        @text = StringIO.new
      end

      # Perform the filter
      # @param text [String]
      # @return [String]
      def filter!(text)
        @text.rewind
        Nokogiri::HTML::SAX::Parser.new(self).parse(text)
        @text.string
      end

      protected

      def characters(string)
        @text << ' '
        @text << string
      end

    end
  end
end