louismullie/treat

View on GitHub
lib/treat/workers/processors/chunkers/html.rb

Summary

Maintainability
B
6 hrs
Test Coverage
class Treat::Workers::Processors::Chunkers::HTML

  require 'nokogiri'

  def self.chunk(entity, options = {})
    entity.check_hasnt_children
    doc = Nokogiri::HTML(entity.value)
    self.recurse(entity, doc)
  end

  def self.recurse(node, html_node, level = 1)

    html_node.children.each do |child|
      
      next if child.name == 'text'
      txt = child.inner_text
      
      if child.name =~ /^h([0-9]{1})$/ ||
        (child.name == 'p' && txt.length < 45 && 
        node.parent && node.parent.type == :section)
        
        if $1
          
          lvl = $1.to_i
          if lvl <= level
            node.ancestors_with_type(:section).
              each do |s|
                l = s.has?(:level) ? s.level : 1
                node = s if l == lvl - 1
            end
            node = node <<
            Treat::Entities::Section.new
          elsif lvl > level
            node = node <<
            Treat::Entities::Section.new
          end
          level = lvl
          node.set :level, level

        end
        
        t = node <<
        Treat::Entities::Title.new(txt)

      elsif child.name == 'p'

        node << Treat::Entities::Zone.
        from_string(txt)

      elsif ['ul', 'ol'].include?(child.name)
        node = node <<
        Treat::Entities::List.new
      elsif ['li'].include?(child.name)
        n = Treat::Entities::Entity.
        zone_from_string(txt)
        node << n
      end

      if child.children.size > 0
        recurse(node, child, level)
      end

    end

  end

end