haml/html2haml

View on GitHub
lib/html2haml/html.rb

Summary

Maintainability
F
3 days
Test Coverage
require 'cgi'
require 'nokogiri'
require 'html2haml/html/erb'

# Haml monkeypatches various Nokogiri classes
# to add methods for conversion to Haml.
# @private
module Nokogiri

  module XML
    # @see Nokogiri
    class Node
      # Whether this node has already been converted to Haml.
      # Only used for text nodes and elements.
      #
      # @return [Boolean]
      attr_accessor :converted_to_haml

      # Returns the Haml representation of the given node.
      #
      # @param tabs [Fixnum] The indentation level of the resulting Haml.
      # @option options (see Html2haml::HTML#initialize)
      def to_haml(tabs, options)
        return "" if converted_to_haml || to_s.strip.empty?
        text = uninterp(self.to_s)

        #ending in a newline stops the inline nodes
        if text.end_with?("\n")
          parse_text_with_interpolation(text, tabs)
        else
          text << process_inline_nodes(next_sibling)
          parse_text_with_interpolation(text, tabs)
        end
      end

      private

      def erb_to_interpolation(text, options)
        return text unless options[:erb]
        text = CGI.escapeHTML(uninterp(text))
        %w[<haml_loud> </haml_loud>].each {|str| text.gsub!(CGI.escapeHTML(str), str)}
        ::Nokogiri::XML.fragment(text).children.inject("") do |str, elem|
          if elem.is_a?(::Nokogiri::XML::Text)
            str + CGI.unescapeHTML(elem.to_s)
          else # <haml_loud> element
            str + '#{' + CGI.unescapeHTML(elem.inner_text.strip) + '}'
          end
        end
      end

      def tabulate(tabs)
        '  ' * tabs
      end

      def uninterp(text)
        text.gsub('#{', '\#{') #'
      end

      def attr_hash
        Hash[attributes.map {|k, v| [k.to_s, v.to_s]}]
      end

      def parse_text(text, tabs)
        parse_text_with_interpolation(uninterp(text), tabs)
      end

      def parse_text_with_interpolation(text, tabs)
        text.strip!
        return "" if text.empty?

        text.split("\n").map do |line|
          line.strip!
          "#{tabulate(tabs)}#{'\\' if Haml::Parser::SPECIAL_CHARACTERS.include?(line[0])}#{line}\n"
        end.join
      end

      def process_inline_nodes(node)
        text = ""
        while node.is_a?(::Nokogiri::XML::Element) && node.name == "haml_loud"
          node.converted_to_haml = true
          text << '#{' <<
            CGI.unescapeHTML(node.inner_text).gsub(/\n\s*/, ' ').strip << '}'

          if node.next_sibling.is_a?(::Nokogiri::XML::Text)
            node = node.next_sibling
            text << uninterp(node.to_s)
            node.converted_to_haml = true
          end

          node = node.next_sibling
        end
        text
      end
    end
  end
end

# @private
HAML_TAGS = %w[haml_block haml_loud haml_silent]
#
# HAML_TAGS.each do |t|
#   Nokogiri::XML::ElementContent[t] = {}
#   Nokogiri::XML::ElementContent.keys.each do |key|
#     Nokogiri::XML::ElementContent[t][key.hash] = true
#   end
# end
#
# Nokogiri::XML::ElementContent.keys.each do |k|
#   HAML_TAGS.each do |el|
#     val = Nokogiri::XML::ElementContent[k]
#     val[el.hash] = true if val.is_a?(Hash)
#   end
# end

module Html2haml
  # Converts HTML documents into Haml templates.
  # Depends on [Nokogiri](http://nokogiri.org/) for HTML parsing.
  # If ERB conversion is being used, also depends on
  # [Erubis](http://www.kuwata-lab.com/erubis) to parse the ERB
  # and [ruby_parser](http://parsetree.rubyforge.org/) to parse the Ruby code.
  #
  # Example usage:
  #
  #     HTML.new("<a href='http://google.com'>Blat</a>").render
  #       #=> "%a{:href => 'http://google.com'} Blat"
  class HTML
    # @param template [String, Nokogiri::Node] The HTML template to convert
    # @option options :erb [Boolean] (false) Whether or not to parse
    #   ERB's `<%= %>` and `<% %>` into Haml's `=` and `-`
    # @option options :xhtml [Boolean] (false) Whether or not to parse
    #   the HTML strictly as XHTML
    def initialize(template, options = {})
      @options = options

      if template.is_a? Nokogiri::XML::Node
        @template = template
      else
        if template.is_a? IO
          template = template.read
        end

        template = Haml::Util.check_encoding(template) {|msg, line| raise Haml::Error.new(msg, line)}

        if @options[:erb]
          require 'html2haml/html/erb'
          template = ERB.compile(template)
        end

        @template = detect_proper_parser(template)
      end
    end

    def detect_proper_parser(template)
      if template =~ /^\s*<!DOCTYPE|<html/i
        return Nokogiri.HTML(template)
      end

      if template =~ /^\s*<head|<body/i
        return Nokogiri.HTML(template).at('/html').children
      end

      parsed = Nokogiri::HTML::DocumentFragment.parse(template)

      #detect missplaced head or body tag
      #XML_HTML_STRUCURE_ERROR : 800
      if parsed.errors.any? {|e| e.code == 800 }
        return Nokogiri.HTML(template).at('/html').children
      end

      #in order to support CDATA in HTML (which is invalid) try using the XML parser
      # we can detect this when libxml returns error code XML_ERR_NAME_REQUIRED : 68
      if parsed.errors.any? {|e| e.code == 68 } || template =~ /CDATA/
        return Nokogiri::XML.fragment(template)
      end

      parsed
    end

    # Processes the document and returns the result as a string
    # containing the Haml template.
    def render
      @template.to_haml(0, @options)
    end
    alias_method :to_haml, :render

    TEXT_REGEXP = /^(\s*).*$/


    # @see Nokogiri
    # @private
    class ::Nokogiri::XML::Document
      # @see Html2haml::HTML::Node#to_haml
      def to_haml(tabs, options)
        (children || []).inject('') {|s, c| s << c.to_haml(0, options)}
      end
    end

    class ::Nokogiri::XML::DocumentFragment
      # @see Html2haml::HTML::Node#to_haml
      def to_haml(tabs, options)
        (children || []).inject('') {|s, c| s << c.to_haml(0, options)}
      end
    end

    class ::Nokogiri::XML::NodeSet
      # @see Html2haml::HTML::Node#to_haml
      def to_haml(tabs, options)
        self.inject('') {|s, c| s << c.to_haml(tabs, options)}
      end
    end

    # @see Nokogiri
    # @private
    class ::Nokogiri::XML::ProcessingInstruction
      # @see Html2haml::HTML::Node#to_haml
      def to_haml(tabs, options)
        "#{tabulate(tabs)}!!! XML\n"
      end
    end

    # @see Nokogiri
    # @private
    class ::Nokogiri::XML::CDATA
      # @see Html2haml::HTML::Node#to_haml
      def to_haml(tabs, options)
        content = parse_text_with_interpolation(
          erb_to_interpolation(self.content, options), tabs + 1)
        "#{tabulate(tabs)}:cdata\n#{content}"
      end

      # removes the start and stop markers for cdata
      def content_without_cdata_tokens
        content.
          gsub(/^\s*<!\[CDATA\[\n/,"").
          gsub(/^\s*\]\]>\n/, "")
      end
    end

    # @see Nokogiri
    # @private
    class ::Nokogiri::XML::DTD
      # @see Html2haml::HTML::Node#to_haml
      def to_haml(tabs, options)
        attrs = external_id.nil? ? ["", "", ""] :
          external_id.scan(/DTD\s+([^\s]+)\s*([^\s]*)\s*([^\s]*)\s*\/\//)[0]
        raise Haml::SyntaxError.new("Invalid doctype") if attrs == nil

        type, version, strictness = attrs.map { |a| a.downcase }
        if type == "html"
          version = ""
          strictness = "strict" if strictness == ""
        end

        if version == "1.0" || version.empty?
          version = nil
        end

        if strictness == 'transitional' || strictness.empty?
          strictness = nil
        end

        version = " #{version.capitalize}" if version
        strictness = " #{strictness.capitalize}" if strictness

        "#{tabulate(tabs)}!!!#{version}#{strictness}\n"
      end
    end

    # @see Nokogiri
    # @private
    class ::Nokogiri::XML::Comment
      # @see Html2haml::HTML::Node#to_haml
      def to_haml(tabs, options)
        content = self.content
        if content =~ /\A(\[[^\]]+\])>(.*)<!\[endif\]\z/m
          condition = $1
          content = $2
        end

        if content.include?("\n")
          "#{tabulate(tabs)}/#{condition}\n#{parse_text(content, tabs + 1)}"
        else
          "#{tabulate(tabs)}/#{condition} #{content.strip}\n"
        end
      end
    end

    # @see Nokogiri
    # @private
    class ::Nokogiri::XML::Element
      # @see Html2haml::HTML::Node#to_haml
      def to_haml(tabs, options)
        return "" if converted_to_haml

        if name == "script" &&
            (attr_hash['type'].nil? || attr_hash['type'].to_s == "text/javascript") &&
            (attr_hash.keys - ['type']).empty?
          return to_haml_filter(:javascript, tabs, options)
        elsif name == "style" &&
            (attr_hash['type'].nil? || attr_hash['type'].to_s == "text/css") &&
            (attr_hash.keys - ['type']).empty?
          return to_haml_filter(:css, tabs, options)
        end

        output = tabulate(tabs)
        if options[:erb] && HAML_TAGS.include?(name)
          case name
          when "haml_loud"
            lines = CGI.unescapeHTML(inner_text).split("\n").
              map {|s| s.rstrip}.reject {|s| s.strip.empty?}

            if attribute("raw")
              lines.first.gsub!(/^[ \t]*/, "!= ")
            else
              lines.first.gsub!(/^[ \t]*/, "= ")
            end

            if lines.size > 1 # Multiline script block
              # Normalize the indentation so that the last line is the base
              indent_str = lines.last[/^[ \t]*/]
              indent_re = /^[ \t]{0,#{indent_str.count(" ") + 8 * indent_str.count("\t")}}/
              lines.map! {|s| s.gsub!(indent_re, '')}

              # Add an extra "  " to make it indented relative to "= "
              lines[1..-1].each {|s| s.gsub!(/^/, "  ")}

              # Add | at the end, properly aligned
              length = lines.map {|s| s.size}.max + 1
              lines.map! {|s| "%#{-length}s|" % s}

              if next_sibling && next_sibling.is_a?(Nokogiri::XML::Element) && next_sibling.name == "haml_loud" &&
                  next_sibling.inner_text.split("\n").reject {|s| s.strip.empty?}.size > 1
                lines << "-#"
              end
            end
            return lines.map {|s| output + s + "\n"}.join
          when "haml_silent"
            return CGI.unescapeHTML(inner_text).split("\n").map do |line|
              next "" if line.strip.empty?
              "#{output}- #{line.strip}\n"
            end.join
          when "haml_block"
            return render_children("", tabs, options)
          end
        end

        if self.next && self.next.text? && self.next.content =~ /\A[^\s]/
          if self.previous.nil? || self.previous.text? &&
              (self.previous.content =~ /[^\s]\Z/ ||
               self.previous.content =~ /\A\s*\Z/ && self.previous.previous.nil?)
            nuke_outer_whitespace = true
          else
            output << "= succeed #{self.next.content.slice!(/\A[^\s]+/).dump} do\n"
            tabs += 1
            output << tabulate(tabs)
            #empty the text node since it was inserted into the block
            self.next.content = ""
          end
        end

        output << "%#{name}" unless name.to_s == 'div' &&
          (static_id?(options) ||
           static_classname?(options) &&
           attr_hash['class'].to_s.split(' ').any?(&method(:haml_css_attr?)))

        if attr_hash

          if static_id?(options)
            output << "##{attr_hash['id'].to_s}"
            remove_attribute('id')
          end
          if static_classname?(options)
            leftover = attr_hash['class'].to_s.split(' ').reject do |c|
              next unless haml_css_attr?(c)
              output << ".#{c}"
            end
            remove_attribute('class')
            set_attribute('class', leftover.join(' ')) unless leftover.empty?
          end
          output << haml_attributes(options) if attr_hash.length > 0
        end

        output << ">" if nuke_outer_whitespace
        output << "/" if to_xhtml.end_with?("/>")

        if children && children.size == 1
          child = children.first
          if child.is_a?(::Nokogiri::XML::Text)
            if !child.to_s.include?("\n")
              text = child.to_haml(tabs + 1, options)
              return output + " " + text.lstrip.gsub(/^\\/, '') unless text.chomp.include?("\n") || text.empty?
              return output + "\n" + text
            elsif ["pre", "textarea"].include?(name) ||
                (name == "code" && parent.is_a?(::Nokogiri::XML::Element) && parent.name == "pre")
              return output + "\n#{tabulate(tabs + 1)}:preserve\n" +
                inner_text.gsub(/^/, tabulate(tabs + 2))
            end
          elsif child.is_a?(::Nokogiri::XML::Element) && child.name == "haml_loud"
            return output + child.to_haml(tabs + 1, options).lstrip
          end
        end

        render_children(output + "\n", tabs, options)
      end

      private

      def render_children(so_far, tabs, options)
        (self.children || []).inject(so_far) do |output, child|
          output + child.to_haml(tabs + 1, options)
        end
      end

      def dynamic_attributes
        #reject any attrs without <haml>
        @dynamic_attributes = attr_hash.select {|name, value| value =~ %r{<haml.*</haml} }
        @dynamic_attributes.each do |name, value|
          fragment = Nokogiri::XML.fragment(CGI.unescapeHTML(value))

          # unwrap interpolation if we can:
          if fragment.children.size == 1 && fragment.child.name == 'haml_loud'
            if attribute_value_can_be_bare_ruby?(fragment.text)
              value.replace(fragment.text.strip)
              next
            end
          end

          # turn erb into interpolations
          fragment.css('haml_loud').each do |el|
            inner_text = el.text.strip
            next if inner_text == ""
            el.replace('#{' + inner_text + '}')
          end

          # put the resulting text in a string
          value.replace('"' + fragment.text.strip + '"')
        end
      end

      def attribute_value_can_be_bare_ruby?(value)
        begin
          ruby = RubyParser.for_current_ruby.parse(value)
        rescue Racc::ParseError, RubyParser::SyntaxError
          return false
        end

        return false if ruby.nil?

        (ruby.sexp_type == :str) ||   #regular string
        (ruby.sexp_type == :dstr) ||  #string with interpolation
        (ruby.sexp_type == :lit) ||   #symbol
        (ruby.sexp_type == :call && ruby.mass == 1) #local var or method with no params
      end


      def to_haml_filter(filter, tabs, options)
        content =
          if children.first && children.first.cdata?
            decode_entities(children.first.content_without_cdata_tokens)
          else
            decode_entities(self.inner_text)
          end

        content = erb_to_interpolation(content, options)
        content.gsub!(/\A\s*\n(\s*)/, '\1')
        original_indent = content[/\A(\s*)/, 1]
        if content.split("\n").all? {|l| l.strip.empty? || l =~ /^#{original_indent}/}
          content.gsub!(/^#{original_indent}/, tabulate(tabs + 1))
        else
          # Indentation is inconsistent. Strip whitespace from start and indent all
          # to ensure valid Haml.
          content.lstrip!
          content.gsub!(/^/, tabulate(tabs + 1))
        end

        content.rstrip!
        content << "\n"

        "#{tabulate(tabs)}:#{filter}\n#{content}"
      end

      # TODO: this method is utterly awful, find a better way to decode HTML entities.
      def decode_entities(str)
        str.gsub(/&[\S]+;/) do |entity|
          begin
            [Nokogiri::HTML::NamedCharacters[entity[1..-2]]].pack("C")
          rescue TypeError
            entity
          end
        end
      end

      def static_attribute?(name, options)
        attr_hash[name] && !dynamic_attribute?(name, options)
      end

      def dynamic_attribute?(name, options)
        options[:erb] and dynamic_attributes.key?(name)
      end

      def static_id?(options)
        static_attribute?('id', options) && haml_css_attr?(attr_hash['id'])
      end

      def static_classname?(options)
        static_attribute?('class', options)
      end

      def haml_css_attr?(attr)
        attr =~ /^[-:\w]+$/
      end

      # Returns a string representation of an attributes hash
      # that's prettier than that produced by Hash#inspect
      def haml_attributes(options)
        attrs = attr_hash.sort.map do |name, value|
          haml_attribute_pair(name, value.to_s, options)
        end
        if options[:html_style_attributes]
          "(#{attrs.join(' ')})"
        else
          "{#{attrs.join(', ')}}"
        end
      end

      # Returns the string representation of a single attribute key value pair
      def haml_attribute_pair(name, value, options)
        value = dynamic_attribute?(name, options) ? dynamic_attributes[name] : value.inspect

        if options[:html_style_attributes]
          return "#{name}=#{value}"
        end

        if name.index(/\W/)
          return "#{name.inspect} => #{value}"
        end

        if options[:ruby19_style_attributes]
          return "#{name}: #{value}"
        end

        ":#{name} => #{value}"
      end
    end
  end
end