lib/html2haml/html.rb
require 'cgi'
require 'nokogiri'
require 'html2haml/html/erb'
# Haml monkeypatches various Nokogiri classes
# to add methods for conversion to Haml.
# @private
module Nokogiri
module XML
# @see Nokogiri
class Node
# Whether this node has already been converted to Haml.
# Only used for text nodes and elements.
#
# @return [Boolean]
attr_accessor :converted_to_haml
# Returns the Haml representation of the given node.
#
# @param tabs [Fixnum] The indentation level of the resulting Haml.
# @option options (see Html2haml::HTML#initialize)
def to_haml(tabs, options)
return "" if converted_to_haml || to_s.strip.empty?
text = uninterp(self.to_s)
#ending in a newline stops the inline nodes
if text.end_with?("\n")
parse_text_with_interpolation(text, tabs)
else
text << process_inline_nodes(next_sibling)
parse_text_with_interpolation(text, tabs)
end
end
private
def erb_to_interpolation(text, options)
return text unless options[:erb]
text = CGI.escapeHTML(uninterp(text))
%w[<haml_loud> </haml_loud>].each {|str| text.gsub!(CGI.escapeHTML(str), str)}
::Nokogiri::XML.fragment(text).children.inject("") do |str, elem|
if elem.is_a?(::Nokogiri::XML::Text)
str + CGI.unescapeHTML(elem.to_s)
else # <haml_loud> element
str + '#{' + CGI.unescapeHTML(elem.inner_text.strip) + '}'
end
end
end
def tabulate(tabs)
' ' * tabs
end
def uninterp(text)
text.gsub('#{', '\#{') #'
end
def attr_hash
Hash[attributes.map {|k, v| [k.to_s, v.to_s]}]
end
def parse_text(text, tabs)
parse_text_with_interpolation(uninterp(text), tabs)
end
def parse_text_with_interpolation(text, tabs)
text.strip!
return "" if text.empty?
text.split("\n").map do |line|
line.strip!
"#{tabulate(tabs)}#{'\\' if Haml::Parser::SPECIAL_CHARACTERS.include?(line[0])}#{line}\n"
end.join
end
def process_inline_nodes(node)
text = ""
while node.is_a?(::Nokogiri::XML::Element) && node.name == "haml_loud"
node.converted_to_haml = true
text << '#{' <<
CGI.unescapeHTML(node.inner_text).gsub(/\n\s*/, ' ').strip << '}'
if node.next_sibling.is_a?(::Nokogiri::XML::Text)
node = node.next_sibling
text << uninterp(node.to_s)
node.converted_to_haml = true
end
node = node.next_sibling
end
text
end
end
end
end
# @private
HAML_TAGS = %w[haml_block haml_loud haml_silent]
#
# HAML_TAGS.each do |t|
# Nokogiri::XML::ElementContent[t] = {}
# Nokogiri::XML::ElementContent.keys.each do |key|
# Nokogiri::XML::ElementContent[t][key.hash] = true
# end
# end
#
# Nokogiri::XML::ElementContent.keys.each do |k|
# HAML_TAGS.each do |el|
# val = Nokogiri::XML::ElementContent[k]
# val[el.hash] = true if val.is_a?(Hash)
# end
# end
module Html2haml
# Converts HTML documents into Haml templates.
# Depends on [Nokogiri](http://nokogiri.org/) for HTML parsing.
# If ERB conversion is being used, also depends on
# [Erubis](http://www.kuwata-lab.com/erubis) to parse the ERB
# and [ruby_parser](http://parsetree.rubyforge.org/) to parse the Ruby code.
#
# Example usage:
#
# HTML.new("<a href='http://google.com'>Blat</a>").render
# #=> "%a{:href => 'http://google.com'} Blat"
class HTML
# @param template [String, Nokogiri::Node] The HTML template to convert
# @option options :erb [Boolean] (false) Whether or not to parse
# ERB's `<%= %>` and `<% %>` into Haml's `=` and `-`
# @option options :xhtml [Boolean] (false) Whether or not to parse
# the HTML strictly as XHTML
def initialize(template, options = {})
@options = options
if template.is_a? Nokogiri::XML::Node
@template = template
else
if template.is_a? IO
template = template.read
end
template = Haml::Util.check_encoding(template) {|msg, line| raise Haml::Error.new(msg, line)}
if @options[:erb]
require 'html2haml/html/erb'
template = ERB.compile(template)
end
@template = detect_proper_parser(template)
end
end
def detect_proper_parser(template)
if template =~ /^\s*<!DOCTYPE|<html/i
return Nokogiri.HTML(template)
end
if template =~ /^\s*<head|<body/i
return Nokogiri.HTML(template).at('/html').children
end
parsed = Nokogiri::HTML::DocumentFragment.parse(template)
#detect missplaced head or body tag
#XML_HTML_STRUCURE_ERROR : 800
if parsed.errors.any? {|e| e.code == 800 }
return Nokogiri.HTML(template).at('/html').children
end
#in order to support CDATA in HTML (which is invalid) try using the XML parser
# we can detect this when libxml returns error code XML_ERR_NAME_REQUIRED : 68
if parsed.errors.any? {|e| e.code == 68 } || template =~ /CDATA/
return Nokogiri::XML.fragment(template)
end
parsed
end
# Processes the document and returns the result as a string
# containing the Haml template.
def render
@template.to_haml(0, @options)
end
alias_method :to_haml, :render
TEXT_REGEXP = /^(\s*).*$/
# @see Nokogiri
# @private
class ::Nokogiri::XML::Document
# @see Html2haml::HTML::Node#to_haml
def to_haml(tabs, options)
(children || []).inject('') {|s, c| s << c.to_haml(0, options)}
end
end
class ::Nokogiri::XML::DocumentFragment
# @see Html2haml::HTML::Node#to_haml
def to_haml(tabs, options)
(children || []).inject('') {|s, c| s << c.to_haml(0, options)}
end
end
class ::Nokogiri::XML::NodeSet
# @see Html2haml::HTML::Node#to_haml
def to_haml(tabs, options)
self.inject('') {|s, c| s << c.to_haml(tabs, options)}
end
end
# @see Nokogiri
# @private
class ::Nokogiri::XML::ProcessingInstruction
# @see Html2haml::HTML::Node#to_haml
def to_haml(tabs, options)
"#{tabulate(tabs)}!!! XML\n"
end
end
# @see Nokogiri
# @private
class ::Nokogiri::XML::CDATA
# @see Html2haml::HTML::Node#to_haml
def to_haml(tabs, options)
content = parse_text_with_interpolation(
erb_to_interpolation(self.content, options), tabs + 1)
"#{tabulate(tabs)}:cdata\n#{content}"
end
# removes the start and stop markers for cdata
def content_without_cdata_tokens
content.
gsub(/^\s*<!\[CDATA\[\n/,"").
gsub(/^\s*\]\]>\n/, "")
end
end
# @see Nokogiri
# @private
class ::Nokogiri::XML::DTD
# @see Html2haml::HTML::Node#to_haml
def to_haml(tabs, options)
attrs = external_id.nil? ? ["", "", ""] :
external_id.scan(/DTD\s+([^\s]+)\s*([^\s]*)\s*([^\s]*)\s*\/\//)[0]
raise Haml::SyntaxError.new("Invalid doctype") if attrs == nil
type, version, strictness = attrs.map { |a| a.downcase }
if type == "html"
version = ""
strictness = "strict" if strictness == ""
end
if version == "1.0" || version.empty?
version = nil
end
if strictness == 'transitional' || strictness.empty?
strictness = nil
end
version = " #{version.capitalize}" if version
strictness = " #{strictness.capitalize}" if strictness
"#{tabulate(tabs)}!!!#{version}#{strictness}\n"
end
end
# @see Nokogiri
# @private
class ::Nokogiri::XML::Comment
# @see Html2haml::HTML::Node#to_haml
def to_haml(tabs, options)
content = self.content
if content =~ /\A(\[[^\]]+\])>(.*)<!\[endif\]\z/m
condition = $1
content = $2
end
if content.include?("\n")
"#{tabulate(tabs)}/#{condition}\n#{parse_text(content, tabs + 1)}"
else
"#{tabulate(tabs)}/#{condition} #{content.strip}\n"
end
end
end
# @see Nokogiri
# @private
class ::Nokogiri::XML::Element
# @see Html2haml::HTML::Node#to_haml
def to_haml(tabs, options)
return "" if converted_to_haml
if name == "script" &&
(attr_hash['type'].nil? || attr_hash['type'].to_s == "text/javascript") &&
(attr_hash.keys - ['type']).empty?
return to_haml_filter(:javascript, tabs, options)
elsif name == "style" &&
(attr_hash['type'].nil? || attr_hash['type'].to_s == "text/css") &&
(attr_hash.keys - ['type']).empty?
return to_haml_filter(:css, tabs, options)
end
output = tabulate(tabs)
if options[:erb] && HAML_TAGS.include?(name)
case name
when "haml_loud"
lines = CGI.unescapeHTML(inner_text).split("\n").
map {|s| s.rstrip}.reject {|s| s.strip.empty?}
if attribute("raw")
lines.first.gsub!(/^[ \t]*/, "!= ")
else
lines.first.gsub!(/^[ \t]*/, "= ")
end
if lines.size > 1 # Multiline script block
# Normalize the indentation so that the last line is the base
indent_str = lines.last[/^[ \t]*/]
indent_re = /^[ \t]{0,#{indent_str.count(" ") + 8 * indent_str.count("\t")}}/
lines.map! {|s| s.gsub!(indent_re, '')}
# Add an extra " " to make it indented relative to "= "
lines[1..-1].each {|s| s.gsub!(/^/, " ")}
# Add | at the end, properly aligned
length = lines.map {|s| s.size}.max + 1
lines.map! {|s| "%#{-length}s|" % s}
if next_sibling && next_sibling.is_a?(Nokogiri::XML::Element) && next_sibling.name == "haml_loud" &&
next_sibling.inner_text.split("\n").reject {|s| s.strip.empty?}.size > 1
lines << "-#"
end
end
return lines.map {|s| output + s + "\n"}.join
when "haml_silent"
return CGI.unescapeHTML(inner_text).split("\n").map do |line|
next "" if line.strip.empty?
"#{output}- #{line.strip}\n"
end.join
when "haml_block"
return render_children("", tabs, options)
end
end
if self.next && self.next.text? && self.next.content =~ /\A[^\s]/
if self.previous.nil? || self.previous.text? &&
(self.previous.content =~ /[^\s]\Z/ ||
self.previous.content =~ /\A\s*\Z/ && self.previous.previous.nil?)
nuke_outer_whitespace = true
else
output << "= succeed #{self.next.content.slice!(/\A[^\s]+/).dump} do\n"
tabs += 1
output << tabulate(tabs)
#empty the text node since it was inserted into the block
self.next.content = ""
end
end
output << "%#{name}" unless name.to_s == 'div' &&
(static_id?(options) ||
static_classname?(options) &&
attr_hash['class'].to_s.split(' ').any?(&method(:haml_css_attr?)))
if attr_hash
if static_id?(options)
output << "##{attr_hash['id'].to_s}"
remove_attribute('id')
end
if static_classname?(options)
leftover = attr_hash['class'].to_s.split(' ').reject do |c|
next unless haml_css_attr?(c)
output << ".#{c}"
end
remove_attribute('class')
set_attribute('class', leftover.join(' ')) unless leftover.empty?
end
output << haml_attributes(options) if attr_hash.length > 0
end
output << ">" if nuke_outer_whitespace
output << "/" if to_xhtml.end_with?("/>")
if children && children.size == 1
child = children.first
if child.is_a?(::Nokogiri::XML::Text)
if !child.to_s.include?("\n")
text = child.to_haml(tabs + 1, options)
return output + " " + text.lstrip.gsub(/^\\/, '') unless text.chomp.include?("\n") || text.empty?
return output + "\n" + text
elsif ["pre", "textarea"].include?(name) ||
(name == "code" && parent.is_a?(::Nokogiri::XML::Element) && parent.name == "pre")
return output + "\n#{tabulate(tabs + 1)}:preserve\n" +
inner_text.gsub(/^/, tabulate(tabs + 2))
end
elsif child.is_a?(::Nokogiri::XML::Element) && child.name == "haml_loud"
return output + child.to_haml(tabs + 1, options).lstrip
end
end
render_children(output + "\n", tabs, options)
end
private
def render_children(so_far, tabs, options)
(self.children || []).inject(so_far) do |output, child|
output + child.to_haml(tabs + 1, options)
end
end
def dynamic_attributes
#reject any attrs without <haml>
@dynamic_attributes = attr_hash.select {|name, value| value =~ %r{<haml.*</haml} }
@dynamic_attributes.each do |name, value|
fragment = Nokogiri::XML.fragment(CGI.unescapeHTML(value))
# unwrap interpolation if we can:
if fragment.children.size == 1 && fragment.child.name == 'haml_loud'
if attribute_value_can_be_bare_ruby?(fragment.text)
value.replace(fragment.text.strip)
next
end
end
# turn erb into interpolations
fragment.css('haml_loud').each do |el|
inner_text = el.text.strip
next if inner_text == ""
el.replace('#{' + inner_text + '}')
end
# put the resulting text in a string
value.replace('"' + fragment.text.strip + '"')
end
end
def attribute_value_can_be_bare_ruby?(value)
begin
ruby = RubyParser.for_current_ruby.parse(value)
rescue Racc::ParseError, RubyParser::SyntaxError
return false
end
return false if ruby.nil?
(ruby.sexp_type == :str) || #regular string
(ruby.sexp_type == :dstr) || #string with interpolation
(ruby.sexp_type == :lit) || #symbol
(ruby.sexp_type == :call && ruby.mass == 1) #local var or method with no params
end
def to_haml_filter(filter, tabs, options)
content =
if children.first && children.first.cdata?
decode_entities(children.first.content_without_cdata_tokens)
else
decode_entities(self.inner_text)
end
content = erb_to_interpolation(content, options)
content.gsub!(/\A\s*\n(\s*)/, '\1')
original_indent = content[/\A(\s*)/, 1]
if content.split("\n").all? {|l| l.strip.empty? || l =~ /^#{original_indent}/}
content.gsub!(/^#{original_indent}/, tabulate(tabs + 1))
else
# Indentation is inconsistent. Strip whitespace from start and indent all
# to ensure valid Haml.
content.lstrip!
content.gsub!(/^/, tabulate(tabs + 1))
end
content.rstrip!
content << "\n"
"#{tabulate(tabs)}:#{filter}\n#{content}"
end
# TODO: this method is utterly awful, find a better way to decode HTML entities.
def decode_entities(str)
str.gsub(/&[\S]+;/) do |entity|
begin
[Nokogiri::HTML::NamedCharacters[entity[1..-2]]].pack("C")
rescue TypeError
entity
end
end
end
def static_attribute?(name, options)
attr_hash[name] && !dynamic_attribute?(name, options)
end
def dynamic_attribute?(name, options)
options[:erb] and dynamic_attributes.key?(name)
end
def static_id?(options)
static_attribute?('id', options) && haml_css_attr?(attr_hash['id'])
end
def static_classname?(options)
static_attribute?('class', options)
end
def haml_css_attr?(attr)
attr =~ /^[-:\w]+$/
end
# Returns a string representation of an attributes hash
# that's prettier than that produced by Hash#inspect
def haml_attributes(options)
attrs = attr_hash.sort.map do |name, value|
haml_attribute_pair(name, value.to_s, options)
end
if options[:html_style_attributes]
"(#{attrs.join(' ')})"
else
"{#{attrs.join(', ')}}"
end
end
# Returns the string representation of a single attribute key value pair
def haml_attribute_pair(name, value, options)
value = dynamic_attribute?(name, options) ? dynamic_attributes[name] : value.inspect
if options[:html_style_attributes]
return "#{name}=#{value}"
end
if name.index(/\W/)
return "#{name.inspect} => #{value}"
end
if options[:ruby19_style_attributes]
return "#{name}: #{value}"
end
":#{name} => #{value}"
end
end
end
end