lib/maruku/input/html_helper.rb
module MaRuKu::In::Markdown::SpanLevelParser
# This class helps me read and sanitize HTML blocks
class HTMLHelper
Tag = %r{^<(/)?(\w+)\s*([^>]*?)>}m
PartialTag = %r{^<.*}m
CData = %r{^\s*<!\[CDATA\[}m
CDataEnd = %r{\]\]>}m
EverythingElse = %r{^[^<]+}m
CommentStart = %r{^<!--}x
CommentEnd = %r{-->}
TO_SANITIZE = ['img', 'hr', 'br']
attr_reader :rest, :first_tag
def initialize
@rest = ""
@tag_stack = []
@m = nil
@already = ""
self.state = :inside_element
end
attr_accessor :state # = :inside_element, :inside_tag, :inside_comment, :inside_cdata
def eat_this(line)
@rest = line + @rest
things_read = 0
until @rest.empty?
case self.state
when :inside_comment
if @m = CommentEnd.match(@rest)
debug_state 'Comment End'
# Workaround for https://bugs.ruby-lang.org/issues/9277 and another bug in 1.9.2 where even a
# single dash in a comment will cause REXML to error.
@already << @m.pre_match.gsub(/-(?![^\-])/, '- ') << @m.to_s
@rest = @m.post_match
self.state = :inside_element
else
@already << @rest.gsub(/-(?![^\-])/, '- ') # Workaround for https://bugs.ruby-lang.org/issues/9277
@rest = ""
self.state = :inside_comment
end
when :inside_element
if @m = CommentStart.match(@rest)
debug_state 'Comment'
things_read += 1
@already << @m.pre_match << @m.to_s
@rest = @m.post_match
self.state = :inside_comment
elsif @m = Tag.match(@rest)
debug_state 'Tag'
things_read += 1
self.state = :inside_element
handle_tag
elsif @m = CData.match(@rest)
debug_state 'CDATA'
@already << @m.pre_match
close_script_style if script_style?
@already << @m.to_s
@rest = @m.post_match
self.state = :inside_cdata
elsif @m = PartialTag.match(@rest)
debug_state 'PartialTag'
@already << @m.pre_match
@rest = @m.post_match
@partial_tag = @m.to_s
self.state = :inside_tag
elsif @m = EverythingElse.match(@rest)
debug_state 'EverythingElse'
@already << @m.pre_match << @m.to_s
@rest = @m.post_match
self.state = :inside_element
else
error "Malformed HTML: not complete: #{@rest.inspect}"
end
when :inside_tag
if @m = /^[^>]*>/.match(@rest)
@partial_tag << @m.to_s
@rest = @partial_tag + @m.post_match
@partial_tag = nil
self.state = :inside_element
if @m = Tag.match(@rest)
things_read += 1
handle_tag
end
else
@partial_tag << @rest
@rest = ""
self.state = :inside_tag
end
when :inside_cdata
if @m = CDataEnd.match(@rest)
self.state = :inside_element
@already << @m.pre_match << @m.to_s
@rest = @m.post_match
start_script_style if script_style?
else
@already << @rest
@rest = ""
self.state = :inside_cdata
end
else
raise "Bug bug: state = #{self.state.inspect}"
end
break if is_finished? && things_read > 0
end
end
def handle_tag
@already << @m.pre_match
@rest = @m.post_match
is_closing = !!@m[1]
tag = @m[2]
@first_tag ||= tag
attributes = @m[3].to_s
is_single = false
if attributes[-1, 1] == '/'
attributes = attributes[0, attributes.size - 1]
is_single = true
end
if TO_SANITIZE.include? tag
attributes.strip!
if attributes.size > 0
@already << '<%s %s />' % [tag, attributes]
else
@already << '<%s />' % [tag]
end
elsif is_closing
if @tag_stack.empty?
error "Malformed: closing tag #{tag.inspect} in empty list"
elsif @tag_stack.last != tag
error "Malformed: tag <#{tag}> closes <#{@tag_stack.last}>"
end
close_script_style if script_style?
@already << @m.to_s
@tag_stack.pop
else
@already << @m.to_s
@tag_stack.push(tag) unless is_single
start_script_style if script_style?
end
end
def stuff_you_read
@already
end
def is_finished?
self.state == :inside_element && @tag_stack.empty?
end
private
def debug_state(note)
my_debug "#{@state}: #{note}: #{@m.to_s.inspect}"
end
def my_debug(s)
# puts "---" * 10 + "\n" + inspect + "\t>>>\t" + s
end
def error(s)
raise "Error: #{s} \n" + inspect, caller
end
def inspect
"HTML READER\n state=#{self.state} " +
"match=#{@m.to_s.inspect}\n" +
"Tag stack = #{@tag_stack.inspect} \n" +
"Before:\n" +
@already.gsub(/^/, '|') + "\n" +
"After:\n" +
@rest.gsub(/^/, '|') + "\n"
end
# Script and style tag handling
# -----------------------------
#
# XHTML, and XML parsers like REXML, require that certain characters be
# escaped within script or style tags. However, there are conflicts between
# documents served as XHTML vs HTML. So we need to be extra careful about
# how we escape these tags so they will even parse correctly. However, we
# also try to avoid adding that escaping unnecessarily.
#
# See http://dorward.me.uk/www/comments-cdata/ for a good explanation.
# Are we within a script or style tag?
def script_style?
%w(script style).include?(@tag_stack.last)
end
# Save our @already buffer elsewhere, and switch to using @already for the
# contents of this script or style tag.
def start_script_style
@before_already, @already = @already, ""
end
# Finish script or style tag content, wrapping it in CDATA if necessary,
# and add it to our original @already buffer.
def close_script_style
tag = @tag_stack.last
# See http://www.w3.org/TR/xhtml1/#C_4 for character sequences not allowed within an element body.
if @already =~ /<|&|\]\]>|--/
new_already = script_style_cdata_start(tag)
new_already << "\n" unless @already.start_with?("\n")
new_already << @already
new_already << "\n" unless @already.end_with?("\n")
new_already << script_style_cdata_end(tag)
@already = new_already
end
@before_already << @already
@already = @before_already
end
def script_style_cdata_start(tag)
(tag == 'script') ? "//<![CDATA[" : "/*<![CDATA[*/"
end
def script_style_cdata_end(tag)
(tag == 'script') ? "//]]>" : "/*]]>*/"
end
end
end