lib/nokogiri/xml/sax/parser.rb
# frozen_string_literal: true
module Nokogiri
module XML
module SAX
###
# This parser is a SAX style parser that reads it's input as it
# deems necessary. The parser takes a Nokogiri::XML::SAX::Document,
# an optional encoding, then given an XML input, sends messages to
# the Nokogiri::XML::SAX::Document.
#
# Here is an example of using this parser:
#
# # Create a subclass of Nokogiri::XML::SAX::Document and implement
# # the events we care about:
# class MyDoc < Nokogiri::XML::SAX::Document
# def start_element name, attrs = []
# puts "starting: #{name}"
# end
#
# def end_element name
# puts "ending: #{name}"
# end
# end
#
# # Create our parser
# parser = Nokogiri::XML::SAX::Parser.new(MyDoc.new)
#
# # Send some XML to the parser
# parser.parse(File.open(ARGV[0]))
#
# For more information about SAX parsers, see Nokogiri::XML::SAX. Also
# see Nokogiri::XML::SAX::Document for the available events.
class Parser
class Attribute < Struct.new(:localname, :prefix, :uri, :value)
end
# Encodinds this parser supports
ENCODINGS = {
"NONE" => 0, # No char encoding detected
"UTF-8" => 1, # UTF-8
"UTF16LE" => 2, # UTF-16 little endian
"UTF16BE" => 3, # UTF-16 big endian
"UCS4LE" => 4, # UCS-4 little endian
"UCS4BE" => 5, # UCS-4 big endian
"EBCDIC" => 6, # EBCDIC uh!
"UCS4-2143" => 7, # UCS-4 unusual ordering
"UCS4-3412" => 8, # UCS-4 unusual ordering
"UCS2" => 9, # UCS-2
"ISO-8859-1" => 10, # ISO-8859-1 ISO Latin 1
"ISO-8859-2" => 11, # ISO-8859-2 ISO Latin 2
"ISO-8859-3" => 12, # ISO-8859-3
"ISO-8859-4" => 13, # ISO-8859-4
"ISO-8859-5" => 14, # ISO-8859-5
"ISO-8859-6" => 15, # ISO-8859-6
"ISO-8859-7" => 16, # ISO-8859-7
"ISO-8859-8" => 17, # ISO-8859-8
"ISO-8859-9" => 18, # ISO-8859-9
"ISO-2022-JP" => 19, # ISO-2022-JP
"SHIFT-JIS" => 20, # Shift_JIS
"EUC-JP" => 21, # EUC-JP
"ASCII" => 22, # pure ASCII
}
# The Nokogiri::XML::SAX::Document where events will be sent.
attr_accessor :document
# The encoding beings used for this document.
attr_accessor :encoding
# Create a new Parser with +doc+ and +encoding+
def initialize(doc = Nokogiri::XML::SAX::Document.new, encoding = "UTF-8")
@encoding = check_encoding(encoding)
@document = doc
@warned = false
end
###
# Parse given +thing+ which may be a string containing xml, or an
# IO object.
def parse(thing, &block)
if thing.respond_to?(:read) && thing.respond_to?(:close)
parse_io(thing, &block)
else
parse_memory(thing, &block)
end
end
###
# Parse given +io+
def parse_io(io, encoding = @encoding)
ctx = ParserContext.io(io, ENCODINGS[check_encoding(encoding)])
yield ctx if block_given?
ctx.parse_with(self)
end
###
# Parse a file with +filename+
def parse_file(filename)
raise ArgumentError unless filename
raise Errno::ENOENT unless File.exist?(filename)
raise Errno::EISDIR if File.directory?(filename)
ctx = ParserContext.file(filename)
yield ctx if block_given?
ctx.parse_with(self)
end
def parse_memory(data)
ctx = ParserContext.memory(data)
yield ctx if block_given?
ctx.parse_with(self)
end
private
def check_encoding(encoding)
encoding.upcase.tap do |enc|
raise ArgumentError, "'#{enc}' is not a valid encoding" unless ENCODINGS[enc]
end
end
end
end
end
end