sparklemotion/nokogiri

View on GitHub
lib/nokogiri/xml/sax/parser.rb

Summary

Maintainability
A
25 mins
Test Coverage
A
100%
# frozen_string_literal: true

module Nokogiri
  module XML
    module SAX
      ###
      # This parser is a SAX style parser that reads it's input as it
      # deems necessary.  The parser takes a Nokogiri::XML::SAX::Document,
      # an optional encoding, then given an XML input, sends messages to
      # the Nokogiri::XML::SAX::Document.
      #
      # Here is an example of using this parser:
      #
      #   # Create a subclass of Nokogiri::XML::SAX::Document and implement
      #   # the events we care about:
      #   class MyDoc < Nokogiri::XML::SAX::Document
      #     def start_element name, attrs = []
      #       puts "starting: #{name}"
      #     end
      #
      #     def end_element name
      #       puts "ending: #{name}"
      #     end
      #   end
      #
      #   # Create our parser
      #   parser = Nokogiri::XML::SAX::Parser.new(MyDoc.new)
      #
      #   # Send some XML to the parser
      #   parser.parse(File.open(ARGV[0]))
      #
      # For more information about SAX parsers, see Nokogiri::XML::SAX.  Also
      # see Nokogiri::XML::SAX::Document for the available events.
      class Parser
        class Attribute < Struct.new(:localname, :prefix, :uri, :value)
        end

        # Encodinds this parser supports
        ENCODINGS = {
          "NONE" => 0, # No char encoding detected
          "UTF-8" => 1, # UTF-8
          "UTF16LE" => 2, # UTF-16 little endian
          "UTF16BE" => 3, # UTF-16 big endian
          "UCS4LE" => 4, # UCS-4 little endian
          "UCS4BE" => 5, # UCS-4 big endian
          "EBCDIC" => 6, # EBCDIC uh!
          "UCS4-2143" => 7, # UCS-4 unusual ordering
          "UCS4-3412" => 8, # UCS-4 unusual ordering
          "UCS2" => 9, # UCS-2
          "ISO-8859-1" => 10, # ISO-8859-1 ISO Latin 1
          "ISO-8859-2" => 11, # ISO-8859-2 ISO Latin 2
          "ISO-8859-3" => 12, # ISO-8859-3
          "ISO-8859-4" => 13, # ISO-8859-4
          "ISO-8859-5" => 14, # ISO-8859-5
          "ISO-8859-6" => 15, # ISO-8859-6
          "ISO-8859-7" => 16, # ISO-8859-7
          "ISO-8859-8" => 17, # ISO-8859-8
          "ISO-8859-9" => 18, # ISO-8859-9
          "ISO-2022-JP" => 19, # ISO-2022-JP
          "SHIFT-JIS" => 20, # Shift_JIS
          "EUC-JP" => 21, # EUC-JP
          "ASCII" => 22, # pure ASCII
        }

        # The Nokogiri::XML::SAX::Document where events will be sent.
        attr_accessor :document

        # The encoding beings used for this document.
        attr_accessor :encoding

        # Create a new Parser with +doc+ and +encoding+
        def initialize(doc = Nokogiri::XML::SAX::Document.new, encoding = "UTF-8")
          @encoding = check_encoding(encoding)
          @document = doc
          @warned   = false
        end

        ###
        # Parse given +thing+ which may be a string containing xml, or an
        # IO object.
        def parse(thing, &block)
          if thing.respond_to?(:read) && thing.respond_to?(:close)
            parse_io(thing, &block)
          else
            parse_memory(thing, &block)
          end
        end

        ###
        # Parse given +io+
        def parse_io(io, encoding = @encoding)
          ctx = ParserContext.io(io, ENCODINGS[check_encoding(encoding)])
          yield ctx if block_given?
          ctx.parse_with(self)
        end

        ###
        # Parse a file with +filename+
        def parse_file(filename)
          raise ArgumentError unless filename
          raise Errno::ENOENT unless File.exist?(filename)
          raise Errno::EISDIR if File.directory?(filename)

          ctx = ParserContext.file(filename)
          yield ctx if block_given?
          ctx.parse_with(self)
        end

        def parse_memory(data)
          ctx = ParserContext.memory(data)
          yield ctx if block_given?
          ctx.parse_with(self)
        end

        private

        def check_encoding(encoding)
          encoding.upcase.tap do |enc|
            raise ArgumentError, "'#{enc}' is not a valid encoding" unless ENCODINGS[enc]
          end
        end
      end
    end
  end
end