fidothe/saxon-rb

View on GitHub
lib/saxon/document_builder.rb

Summary

Maintainability
A
0 mins
Test Coverage
A
94%
require 'saxon/xdm'

module Saxon
  # Builds XDM objects from XML sources, for use in XSLT or for query and
  # access
  class DocumentBuilder
    # Provides a simple configuraion DSL for DocumentBuilders.
    # @see DocumentBuilder.create
    class ConfigurationDSL
      # @api private
      #
      # Create a new instance and +instance_exec+ the passed-in block against it
      def self.define(document_builder, block)
        new(document_builder).instance_exec(&block)
      end

      # @api private
      def initialize(document_builder)
        @document_builder = document_builder
      end

      # Sets line numbering on or off
      #
      # @see DocumentBuilder#line_numbering=
      #
      # @param value [Boolean] on (true) or off (false)
      def line_numbering(value)
        @document_builder.line_numbering = value
      end

      # Sets the base URI of documents created using this instance.
      #
      # @see DocumentBuilder.base_uri=
      #
      # @param value [String, URI::File, URI::HTTP] The (absolute) base URI to use
      def base_uri(value)
        @document_builder.base_uri = value
      end

      # Sets the base URI of documents created using this instance.
      #
      # @see DocumentBuilder.base_uri=
      #
      # @param value [String, URI::File, URI::HTTP] The (absolute) base URI to use
      def whitespace_stripping_policy(value)
        @document_builder.whitespace_stripping_policy = value
      end

      # Sets the base URI of documents created using this instance.
      #
      # @see DocumentBuilder.base_uri=
      #
      # @param value [String, URI::File, URI::HTTP] The (absolute) base URI to use
      def dtd_validation(value)
        @document_builder.dtd_validation = value
      end
    end


    # Create a new DocumentBuilder that can be used to build new XML documents
    # with the passed-in {Saxon::Processor}. If a block is passed in it's
    # executed as a DSL for configuring the builder instance.
    #
    # @param processor [Saxon::Processor] the Processor
    # @yield An DocumentBuilder configuration DSL block
    # @return [Saxon::DocumentBuilder] the new instance
    def self.create(processor, &block)
      new(processor.to_java.newDocumentBuilder, &block)
    end

    attr_reader :s9_document_builder
    private :s9_document_builder

    # @api private
    # @param [net.sf.saxon.s9api.DocumentBuilder] s9_document_builder The
    #   Saxon DocumentBuilder instance to wrap
    def initialize(s9_document_builder, &block)
      @s9_document_builder = s9_document_builder
      if block_given?
        ConfigurationDSL.define(self, block)
      end
    end

    # Report whether documents created using this instance will keep track of
    # the line and column numbers of elements.
    #
    # @return [Boolean] whether line numbering will be tracked
    def line_numbering?
      s9_document_builder.isLineNumbering
    end


    # Switch tracking of line and column numbers for elements in documents
    # created by this instance on or off
    #
    # @see https://www.saxonica.com/documentation9.9/index.html#!javadoc/net.sf.saxon.s9api/DocumentBuilder@setLineNumbering
    #
    # @param on_or_not [Boolean] whether or not to track line numbering
    def line_numbering=(on_or_not)
      s9_document_builder.setLineNumbering(on_or_not)
    end

    # Return the default base URI to be used when building documents using this
    # instance. This value will be ignored if the source being parsed has an
    # intrinsic base URI (e.g. a File).
    #
    # Returns +nil+ if no URI is set (the default).
    #
    # @return [nil, URI::File, URI::HTTP] the default base URI (or nil)
    def base_uri
      uri = s9_document_builder.getBaseURI
      uri.nil? ? uri : URI(uri.to_s)
    end

    # Set the base URI of documents created using this instance. This value will
    # be ignored if the source being parsed has an intrinsic base URI (e.g. a
    # File)
    #
    # @see https://www.saxonica.com/documentation9.9/index.html#!javadoc/net.sf.saxon.s9api/DocumentBuilder@setBaseURI
    #
    # @param uri [String, URI::File, URI::HTTP] The (absolute) base URI to use
    def base_uri=(uri)
      s9_document_builder.setBaseURI(java.net.URI.new(uri.to_s))
    end

    # Return the Whitespace stripping policy for this instance. Returns one of
    # the standard policy names as a symbol, or the custom Java
    # WhitespaceStrippingPolicy if one was defined using
    # +#whitespace_stripping_policy = ->(qname) { ... }+. (See
    # {#whitespace_stripping_policy=} for more.)
    #
    # +:all+: All whitespace-only nodes will be discarded
    #
    # +:none+: No whitespace-only nodes will be discarded (the default if DTD or
    # schema validation is not in effect)
    #
    # +:ignorable+: Whitespace-only nodes inside elements defined as
    # element-only in the DTD or schema being used will be discarded (the
    # default if DTD or schema validation is in effect)
    #
    # +:unspecified+: the default, which in practice means :ignorable if DTD or
    # schema validation is in effect, and :none otherwise.
    #
    # @return [:all, :none, :ignorable, :unspecified, Proc]
    def whitespace_stripping_policy
      s9_policy = s9_document_builder.getWhitespaceStrippingPolicy
      case s9_policy
      when Saxon::S9API::WhitespaceStrippingPolicy::UNSPECIFIED
        :unspecified
      when Saxon::S9API::WhitespaceStrippingPolicy::NONE
        :none
      when Saxon::S9API::WhitespaceStrippingPolicy::IGNORABLE
        :ignorable
      when Saxon::S9API::WhitespaceStrippingPolicy::ALL
        :all
      else
        s9_policy
      end
    end

    # Set the whitespace stripping policy to be used for documents built with
    # this instance.
    #
    # Possible values are:
    #
    # * One of the standard policies, as a symbol (+:all+, +:none+,
    #   +:ignorable+, +:unspecified+, see {#whitespace_stripping_policy}).
    # * A Java +net.sf.saxon.s9api.WhitesapceStrippingPolicy+ instance
    # * A Proc/lambda that is handed an element name as a {Saxon::QName}, and
    #   should return true (if whitespace should be stripped for this element)
    #   or false (it should not).
    # @example
    #   whitespace_stripping_policy = ->(element_qname) {
    #     element_qname == Saxon::QName.clark("{http://example.org/}element-name")
    #   }
    #
    # @see https://www.saxonica.com/documentation9.9/index.html#!javadoc/net.sf.saxon.s9api/DocumentBuilder@setWhitespaceStrippingPolicy
    # @see https://www.saxonica.com/documentation9.9/index.html#!javadoc/net.sf.saxon.s9api/WhitespaceStrippingPolicy
    # @param policy [Symbol, Proc, Saxon::S9API::WhitespaceStrippingPolicy] the
    #   policy to use
    def whitespace_stripping_policy=(policy)
      case policy
      when :unspecified, :none, :ignorable, :all
        s9_policy = Saxon::S9API::WhitespaceStrippingPolicy.const_get(policy.to_s.upcase.to_sym)
      when Proc
        wrapped_policy = ->(s9_qname) {
          policy.call(Saxon::QName.new(s9_qname))
        }
        s9_policy = Saxon::S9API::WhitespaceStrippingPolicy.makeCustomPolicy(wrapped_policy)
      when Saxon::S9API::WhitespaceStrippingPolicy
        s9_policy = policy
      else
        raise InvalidWhitespaceStrippingPolicyError, "#{policy.inspect} is not one of the allowed Symbols, or a custom policy"
      end
      s9_document_builder.setWhitespaceStrippingPolicy(s9_policy)
    end

    # @return [Boolean] whether DTD Validation is enabled
    def dtd_validation?
      s9_document_builder.isDTDValidation
    end

    # Switches DTD validation on or off.
    #
    # It's important to note that DTD validation only applies to documents that
    # contain a +<!doctype>+, but switching DTD validation off doesn't stop the
    # XML parser Saxon uses from trying to retrieve the DTD that's referenced,
    # which can mean network requests. By default, the SAX parser Saxon uses
    # (Xerces) doesn't make use of XML catalogs, which causes problems when documents reference a DTD with a relative path as in:
    #   <!DOCTYPE root-element SYSTEM "example.dtd">
    # This can be controlled through a configuration option, however.
    #
    # @see https://www.saxonica.com/documentation9.9/index.html#!javadoc/net.sf.saxon.s9api/DocumentBuilder@setDTDValidation
    # @see https://www.saxonica.com/documentation9.9/index.html#!sourcedocs/controlling-parsing
    # @param on [Boolean] whether DTD Validation should be enabled
    def dtd_validation=(on)
      s9_document_builder.setDTDValidation(on)
    end

    # @param [Saxon::Source] source The Saxon::Source containing the source
    #   IO/string
    # @return [Saxon::XDM::Node] The Saxon::XDM::Node representing the root of the
    #   document tree
    def build(source)
      XDM::Node.new(s9_document_builder.build(source.to_java))
    end

    # @return [Java::NetSfSaxonS9api::DocumentBuilder] The underlying Java Saxon
    #   DocumentBuilder instance
    def to_java
      s9_document_builder
    end
  end

  # Error raised when someone tries to set an invalid whitespace stripping
  # policy on a {DocumentBuilder}
  class InvalidWhitespaceStrippingPolicyError < RuntimeError
  end
end