ruby-rdf/rdf

View on GitHub
lib/rdf/writer.rb

Summary

Maintainability
D
1 day
Test Coverage
# -*- encoding: utf-8 -*-
module RDF
  ##
  # The base class for RDF serializers.
  #
  # @example Loading an RDF writer implementation
  #   require 'rdf/ntriples'
  #
  # @example Iterating over known RDF writer classes
  #   RDF::Writer.each { |klass| puts klass.name }
  #
  # @example Obtaining an RDF writer class
  #   RDF::Writer.for(:ntriples)     #=> RDF::NTriples::Writer
  #   RDF::Writer.for("spec/data/output.nt")
  #   RDF::Writer.for(file_name:      "spec/data/output.nt")
  #   RDF::Writer.for(file_extension: "nt")
  #   RDF::Writer.for(content_type:   "application/n-triples")
  #
  # @example Instantiating an RDF writer class
  #   RDF::Writer.for(:ntriples).new($stdout) { |writer| ... }
  #
  # @example Serializing RDF statements into a file
  #   RDF::Writer.open("spec/data/output.nt") do |writer|
  #     graph.each_statement do |statement|
  #       writer << statement
  #     end
  #   end
  #
  # @example Serializing RDF statements into a string
  #   RDF::Writer.for(:ntriples).buffer do |writer|
  #     graph.each_statement do |statement|
  #       writer << statement
  #     end
  #   end
  #
  # @example Detecting invalid output
  #   logger = Logger.new([])
  #   RDF::Writer.for(:ntriples).buffer(logger: logger) do |writer|
  #     statement = RDF::Statement.new(
  #       RDF::URI("https://rubygems.org/gems/rdf"),
  #       RDF::URI("http://purl.org/dc/terms/creator"),
  #       nil)
  #     writer << statement
  #   end # => RDF::WriterError
  #   logger.empty? => false
  #
  # @abstract
  # @see RDF::Format
  # @see RDF::Reader
  class Writer
    extend  ::Enumerable
    extend  RDF::Util::Aliasing::LateBound
    include RDF::Util::Logger
    include RDF::Writable

    ##
    # Enumerates known RDF writer classes.
    #
    # @yield  [klass]
    # @yieldparam  [Class] klass
    # @yieldreturn [void] ignored
    # @return [Enumerator]
    def self.each(&block)
      RDF::Format.map(&:writer).reject(&:nil?).each(&block)
    end

    ##
    # Finds an RDF writer class based on the given criteria.
    #
    # @overload for(format)
    #   Finds an RDF writer class based on a symbolic name.
    #
    #   @param  [Symbol] format
    #   @return [Class]
    #
    # @overload for(filename)
    #   Finds an RDF writer class based on a file name.
    #
    #   @param  [String] filename
    #   @return [Class]
    #
    # @overload for(options = {})
    #   Finds an RDF writer class based on various options.
    #
    #   @param  [Hash{Symbol => Object}] options
    #   @option options [String, #to_s]   :file_name      (nil)
    #   @option options [Symbol, #to_sym] :file_extension (nil)
    #   @option options [String, #to_s]   :content_type   (nil)
    #   @return [Class]
    #
    # @return [Class]
    def self.for(*arg, &block)
      case arg.length
      when 0 then arg = nil
      when 1 then arg = arg.first
      else
        raise ArgumentError, "Format.for accepts zero or one argument, got #{arg.length}."
      end
      arg = arg.merge(has_writer: true) if arg.is_a?(Hash)
      if format = self.format || Format.for(arg)
        format.writer
      end
    end

    ##
    # Retrieves the RDF serialization format class for this writer class.
    #
    # @return [Class]
    def self.format(klass = nil)
      if klass.nil?
        Format.each do |format|
          if format.writer == self
            return format
          end
        end
        nil # not found
      end
    end

    ##
    # Options suitable for automatic Writer provisioning.
    # @return [Array<RDF::CLI::Option>]
    def self.options
      [
        RDF::CLI::Option.new(
          symbol: :canonicalize,
          datatype: TrueClass,
          control: :checkbox,
          on: ["--canonicalize"],
          description: "Canonicalize input/output.") {true},
        RDF::CLI::Option.new(
          symbol: :encoding,
          datatype: Encoding,
          control: :text,
          on: ["--encoding ENCODING"],
          description: "The encoding of the input stream.") {|arg| Encoding.find arg},
        RDF::CLI::Option.new(
          symbol: :prefixes,
          datatype: Hash,
          multiple: true,
          control: :none,
          on: ["--prefixes PREFIX,PREFIX"],
          description: "A comma-separated list of prefix:uri pairs.") do |arg|
            arg.split(',').inject({}) do |memo, pfxuri|
              pfx,uri = pfxuri.split(':', 2)
              memo.merge(pfx.to_sym => RDF::URI(uri))
            end
        end,
        RDF::CLI::Option.new(
          symbol: :unique_bnodes,
          datatype: TrueClass,
          control: :checkbox,
          on: ["--unique-bnodes"],
          description: "Use unique Node identifiers.") {true},
      ]
    end

    class << self
      alias_method :format_class, :format

      ##
      # Use parameters from accept-params to determine if the parameters are acceptable to invoke this writer. The `accept_params` will subsequently be provided to the writer instance.
      #
      # @example rejecting a writer based on a profile
      #   JSON::LD::Writer.accept?(profile: "http://www.w3.org/ns/json-ld#compacted http://example.org/black-listed")
      #     # => false
      #
      # @param [Hash{Symbol => String}] accept_params
      # @yield [accept_params] if a block is given, returns the result of evaluating that block
      # @yieldparam [Hash{Symbol => String}] accept_params
      # @return [Boolean]
      # @see    http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.1
      def accept?(accept_params)
        block_given? ? yield(accept_params) : true
      end
    end

    ##
    # @param  [RDF::Enumerable, #each] data
    #   the graph or repository to dump
    # @param  [IO, File, String] io
    #   the output stream or file to write to
    # @param  [Hash{Symbol => Object}] options
    #   passed to {RDF::Writer#initialize} or {RDF::Writer.buffer}
    # @return [void]
    def self.dump(data, io = nil, **options)
      io = File.open(io, 'w') if io.is_a?(String)
      method = data.respond_to?(:each_statement) ? :each_statement : :each
      if io
        new(io, **options) do |writer|
          io.set_encoding(writer.encoding) if io.respond_to?(:set_encoding)
          data.send(method) do |statement|
            writer << statement
          end
          writer.flush
        end
      else
        buffer(**options) do |writer|
          data.send(method) do |statement|
            writer << statement
          end
        end
      end
    end

    ##
    # Buffers output into a string buffer.
    #
    # @param  [Hash{Symbol => Object}] options
    #   passed to {RDF::Writer#initialize}
    # @yield  [writer]
    # @yieldparam  [RDF::Writer] writer
    # @yieldreturn [void]
    # @return [String]
    # @raise [ArgumentError] if no block is provided
    def self.buffer(*args, **options, &block)
      raise ArgumentError, "block expected" unless block_given?

      StringIO.open do |buffer|
        self.new(buffer, *args, **options) do |writer|
          buffer.set_encoding(writer.encoding)
          block.call(writer)
        end
        buffer.string
      end
    end

    ##
    # Writes output to the given `filename`.
    #
    # @param  [String, #to_s] filename
    # @param [Symbol] format (nil)
    # @param  [Hash{Symbol => Object}] options
    #   any additional options (see {RDF::Writer#initialize} and {RDF::Format.for})
    # @return [RDF::Writer]
    def self.open(filename, format: nil, **options, &block)
      File.open(filename, 'wb') do |file|
        format_options = options.dup
        format_options[:file_name] ||= filename
        self.for(format || format_options).new(file, **options) do |writer|
          file.set_encoding(writer.encoding)
          block.call(writer)
        end
      end
    end

    ##
    # Returns a symbol appropriate to use with RDF::Writer.for()
    # @return [Symbol]
    def self.to_sym
      self.format.to_sym
    end

    ##
    # Returns a symbol appropriate to use with RDF::Writer.for()
    # @return [Symbol]
    def to_sym
      self.class.to_sym
    end
    
    ##
    # Initializes the writer.
    #
    # @param  [IO, File] output
    #   the output stream
    # @param  [Hash{Symbol => Object}] options
    #   any additional options
    # @option options [Encoding, String, Symbol] :encoding
    #   the encoding to use on the output stream.
    #   Defaults to the format associated with `content_encoding`.
    # @option options [Boolean]  :canonicalize (false)
    #   whether to canonicalize terms when serializing
    # @option options [Boolean]  :validate (false)
    #   whether to validate terms when serializing
    # @option options [Hash]     :prefixes     (Hash.new)
    #   the prefix mappings to use (not supported by all writers)
    # @option options [#to_s]    :base_uri     (nil)
    #   the base URI to use when constructing relative URIs (not supported
    #   by all writers)
    # @option options [Boolean]  :unique_bnodes   (false)
    #   Use unique {Node} identifiers, defaults to using the identifier which the node was originall initialized with (if any). Implementations should ensure that Nodes are serialized using a unique representation independent of any identifier used when creating the node. See {NTriples::Writer#format_node}
    # @option options [Hash{Symbol => String}] :accept_params
    #   Parameters from ACCEPT header entry for the media-range matching this writer.
    # @yield  [writer] `self`
    # @yieldparam  [RDF::Writer] writer
    # @yieldreturn [void]
    def initialize(output = $stdout, **options, &block)
      @output, @options = output, options.dup
      @nodes, @node_id, @node_id_map  = {}, 0, {}

      if block_given?
        write_prologue
        case block.arity
          when 1 then block.call(self)
          else instance_eval(&block)
        end
        write_epilogue
      end
    end

    ##
    # Any additional options for this writer.
    #
    # @return [Hash]
    # @since  0.2.2
    attr_reader :options

    ##
    # Returns the base URI used for this writer.
    #
    # @example
    #   writer.prefixes[:dc]  #=> RDF::URI('http://purl.org/dc/terms/')
    #
    # @return [RDF::URI]
    # @since  0.3.4
    def base_uri
      RDF::URI(@options[:base_uri]) if @options[:base_uri]
    end

    ##
    # Returns the URI prefixes currently defined for this writer.
    #
    # @example
    #   writer.prefixes[:dc]  #=> RDF::URI('http://purl.org/dc/terms/')
    #
    # @return [Hash{Symbol => RDF::URI}]
    # @since  0.2.2
    def prefixes
      @options[:prefixes] ||= {}
    end

    ##
    # Defines the given URI prefixes for this writer.
    #
    # @example
    #   writer.prefixes = {
    #     dc: RDF::URI('http://purl.org/dc/terms/'),
    #   }
    #
    # @param  [Hash{Symbol => RDF::URI}] prefixes
    # @return [Hash{Symbol => RDF::URI}]
    # @since  0.3.0
    def prefixes=(prefixes)
      @options[:prefixes] = prefixes
    end

    ##
    # Defines the given named URI prefix for this writer.
    #
    # @example Defining a URI prefix
    #   writer.prefix :dc, RDF::URI('http://purl.org/dc/terms/')
    #
    # @example Returning a URI prefix
    #   writer.prefix(:dc)    #=> RDF::URI('http://purl.org/dc/terms/')
    #
    # @overload prefix(name, uri)
    #   @param  [Symbol, #to_s]   name
    #   @param  [RDF::URI, #to_s] uri
    #
    # @overload prefix(name)
    #   @param  [Symbol, #to_s]   name
    #
    # @return [RDF::URI]
    def prefix(name, uri = nil)
      name = name.to_s.empty? ? nil : (name.respond_to?(:to_sym) ? name.to_sym : name.to_s.to_sym)
      uri.nil? ? prefixes[name] : prefixes[name] = uri
    end
    alias_method :prefix!, :prefix

    ##
    # Returns the encoding of the output stream.
    #
    # @return [Encoding]
    def encoding
      case @options[:encoding]
      when String, Symbol
        Encoding.find(@options[:encoding].to_s)
      when Encoding
        @options[:encoding]
      else
        @options[:encoding] ||= Encoding.find(self.class.format.content_encoding.to_s)
      end
    end

    ##
    # Returns `true` if statements and terms should be validated.
    #
    # @return [Boolean] `true` or `false`
    # @since  1.0.8
    def validate?
      @options[:validate]
    end

    ##
    # Returns `true` if terms should be in canonical form.
    #
    # @note This is for term canonicalization, for graph/dataset canonicalization use `RDF::Normalize`.
    #
    # @return [Boolean] `true` or `false`
    # @since  1.0.8
    def canonicalize?
      @options[:canonicalize]
    end

    ##
    # Flushes the underlying output buffer.
    #
    # @return [self]
    def flush
      @output.flush if @output.respond_to?(:flush)
      self
    end
    alias_method :flush!, :flush

    ##
    # @return [self]
    # @abstract
    def write_prologue
      @logged_errors_at_prolog = log_statistics[:error].to_i
      self
    end

    ##
    # @return [self]
    # @raise [RDF::WriterError] if errors logged during processing.
    # @abstract
    def write_epilogue
      if log_statistics[:error].to_i > @logged_errors_at_prolog
        raise RDF::WriterError, "Errors found during processing"
      end
      self
    end

    ##
    # @param  [String] text
    # @return [self]
    # @abstract
    def write_comment(text)
      self
    end

    ##
    # Add a statement to the writer. This will check to ensure that the statement is complete (no nil terms) and is valid, if the `:validation` option is set.
    #
    # Additionally, it will de-duplicate BNode terms sharing a common identifier.
    #
    # @param  [RDF::Statement] statement
    # @return [self]
    # @note logs error if attempting to write an invalid {RDF::Statement} or if canonicalizing a statement which cannot be canonicalized.
    def write_statement(statement)
      statement = statement.canonicalize! if canonicalize?

      # Make sure BNodes in statement use unique identifiers
      if statement.node?
        statement.to_quad.map do |term|
          if term.is_a?(RDF::Node)
            term = term.original while term.original
            @nodes[term] ||= begin
              # Account for duplicated nodes
              @node_id_map[term.to_s] ||= term
              if !@node_id_map[term.to_s].equal?(term)
                # Rename node
                term.make_unique!
                @node_id_map[term.to_s] = term
              end
            end
          else
            term
          end
        end
        statement = RDF::Statement.from(statement.to_quad)
      end

      if statement.incomplete?
        log_error "Statement #{statement.inspect} is incomplete"
      elsif validate? && statement.invalid?
        log_error "Statement #{statement.inspect} is invalid"
      elsif respond_to?(:write_quad)
        write_quad(*statement.to_quad)
      else
        write_triple(*statement.to_triple)
      end
      self
    rescue ArgumentError => e
      log_error e.message
    end
    alias_method :insert_statement, :write_statement # support the RDF::Writable interface

    ##
    # @param  [Array<Array(RDF::Resource, RDF::URI, RDF::Term)>] triples
    # @return [self]
    # @note logs error if attempting to write an invalid {RDF::Statement} or if canonicalizing a statement which cannot be canonicalized.
    def write_triples(*triples)
      triples.each { |triple| write_triple(*triple) }
      self
    end

    ##
    # @param  [RDF::Resource] subject
    # @param  [RDF::URI]      predicate
    # @param  [RDF::Term]     object
    # @return [self]
    # @raise  [NotImplementedError] unless implemented in subclass
    # @note logs error if attempting to write an invalid {RDF::Statement} or if canonicalizing a statement which cannot be canonicalized.
    # @abstract
    def write_triple(subject, predicate, object)
      raise NotImplementedError.new("#{self.class}#write_triple") # override in subclasses
    end

    ##
    # @param  [RDF::Term] term
    # @return [String]
    # @since  0.3.0
    def format_term(term, **options)
      case term
        when String         then format_literal(RDF::Literal(term, **options), **options)
        when RDF::List      then format_list(term, **options)
        when RDF::Literal   then format_literal(term, **options)
        when RDF::URI       then format_uri(term, **options)
        when RDF::Node      then format_node(term, **options)
        when RDF::Statement then format_quotedTriple(term, **options)
        else nil
      end
    end

    ##
    # @param  [RDF::Node] value
    # @param  [Hash{Symbol => Object}] options = ({})
    # @option options [Boolean] :unique_bnodes (false)
    #   Serialize node using unique identifier, rather than any used to create the node.
    # @return [String]
    # @raise  [NotImplementedError] unless implemented in subclass
    # @abstract
    def format_node(value, **options)
      raise NotImplementedError.new("#{self.class}#format_node") # override in subclasses
    end

    ##
    # @param  [RDF::URI] value
    # @param  [Hash{Symbol => Object}] options = ({})
    # @return [String]
    # @raise  [NotImplementedError] unless implemented in subclass
    # @abstract
    def format_uri(value, **options)
      raise NotImplementedError.new("#{self.class}#format_uri") # override in subclasses
    end

    ##
    # @param  [RDF::Literal, String, #to_s] value
    # @param  [Hash{Symbol => Object}] options = ({})
    # @return [String]
    # @raise  [NotImplementedError] unless implemented in subclass
    # @abstract
    def format_literal(value, **options)
      raise NotImplementedError.new("#{self.class}#format_literal") # override in subclasses
    end

    ##
    # @param  [RDF::List] value
    # @param  [Hash{Symbol => Object}] options = ({})
    # @return [String]
    # @abstract
    # @since  0.2.3
    def format_list(value, **options)
      format_term(value.subject, **options)
    end

    ##
    # Formats a referenced triple.
    #
    # @example
    #     <<<s> <p> <o>>> <p> <o> .
    #
    # @param  [RDF::Statement] value
    # @param  [Hash{Symbol => Object}] options = ({})
    # @return [String]
    # @raise  [NotImplementedError] unless implemented in subclass
    # @abstract
    def format_quotedTriple(value, **options)
      raise NotImplementedError.new("#{self.class}#format_statement") # override in subclasses
    end

  protected

    ##
    # @return [void]
    def puts(*args)
      @output.puts(*args.map {|s| s.encode(encoding)})
    end

    ##
    # @param  [RDF::Resource] term
    # @return [String]
    def uri_for(term)
      case
        when term.is_a?(RDF::Node)
          @nodes[term] ||= term.to_base
        when term.respond_to?(:to_uri)
          term.to_uri.to_s
        else
          term.to_s
      end
    end

    ##
    # @return [String]
    def node_id
      "_:n#{@node_id += 1}"
    end

    ##
    # @param  [String] string
    # @return [String]
    def escaped(string)
      string.gsub('\\', '\\\\\\\\').
             gsub("\b", '\\b').
             gsub("\f", '\\f').
             gsub("\t", '\\t').
             gsub("\n", '\\n').
             gsub("\r", '\\r').
             gsub('"', '\\"')
    end

    ##
    # @param  [String] string
    # @return [String]
    def quoted(string)
      "\"#{string}\""
    end

  private

    @@subclasses = [] # @private

    ##
    # @private
    # @return [void]
    def self.inherited(child)
      @@subclasses << child
      super
    end
  end # Writer

  ##
  # The base class for RDF serialization errors.
  class WriterError < IOError
  end # WriterError
end # RDF