ruby-rdf/rdf

View on GitHub
lib/rdf/reader.rb

Summary

Maintainability
D
2 days
Test Coverage
# frozen_string_literal: true
module RDF
  ##
  # The base class for RDF parsers.
  #
  # @example Loading an RDF reader implementation
  #   require 'rdf/ntriples'
  #
  # @example Iterating over known RDF reader classes
  #   RDF::Reader.each { |klass| puts klass.name }
  #
  # @example Obtaining an RDF reader class
  #   RDF::Reader.for(:ntriples)     #=> RDF::NTriples::Reader
  #   RDF::Reader.for("etc/doap.nt")
  #   RDF::Reader.for(file_name:      "etc/doap.nt")
  #   RDF::Reader.for(file_extension: "nt")
  #   RDF::Reader.for(content_type:   "application/n-triples")
  #
  # @example Instantiating an RDF reader class
  #   RDF::Reader.for(:ntriples).new($stdin) { |reader| ... }
  #
  # @example Parsing RDF statements from a file
  #   RDF::Reader.open("etc/doap.nt") do |reader|
  #     reader.each_statement do |statement|
  #       puts statement.inspect
  #     end
  #   end
  #
  # @example Parsing RDF statements from a string
  #   data = StringIO.new(File.read("etc/doap.nt"))
  #   RDF::Reader.for(:ntriples).new(data) do |reader|
  #     reader.each_statement do |statement|
  #       puts statement.inspect
  #     end
  #   end
  #
  # @abstract
  # @see RDF::Format
  # @see RDF::Writer
  class Reader
    extend  ::Enumerable
    extend  RDF::Util::Aliasing::LateBound
    include RDF::Util::Logger
    include RDF::Readable
    include RDF::Enumerable # @since 0.3.0

    ##
    # Enumerates known RDF reader classes.
    #
    # @yield  [klass]
    # @yieldparam [Class] klass
    # @return [Enumerator]
    def self.each(&block)
      RDF::Format.map(&:reader).reject(&:nil?).each(&block)
    end

    ##
    # Finds an RDF reader class based on the given criteria.
    #
    # If the reader class has a defined format, use that.
    #
    # @overload for(format)
    #   Finds an RDF reader class based on a symbolic name.
    #
    #   @param  [Symbol] format
    #   @return [Class]
    #
    # @overload for(filename)
    #   Finds an RDF reader class based on a file name.
    #
    #   @param  [String] filename
    #   @return [Class]
    #
    # @overload for(options = {})
    #   Finds an RDF reader class based on various options.
    #
    #   @param  [Hash{Symbol => Object}] options
    #   @option options [String, #to_s]   :file_name      (nil)
    #   @option options [Symbol, #to_sym] :file_extension (nil)
    #   @option options [String, #to_s]   :content_type   (nil)
    #   @return [Class]
    #   @option options [String]          :sample (nil)
    #     A sample of input used for performing format detection.
    #     If we find no formats, or we find more than one, and we have a sample, we can
    #     perform format detection to find a specific format to use, in which case
    #     we pick the first one we find
    #   @return [Class]
    #   @yieldreturn [String] another way to provide a sample, allows lazy for retrieving the sample.
    #
    # @return [Class]
    def self.for(*arg, &block)
      case arg.length
      when 0 then arg = nil
      when 1 then arg = arg.first
      else
        raise ArgumentError, "Format.for accepts zero or one argument, got #{arg.length}."
      end
      arg = arg.merge(has_reader: true) if arg.is_a?(Hash)
      if format = self.format || Format.for(arg, &block)
        format.reader
      end
    end

    ##
    # Retrieves the RDF serialization format class for this reader class.
    #
    # @return [Class]
    def self.format(klass = nil)
      if klass.nil?
        Format.each do |format|
          if format.reader == self
            return format
          end
        end
        nil # not found
      end
    end

    ##
    # Options suitable for automatic Reader provisioning.
    # @return [Array<RDF::CLI::Option>]
    def self.options
      [
        RDF::CLI::Option.new(
          symbol: :base_uri,
          control: :url,
          datatype: RDF::URI,
          on: ["--uri URI"],
          description: "Base URI of input file, defaults to the filename.") {|arg| RDF::URI(arg)},
        RDF::CLI::Option.new(
          symbol: :canonicalize,
          datatype: TrueClass,
          on: ["--canonicalize"],
          control: :checkbox,
          default: false,
          description: "Canonicalize URI/literal forms") {true},
        RDF::CLI::Option.new(
          symbol: :encoding,
          datatype: Encoding,
          control: :text,
          on: ["--encoding ENCODING"],
          description: "The encoding of the input stream.") {|arg| Encoding.find arg},
        RDF::CLI::Option.new(
          symbol: :intern,
          datatype: TrueClass,
          control: :none,
          on: ["--intern"],
          description: "Intern all parsed URIs."),
        RDF::CLI::Option.new(
          symbol: :prefixes,
          datatype: Hash,
          control: :none,
          multiple: true,
          on: ["--prefixes PREFIX:URI,PREFIX:URI"],
          description: "A comma-separated list of prefix:uri pairs.") do |arg|
            arg.split(',').inject({}) do |memo, pfxuri|
              pfx,uri = pfxuri.split(':', 2)
              memo.merge(pfx.to_sym => RDF::URI(uri))
            end
        end,
        RDF::CLI::Option.new(
          symbol: :rdfstar,
          datatype: TrueClass,
          control: :checkbox,
          on: ["--rdfstar"],
          description: "Parse RDF-star for preliminary RDF 1.2 support."),
        RDF::CLI::Option.new(
          symbol: :validate,
          datatype: TrueClass,
          control: :checkbox,
          on: ["--validate"],
          description: "Validate input file."),
        RDF::CLI::Option.new(
          symbol: :verifySSL,
          datatype: TrueClass,
          default: true,
          control: :checkbox,
          on: ["--[no-]verifySSL"],
          description: "Verify SSL results on HTTP GET")
      ]
    end

    # Returns a hash of options appropriate for use with this reader
    
    class << self
      alias_method :format_class, :format
    end

    ##
    # Parses input from the given file name or URL.
    #
    # @note A reader returned via this method may not be readable depending on the processing model of the specific reader, as the file is only open during the scope of `open`. The reader is intended to be accessed through a block.
    #
    # @example Parsing RDF statements from a file
    #   RDF::Reader.open("etc/doap.nt") do |reader|
    #     reader.each_statement do |statement|
    #       puts statement.inspect
    #     end
    #   end
    #
    # @param  [String, #to_s] filename
    # @param [Symbol] format
    # @param  [Hash{Symbol => Object}] options
    #   any additional options (see {RDF::Util::File.open_file}, {RDF::Reader#initialize} and {RDF::Format.for})
    # @yield  [reader]
    # @yieldparam  [RDF::Reader] reader
    # @yieldreturn [void] ignored
    # @raise  [RDF::FormatError] if no reader found for the specified format
    def self.open(filename, format: nil, **options, &block)
      # If we're the abstract reader, and we can figure out a concrete reader from format, use that.
      if self == RDF::Reader && format && reader = self.for(format)
        return reader.open(filename, format: format, **options, &block)
      end

      # If we are a concrete reader class or format is not nil, set accept header from our content_types.
      unless self == RDF::Reader
        headers = (options[:headers] ||= {})
        headers['Accept'] ||= (self.format.accept_type + %w(*/*;q=0.1)).join(", ")
      end

      Util::File.open_file(filename, **options) do |file|
        format_options = options.dup
        format_options[:content_type] ||= file.content_type if
          file.respond_to?(:content_type) &&
          !file.content_type.to_s.include?('text/plain')
        format_options[:file_name] ||= filename
        reader = if self == RDF::Reader
          # We are the abstract reader class, find an appropriate reader
          self.for(format || format_options) do
            # Return a sample from the input file
            sample = file.read(1000)
            file.rewind
            sample
          end
        else
          # We are a concrete reader class
          self
        end

        options[:encoding] ||= file.encoding if file.respond_to?(:encoding)
        options[:filename] ||= filename

        if reader
          reader.new(file, **options, &block)
        else
          raise FormatError, "unknown RDF format: #{format_options.inspect}#{"\nThis may be resolved with a require of the 'linkeddata' gem." unless Object.const_defined?(:LinkedData)}"
        end
      end
    end

    ##
    # Returns a symbol appropriate to use with RDF::Reader.for()
    # @return [Symbol]
    def self.to_sym
      self.format.to_sym
    end

    ##
    # Returns a symbol appropriate to use with RDF::Reader.for()
    # @return [Symbol]
    def to_sym
      self.class.to_sym
    end
    
    ##
    # Initializes the reader.
    #
    # @param  [IO, File, String] input
    #   the input stream to read
    # @param [#to_s]    base_uri     (nil)
    #   the base URI to use when resolving relative URIs (not supported by
    #   all readers)
    # @param [Boolean]  canonicalize (false)
    #   whether to canonicalize parsed URIs and Literals.
    # @param [Encoding] encoding     (Encoding::UTF_8)
    #   the encoding of the input stream
    # @param [Boolean]  intern       (true)
    #   whether to intern all parsed URIs
    # @param [Boolean] rdfstar      (false)
    #   Preliminary support for RDF 1.2.
    # @param [Hash]     prefixes     (Hash.new)
    #   the prefix mappings to use (not supported by all readers)
    # @param  [Hash{Symbol => Object}] options
    #   any additional options
    # @param [Boolean]  validate     (false)
    #   whether to validate the parsed statements and values
    # @yield  [reader] `self`
    # @yieldparam  [RDF::Reader] reader
    # @yieldreturn [void] ignored
    def initialize(input = $stdin,
                   base_uri:      nil,
                   canonicalize:  false,
                   encoding:      Encoding::UTF_8,
                   intern:        true,
                   prefixes:      Hash.new,
                   rdfstar:       false,
                   validate:      false,
                   **options,
                   &block)

      base_uri     ||= input.base_uri if input.respond_to?(:base_uri)
      @options = options.merge({
        base_uri:       base_uri,
        canonicalize:   canonicalize,
        encoding:       encoding,
        intern:         intern,
        prefixes:       prefixes,
        rdfstar:        rdfstar,
        validate:       validate
      })

      @input = case input
        when String then StringIO.new(input)
        else input
      end

      if block_given?
        case block.arity
          when 0 then instance_eval(&block)
          else block.call(self)
        end
      end
    end

    ##
    # Any additional options for this reader.
    #
    # @return [Hash]
    # @since  0.3.0
    attr_reader :options

    ##
    # Returns the base URI determined by this reader.
    #
    # @example
    #   reader.prefixes[:dc]  #=> RDF::URI('http://purl.org/dc/terms/')
    #
    # @return [RDF::URI]
    # @since  0.3.0
    def base_uri
      RDF::URI(@options[:base_uri]) if @options[:base_uri]
    end

    ##
    # Returns the URI prefixes currently defined for this reader.
    #
    # @example
    #   reader.prefixes[:dc]  #=> RDF::URI('http://purl.org/dc/terms/')
    #
    # @return [Hash{Symbol => RDF::URI}]
    # @since  0.3.0
    def prefixes
      @options[:prefixes] ||= {}
    end

    ##
    # Defines the given URI prefixes for this reader.
    #
    # @example
    #   reader.prefixes = {
    #     dc: RDF::URI('http://purl.org/dc/terms/'),
    #   }
    #
    # @param  [Hash{Symbol => RDF::URI}] prefixes
    # @return [Hash{Symbol => RDF::URI}]
    # @since  0.3.0
    def prefixes=(prefixes)
      @options[:prefixes] = prefixes
    end

    ##
    # Defines the given named URI prefix for this reader.
    #
    # @example Defining a URI prefix
    #   reader.prefix :dc, RDF::URI('http://purl.org/dc/terms/')
    #
    # @example Returning a URI prefix
    #   reader.prefix(:dc)    #=> RDF::URI('http://purl.org/dc/terms/')
    #
    # @overload prefix(name, uri)
    #   @param  [Symbol, #to_s]   name
    #   @param  [RDF::URI, #to_s] uri
    #
    # @overload prefix(name)
    #   @param  [Symbol, #to_s]   name
    #
    # @return [RDF::URI]
    def prefix(name, uri = nil)
      name = name.to_s.empty? ? nil : (name.respond_to?(:to_sym) ? name.to_sym : name.to_s.to_sym)
      uri.nil? ? prefixes[name] : prefixes[name] = uri
    end
    alias_method :prefix!, :prefix

    ##
    # Iterates the given block for each RDF statement.
    #
    # If no block was given, returns an enumerator.
    #
    # Statements are yielded in the order that they are read from the input
    # stream.
    #
    # @overload each_statement
    #   @yield  [statement]
    #     each statement
    #   @yieldparam  [RDF::Statement] statement
    #   @yieldreturn [void] ignored
    #   @return [void]
    #
    # @overload each_statement
    #   @return [Enumerator]
    #
    # @return [void]
    # @raise  [RDF::ReaderError] on invalid data
    # @see    RDF::Enumerable#each_statement
    def each_statement(&block)
      if block_given?
        begin
          loop do
            st = read_statement
            block.call(st)
          end
        rescue EOFError
          rewind rescue nil
        end
      end
      enum_for(:each_statement)
    end
    alias_method :each, :each_statement

    ##
    # Iterates the given block for each RDF triple.
    #
    # If no block was given, returns an enumerator.
    #
    # Triples are yielded in the order that they are read from the input
    # stream.
    #
    # @overload each_triple
    #   @yield  [subject, predicate, object]
    #     each triple
    #   @yieldparam  [RDF::Resource] subject
    #   @yieldparam  [RDF::URI]      predicate
    #   @yieldparam  [RDF::Term]     object
    #   @yieldreturn [void] ignored
    #   @return [void]
    #
    # @overload each_triple
    #   @return [Enumerator]
    #
    # @return [void]
    # @see    RDF::Enumerable#each_triple
    def each_triple(&block)
      if block_given?
        begin
          loop do
            triple = read_triple
            block.call(*triple)
          end
        rescue EOFError
          rewind rescue nil
        end
      end
      enum_for(:each_triple)
    end

    ##
    # Rewinds the input stream to the beginning of input.
    #
    # @return [void]
    # @since  0.2.3
    # @see    http://ruby-doc.org/core-2.2.2/IO.html#method-i-rewind
    def rewind
      @input.rewind
    end
    alias_method :rewind!, :rewind

    ##
    # Closes the input stream, after which an `IOError` will be raised for
    # further read attempts.
    #
    # If the input stream is already closed, does nothing.
    #
    # @return [void]
    # @since  0.2.2
    # @see    http://ruby-doc.org/core-2.2.2/IO.html#method-i-close
    def close
      @input.close unless @input.closed?
    end
    alias_method :close!, :close

    ##
    # Current line number being processed. For formats that can associate generated {Statement} with a particular line number from input, this value reflects that line number.
    # @return [Integer]
    def lineno
      @input.lineno
    end

    ##
    # @return [Boolean]
    #
    # @note this parses the full input and is valid only in the reader block.
    #   Use `Reader.new(input, validate: true)` if you intend to capture the 
    #   result.
    #
    # @example Parsing RDF statements from a file
    #   RDF::NTriples::Reader.new("!!invalid input??") do |reader|
    #     reader.valid? # => false
    #   end
    #
    # @see RDF::Value#validate! for Literal & URI validation relevant to 
    #   error handling.
    # @see Enumerable#valid?
    def valid?
      super && !log_statistics[:error]
    rescue ArgumentError, RDF::ReaderError => e
      log_error(e.message)
      false
    end

  protected

    ##
    # Reads a statement from the input stream.
    #
    # @return [RDF::Statement] a statement
    # @raise  [NotImplementedError] unless implemented in subclass
    # @abstract
    def read_statement
      Statement.new(*read_triple)
    end

    ##
    # Reads a triple from the input stream.
    #
    # @return [Array(RDF::Term)] a triple
    # @raise  [NotImplementedError] unless implemented in subclass
    # @abstract
    def read_triple
      raise NotImplementedError, "#{self.class}#read_triple" # override in subclasses
    end

    ##
    # Raises an "expected subject" parsing error on the current line.
    #
    # @return [void]
    # @raise  [RDF::ReaderError]
    def fail_subject
      log_error("Expected subject (found: #{current_line.inspect})", lineno: lineno, exception: RDF::ReaderError)
    end

    ##
    # Raises an "expected predicate" parsing error on the current line.
    #
    # @return [void]
    # @raise  [RDF::ReaderError]
    def fail_predicate
      log_error("Expected predicate (found: #{current_line.inspect})", lineno: lineno, exception: RDF::ReaderError)
    end

    ##
    # Raises an "expected object" parsing error on the current line.
    #
    # @return [void]
    # @raise  [RDF::ReaderError]
    def fail_object
      log_error("Expected object (found: #{current_line.inspect})", lineno: lineno, exception: RDF::ReaderError)
    end

    ##
    # Recursively emit embedded statements in Property Graph mode
    #
    # @param [RDF::Statement] statement
    def each_pg_statement(statement, &block)
      if statement.subject.is_a?(Statement)
        block.call(statement.subject)
        each_pg_statement(statement.subject, &block)
      end

      if statement.object.is_a?(Statement)
        block.call(statement.object)
        each_pg_statement(statement.object, &block)
      end
    end

  public
    ##
    # Returns the encoding of the input stream.
    #
    # @return [Encoding]
    def encoding
      case @options[:encoding]
      when String, Symbol
        Encoding.find(@options[:encoding].to_s)
      when Encoding
        @options[:encoding]
      else
        @options[:encoding] ||= Encoding.find(self.class.format.content_encoding.to_s)
      end
    end

    ##
    # Returns `true` if parsed statements and values should be validated.
    #
    # @return [Boolean] `true` or `false`
    # @since  0.3.0
    def validate?
      @options[:validate]
    end

    ##
    # Returns `true` if parsed values should be in canonical form.
    #
    # @note This is for term canonicalization, for graph/dataset canonicalization use `RDF::Normalize`.
    #
    # @return [Boolean] `true` or `false`
    # @since  0.3.0
    def canonicalize?
      @options[:canonicalize]
    end

    ##
    # Returns `true` if parsed URIs should be interned.
    #
    # @return [Boolean] `true` or `false`
    # @since  0.3.0
    def intern?
      @options[:intern]
    end

  private

    @@subclasses = [] # @private

    ##
    # @private
    # @return [void]
    def self.inherited(child)
      @@subclasses << child
      super
    end

    ##
    # @private
    # @return [String] The most recently read line of the input
    def current_line
      @line
    end

    ##
    # @return [String]
    def readline
      @line = instance_variable_defined?(:@line_rest) && @line_rest || @input.readline
      @line, @line_rest = @line.split("\r", 2)
      @line = String.new if @line.nil? # not frozen
      @line.chomp!
      begin
        @line.encode!(encoding)
      rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError, Encoding::ConverterNotFoundError
        # It is likely the persisted line was not encoded on initial write
        # (i.e. persisted via RDF <= 1.0.9 and read via RDF >= 1.0.10)
        #
        # Encoding::UndefinedConversionError is raised by MRI.
        # Encoding::InvalidByteSequenceError is raised by jruby >= 1.7.5
        # Encoding::ConverterNotFoundError is raised by jruby < 1.7.5
        @line.force_encoding(encoding)
      end
      @line
    end

    ##
    # @return [void]
    def strip!
      @line.strip!
    end

    ##
    # @return [Boolean]
    def blank?
      @line.nil? || @line.empty?
    end

    ##
    # @param  [Regexp] pattern
    # @return [Object]
    def match(pattern)
      if @line =~ pattern
        result, @line = $1, $'.lstrip
        result || true
      end
    end
  end # Reader

  ##
  # The base class for RDF parsing errors.
  class ReaderError < IOError
    ##
    # The invalid token which triggered the error.
    #
    # @return [String]
    attr_reader :token

    ##
    # The line number where the error occurred.
    #
    # @return [Integer]
    attr_reader :lineno

    ##
    # Initializes a new lexer error instance.
    #
    # @param  [String, #to_s]  message
    # @param  [String]         token  (nil)
    # @param  [Integer]        lineno (nil)
    def initialize(message, token: nil, lineno: nil)
      @token      = token
      @lineno     = lineno || (token.lineno if token.respond_to?(:lineno))
      super(message.to_s)
    end
  end # ReaderError
end # RDF