ruby-rdf/rdf

View on GitHub
lib/rdf/query.rb

Summary

Maintainability
C
1 day
Test Coverage
module RDF
  ##
  # An RDF basic graph pattern (BGP) query.
  #
  # Named queries either match against a specifically named
  # graph if the name is an RDF::Resource or bound RDF::Query::Variable.
  # Names that are against unbound variables match either default
  # or named graphs.
  # The name of `false` will only match against the default graph.
  #
  # Variable names cause the variable to be added to the solution set
  # elements.
  #
  # @example Constructing a basic graph pattern query (1)
  #   query = RDF::Query.new do
  #     pattern [:person, RDF.type,  FOAF.Person]
  #     pattern [:person, FOAF.name, :name]
  #     pattern [:person, FOAF.mbox, :email]
  #   end
  #
  # @example Constructing a basic graph pattern query (2)
  #   query = RDF::Query.new({
  #     person: {
  #       RDF.type  => FOAF.Person,
  #       FOAF.name => :name,
  #       FOAF.mbox => :email,
  #     }
  #   })
  #
  # @example Executing a basic graph pattern query
  #   graph = RDF::Graph.load('etc/doap.nt')
  #   query.execute(graph).each do |solution|
  #     puts solution.inspect
  #   end
  #
  # @example Constructing and executing a query in one go (1)
  #   solutions = RDF::Query.execute(graph) do
  #     pattern [:person, RDF.type, FOAF.Person]
  #   end
  #
  # @example Constructing and executing a query in one go (2)
  #   solutions = RDF::Query.execute(graph, {
  #     person: {
  #       RDF.type => FOAF.Person,
  #     }
  #   })
  #
  # @example In this example, the default graph contains the names of the publishers of two named graphs. The triples in the named graphs are not visible in the default graph in this example.
  #   # default graph
  #   @prefix dc: <http://purl.org/dc/elements/1.1/
  #
  #   <http://example.org/bob>    dc:publisher  "Bob" .
  #   <http://example.org/alice>  dc:publisher  "Alice" .
  #
  #   # Named graph: http://example.org/bob
  #   @prefix foaf: <http://xmlns.com/foaf/0.1/> .
  #
  #   _:a foaf:name "Bob" .
  #   _:a foaf:mbox <mailto:bob@oldcorp.example.org> .
  #
  #   # Named graph: http://example.org/alice
  #   @prefix foaf: <http://xmlns.com/foaf/0.1/> .
  #
  #   _:a foaf:name "Alice" .
  #   _:a foaf:mbox <mailto:alice@work.example.org> .
  #
  # @see http://www.w3.org/TR/rdf-sparql-query/#rdfDataset
  # @since 0.3.0
  class Query
    include Enumerable
    autoload :Pattern,   'rdf/query/pattern'
    autoload :Solution,  'rdf/query/solution'
    autoload :Solutions, 'rdf/query/solutions'
    autoload :Variable,  'rdf/query/variable'
    autoload :HashPatternNormalizer, 'rdf/query/hash_pattern_normalizer'

    ##
    # Executes a query on the given `queryable` graph or repository.
    #
    # @param  [RDF::Queryable] queryable
    #   the graph or repository to query
    # @param  [Hash{Object => Object}] patterns
    #   optional hash patterns to initialize the query with
    # @param  [Hash{Symbol => Object}] options
    #   any additional keyword options (see {RDF::Query#initialize})
    # @yield  [query]
    # @yieldparam  [RDF::Query] query
    # @yieldreturn [void] ignored
    # @return [RDF::Query::Solutions]
    #   the resulting solution sequence
    # @see    RDF::Query#execute
    def self.execute(queryable, patterns = {}, options = {}, &block)
      self.new(patterns, **options, &block).execute(queryable, **options)
    end

    ##
    # Cast values as Solutions
    # @overload Solutions()
    #   @return [Solutions] returns Solutions.new()
    #
    # @overload Solutions(solutions)
    #   @return [Solutions] returns the argument
    #
    # @overload Solutions(array)
    #   @param [Array] array
    #   @return [Solutions] returns the array extended with solutions
    #
    # @overload Solutions(*args)
    #   @param [Array<Solution>] args
    #   @return [Solutions] returns new solutions including the arguments, which must each be a {Solution}
    def self.Solutions(*args)
      if args.length == 1
        return args[0] if args[0].is_a?(Solutions)
        args = args[0] if args[0].is_a?(Array)
      end
      return Solutions.new(args)
    end

    ##
    # The patterns that constitute this query.
    #
    # @return [Array<RDF::Query::Pattern>]
    attr_reader :patterns

    ##
    # The solution sequence for this query.
    #
    # @return [RDF::Query::Solutions]
    attr_reader :solutions

    ##
    # Any additional options for this query.
    #
    # @return [Hash]
    attr_reader :options

    ##
    # Scope the query to named graphs matching value
    #
    # @return [RDF::Resource, RDF::Query::Variable, false] graph_name
    attr_accessor :graph_name

    ##
    # Initializes a new basic graph pattern query.
    #
    # @overload initialize(patterns = [], **options)
    #   @param  [Array<RDF::Query::Pattern>] patterns
    #     ...
    #   @param  [Hash{Symbol => Object}] options
    #     any additional keyword options
    #   @option options [RDF::Query::Solutions] :solutions (Solutions.new)
    #   @option options [RDF::Resource, RDF::Query::Variable, false] :graph_name (nil)
    #     Default graph name for matching against queryable.
    #     Named queries either match against a specifically named
    #     graphs if the name is an {RDF::Resource} or bound {RDF::Query::Variable}.
    #     Names that are against unbound variables match either default
    #     or named graphs.
    #     The name of `false` will only match against the default graph.
    #   @option options [RDF::Resource, RDF::Query::Variable, false] :name (nil)
    #     Alias for `:graph_name`.
    #   @yield  [query]
    #   @yieldparam  [RDF::Query] query
    #   @yieldreturn [void] ignored
    #
    # @overload initialize(patterns, **options)
    #   @param  [Hash{Object => Object}] patterns
    #     ...
    #   @param [RDF::Query::Solutions] solutions (Solutions.new)
    #   @param [RDF::Resource, RDF::Query::Variable, false] graph_name (false)
    #     Default graph name for matching against queryable.
    #     Named queries either match against a specifically named
    #     graphs if the name is an {RDF::Resource} or bound {RDF::Query::Variable}.
    #     Names that are against unbound variables match either default
    #     or named graphs.
    #     The name of `false` will only match against the default graph.
    #   @param [RDF::Resource, RDF::Query::Variable, false] name (false)
    #     Alias for `:graph_name`.
    #   @param  [Hash{Symbol => Object}] options
    #     any additional keyword options
    # @option options [Boolean] validate (false)
    #   validate patterns
    #   @yield  [query]
    #   @yieldparam  [RDF::Query] query
    #   @yieldreturn [void] ignored
    def initialize(*patterns, solutions: nil, graph_name: nil, name: nil, validate: false, **options, &block)
      @options = options.dup
      @solutions = Query::Solutions(solutions)
      graph_name = name if graph_name.nil?
      @graph_name = graph_name

      patterns << @options if patterns.empty?

      @patterns  = case patterns.first
        when Hash  then compile_hash_patterns(HashPatternNormalizer.normalize!(patterns.first.dup, @options))
        when Array then patterns.first
        else patterns
      end

      if block_given?
        case block.arity
          when 1 then block.call(self)
          else instance_eval(&block)
        end
      end

      validate! if validate
    end

    ##
    # Appends the given query `pattern` to this query.
    #
    # @param  [RDF::Query::Pattern] pattern
    #   a triple query pattern
    # @return [void] self
    def <<(pattern)
      @patterns << Pattern.from(pattern)
      self
    end

    ##
    # Appends the given query `pattern` to this query.
    #
    # @param  [RDF::Query::Pattern] pattern
    #   a triple query pattern
    # @param  [Hash{Symbol => Object}] options
    #   any additional keyword options
    # @option options [Boolean] :optional (false)
    #   whether this is an optional pattern
    # @return [void] self
    def pattern(pattern, **options)
      @patterns << Pattern.from(pattern, **options)
      self
    end

    ##
    # Returns an optimized copy of this query.
    #
    # @param  [Hash{Symbol => Object}] options
    #   any additional options for optimization
    # @return [RDF::Query] a copy of `self`
    # @since  0.3.0
    def optimize(**options)
      self.dup.optimize!(**options)
    end

    ##
    # Optimizes this query by reordering its constituent triple patterns
    # according to their cost estimates.
    #
    # Optional patterns have greater cost than non-optional patterns so they will always come after non-optional patterns
    #
    # @param  [Hash{Symbol => Object}] options
    #   any additional options for optimization
    # @return [self]
    # @see    RDF::Query::Pattern#cost
    # @since  0.3.0
    def optimize!(**options)
      optional, required = @patterns.uniq.partition(&:optional?)
      required.sort! do |a, b|
        (a.cost || 0) <=> (b.cost || 0)
      end
      optional.sort! do |a, b|
        (a.cost || 0) <=> (b.cost || 0)
      end
      @patterns = required + optional
      self
    end

    ##
    # Executes this query on the given `queryable` graph or repository.
    #
    # Named queries either match against a specifically named
    # graphs if the name is an RDF::Resource or bound RDF::Query::Variable.
    # Names that are against unbound variables match either detault
    # or named graphs.
    # The name of `false` will only match against the default graph.
    #
    # If the query nas no patterns, it returns a single empty solution as
    # per SPARQL 1.1 _Empty Group Pattern_.
    #
    # @note solutions could be an Iterator, but this algorithm cycles over solutions, which requires them to be an array internally.
    #
    # @param  [RDF::Queryable] queryable
    #   the graph or repository to query
    # @param [RDF::Query::Solutions] solutions (Solutions.new)
    # @param [RDF::Resource, RDF::Query::Variable, false] graph_name (nil)
    #   Default graph name for matching against queryable.
    #   Named queries either match against a specifically named
    #   graphs if the name is an {RDF::Resource} or bound {RDF::Query::Variable}.
    #   Names that are against unbound variables match either default
    #   or named graphs.
    #   The name of `false` will only match against the default graph.
    # @param [RDF::Resource, RDF::Query::Variable, false] name (nil)
    #   Alias for `:graph_name`.
    # @param  [Hash{Symbol => Object}] options
    #   any additional keyword options
    # @option options [Hash{Symbol => RDF::Term}, RDF::Query::Solution] bindings
    #   optional variable bindings to use
    # @option options [Boolean] :optimize
    #   Optimize query before execution.
    # @option options [RDF::Query::Solutions] solutions
    #   optional initial solutions for chained queries
    # @yield  [solution]
    #   each matching solution
    # @yieldparam  [RDF::Query::Solution] solution
    # @yieldreturn [void] ignored
    # @return [RDF::Query::Solutions]
    #   the resulting solution sequence
    # @see    http://www.holygoat.co.uk/blog/entry/2005-10-25-1
    # @see    http://www.w3.org/TR/sparql11-query/#emptyGroupPattern
    def execute(queryable, bindings: {}, solutions: Solution.new, graph_name: nil, name: nil, **options, &block)
      # Use provided solutions to allow for query chaining
      # Otherwise, a quick empty solution simplifies the logic below; no special case for
      # the first pattern
      @solutions = Query::Solutions(solutions)
      bindings = bindings.to_h if bindings.is_a?(Solution)

      # If there are no patterns, just return the empty solution
      if empty?
        @solutions.each(&block) if block_given?
        return @solutions
      end

      self.optimize! if options[:optimize]
      patterns = @patterns
      graph_name = name if graph_name.nil?
      @graph_name = graph_name unless graph_name.nil?

      # Add graph_name to pattern, if necessary
      unless @graph_name.nil?
        if patterns.empty?
          patterns = [Pattern.new(nil, nil, nil, graph_name: @graph_name)]
        else
          apply_graph_name(@graph_name)
        end
      end

      patterns.each do |pattern|

        old_solutions, @solutions = @solutions, Query::Solutions()

        bindings.each_key do |variable|
          if pattern.variables.include?(variable)
            unbound_solutions, old_solutions = old_solutions, Query::Solutions()
            Array(bindings[variable]).each do |binding|
              unbound_solutions.each do |solution|
                old_solutions << solution.merge(variable => binding)
              end
            end
            bindings.delete(variable)
          end
        end

        old_solutions.each do |solution|
          found_match = false
          pattern.execute(queryable, solution) do |statement|
            found_match = true
            @solutions << solution.merge(pattern.solution(statement))
          end
          # If this pattern was optional, and we didn't find any matches,
          # just copy it over as-is.
          if !found_match && pattern.optional?
            @solutions << solution
          end
        end

        #puts "solutions after #{pattern} are #{@solutions.to_a.inspect}"

        # It's important to abort failed queries quickly because later patterns
        # that can have constraints are often broad without them.
        # We have no solutions at all:
        return @solutions if @solutions.empty?

        if !pattern.optional?
          # We have no solutions for variables we should have solutions for
          # (excludes non-distinguished variables):
          need_vars = pattern.variables.select {|k,v| v.distinguished?}.keys
          @solutions.each do |solution|
            break if need_vars.empty?
            need_vars -= solution.bindings.keys
          end
          return Query::Solutions() unless need_vars.empty?
        end
      end
      @solutions.each(&block) if block_given?
      @solutions
    end

    ##
    # Returns `true` if this query did not match when last executed.
    #
    # When the solution sequence is empty, this method can be used to
    # determine whether the query failed to match or not.
    #
    # @return [Boolean]
    # @see    #matched?
    def failed?
      @solutions.empty?
    end

    ##
    # Returns `true` if this query matched when last executed.
    #
    # When the solution sequence is empty, this method can be used to
    # determine whether the query matched successfully or not.
    #
    # @return [Boolean]
    # @see    #failed?
    def matched?
      !failed?
    end

    # Add patterns from another query to form a new Query
    # @param [RDF::Query] other
    # @return [RDF::Query]
    def +(other)
      Query.new(self.patterns + other.patterns)
    end
    
    # Is this query scoped to a named graph?
    # @return [Boolean]
    def named?
      !!graph_name
    end
    
    # Is this query scoped to the default graph?
    # @return [Boolean]
    def default?
      graph_name == false
    end
    
    # Is this query unscoped? This indicates that it can return results from
    # either a named graph or the default graph.
    # @return [Boolean]
    def unnamed?
      graph_name.nil?
    end

    # Apply the graph name specified (or configured) to all patterns that have no graph name
    # @param [RDF::IRI, RDF::Query::Variable] graph_name (self.graph_name)
    def apply_graph_name(graph_name = nil)
      graph_name ||= self.graph_name
      patterns.each {|pattern| pattern.graph_name = graph_name if pattern.graph_name.nil?} unless graph_name.nil?
    end

    ##
    # @overload variable?
    #   Returns `true` if any pattern contains a variable.
    #
    #   @return [Boolean]
    # @overload variable?(variables)
    #   Returns `true` if any pattern contains any of the variables.
    #
    #   @param  [Array<Symbol, #to_sym>] variables
    #   @return [Boolean]
    def variable?(*args)
      case args.length
      when 0 then !variables.empty?
      when 1
        patterns.any? {|p| p.variable?(*args)}
      else raise ArgumentError("wrong number of arguments (given #{args.length}, expected 0 or 1)")
      end
    end
    alias_method :variables?, :variable?
    alias_method :has_variables?, :variable?

    ##
    # The variables used in this query. This includes variables used in patterns along with the graph_name itself, if it is a variable.
    #
    # @return [Hash{Symbol => RDF::Query::Variable}]
    def variables
      # Set variables used in query
      vars = patterns.inject({}) do |memo, pattern|
        memo.merge(pattern.variables)
      end
      graph_name.is_a?(Variable) ? vars.merge(graph_name.to_sym => graph_name) : vars
    end

    ##
    # Returns the number of variables in this query.
    #
    # @return [Integer] (0..3)
    def variable_count
      variables.keys.length
    end

    ##
    # Returns `true` if any pattern contains a blank node.
    #
    # @return [Boolean]
    # @since 2.0
    def node?
      patterns.any?(&:node?) || graph_name && graph_name.node?
    end

    # Query has no patterns
    # @return [Boolean]
    def empty?
      patterns.empty?
    end
    alias_method :has_blank_nodes?, :node?

    ##
    # Enumerates over each matching query solution.
    #
    # @yield  [solution]
    # @yieldparam [RDF::Query::Solution] solution
    # @return [Enumerator]
    def each_solution(&block)
      @solutions.each(&block)
    end
    alias_method :each, :each_solution

    ##
    # Enumerates over each statement (pattern).
    #
    # @yield  [RDF::Query::Pattern]
    # @yieldparam [::Query::Pattern] pattern
    # @return [Enumerator]
    def each_statement(&block)
      apply_graph_name
      patterns.each(&block)
    end

    ##
    # Duplicate query, including patterns and solutions
    # @return [RDF::Query]
    def dup
      patterns = @patterns.map {|p| p.dup}
      Query.new(patterns, graph_name: graph_name, solutions: @solutions.dup, **options)
    end

    ##
    # Determine if the query containts valid patterns
    #
    # @return [Boolean] `true` or `false`
    # @since 0.3.9
    def valid?
      !!validate! rescue raise false
    rescue
      false
    end

    ##
    # Validate this query, making sure it can be executed by our query engine.
    # This method is public so that it may be called by implementations of
    # RDF::Queryable#query_execute that bypass our built-in query engine.
    #
    # @return [RDF::Query] `self`
    # @raise [ArgumentError] This query cannot be executed.
    def validate!
      # All patterns must be valid
      @patterns.each(&:validate!)

      # All optional patterns must appear after the regular patterns.
      if i = @patterns.find_index(&:optional?)
        unless @patterns[i..-1].all?(&:optional?)
          raise ArgumentError.new("Optional patterns must appear at end of query")
        end
      end

      self
    end

  protected

    ##
    # @private
    def compile_hash_patterns(hash_patterns)
      patterns = []
      hash_patterns.each do |s, pos|
        raise ArgumentError, "invalid hash pattern: #{hash_patterns.inspect}" unless pos.is_a?(Hash)
        pos.each do |p, os|
          case os
            when Hash
              patterns += os.keys.map { |o| [s, p, o] }
              patterns += compile_hash_patterns(os)
            when Array
              patterns += os.map { |o| [s, p, o] }
            else
              patterns << [s, p, os]
          end
        end
      end
      patterns.map { |pattern| Pattern.from(pattern) }
    end
  end # Query
end # RDF