ruby-rdf/rdf

View on GitHub
lib/rdf/cli.rb

Summary

Maintainability
F
3 days
Test Coverage
require 'rdf'
require 'rdf/ntriples'
require 'rdf/nquads'
require 'rdf/vocab/writer'
require 'logger'
require 'optparse'
begin
  require 'linkeddata'
rescue LoadError
  # Silently load without linkeddata, but try some others
  %w(
     json/ld
     ld/patch
     rdf/microdata
     rdf/n3
     rdf/ordered_repo
     rdf/rdfa
     rdf/rdfxml
     rdf/reasoner
     rdf/tabular
     rdf/trig
     rdf/trix
     rdf/turtle
     rdf/vocab
     rdf/xsd
     shacl
     shex
     yaml_ld
  ).each do |ser|
    begin
      require ser
    rescue LoadError
    end
  end
end

class OptionParser
  # Actual parsed options
  def options; @options || {}; end
  def options=(value); @options = value; end

  # Arguments remaining after extracting options
  def args; @args || []; end
  def args=(value); @args = value; end
end

module RDF
  # Individual formats can modify options by updating {Reader.options} or {Writer.options}. Format-specific commands are taken from {Format.cli_commands} for each loaded format, which returns an array of lambdas taking arguments and options.
  #
  # Status updates should be logged to `opts[:logger].info`. More complicated information can be added to `:messages` key within `opts`, if present.
  #
  # Other than `help`, all commands parse an input file.
  #
  # Multiple commands may be added in sequence to execute a pipeline.
  #
  # @example Creating Reader-specific options:
  #   class Reader
  #     def self.options
  #       [
  #         RDF::CLI::Option.new(
  #           symbol: :canonicalize,
  #           on: ["--canonicalize"],
  #           description: "Canonicalize URI/literal forms.") {true},
  #         RDF::CLI::Option.new(
  #           symbol: :uri,
  #           on: ["--uri STRING"],
  #           description: "URI.") {|v| RDF::URI(v)},
  #       ]
  #     end
  #
  # @example Creating Format-specific commands:
  #   class Format
  #     def self.cli_commands
  #       {
  #         count: {
  #           description: "",
  #           parse: true,
  #           lambda: ->(argv, opts) {}
  #         },
  #       }
  #     end
  #
  # @example Adding a command manually
  #   class MyCommand
  #     RDF::CLI.add_command(:count, description: "Count statements") do |argv, opts|
  #       count = 0
  #       RDF::CLI.parse(argv, opts) do |reader|
  #         reader.each_statement do |statement|
  #           count += 1
  #         end
  #       end
  #       options[:logger].info "Parsed #{count} statements"
  #     end
  #   end
  #     
  # Format-specific commands should verify that the reader and/or output format are appropriate for the command.
  class CLI

    # Option description for use within Readers/Writers. See {RDF::Reader.options} and {RDF::Writer.options} for example usage.
    class Option
      # Symbol used for this option when calling `Reader.new`
      # @return [Symbol]
      attr_reader :symbol

      # Arguments passed to OptionParser#on
      # @return [Array<String>]
      attr_reader :on

      # Description of this option (optional)
      # @return [String]
      attr_reader :description

      # Default value for this option
      # @return [Object]
      attr_reader :default

      # Potential values (for select or radio) or Ruby datatype
      # @return  [Class, Array<String>]
      attr_reader :datatype

      # Associated HTML form control
      # @return [:text, :textarea, :radio, :checkbox, :select, :url, :url2, :none]
      attr_reader :control

      # Use of this option
      # @return [:optional, :disabled, :removed, :required]
      attr_accessor :use

      ##
      # Create a new option with optional callback.
      #
      # @param [Symbol] symbol
      # @param [Array<String>] on
      # @param [String] datatype
      # @param [Object] default
      # @param [String] control
      # @param [String] description
      # @param [[:optional, :disabled, :removed, :required]] use
      # @yield value which may be used within `OptionParser#on`
      # @yieldparam [Object] value The option value as parsed using `on` argument
      # @yieldparam [OptionParser] options (nil) optional OptionParser
      # @yieldreturn [Object] a possibly modified input value
      def initialize(symbol: nil, on: nil, datatype: nil, control: nil,
                     description: nil, use: :optional, default: nil, **options, &block)
        raise ArgumentError, "symbol is a required argument" unless symbol
        raise ArgumentError, "on is a required argument" unless on
        @symbol, @on, @datatype, @control, @description, @use, @default, @callback = symbol.to_sym, Array(on), datatype, control, description, use, default, block
      end

      def call(arg, options = {})
        if @callback
          case @callback.arity
          when 0 then @callback.call
          when 1 then @callback.call(arg)
          when 2 then @callback.call(arg, options)
          else arg
          end
        else
          arg
        end
      end

      # Return version of commands appropriate for use in JSON
      def to_hash
        {
          symbol:       symbol,
          datatype:     (datatype.is_a?(Class) ? datatype.name : datatype),
          default:      default,
          control:      control,
          description:  description,
          use:          use
        }
      end
    end

    # Built-in commands. Other commands are imported from the Format class of different readers/writers using {RDF::Format#cli_commands}. `COMMANDS` is a Hash who's keys are commands that may be executed by {RDF::CLI.exec}. The value is a hash containing the following keys:
    # * `description` used for providing information about the command.
    # * `parse` Boolean value to determine if input files should automatically be parsed into `repository`.
    # * `help` used for the CLI help output.
    # * `lambda` code run to execute command.
    # * `filter` value is a Hash whose keys are matched against selected command options. All specified `key/value` pairs are compared against the equivalent key in the current invocation.
    #            If an Array, option value (as a string) must match any value of the array (as a string)
    #            If a Proc, it is passed the option value and must return `true`.
    #            Otherwise, the option value (as a string) must equal the  `value` (as a string).
    # * `control` Used to indicate how (if) command is displayed
    # * `repository` Use this repository, if set
    # * `options` an optional array of `RDF::CLI::Option` describing command-specific options.
    # * `option_use`: A hash of option symbol to option usage, used for overriding the default status of an option for this command.
    # @return [Hash{Symbol => Hash{Symbol => Object}}]
    COMMANDS = {
      count: {
        description: "Count statements in parsed input",
        parse: false,
        control: :none,
        help: "count [options] [args...]\nreturns number of parsed statements",
        lambda: ->(argv, opts) do
          unless repository.count > 0
            start = Time.new
            count = 0
            self.parse(argv, **opts) do |reader|
              reader.each_statement do |statement|
                count += 1
              end
            end
            secs = Time.new - start
            opts[:output].puts "Parsed #{count} statements with #{@readers.join(', ')} in #{secs} seconds @ #{count/secs} statements/second."
          end
        end,
        option_use: {output_format: :disabled}
      },
      help: {
        description: "This message",
        parse: false,
        control: :none,
        lambda: ->(argv, opts) {self.usage(self.options)}
      },
      lengths: {
        description: "Lengths of each parsed statement",
        parse: true,
        control: :none,
        help: "lengths [options] [args...]\nreturns lengths of each parsed statement",
        lambda: ->(argv, opts) do
          opts[:output].puts "Lengths"
          repository.each_statement do |statement|
            opts[:output].puts statement.to_s.size
          end
        end,
        option_use: {output_format: :disabled}
      },
      objects: {
        description: "Serialize each parsed object to N-Triples",
        parse: true,
        control: :none,
        help: "objects [options] [args...]\nreturns unique objects serialized in N-Triples format",
        lambda: ->(argv, opts) do
          opts[:output].puts "Objects"
          repository.each_object do |object|
            opts[:output].puts object.to_ntriples
          end
        end,
        option_use: {output_format: :disabled}
      },
      predicates: {
        parse: true,
        description: "Serialize each parsed predicate to N-Triples",
        control: :none,
        help: "predicates [options] [args...]\nreturns unique predicates serialized in N-Triples format",
        lambda: ->(argv, opts) do
          opts[:output].puts "Predicates"
          repository.each_predicate do |predicate|
            opts[:output].puts predicate.to_ntriples
          end
        end,
        option_use: {output_format: :disabled}
      },
      serialize: {
        description: "Serialize using output-format (or N-Triples)",
        parse: true,
        help: "serialize [options] [args...]\nserialize output using specified format (or N-Triples if not specified)",
        lambda: ->(argv, opts) do
          writer_class = RDF::Writer.for(opts[:output_format]) || RDF::NTriples::Writer
          out = opts[:output]
          writer_opts = {prefixes: {}, standard_prefixes: true}.merge(opts)
          writer_class.new(out, **writer_opts) do |writer|
            writer << repository
          end
        end
      },
      subjects: {
        parse: true,
        control: :none,
        description: "Serialize each parsed subject to N-Triples",
        help: "subjects [options] [args...]\nreturns unique subjects serialized in N-Triples format",
        lambda: ->(argv, opts) do
          opts[:output].puts "Subjects"
          repository.each_subject do |subject|
            opts[:output].puts subject.to_ntriples
          end
        end,
        option_use: {output_format: :disabled}
      },
      validate: {
        description: "Validate parsed input",
        control: :none,
        parse: true,
        help: "validate [options] [args...]\nvalidates resulting repository (may also be used with --validate to check for parse-time errors)",
        lambda: ->(argv, opts) do
          opts[:output].puts "Input is " + (repository.valid? ? "" : "in") + "valid"
        end,
        option_use: {output_format: :disabled}
      }
    }

    # Options to setup, may be modified by selected command. Options are also read from {RDF::Reader#options} and {RDF::Writer#options}. When a specific input- or ouput-format is selected, options are also discovered from the associated subclass reader or writer.
    # @return [Array<RDF::CLI::Option>]
    OPTIONS = ([
      RDF::CLI::Option.new(
        symbol: :debug,
        control: :checkbox,
        datatype: TrueClass,
        on: ["-d", "--debug"],
        description: 'Enable debug output for troubleshooting.'),
      RDF::CLI::Option.new(
        symbol: :verbose,
        control: :checkbox,
        datatype: TrueClass,
        on: ['-v', '--verbose'],
        description: 'Enable verbose output. May be given more than once.'),
      RDF::CLI::Option.new(
        symbol: :evaluate,
        control: :none,
        datatype: TrueClass,
        on: ["-e", "--evaluate STRING"],
        description: "Evaluate argument as RDF input, if no files are specified"),
      RDF::CLI::Option.new(
        symbol: :output,
        control: :none,
        on: ["-o", "--output FILE"],
        description: "File to write output, defaults to STDOUT") {|arg| File.open(arg, "w")},
      RDF::CLI::Option.new(
        symbol: :ordered,
        control: :checkbox,
        datatype: TrueClass,
        on: ["--ordered"],
        description: "Use order preserving repository"),
      RDF::CLI::Option.new(
        symbol: :format,
        control: :select,
        datatype: RDF::Format.select {|ft| ft.reader}.map(&:to_sym).sort,
        on: ["--input-format FORMAT", "--format FORMAT"],
        description: "Format of input file, uses heuristic if not specified"
      ) do |arg, options|
          unless reader = RDF::Reader.for(arg.downcase.to_sym)
            RDF::CLI.abort "No reader found for #{arg.downcase.to_sym}. Available readers:\n  #{RDF::CLI.formats(reader: true).join("\n  ")}"
          end

          # Add format-specific reader options
          reader.options.each do |cli_opt|
            next if options.options.key?(cli_opt.symbol)
            on_args = cli_opt.on || []
            on_args << cli_opt.description if cli_opt.description
            options.on(*on_args) do |opt_arg|
              options.options[cli_opt.symbol] = cli_opt.call(opt_arg, options)
            end
          end if reader
          arg.downcase.to_sym
        end,
      RDF::CLI::Option.new(
        symbol: :output_format,
        control: :select,
        datatype: RDF::Format.select {|ft| ft.writer}.map(&:to_sym).sort,
        on: ["--output-format FORMAT"],
        description: "Format of output file, defaults to NTriples"
      ) do |arg, options|
          unless writer = RDF::Writer.for(arg.downcase.to_sym)
            RDF::CLI.abort "No writer found for #{arg.downcase.to_sym}. Available writers:\n  #{self.formats(writer: true).join("\n  ")}"
          end

          # Add format-specific writer options
          writer.options.each do |cli_opt|
            next if options.options.key?(cli_opt.symbol)
            on_args = cli_opt.on || []
            on_args << cli_opt.description if cli_opt.description
            options.on(*on_args) do |opt_arg|
              options.options[cli_opt.symbol] = cli_opt.call(opt_arg, options)
            end
          end if writer
          arg.downcase.to_sym
        end,
    ] + RDF::Reader.options + RDF::Writer.options).uniq(&:symbol)

    class << self
      # Repository containing parsed statements
      # @return [RDF::Repository]
      attr_accessor :repository
    end

    ##
    # @return [String]
    def self.basename() File.basename($0) end

    ##
    # Return OptionParser set with appropriate options
    #
    # The yield return should provide one or more commands from which additional options will be extracted.
    # @overload options(argv)
    #   @param [Array<String>] argv
    #   @return [OptionParser]
    # @overload options(argv, format: :json)
    #   @param [Array<String>] argv
    #   @param [:json] format (:json)
    #   @return [Array<RDF::CLI::Option>]
    #     Returns discovered options
    def self.options(argv, format: nil)
      options = OptionParser.new
      cli_opts = OPTIONS.map(&:dup)
      logger = Logger.new($stderr)
      logger.level = Logger::WARN
      logger.formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"}
      opts = options.options = {logger: logger}

      # Pre-load commands
      load_commands

      # Add options for the specified command(s)
      cmds, args = argv.partition {|e| COMMANDS.include?(e.to_sym)}
      cmds.each do |cmd|
        Array(RDF::CLI::COMMANDS[cmd.to_sym][:options]).each do |option|
          # Replace any existing option with the same symbol
          cli_opts.delete_if {|cli_opt| cli_opt.symbol == option.symbol}

          # Add the option, unless disabled or removed
          cli_opts.unshift(option)
        end

        # Update usage of options for this command
        RDF::CLI::COMMANDS[cmd.to_sym].fetch(:option_use, {}).each do |sym, use|
          if opt = cli_opts.find {|cli_opt| cli_opt.symbol == sym}
            opt.use = use
          end
        end
      end

      cli_opts.each do |cli_opt|
        next if opts.key?(cli_opt.symbol)
        on_args = cli_opt.on || []
        on_args << cli_opt.description if cli_opt.description
        options.on(*on_args) do |arg|
          opts[cli_opt.symbol] = cli_opt.call(arg, options)
        end
      end

      if format == :json
        # Return options
        cli_opts.map(&:to_hash)
      else
        options.banner = "Usage: #{self.basename} command+ [options] [args...]"

        options.on_tail('-V', '--version', 'Display the RDF.rb version and exit.') do
          puts RDF::VERSION; exit(0)
        end

        show_help = false
        options.on_tail("-h", "--help", "Show this message") do
          show_help = true
        end

        begin
          args = options.parse!(args)
        rescue OptionParser::InvalidOption, OptionParser::InvalidArgument, ArgumentError => e
          abort e
        end

        # Make sure options are processed first
        if show_help
          self.usage(options); exit(0)
        end

        options.args = cmds + args
        options
      end
    end

    ##
    # Output usage message
    def self.usage(options, cmd_opts: {}, banner: nil)
      options.banner = banner if banner
      $stdout.puts options
      $stdout.puts "Note: available commands and options may be different depending on selected --input-format and/or --output-format."
      $stdout.puts "Available commands:\n\t#{self.commands(**options.options).join("\n\t")}"
      $stdout.puts "Available formats:\n\t#{(self.formats).join("\n\t")}"
    end

    ##
    # Execute one or more commands, parsing input as necessary
    #
    # @param  [Array<String>] args
    # @param  [IO] output
    # @param  [OptionParser] option_parser
    # @param [Hash{Symbol => Hash{Symbol => Array[String]}}] messages used for conveying non primary-output which is structured.
    # @param  [Hash{Symbol => Object}] options
    # @return [Boolean]
    def self.exec(args, output: $stdout, option_parser: nil, messages: {}, **options)
      option_parser ||= self.options(args)
      options[:logger] ||= option_parser.options[:logger]
      output.set_encoding(Encoding::UTF_8) if output.respond_to?(:set_encoding) && RUBY_PLATFORM == "java"

      # Separate commands from file options; arguments already extracted
      cmds, args = args.partition {|e| COMMANDS.include?(e.to_sym)}

      if cmds.empty?
        usage(option_parser)
        raise ArgumentError, "No command given"
      end

      if cmds.first == 'help'
        on_cmd = cmds[1]
        cmd_opts = COMMANDS.fetch(on_cmd.to_s.to_sym, {})
        if on_cmd && cmd_opts[:help]
          usage(option_parser, cmd_opts: cmd_opts, banner: "Usage: #{self.basename.split('/').last} #{COMMANDS[on_cmd.to_sym][:help]}")
        elsif on_cmd
          usage(option_parser, cmd_opts: cmd_opts)
        else
          usage(option_parser)
        end
        return
      end

      # Make sure any selected command isn't filtered out
      cmds.each do |c|
        COMMANDS[c.to_sym].fetch(:filter, {}).each do |opt, val|
          case val
          when Array
            unless val.map(&:to_s).include?(options[opt].to_s)
              usage(option_parser, banner: "Command #{c.inspect} requires #{opt} in #{val.map(&:to_s).inspect}, not #{options.fetch(opt, 'null')}")
              raise ArgumentError, "Incompatible command #{c} used with option #{opt}=#{options[opt]}"
            end
          when Proc
            unless val.call(options[opt])
              usage(option_parser, banner: "Command #{c.inspect} #{opt} inconsistent with #{options.fetch(opt, 'null')}")
              raise ArgumentError, "Incompatible command #{c} used with option #{opt}=#{options[opt]}"
            end
          else
            unless val.to_s == options[opt].to_s
              usage(option_parser, banner: "Command #{c.inspect} requires compatible value for #{opt}, not #{options.fetch(opt, 'null')}")
              raise ArgumentError, "Incompatible command #{c} used with option #{opt}=#{options[opt]}"
            end
          end
        end

        # The command may specify a repository instance to use
        options[:repository] ||= COMMANDS[c.to_sym][:repository]
      end

      # Hacks for specific options
      options[:logger].level = Logger::INFO if options[:verbose]
      options[:logger].level = Logger::DEBUG if options[:debug]
      options[:format] = options[:format].to_sym if options[:format]
      options[:output_format] = options[:output_format].to_sym if options[:output_format]

      # Allow repository to be set via option.
      # If RDF::OrderedRepo is present, use it if the `ordered` option is specified, otherwise extend an Array.
      @repository = options[:repository] || case
        when RDF.const_defined?(:OrderedRepo) then RDF::OrderedRepo.new
        when options[:ordered] then [].extend(RDF::Enumerable, RDF::Queryable)
        else RDF::Repository.new
      end

      # Parse input files if any command requires it
      if cmds.any? {|c| COMMANDS[c.to_sym][:parse]}
        start = Time.new
        count = 0
        self.parse(args, **options) do |reader|
          reader.each_statement {|st| @repository << st}
          # Remember prefixes from reading
          options[:prefixes] ||= reader.prefixes
        end
        secs = Time.new - start
        options[:logger].info "Parsed #{repository.count} statements with #{@readers.join(', ')} in #{secs} seconds @ #{count/secs} statements/second."
      end

      # Run each command in sequence
      cmds.each do |command|
        COMMANDS[command.to_sym][:lambda].call(args,
          output: output,
          messages: messages,
          **options.merge(repository: repository))
      end

      # Normalize messages
      messages.each do |kind, term_messages|
        case term_messages
        when Hash
        when Array
          messages[kind] = {result: term_messages}
        else
          messages[kind] = {result: [term_messages]}
        end
      end

      if options[:statistics]
        options[:statistics][:reader] = @readers.first unless (@readers || []).empty?
        options[:statistics][:count] = @repository.count
      end
    end

    ##
    # @overload commands(**options)
    #   @param [Hash{Symbol => Object}] options already set
    #   @return [Array<String>] list of executable commands
    # @overload commands(format: :json, **options)
    #   Returns commands as JSON, for API usage.
    #   @param [:json] format
    #   @param [Hash{Symbol => Object}] options already set
    #   @return [Array{Object}]
    #     Returns an array of commands including the command symbol
    def self.commands(format: nil, **options)
      # First, load commands from other formats
      load_commands

      case format
      when :json
        COMMANDS.map do |k, v|
          v = v.merge(symbol: k, options: v.fetch(:options, []).map(&:to_hash))
          v.delete(:lambda)
          v.delete(:help)
          v.delete(:options) if v[:options].empty?
          v[:control] == :none ? nil : v
        end.compact
      else
        # Subset commands based on filter options
        cmds = COMMANDS.reject do |k, c|
          c.fetch(:filter, {}).any? do |opt, val|
            case val
            when Array
              !val.map(&:to_s).include?(options[opt].to_s)
            when Proc
              !val.call(options[opt])
            else
              val.to_s != options[opt].to_s
            end
          end
        end

        sym_len = cmds.keys.map {|k| k.to_s.length}.max
        cmds.keys.sort.map do |k|
          "%*s: %s" % [sym_len, k, cmds[k][:description]]
        end
      end
    end

    ##
    # Load commands from formats
    # @return [Hash{Symbol => Hash{Symbol => Object}}]
    def self.load_commands
      unless @commands_loaded
        RDF::Format.each do |format|
          format.cli_commands.each do |command, options|
            options = {lambda: options} unless options.is_a?(Hash)
            add_command(command, **options)
          end
        end
        @commands_loaded = true
      end
      COMMANDS
    end

    ##
    # Add a command.
    #
    # @param [#to_sym] command
    # @param [Hash{Symbol => String}] options
    # @option options [String] description
    # @option options [String] help string to display for help
    # @option options [Boolean] parse parse input files in to Repository, or not.
    # @option options [Array<RDF::CLI::Option>] options specific to this command
    # @yield argv, opts
    # @yieldparam [Array<String>] argv
    # @yieldparam [Hash] opts
    # @yieldreturn [void]
    def self.add_command(command, **options, &block)
      options[:lambda] = block if block_given?
      COMMANDS[command.to_sym] ||= options
    end

    ##
    # @return [Array<String>] list of available formats
    def self.formats(reader: false, writer: false)
      f = RDF::Format.sort_by(&:to_sym).
        select {|ft| (reader ? ft.reader : (writer ? ft.writer : (ft.reader || ft.writer)))}.
        inject({}) do |memo, r|
          memo.merge(r.to_sym => r.name)
      end
      sym_len = f.keys.map {|k| k.to_s.length}.max
      f.map {|s, t| "%*s: %s" % [sym_len, s, t]}
    end

    ##
    # Parse each file, $stdin or specified string in `options[:evaluate]`
    # yielding a reader
    #
    # @param  [Array<String>] files
    # @param  [String] evaluate from command-line, rather than referenced file
    # @param  [Symbol] format (:ntriples) Reader symbol for finding reader
    # @param  [Encoding] encoding set on the input
    # @param  [Hash{Symbol => Object}] options sent to reader
    # @yield  [reader]
    # @yieldparam [RDF::Reader]
    # @return [nil]
    def self.parse(files, evaluate: nil, format: nil, encoding: Encoding::UTF_8, **options, &block)
      if files.empty?
        # If files are empty, either use options[:execute]
        input = evaluate ? StringIO.new(evaluate) : $stdin
        input.set_encoding(encoding )
        if !format
          sample = input.read
          input.rewind
        end
        r = RDF::Reader.for(format|| {sample: sample})
        raise ArgumentError, "Unknown format for evaluated input" unless r
        (@readers ||= []) << r
        r.new(input, **options) do |reader|
          yield(reader)
        end
      else
        options[:format] = format if format
        files.each do |file|
          RDF::Reader.open(file, **options) do |reader|
            (@readers ||= []) << reader.class.to_s
            yield(reader)
          end
        end
      end
    end

    ##
    # @param  [String] msg
    # @return [void]
    def self.abort(msg)
      Kernel.abort "#{basename}: #{msg}"
    end
  end
end