ruby-rdf/rdf

View on GitHub
lib/rdf/util/file.rb

Summary

Maintainability
D
1 day
Test Coverage
require 'net/http'
require 'link_header'
require 'time'
require 'openssl'

module RDF; module Util
  ##
  # Wrapper for retrieving RDF resources from HTTP(S) and file: scheme locations.
  # 
  # By default, HTTP(S) resources are retrieved using Net::HTTP. However, 
  # If the [Rest Client](https://rubygems.org/gems/rest-client) gem is included,
  # it will be used for retrieving resources, allowing for
  # sophisticated HTTP caching using [REST Client Components](https://rubygems.org/gems/rest-client-components)
  # allowing the use of `Rack::Cache` to avoid network access.
  #
  # To use other HTTP clients, consumers can subclass 
  # {RDF::Util::File::HttpAdapter} and set the {RDF::Util::File.}.
  #
  # Also supports the file: scheme for access to local files.
  #
  #
  # @since 0.2.4
  module File

    ##
    # @abstract Subclass and override {.open_url} to implement a custom adapter
    # @since 1.2
    class HttpAdapter
      ##
      # @param [Array, String] headers
      #   HTTP Request headers
      # @return [Hash] A hash of HTTP request headers
      def self.headers headers: {}
        headers['Accept'] ||= default_accept_header
        headers['User-Agent'] ||= default_user_agent
        headers
      end

      ##
      # @return [String] the value for an Accept header
      def self.default_accept_header
        (RDF::Format.accept_types + %w(*/*;q=0.1)).join(", ")
      end

      ##
      # @return [String] the default User-Agent used when fetching resources.
      def self.default_user_agent
        "Ruby RDF.rb/#{RDF::VERSION}"
      end

      ##
      # @abstract
      # @param [String] base_uri to open
      # @param [String] proxy
      #   HTTP Proxy to use for requests.
      # @param [Array, String] headers ({})
      #   HTTP Request headers
      #
      #   Defaults `Accept` header based on available reader content types to allow for content negotiation based on available readers.
      #
      #   Defaults  `User-Agent` header, unless one is specified.
      # @param [Boolean] verify_none (false)
      #   Don't verify SSL certificates
      # @param  [Hash{Symbol => Object}] options
      #   options are ignored in this implementation. Applications are encouraged
      #   to override this implementation to provide more control over HTTP
      #   headers and redirect following.
      # @return [RemoteDocument, Object] A {RemoteDocument}. If a block is given, the result of evaluating the block is returned.
      # @raise [IOError] if not found
      def self.open_url(base_uri, proxy: nil, headers: {}, verify_none: false, **options)
        raise NoMethodError.new("#{self.inspect} does not implement required method `open_url` for ", "open_url")
      end
    end

    ##
    # If the [Rest Client](https://rubygems.org/gems/rest-client) gem is included,
    # it will be used for retrieving resources allowing for
    # sophisticated HTTP caching using [REST Client Components](https://rubygems.org/gems/rest-client-components)
    # allowing the use of `Rack::Cache` to avoid network access.
    # @since 1.2
    class RestClientAdapter < HttpAdapter
      # (see HttpAdapter.open_url)
      def self.open_url(base_uri, proxy: nil, headers: {}, verify_none: false, **options)
        ssl_verify = verify_none ? OpenSSL::SSL::VERIFY_NONE : OpenSSL::SSL::VERIFY_PEER

        # If RestClient is loaded, prefer it
        RestClient.proxy = proxy.to_s if proxy
        client = RestClient::Resource.new(base_uri, verify_ssl: ssl_verify)
        client.get(headers(headers: headers)) do |response, request, res, &blk|
          case response.code
          when 200..299
            # found object

            # If a Location is returned, it defines the base resource for this file, not it's actual ending location
            document_options = {
              base_uri:     RDF::URI(response.headers.fetch(:location, base_uri)),
              code:         response.code.to_i,
              headers:      response.headers
            }

            RemoteDocument.new(response.body, document_options)
          when 300..399
            # Document base is redirected location
            # Location may be relative
            base_uri = ::URI.join(base_uri, response.headers[:location].to_s).to_s
            response.follow_redirection(&blk)
          else
            raise IOError, "<#{base_uri}>: #{response.code}"
          end
        end
      end
    end

    ##
    # Net::HTTP adapter to retrieve resources without additional dependencies
    # @since 1.2
    class NetHttpAdapter < HttpAdapter
      # (see HttpAdapter.open_url)
      def self.open_url(base_uri, proxy: nil, headers: {}, verify_none: false, **options)
        ssl_verify = verify_none ? OpenSSL::SSL::VERIFY_NONE : OpenSSL::SSL::VERIFY_PEER

        redirect_count = 0
        max_redirects = 5
        parsed_url = RDF::URI.parse(base_uri)
        parsed_proxy = RDF::URI.parse(proxy.to_s)
        base_uri = parsed_url.to_s
        remote_document = nil

        until remote_document do
          Net::HTTP::start(parsed_url.host, parsed_url.port,
                          parsed_proxy.host, parsed_proxy.port,
                          open_timeout: 60 * 1000,
                          use_ssl: parsed_url.scheme == 'https',
                          verify_mode: ssl_verify
          ) do |http|
            request = Net::HTTP::Get.new(parsed_url.request_uri, headers(headers: headers))
            http.request(request) do |response|
              case response
              when Net::HTTPSuccess
                # found object

                # Normalize headers using symbols
                response_headers = response.to_hash.inject({}) do |out, (key, value)|
                  out[key.gsub(/-/, '_').downcase.to_sym] = %w{ set-cookie }.include?(key.downcase) ? value : value.first
                  out
                end

                # If a Location is returned, it defines the base resource for this file, not it's actual ending location
                document_options = {
                  base_uri:     RDF::URI(response["Location"] ? response["Location"] : base_uri),
                  code:         response.code.to_i,
                  content_type: response.content_type,
                  headers:      response_headers
                }.merge(response.type_params)
                document_options[:last_modified] = DateTime.parse(response["Last-Modified"]) if response["Last-Modified"]

                remote_document = RemoteDocument.new(response.body, document_options)
              when Net::HTTPRedirection
                # Follow redirection
                raise IOError, "Too many redirects" if (redirect_count += 1) > max_redirects

                # Location may be relative
                parsed_url = ::URI.join(base_uri, response["Location"])

                base_uri = parsed_url.to_s
              else
                raise IOError, "<#{parsed_url}>: #{response.message}(#{response.code})"
              end
            end
          end
        end
        remote_document
      end
    end

    ##
    # Use Faraday for retrieving resources
    # @since 1.2
    class FaradayAdapter < HttpAdapter
      class <<self
        ##
        # Set the Faraday::Connection to use for retrieving RDF resources
        def conn= conn
          @conn = conn
        end

        ##
        # Get the Faraday::Connection to use for retrieving RDF resources,
        # or a default connect that follows redirects.
        def conn
          @conn ||= Faraday.new do |conn|
            conn.use FaradayMiddleware::FollowRedirects
            conn.adapter Faraday.default_adapter
          end
        end
      end

      # (see HttpAdapter.open_url)
      def self.open_url(base_uri, proxy: nil, headers: {}, verify_none: false, **options)
        response = conn.get do |req|
          req.url base_uri
          headers(headers: headers).each do |k,v|
            req.headers[k] = v
          end
        end

        case response.status
        when 200..299
          # found object

          # If a Location is returned, it defines the base resource for this file, not it's actual ending location
          document_options = {
            base_uri:     RDF::URI(response.headers.fetch(:location, response.env.url)),
            code:         response.status,
            headers:      response.headers
          }

          RemoteDocument.new(response.body, document_options)
        else
          raise IOError, "<#{base_uri}>: #{response.status}"
        end
      end
    end

    class << self
      ##
      # Set the HTTP adapter
      # @see .http_adapter
      # @param [HttpAdapter] http_adapter
      # @return [HttpAdapter]
      # @since 1.2
      def http_adapter= http_adapter
        @http_adapter = http_adapter
      end

      ##
      # Get current HTTP adapter. If no adapter has been explicitly set,
      # use RestClientAdapter (if RestClient is loaded), or the NetHttpAdapter
      #
      # @param [Boolean] use_net_http use the NetHttpAdapter, even if other
      #      adapters have been configured
      # @return [HttpAdapter]
      # @since 1.2
      def http_adapter(use_net_http = false)
        if use_net_http
          NetHttpAdapter
        else
          @http_adapter ||= begin
            # Otherwise, fallback to Net::HTTP
            if defined?(RestClient)
              RestClientAdapter
            else
              NetHttpAdapter
            end
          end
        end
      end
    end

    ##
    # Open the file, returning or yielding {RemoteDocument}.
    #
    # Input received as non-unicode, is transformed to UTF-8. With Ruby >= 2.2, all UTF is normalized to [Unicode Normalization Form C (NFC)](http://unicode.org/reports/tr15/#Norm_Forms).
    #
    # HTTP resources may be retrieved via proxy using the `proxy` option. If `RestClient` is loaded, they will use the proxy globally by setting something like the following:
    #     `RestClient.proxy = "http://proxy.example.com/"`.
    # When retrieving documents over HTTP(S), use the mechanism described in [Providing and Discovering URI Documentation](http://www.w3.org/2001/tag/awwsw/issue57/latest/) to pass the appropriate `base_uri` to the block or as the return.
    #
    # Applications needing HTTP caching may consider [Rest Client](https://rubygems.org/gems/rest-client) and [REST Client Components](https://rubygems.org/gems/rest-client-components) allowing the use of `Rack::Cache` as a local file cache.
    #
    # @example using a local HTTP cache
    #    require 'restclient/components'
    #    require 'rack/cache'
    #    RestClient.enable Rack::Cache
    #    RDF::Util::File.open_file("http://example.org/some/resource")
    #      # => Cached resource if current, otherwise returned resource
    #
    # @param [String] filename_or_url to open
    # @param [String] proxy
    #   HTTP Proxy to use for requests.
    # @param [Array, String] headers ({})
    #   HTTP Request headers
    #
    #   Defaults `Accept` header based on available reader content types to allow for content negotiation based on available readers.
    #
    #   Defaults  `User-Agent` header, unless one is specified.
    #
    # @param [Boolean] verify_none (false)
    #   Don't verify SSL certificates
    # @param  [Hash{Symbol => Object}] options
    #   options are ignored in this implementation. Applications are encouraged
    #   to override this implementation to provide more control over HTTP
    #   headers and redirect following. If opening as a file,
    #   options are passed to `Kernel.open`.
    # @return [RemoteDocument, Object] A {RemoteDocument}. If a block is given, the result of evaluating the block is returned.
    # @yield [ RemoteDocument] A {RemoteDocument} for local files
    # @yieldreturn [Object] returned from open_file
    # @raise [IOError] if not found
    def self.open_file(filename_or_url, proxy: nil, headers: {}, verify_none: false, **options, &block)
      filename_or_url = $1 if filename_or_url.to_s.match(/^file:(.*)$/)
      remote_document = nil

      if filename_or_url.to_s.match?(/^https?/)
        base_uri = filename_or_url.to_s

        remote_document = self.http_adapter(!!options[:use_net_http]).
          open_url(base_uri,
                   proxy:       proxy,
                   headers:     headers,
                   verify_none: verify_none,
                   **options)
      else
        # Fake content type based on found format
        format = RDF::Format.for(filename_or_url.to_s)
        content_type = format ? format.content_type.first : 'text/plain'
        # Open as a file, passing any options
        begin
          url_no_frag_or_query = RDF::URI(filename_or_url).dup
          url_no_frag_or_query.query = nil
          url_no_frag_or_query.fragment = nil
          options[:encoding] ||= Encoding::UTF_8
          Kernel.open(url_no_frag_or_query, "r", **options) do |file|
            document_options = {
              base_uri:     filename_or_url.to_s,
              charset:      file.external_encoding.to_s,
              code:         200,
              content_type: content_type,
              last_modified:file.mtime,
              headers:      {content_type: content_type, last_modified: file.mtime.xmlschema}
            }

            remote_document = RemoteDocument.new(file.read, document_options)
          end
        rescue Errno::ENOENT => e
          raise IOError, e.message
        end
      end

      if block_given?
        yield remote_document
      else
        remote_document
      end
    end

    ##
    # A RemoteDocument contains the body and headers of a remote resource.
    #
    # Link headers are parsed using the `LinkHeader` gem
    # @see https://github.com/asplake/link_header
    class RemoteDocument < StringIO
      # Base URI based on resource location or returned Location header.
      # @return [String]
      attr_reader :base_uri

      # Content-Type of the returned resource
      # @return [String]
      attr_reader :content_type

      # Encoding of resource (from Content-Type), downcased. Also applied to content if it is UTF
      # @return [String]
      attr_reader :charset

      # Parameters from Content-Type
      # @return {Symbol => String}]
      attr_reader :parameters

      # Response code
      # @return [Integer]
      attr_reader :code

      ##
      # ETag from headers
      # @return [String]
      attr_reader :etag

      # Last-Modified time from headers
      # @return [DateTime]
      attr_reader :last_modified

      # Raw headers from response
      # @return [Hash{Symbol => Object}]
      attr_reader :headers

      # Originally requested URL
      # @return [String]
      attr_reader :requested_url

      ##
      # Set content
      # @param [String] body entity content of request.
      def initialize(body, options = {})
        options.each do |key, value|
          # de-quote charset
          matchdata = value.match(/^["'](.*)["']$/.freeze) if key == "charset"
          value = matchdata[1] if matchdata
          value = value.downcase if value.is_a?(String)
          instance_variable_set(:"@#{key}", value)
        end
        @headers = options.fetch(:headers, {})
        @charset = options[:charset].to_s.downcase if options[:charset]
        @parameters = {}

        # Find Content-Type and extract other parameters
        if headers[:content_type]
          ct, *params = headers[:content_type].split(';').map(&:strip)
          @content_type ||= ct

          # Find charset
          params.each do |param|
            p, v = param.split('=')
            @parameters[p.downcase.to_sym] = v.sub(/^["']?([^"']*)["']?$/, '\1')
            @charset ||= @parameters[p.downcase.to_sym].downcase if p.downcase == 'charset'
          end
        end

        @etag = headers[:etag]
        @last_modified = DateTime.parse(headers[:last_modified]) if headers[:last_modified]
        encoding = @charset ||= "utf-8"

        unless encoding.start_with?("utf")
          body.force_encoding(Encoding::UTF_8)
          encoding = "utf-8"

          # Make sure Unicode is in NFC
          begin
            body.unicode_normalize! unless !body.unicode_normalized?
          rescue Encoding::CompatibilityError
            # Oh, well ...
          end if body.respond_to?(:unicode_normalized?)
        end

        super(body).set_encoding encoding
      end

      ##
      # Returns a list of encodings in Content-Encoding field as an array of strings.
      #
      # The encodings are downcased for canonicalization.
      # @return [Array<String>]
      def content_encoding
        headers.fetch(:content_encoding, "").split(',').map(&:strip).map(&:downcase)
      end

      ##
      # Return links from the Link header.
      #
      # Links can be returned in array form, or searched.
      #
      # @example
      #
      #     d = RemoteDocument.new(...)
      #     describedby = links.find_link(['rel', 'describedby']).href
      #
      # @return [::LinkHeader]
      def links
        @links ||= LinkHeader.parse(@headers[:link])
      end
    end
  end # File
end; end # RDF::Util