buren/site_mapper

View on GitHub
lib/site_mapper/request.rb

Summary

Maintainability
A
0 mins
Test Coverage
require 'url_resolver' # TODO: Allow users to use any resolver

module SiteMapper
  # Get webpage wrapper.
  class Request
    # Request info link
    INFO_LINK  = 'https://rubygems.org/gems/site_mapper'
    # Request User-Agent
    USER_AGENT = "SiteMapper/#{SiteMapper::VERSION} (+#{INFO_LINK})"

    class << self
      # Given an URL get it then parse it with Nokogiri::HTML.
      # @param [String] url
      # @param [Hash] options
      # @return [Nokogiri::HTML] a nokogiri HTML object
      def document(url, options = {})
        Nokogiri::HTML(Request.response_body(url, options))
      end

      # Given an URL get the response.
      # @param [String] url
      # @param [Hash] options
      # @return [Net::HTTPOK] if response is successfull, raises error otherwise
      # @example get example.com and resolve the URL
      #    Request.response('example.com', resolve: true)
      # @example get example.com and do *not* resolve the URL
      #    Request.response('http://example.com')
      # @example get example.com and resolve the URL
      #    Request.response('http://example.com', resolve: true)
      # @example get example.com and resolve the URL and use a custom User-Agent
      #    Request.response('http://example.com', resolve: true, user_agent: 'MyUserAgent')
      def response(url, options = {})
        options = {
          resolve: false,
          user_agent: SiteMapper::USER_AGENT
        }.merge(options)
        resolved_url = options[:resolve] ? resolve_url(url) : url
        uri          = URI.parse(resolved_url)
        http         = Net::HTTP.new(uri.host, uri.port)
        http.use_ssl = true if resolved_url.start_with?('https://')

        request = Net::HTTP::Get.new(uri.request_uri)
        request['User-Agent'] = options[:user_agent]
        http.request(request)
      end

      # Get response body, rescues with nil if an exception is raised.
      # @see Request#response
      def response_body(*args)
        response(*args).body
      end

      # Resolve an URL string and follows redirects.
      # if the URL can't be resolved the original URL is returned.
      # @param [String] url to resolve
      # @return [String] a URL string that potentially is a redirected URL
      # @example Resolve google.com
      #    resolve_url('google.com')
      #    # => 'https://www.google.com'
      def resolve_url(url)
        resolved = UrlResolver.resolve(url)
        resolved = resolved.prepend('http://') unless has_protocol?(resolved)
        resolved
      end

      private

      def has_protocol?(url)
        url.start_with?('https://') || url.start_with?('http://')
      end
    end
  end
end