rapid7/metasploit-framework

View on GitHub
lib/robots.rb

Summary

Maintainability
B
5 hrs
Test Coverage
#
#  Copyright (c) 2008 Kyle Maxwell, contributors
#
#  Permission is hereby granted, free of charge, to any person
#  obtaining a copy of this software and associated documentation
#  files (the "Software"), to deal in the Software without
#  restriction, including without limitation the rights to use,
#  copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the
#  Software is furnished to do so, subject to the following
#  conditions:
#
#  The above copyright notice and this permission notice shall be
#  included in all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
#  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
#  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
#  OTHER DEALINGS IN THE SOFTWARE.
#

require "open-uri"
require "uri"
require "timeout"

# https://github.com/fizx/robots
class Robots
  DEFAULT_TIMEOUT = 3

  # Represents a parsed robots.txt file
  class ParsedRobots
    def initialize(uri, user_agent)
      @last_accessed = Time.at(1)

      io = Robots.get_robots_txt(uri, user_agent)

      if !io || io.content_type != "text/plain" || io.status.first != "200"
        io = StringIO.new("User-agent: *\nAllow: /\n")
      end

      @other = {}
      @disallows = {}
      @allows = {}
      @delays = {} # added delays to make it work
      agent = /.*/
      io.each do |line|
        next if line =~ /^\s*(#.*|$)/
        arr = line.split(":")
        key = arr.shift.to_s.downcase
        value = arr.join(":").strip
        value.strip!
        case key
        when "user-agent"
          agent = to_regex(value)
        when "allow"
          @allows[agent] ||= []
          @allows[agent] << to_regex(value)
        when "disallow"
          @disallows[agent] ||= []
          @disallows[agent] << to_regex(value)
        when "crawl-delay"
          @delays[agent] = value.to_i
        else
          @other[key] ||= []
          @other[key] << value
        end
      end

      @parsed = true
    end

    def allowed?(uri, user_agent)
      return true unless @parsed
      allowed = true
      path = uri.request_uri

      @disallows.each do |key, value|
        if user_agent =~ key
          value.each do |rule|
            allowed = false if path =~ rule
          end
        end
      end

      @allows.each do |key, value|
        unless allowed
          if user_agent =~ key
            value.each do |rule|
              if path =~ rule
                allowed = true
              end
            end
          end
        end
      end

      if allowed && @delays[user_agent]
        sleep @delays[user_agent] - (Time.now - @last_accessed)
        @last_accessed = Time.now
      end

      return allowed
    end

    def other_values
      @other
    end

    protected

    def to_regex(pattern)
      return /should-not-match-anything-123456789/ if pattern.strip.empty?
      pattern = Regexp.escape(pattern)
      pattern.gsub!(Regexp.escape("*"), ".*")
      Regexp.compile("^#{pattern}")
    end
  end

  def self.get_robots_txt(uri, user_agent)
    begin
      Timeout.timeout(Robots.timeout) do
        begin
          URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent)
        rescue StandardError
          nil
        end
      end
    rescue Timeout::Error
      dlog("robots.txt request timed out")
    end
  end

  attr_writer :timeout

  def self.timeout
    @timeout || DEFAULT_TIMEOUT
  end

  def initialize(user_agent)
    @user_agent = user_agent
    @parsed = {}
  end

  def allowed?(uri)
    uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
    host = uri.host
    @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
    @parsed[host].allowed?(uri, @user_agent)
  end

  def other_values(uri)
    uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
    host = uri.host
    @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
    @parsed[host].other_values
  end
end