jaimeiniesta/metainspector

View on GitHub
examples/link_checker.rb

Summary

Maintainability
A
0 mins
Test Coverage
# A basic spider that will follow internal links, checking broken links
#
# Usage example:
#
#   ruby link_checker.rb example.com

require '../lib/metainspector'
puts "Using MetaInspector #{MetaInspector::VERSION}"

class BrokenLinkChecker

  def initialize(url)
    @url      = url
    @queue    = []
    @visited  = []
    @ok       = []
    @broken   = {}

    check
  end

  def report
    puts "\n#{@broken.size} broken links found."

    @broken.each do |link, from|
      puts "\n#{link} linked from"
      from.each do |origin|
        puts " - #{origin}"
      end
    end
  end

  private

  def check
    # Resolves redirections of initial URL before placing it on the queue
    @queue.push(MetaInspector.new(@url).url)

    process_next_on_queue while @queue.any?
  end

  def process_next_on_queue
    page = MetaInspector.new(@queue.pop)

    page.links.http.each do |link|
      check_status(link, page.url)
    end

    @visited.push(page.url)

    page.links.internal.each do |link|
      @queue.push(link) if should_be_enqueued?(link)
    end

    show_stats
  end

  # Checks the response status of the linked_url and stores it on the ok or broken collections
  def check_status(linked_url, from_url)
    if @broken.keys.include?(linked_url)
      # This was already known to be broken, we add another origin
      @broken[linked_url] << from_url
    else
      if !@ok.include?(linked_url)
        # We still don't know about this link status, so we check it now
        if reachable?(linked_url)
          @ok << linked_url
        else
          @broken[linked_url] = [from_url]
        end
      end
    end
  end

  def should_be_enqueued?(url)
    !(@visited.include?(url) || @broken.include?(url) || @queue.include?(url))
  end

  def show_stats
    puts "#{'%3s' % @visited.size} pages visited, #{'%3s' % @queue.size} pages on queue, #{'%2s' % @broken.size} broken links"
  end

  # A page is reachable if its response status is less than 400
  # In the case of exceptions, like timeouts or server connection errors,
  # we consider it unreachable
  def reachable?(url)
    page = MetaInspector.new(url)

    if page.response.status < 400
      true
    else
      false
    end
  rescue
    false
  end
end

# Get the starting URL
url = ARGV[0] || (puts "Enter a starting url"; gets.strip)

BrokenLinkChecker.new(url).report