examples/link_checker.rb
# A basic spider that will follow internal links, checking broken links
#
# Usage example:
#
# ruby link_checker.rb example.com
require '../lib/metainspector'
puts "Using MetaInspector #{MetaInspector::VERSION}"
class BrokenLinkChecker
def initialize(url)
@url = url
@queue = []
@visited = []
@ok = []
@broken = {}
check
end
def report
puts "\n#{@broken.size} broken links found."
@broken.each do |link, from|
puts "\n#{link} linked from"
from.each do |origin|
puts " - #{origin}"
end
end
end
private
def check
# Resolves redirections of initial URL before placing it on the queue
@queue.push(MetaInspector.new(@url).url)
process_next_on_queue while @queue.any?
end
def process_next_on_queue
page = MetaInspector.new(@queue.pop)
page.links.http.each do |link|
check_status(link, page.url)
end
@visited.push(page.url)
page.links.internal.each do |link|
@queue.push(link) if should_be_enqueued?(link)
end
show_stats
end
# Checks the response status of the linked_url and stores it on the ok or broken collections
def check_status(linked_url, from_url)
if @broken.keys.include?(linked_url)
# This was already known to be broken, we add another origin
@broken[linked_url] << from_url
else
if !@ok.include?(linked_url)
# We still don't know about this link status, so we check it now
if reachable?(linked_url)
@ok << linked_url
else
@broken[linked_url] = [from_url]
end
end
end
end
def should_be_enqueued?(url)
!(@visited.include?(url) || @broken.include?(url) || @queue.include?(url))
end
def show_stats
puts "#{'%3s' % @visited.size} pages visited, #{'%3s' % @queue.size} pages on queue, #{'%2s' % @broken.size} broken links"
end
# A page is reachable if its response status is less than 400
# In the case of exceptions, like timeouts or server connection errors,
# we consider it unreachable
def reachable?(url)
page = MetaInspector.new(url)
if page.response.status < 400
true
else
false
end
rescue
false
end
end
# Get the starting URL
url = ARGV[0] || (puts "Enter a starting url"; gets.strip)
BrokenLinkChecker.new(url).report