buren/wayback_archiver

View on GitHub
lib/wayback_archiver/url_collector.rb

Summary

Maintainability
A
0 mins
Test Coverage
require 'spidr'
require 'robots'

require 'wayback_archiver/sitemapper'
require 'wayback_archiver/request'

module WaybackArchiver
  # Retrive URLs from different sources
  class URLCollector
    # Retrieve URLs from Sitemap.
    # @return [Array<String>] of URLs defined in Sitemap.
    # @param [String] url domain to retrieve Sitemap from.
    # @example Get URLs defined in Sitemap for google.com
    #    URLCollector.sitemap('https://google.com/sitemap.xml')
    def self.sitemap(url)
      Sitemapper.urls(url: Request.build_uri(url))
    end

    # Retrieve URLs by crawling.
    # @return [Array<String>] of URLs defined found during crawl.
    # @param [String] url domain to crawl URLs from.
    # @param [Array<String, Regexp>] hosts to crawl.
    # @example Crawl URLs defined on example.com
    #    URLCollector.crawl('http://example.com')
    # @example Crawl URLs defined on example.com and limit the number of visited pages to 100
    #    URLCollector.crawl('http://example.com', limit: 100)
    # @example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100
    #    URLCollector.crawl('http://example.com', limit: -1)
    # @example Crawl multiple hosts
    #    URLCollector.crawl(
    #      'http://example.com',
    #      hosts: [
    #        'example.com',
    #        /host[\d]+\.example\.com/
    #      ]
    #    )
    def self.crawl(url, hosts: [], limit: WaybackArchiver.max_limit)
      urls = []
      start_at_url = Request.build_uri(url).to_s
      options = {
        robots: WaybackArchiver.respect_robots_txt,
        hosts: hosts,
        user_agent: WaybackArchiver.user_agent
      }
      options[:limit] = limit unless limit == -1

      Spidr.site(start_at_url, options) do |spider|
        spider.every_page do |page|
          page_url = page.url.to_s
          urls << page_url
          WaybackArchiver.logger.debug "Found: #{page_url}"
          yield(page_url) if block_given?
        end
      end
      urls
    end
  end
end