lib/hijack/page_loader.rb

Summary

Maintainability
A
1 hr
Test Coverage
module Hijack

  class PageLoader
    attr_reader :base_path, :root, :pages

    def initialize(r = 'index.html', b = 'http://www.example.com')
      @base_path = b
      @root = r
      @pages = []
    end

    #
    # <tt>suck(limit = nil)</tt>
    #
    # sucks up an entire website, or up to +limit+ pages.
    #
    def suck(limit = nil)
      self.pages << Page.new(self.root, self.base_path)
      inner_suck(self.pages.first, limit)
    end

    def visited_links
      self.pages.map { |p| p.uri }
    end

    def find_page(l)
      idx = self.visited_links.index(l)
      self.pages[idx] if idx
    end

  private

    def inner_suck(p, limit)
      return if enough?(limit)
      p.links.each do
        |l|
        begin
          if (fp = self.find_page(l))
            fp.linked_from << p
          else
            np =  Page.new(l, self.base_path)
            np.linked_from << p
            self.pages << np
            break if enough?(limit)
            new_limit = limit ? limit - self.pages.size : nil
            inner_suck(np, new_limit)
          end
        rescue OpenURI::HTTPError, URI::InvalidURIError => e
          Hijack::Log.warn(l + ': ' + e.message)
        end
      end
    end

    def full_uri(l)
      [self.base_path, l].join('/')
    end

    def enough?(limit)
      limit && self.pages.size >= limit
    end

  end

end