gilest/briefly

View on GitHub
lib/scrapers/article_scraper.rb

Summary

Maintainability
A
55 mins
Test Coverage
module Scrapers
  class ArticleScraper

    require 'nokogiri'
    require 'open-uri'

    def initialize(url)
      @url = url
      @document = Nokogiri::HTML(open(@url)) # assigning the document to an instance variable is more performant
    end

    # the first h1 and h2 tags are reasonable assumptions for the title and summary respectively

    def title
      return '' if document.css('h1').empty?
      document.css('h1').first.text.strip
    end

    def summary
      return '' if document.css('h2').empty?
      document.css('h2').first.text.strip
    end

    def remote_image_url
      url = nil
      ideal_minimum_width = 661
      try_width = ideal_minimum_width

      while url == nil do
        # looking for images wider than try_width
        images_this_try = images_wider_than(try_width)
        # if there are any
        if images_this_try.length > 0
          potential_image_url = ensure_fully_qualified(images_this_try.first.attr('src'))
          # use the first one as our image
          url = potential_image_url unless exclude_image_url?(potential_image_url)
        end
        # if we can't find any images fall back to nothing
        url = '' if try_width <= 0
        # decrement width we're looking for
        try_width = try_width - 20
      end

      url
    end

    def exclude_image_url?(image_url)
      exclude_images_matching.each {|pattern| return true if image_url =~ pattern }
      return false
    end

    def exclude_images_matching
      []
    end

    def ensure_fully_qualified(url)
      return url if url =~ /http/
      return "#{base_url}#{url}"
    end

    def base_url
      uri = URI(@url)
      "#{uri.scheme}://#{uri.host}"
    end

    def scrape
      {
        title: title,
        summary: summary,
        remote_image_url: remote_image_url,
        link: @url
      }
    end

    def document
      @document
    end

    def images_wider_than(width)
      document.css('img').select{ |img| img[:width].to_i > width }
    end

  end
end