amatriain/feedbunch

View on GitHub
FeedBunch-app/lib/tumblr_feed_fetcher.rb

Summary

Maintainability
A
1 hr
Test Coverage
# frozen_string_literal: true

require 'rest_client'

##
# This class fetches Tumblr feeds. It gets around the interstitial GDPR compliance page that since May 2018
# keeps Tumblr feeds from being fetched with RestClient, by using a full-featured headless browser to download
# it and click on the "Ok" button to access the actual feed.

class TumblrFeedFetcher

  ##
  # Fetch a Tumblr URL. Attempts to fetch it with a full headless browser, and if the response is an interstitial
  # GDPR compliance page, click on the "OK" button to access the actual feed or website.
  #
  # Receives as argument the url to fetch.
  #
  # Returns an object compatible with RestClient::Response object containing the response, which may be the feed XML or
  # an HTML document that will be handled by other methods.

  def self.fetch_feed(url)
    Rails.logger.info "URL #{url} belongs to a Tumblr domain possibly with a GDPR interstitial page, using a full browser to fetch it"

    if Feedbunch::Application.config.headless_browser_location == 'remote'
      Rails.logger.info "Using a remote browser in host #{Feedbunch::Application.config.headless_browser_host}, port #{Feedbunch::Application.config.headless_browser_port}"
      uri = Addressable::URI.new
      uri.host = Feedbunch::Application.config.headless_browser_host
      uri.scheme = 'http'
      uri.port = Feedbunch::Application.config.headless_browser_port
      uri.path = '/wd/hub'
      browser_url = uri.to_s
      browser = Selenium::WebDriver.for :remote, url: browser_url, desired_capabilities: :chrome
    else
      Rails.logger.info "Using a local browser"
      opts = Selenium::WebDriver::Chrome::Options.new
      opts.add_argument '--headless'
      browser = Selenium::WebDriver.for :chrome, options: opts
    end

    browser.get url

    # Check if we've fetched the GDPR interstitial page
    begin
      #  Look for the div that tells us this is the GDPR interstitial page. If it isn't present,
      # #find_element raises a NoSuchElementError and we don't have to do anything more with the response
      browser.find_element :xpath, '//div[@data-view="guce-gdpr"]//div[@class="guce-consent-form"]'

      # Click on the OK button to access the actual content
      interstitial_url = browser.current_url
      browser.find_element(:xpath, '//div[@class="final-btn-consent"]//button[@class="btn yes"]').click

      wait = Selenium::WebDriver::Wait.new timeout: 20
      wait.until {
        # Wait until actual content is loaded, or a timeout happens
        browser.current_url != interstitial_url
      }
    rescue Selenium::WebDriver::Error::NoSuchElementError => e
      # Tumblr didn't send us the GDPR interstitial page, return whatever they've sent us
      Rails.logger.info "Tumblr URL #{url} has returned the actual content, not the GDPR interstitial page"
    end

    feed_response = browser.page_source
    # some methods necessary later, to emulate a RestClient response
    feed_response.define_singleton_method :headers do
      return []
    end

    return feed_response
  rescue Selenium::WebDriver::Error::TimeOutError => eTimeout
    Rails.logger.info "Cannot access Tumblr URL #{url} even using a full browser to get around the interstitial GDPR page"
    # if after all the full browser cannot get the feed, raise a RestClient error
    raise RestClient::ServiceUnavailable.new
  ensure
    # Close the browser (if running locally) or the remote webdriver session (if using a remote browser)
    browser.quit
  end
end