DanMeakin/ratebeer

View on GitHub
lib/ratebeer/search.rb

Summary

Maintainability
A
0 mins
Test Coverage
require 'i18n'
require 'thread'
require_relative 'beer'
require_relative 'brewery'
require_relative 'scraping'
require_relative 'urls'

module RateBeer
  # Stop I18N from enforcing locale, to avoid error message
  I18n.enforce_available_locales = false

  # This class provides functionality for searching RateBeer.com for a
  # specific beer or brewery.
  #
  class Search
    # Keys for fields scraped on RateBeer
    def self.data_keys
      [:query,
       :beers,
       :breweries]
    end

    include RateBeer::Scraping
    include RateBeer::URLs

    class << self
      # Create method which generates new search instance and immediately runs
      # a search.
      #
      def search(query)
        s = new(query)
        { beers:      s.beers,
          breweries:  s.breweries }
      end
    end

    attr_reader :query

    # Create a RateBeer::Search instance.
    #
    # @param [String] query Term to use to search RateBeer
    #
    def initialize(query, scrape_beer_brewers = false)
      self.query = query
      @scrape_breweries = scrape_beer_brewers
    end

    # Setter for query instance variable.
    #
    def query=(qry)
      clear_cached_data
      @query = fix_query_param(qry)
    end

    def ==(other)
      query == other.query
    end

    def inspect
      num_beers = @beers && @beers.count || 0
      num_breweries = @breweries && @breweries.count || 0
      val = "#<#{self.class} - #{@query}"
      val << " - #{num_beers} beers / #{num_breweries} breweries" if @beers || @breweries
      val << ">"
    end

    # Search RateBeer for beers, brewers, etc.
    #
    # The search results page contains a series of tables each of which has the
    # "results" class, containing data of matching brewers, beers, and places
    # in that order. Only brewers and beers are extracted.
    #
    # @return [Hash] Results of the search, broken into breweries and beers,
    #                with the attributes of these results contained therein.
    #
    def run_search
      @beers, @breweries = nil
      tables             = doc.css('h2').map(&:text).zip(doc.css('table'))
      beers, breweries   = nil
      tables.each do |(heading, table)|
        case heading
        when 'brewers'
          @breweries = process_breweries_table(table)
        when 'beers'
          @beers = process_beers_table(table)
        end
      end

      # RateBeer is inconsistent with searching for IPAs. If IPA is in the name
      # of the beer, replace IPA with India Pale Ale, and add the additional
      # results to these results.
      if query.downcase.include?(' ipa')
        alt_query = query.downcase.gsub(' ipa', ' india pale ale')
        extra_beers = self.class.new(alt_query).run_search.beers
        @beers = ((@beers || []) + (extra_beers || [])).uniq
      end
      self
    end

    alias retrieve_details run_search

    private
    
    def doc
      @doc ||= post_request(URI.join(BASE_URL, SEARCH_URL), post_params)
    end

    def scrape_beers
      unless instance_variable_defined?('@beers')
        run_search
        @beers = @beers && @beers.sort_by(&:id)
      end
      @beers
    end

    def scrape_breweries
      unless instance_variable_defined?('@breweries')
        run_search
        @breweries = @breweries.sort_by(&:id)
      end
      @breweries
    end

    # Generate parameters to use in POST request.
    #
    def post_params
      { 'beername' => @query }
    end

    # Process breweries table returned in search.
    #
    # The breweries table (if returned) consists of a series of rows each
    # containing two cells: the first is the name (and hyperlink) to the
    # brewery; and the second is the full location of the brewery.
    #
    # @param [Nokogiri::XML::Element] table An HTML table containing breweries
    #   information
    # @return [Hash{Symbol, String}] Brewery data, including name, location,
    #   url and ID
    #
    def process_breweries_table(table)
      table.css('tr').map do |row|
        result = [:id, :name, :location, :url].zip([nil]).to_h
        result[:name], result[:location] = row.element_children.map do |x|
          fix_characters(x.text)
        end
        result[:url] = row.at_css('a')['href']
        result[:id]  = result[:url].split('/').last.to_i
        Brewery::Brewery.new(result[:id], name: result[:name])
      end
    end

    # Process beers table returned in search.
    #
    # The beers table (if returned) consists of a series of rows each of which
    # contains five cells: the first is the name (and hyperlink) to the beer;
    # the second and third relate to features of the RateBeer.com site, and are
    # ignored; the fourth provides the rating of the beer (if any); and the
    # fifth contains the number of ratings submitted for this beer.
    #
    # The first row in the table contains headings, and is disregarded.
    #
    # @param [Nokogiri::XML::Element] table An HTML table containing beers
    #   information
    # @return [Hash{Symbol, String}] Beer data, including name, score, rating,
    #   url and ID
    #
    def process_beers_table(table)
      beers = []
      threads = []
      mutex = Mutex.new
      table.css('tr').drop(1).map do |r|
        threads << Thread.new do
          beer = process_beer_row(r)
          mutex.synchronize { beers << beer }
        end
      end
      threads.each(&:join)
      beers
    end

    # Processes one row from a beer table.
    def process_beer_row(row)
      result = [:id, :name, :score, :ratings, :url].zip([nil]).to_h
      content = row.element_children.map { |x| fix_characters(x.text) }
      result[:name] = row.element_children.first.at_css('a').text
      result[:score], result[:ratings] = content.values_at(3, 4)
                                                .map do |n|
        n.nil? || n.empty? ? nil : n.to_i
      end
      result[:url] = row.at_css('a')['href']
      result[:id]  = result[:url].split('/').last.to_i
      b = Beer::Beer.new(result[:id], name: result[:name])
      b.brewery.name if @scrape_beer_brewers
      b
    end

    # Amend search query string for better results
    #
    # RateBeer is a little finicky about finding search results. It does not
    # provide results on abbreviations, and a passed query including special
    # characters will return no hits. Often searching using a generic term such
    # as Co, Brewers, Brewery, etc. will not return any results. This method
    # strips out such generic terms from a query.
    #
    # This method attempts to deal with these issues.
    #
    # @param [String] query Raw query parameter
    # @return [String] Query parameter amended to improve results
    #
    def fix_query_param(query)
      query = strip_generic_terms(query)
      query = substitute_known_terms(query)
      I18n.transliterate(query)
    end

    # Strip defined generic terms from query.
    #
    # This method removes all generic terms which may refer to a brewery, but
    # which may not appear in the brewery's proper name, e.g. brewers.
    #
    # @param [String] query Raw query parameter
    # @return [String] Query parameter with generics stripped out
    #
    def strip_generic_terms(query)
      generic_words = ["Brew",
                       "Brewers", 
                       "Brewery",
                       "Brewing",
                       "Brewhouse",
                       "Company",
                       "Co\.?",
                       "Inc\.?",
                       "Ltd\.?",
                       "Limited"]
      generic_words.map! { |w| /(^| )#{w}( |$)/i }
      generic_words.each { |w| query.gsub!(w, " ") }
      query.strip
    end

    # Substitute known problematic terms in query.
    #
    # This method will replace terms which are known to cause problems in the
    # search with different terms which do not cause the same problem.
    #
    # @param [String] query Raw query parameter
    # @return [String] Query parameter with terms substituted
    #
    def substitute_known_terms(query)
      # List of problem terms - key can be a string or regexp
      problem_terms = { "six°north" => "Six Degrees North",
                        /[\/:]/     => " " }
      problem_terms.each { |term, substitute| query.gsub!(term, substitute) }
      query.strip
    end 

    # Clear cached search data.
    #
    def clear_cached_data
      ["@beers", "@breweries"].each { |v| remove_instance_variable(v) if instance_variable_defined?(v) }
    end
  end
end