lib/replica.rb from WikiEducationFoundation/WikiEduDashboard

lib/replica.rb
Summary

Maintainability

0 mins
Test Coverage

Issues
# frozen_string_literal: true

require_dependency "#{Rails.root}/lib/revision_data_parser"
require_dependency "#{Rails.root}/lib/errors/api_error_handling"

#= Fetches wiki revision data from an endpoint that provides SQL query
#= results from a replica wiki database on wmflabs:
#=   https://replica-revision-tools.wmcloud.org/
#= For what's going on at the other end, see:
#=   https://github.com/WikiEducationFoundation/WikiEduDashboardTools
class Replica
  include ApiErrorHandling

  def initialize(wiki, update_service = nil)
    @wiki = wiki
    @update_service = update_service
  end

  # This is the maximum number of concurrent queries the system should run
  # against the wmflabs replica endpoint.
  CONCURRENCY_LIMIT = 2

  ################
  # Entry points #
  ################

  # Given a list of users and a start and end date, return a nicely formatted
  # hash of page_ids and revisions made by those users between those dates.
  def get_revisions(users, rev_start, rev_end)
    raw = get_revisions_raw(users, rev_start, rev_end)
    @data = {}
    return @data unless raw.is_a?(Enumerable)
    raw.each { |revision| extract_revision_data(revision) }

    @data
  end

  # Given a list of users and a start and end date, get data about revisions
  # those users made between those dates.
  # As of 2015-02-24, revisions.php only queries namespaces:
  #   0 ([mainspace])
  #   2 (User:)
  def get_revisions_raw(users, rev_start, rev_end)
    user_list = compile_usernames_query(users)
    oauth_tags = compile_oauth_tags
    oauth_tags = oauth_tags.blank? ? oauth_tags : "&#{oauth_tags}"
    query = user_list + oauth_tags + "&start=#{rev_start}&end=#{rev_end}"
    api_get('revisions.php', query)
  end

  # Given a list of articles *or* hashes of the form { 'mw_page_id' => 1234 },
  # see which ones have not been deleted.
  def get_existing_articles_by_id(articles)
    article_list = compile_article_ids_query(articles)
    api_get('articles.php', article_list)
  end

  # Given a list of articles *or* hashes of the form { 'title' => 'Something' },
  # see which ones have not been deleted.
  def post_existing_articles_by_title(articles)
    article_list = articles.map { |article| article['title'] }
    api_post('articles.php', 'post_article_titles[]', article_list)
  end

  # Given a list of revisions, see which ones have not been deleted
  def get_existing_revisions_by_id(revisions)
    revision_list = compile_revision_ids_query(revisions)
    api_get('revisions.php', revision_list)
  end

  ###################
  # Private methods #
  ###################

  private

  ###################
  # Parsing methods #
  ###################

  def extract_revision_data(revision)
    parsed = RevisionDataParser.new(@wiki).parse_revision(revision)
    page_id = parsed['article']['mw_page_id']
    unless @data.include?(page_id)
      @data[page_id] = {}
      @data[page_id]['article'] = parsed['article']
      @data[page_id]['revisions'] = []
    end
    @data[page_id]['revisions'].push parsed['revision']
  end

  ###############
  # API methods #
  ###############

  # Given an endpoint ('articles.php' or 'revisions.php') and a
  # query appropriate to that endpoint, return the parsed json response.
  #
  # Example revisions.php query:
  # https://replica-revision-tools.wmcloud.org/revisions.php?lang=en&project=wikipedia&usernames[]=Ragesoss&start=20140101003430&end=20171231003430
  #
  # Example revisions.php parsed response:
  # [{"page_id"=>"44962463",
  #   "page_title"=>"Swarfe/ENGL-122-2014",
  #   "page_namespace"=>"2",
  #   "rev_id"=>"641297913",
  #   "rev_timestamp"=>"20150106205746",
  #   "rev_user_text"=>"Sage (Wiki Ed)",
  #   "rev_user"=>"21515199",
  #   "new_article"=>"false",
  #   "byte_change"=>"38"},
  #  {"page_id"=>"44962463",
  #   "page_title"=>"Swarfe/ENGL-122-2014",
  #   "page_namespace"=>"2",
  #   "rev_id"=>"641298113",
  #   "rev_timestamp"=>"20150106205902",
  #   "rev_user_text"=>"Sage (Wiki Ed)",
  #   "rev_user"=>"21515199",
  #   "new_article"=>"false",
  #   "byte_change"=>"-50"
  #  }]
  def api_get(endpoint, query='')
    tries ||= 3
    response = do_query(endpoint, query)
    raise unless response.code == '200'
    response_body = response.body
    parsed = Oj.load(response_body)
    raise unless parsed['success']
    parsed['data']
  rescue StandardError => e
    tries -= 1
    sleep 2 && retry unless tries.zero?
    log_error(e, update_service: @update_service,
              sentry_extra: { endpoint:, query:,
                              language: @wiki.language, project: @wiki.project })
  end

  # rubocop:disable Metrics/CyclomaticComplexity
  # rubocop:disable Metrics/PerceivedComplexity
  def api_post(endpoint, key, data)
    tries ||= 3
    response = do_post(endpoint, key, data)
    raise unless response.code == '200'
    return if response.body.empty?
    parsed = Oj.load(response.body)
    raise unless parsed['success']
    parsed['data']
  rescue StandardError => e
    tries -= 1
    sleep 2 && retry unless tries.zero?
    log_error(e, update_service: @update_service,
              sentry_extra: { query: data,
                              response_body: response&.body,
                              language: @wiki.language,
                              project: @wiki.project })
  end
  # rubocop:enable Metrics/CyclomaticComplexity
  # rubocop:enable Metrics/PerceivedComplexity

  def do_query(endpoint, query)
    url = compile_query_url(endpoint, query)
    Net::HTTP::get_response(URI.parse(url))
  end

  REPLICA_TOOL_URL = 'https://replica-revision-tools.wmcloud.org/'

  def do_post(endpoint, key, data)
    url = "#{REPLICA_TOOL_URL}#{endpoint}"
    database_params = project_database_params_post
    Net::HTTP::post_form(URI.parse(url),
                         'db' => database_params['db'],
                         'lang' => database_params['lang'],
                         'project' => database_params['project'],
                         key => data)
  end

  # Query URL for the WikiEduDashboardTools repository
  def compile_query_url(endpoint, query)
    "#{REPLICA_TOOL_URL}#{endpoint}?#{project_database_params}&#{query}"
  end

  ###############################
  # Replica database parameters #
  ###############################

  # The dashboard replica endpoint tool connects to a replica database for whatever
  # wiki is being queried. Normally the database names can be derived directly
  # from the language code and project — and this happens at the toolforge
  # end of things — but some wikis don't follow the simple normal convention.
  #
  # For languages with hyphens in the language code, they are usually replaced
  # with underscores.
  #
  # See https://quarry.wmflabs.org/query/4031 to look up database name for a wiki
  # Or check the table here: https://db-names.toolforge.org/
  #
  # Here are some of the naming exceptions that we must special-case:
  SPECIAL_DB_NAMES = { 'www.wikidata.org' => 'wikidatawiki',
                       'wikisource.org' => 'sourceswiki',
                       'incubator.wikimedia.org' => 'incubatorwiki',
                       'commons.wikimedia.org' => 'commonswiki',
                       'meta.wikimedia.org' => 'metawiki' }.freeze
  def project_database_params
    # Use special-case db param if available.
    return "db=#{SPECIAL_DB_NAMES[@wiki.domain]}" if SPECIAL_DB_NAMES[@wiki.domain]
    # Otherwise, uses the language and project, and replica API infers the standard db name.
    "lang=#{@wiki.language.tr('-', '_')}&project=#{@wiki.project}"
  end

  def project_database_params_post
    db = ''
    db = SPECIAL_DB_NAMES[@wiki.domain] if SPECIAL_DB_NAMES[@wiki.domain]
    { 'db' => db, 'lang' => @wiki.language&.tr('-', '_'), 'project' => @wiki.project }
  end

  def compile_usernames_query(users)
    { usernames: users.map(&:username) }.to_query
  end

  def compile_oauth_tags
    oauth_ids = ENV['oauth_ids']
    return '' if oauth_ids.nil?
    oauth_id_tags = oauth_ids.split(',').map { |id| "OAuth CID: #{id}" }
    { oauth_tags: oauth_id_tags }.to_query
  end

  # Compile an article list to send to the replica endpoint, which might look
  # something like this:
  # "article_ids[]=100&article_ids[]=300"
  def compile_article_ids_query(articles)
    article_page_ids = articles.map { |article| article['mw_page_id'] }
    { article_ids: article_page_ids }.to_query
  end

  def compile_revision_ids_query(revisions)
    { revision_ids: revisions.map(&:mw_rev_id) }.to_query
  end

  # These are typical network errors that we expect to encounter.
  TYPICAL_ERRORS = [Errno::ETIMEDOUT, Net::ReadTimeout, Errno::ECONNREFUSED,
                    Oj::ParseError].freeze
end