WikiEducationFoundation/WikiEduDashboard

View on GitHub
lib/commons.rb

Summary

Maintainability
A
1 hr
Test Coverage
# frozen_string_literal: true

require 'json'
require_dependency "#{Rails.root}/lib/wiki_api"

#= This class is for getting data directly from the Wikimedia Commons API.
class Commons
  def initialize(query, update_service = nil)
    @query = query
    @update_service = update_service
  end
  ###################
  # Request methods #
  ###################

  # Get user contribution data that corresponds to new file uploads.
  def self.get_uploads(users, start_date: nil, end_date: nil, update_service: nil)
    upload_query = build_upload_query(users, start_date, end_date)
    uploads = new(upload_query, update_service).fetch_all_uploads
    uploads
  end

  # Get data about how files are being used across Wikimedia sites.
  def self.get_usages(commons_uploads, update_service: nil)
    usage_query = build_usage_query commons_uploads
    usages = new(usage_query, update_service).get_image_data('globalusage', 'gucontinue')
    usages
  end

  def self.find_missing_files(commons_uploads)
    missing_query = build_info_query(commons_uploads)
    pages = new(missing_query).get_image_data('pageid', '')
    missing_pages = pages.select { |page| page['missing'] }
    missing_page_ids = missing_pages.map { |page| page['pageid'] }
    commons_uploads.select { |file| missing_page_ids.include? file.id }
  end

  def self.get_urls(commons_uploads, update_service: nil)
    url_query = build_url_query commons_uploads
    file_urls = new(url_query, update_service).get_image_data('imageinfo', 'iicontinue')
    file_urls
  end

  ##################
  # Query builders #
  ##################

  def self.build_upload_query(users, start_date, end_date)
    usernames = users.map(&:username)
    upload_query = { list: 'usercontribs',
                     ucuser: usernames,
                     ucnamespace: 6, # File: namespace
                     ucshow: 'new', # New pages ~= new uploads
                     uclimit: 500, # 500 is max for non-bots
                     continue: '' }
    # The Mediawiki API starts from the 'ucstart' and works backwards to 'ucend'
    # so we put the start_date for ucend and vice versa.
    upload_query[:ucend] = start_date.strftime('%Y%m%d%H%M%S') if start_date
    upload_query[:ucstart] = end_date.strftime('%Y%m%d%H%M%S') if end_date
    upload_query
  end

  def self.build_usage_query(commons_uploads)
    file_ids = commons_uploads.map(&:id)
    usage_query = { prop: 'globalusage',
                    pageids: file_ids,
                    gulimit: 500, # 500 is max for non-bots
                    gufilterlocal: 'true', # Don't return local Commons usage
                    guprop: 'namespace', # Fetch NS for each usage
                    continue: '' }
    usage_query
  end

  def self.build_info_query(commons_uploads)
    file_ids = commons_uploads.map(&:id)
    info_query = { pageids: file_ids,
                   continue: '' }
    info_query
  end

  def self.build_url_query(commons_uploads)
    file_ids = commons_uploads.map(&:id)
    url_query = { prop: 'imageinfo',
                  iiprop: 'url',
                  iiurlheight: 480,
                  pageids: file_ids,
                  iilimit: 50, # 50 is max when iiurlheight is used.
                  continue: '' }
    url_query
  end

  ##########################
  # Instance query methods #
  ##########################

  def fetch_all_uploads
    @uploads = []
    @continue = true
    until @continue.nil?
      response = api_get
      return @uploads unless response # fall back gracefully if the query fails
      @uploads += response.data['usercontribs']
      @continue = response['continue'] # nil if there is no continue
      @query['uccontinue'] = @continue['uccontinue'] if @continue
    end

    @uploads
  end

  def get_image_data(prop, continue_param)
    @continue_param = continue_param
    @prop = prop

    @image_data = []
    @continue = true
    until @continue.nil?
      response = api_get
      return @image_data if response.blank?
      parse_image_data_and_update_continue(response)
    end
    @image_data
  end

  def parse_image_data_and_update_continue(response)
    results = response.data['pages']
    # Account for the different format returned when only a single, missing
    # page is queried, which looks like: [{"pageid"=>0, "missing"=>""}]
    results = results.values unless results.is_a?(Array)
    results.each do |r|
      @image_data << r if r[@prop].present?
    end
    @continue = response['continue'] # nil if there is no continue
    return if @continue.nil?

    # Workaround for MediaWiki bug where continue runs the same query infinitely
    # https://phabricator.wikimedia.org/T101532
    @continue = nil if @query[@continue_param] == @continue[@continue_param]

    @query[@continue_param] = @continue[@continue_param] if @continue
  end

  ###################
  # Private methods #
  ###################
  private

  def api_get
    WikiApi.new(CommonsWiki.new, @update_service).query(@query)
  end
end