WikiEducationFoundation/WikiEduDashboard

View on GitHub
lib/importers/revision_importer.rb

Summary

Maintainability
A
0 mins
Test Coverage
# frozen_string_literal: true

require_dependency "#{Rails.root}/lib/replica"
require_dependency "#{Rails.root}/lib/duplicate_article_deleter"
require_dependency "#{Rails.root}/lib/importers/article_importer"
require_dependency "#{Rails.root}/app/helpers/encoding_helper"

#= Imports and updates revisions from Wikipedia into the dashboard database
class RevisionImporter
  include EncodingHelper

  def initialize(wiki, course, update_service: nil)
    @wiki = wiki
    @course = course
    @update_service = update_service
  end

  def import_revisions_for_course(all_time:)
    if all_time
      import_revisions(all_revisions_for_course)
    else
      import_revisions(new_revisions_for_course)
    end
  end

  ###########
  # Helpers #
  ###########
  private

  def all_revisions_for_course
    get_revisions(@course.students, course_start_date, end_of_update_period)
  end

  def new_revisions_for_course
    results = []

    # Users with no revisions are considered "new". For them, we search for
    # revisions starting from the beginning of the course, in case they were
    # just added to the course.
    @new_users = users_with_no_revisions
    results += revisions_from_new_users unless @new_users.empty?

    # For users who already have revisions during the course, we assume that
    # previous updates imported their revisions prior to the latest revisions.
    # We only need to import revisions
    @old_users = @course.students - @new_users
    results += revisions_from_old_users unless @old_users.empty?
    results
  end

  def revisions_from_new_users
    get_revisions(@new_users, course_start_date, end_of_update_period)
  end

  def revisions_from_old_users
    latest_rev = latest_revision_of_course
    start = latest_rev.blank? ? course_start_date : latest_rev.date.strftime('%Y%m%d%H%M%S')
    get_revisions(@old_users, start, end_of_update_period)
  end

  def import_revisions(data)
    # Use revision data fetched from Replica to add new Revisions as well as
    # new Articles where appropriate.
    data.each_slice(10000) do |sub_data|
      import_revisions_slice(sub_data)
    end
  end

  # Get revisions made by a set of users between two dates.
  # We limit the number of usernames per query in order to avoid
  # hitting the memory limit of the Replica endpoint.
  MAX_USERNAMES = 10
  def get_revisions(users, start, end_date)
    Utils.chunk_requests(users, MAX_USERNAMES) do |block|
      Replica.new(@wiki, @update_service).get_revisions block, start, end_date
    end
  end

  def course_start_date
    @course.start.strftime('%Y%m%d')
  end

  # pull all revisions until present, so that we have any after-the-end revisions
  # included for calculating retention when a past course gets updated.
  def end_of_update_period
    2.days.from_now.strftime('%Y%m%d')
  end

  def users_with_no_revisions
    @course.users.role('student')
           .joins(:courses_users)
           .where(courses_users: { revision_count: 0 })
  end

  def latest_revision_of_course
    @course.recent_revisions.where(wiki_id: @wiki.id).order('date DESC').first
  end

  def import_revisions_slice(sub_data)
    @articles, @revisions = [], []

    # Extract all article data from the slice. Outputs a hash with article attrs.
    articles = sub_data_to_article_attributes(sub_data)

    # We rely on the unique index here, mw_page_id and wiki_id
    Article.import articles, on_duplicate_key_update: [:title, :namespace]
    @articles = Article.where(wiki_id: @wiki.id, mw_page_id: articles.map { |a| a['mw_page_id'] })

    # Prep: get a user dictionary for all users referred to by revisions.
    users = user_dict_from_sub_data(sub_data)

    # Now get all the revisions
    # We need a slightly different article dictionary format here
    article_dict = @articles.each_with_object({}) { |a, memo| memo[a.mw_page_id] = a.id }
    revisions = sub_data_to_revision_attributes(sub_data, users, article_dict)
    Revision.import revisions, on_duplicate_key_ignore: true

    DuplicateArticleDeleter.new(@wiki).resolve_duplicates(@articles)
  end

  def string_to_boolean(string)
    case string
    when 'false'
      false
    when 'true'
      true
    end
  end

  def sub_data_to_article_attributes(sub_data)
    sub_data.map do |_a_id, article_data|
      {
        'mw_page_id' => article_data['article']['mw_page_id'],
        'wiki_id' => @wiki.id,
        'title' => sanitize_4_byte_string(article_data['article']['title']),
        'namespace' => article_data['article']['namespace']
      }
    end
  end

  def user_dict_from_sub_data(sub_data)
    users = sub_data.flat_map do |_a_id, article_data|
      article_data['revisions'].map { |rev_data| rev_data['username'] }
    end
    users.uniq!
    # Returns e.g. {"Nalumc"=>4, "Twkpassmore"=>3}
    User.where(username: users).pluck(:username, :id).to_h
  end

  def sub_data_to_revision_attributes(sub_data, users, articles)
    sub_data.flat_map do |_a_id, article_data|
      article_data['revisions'].map do |rev_data|
        mw_page_id = rev_data['mw_page_id'].to_i
        {
          mw_rev_id: rev_data['mw_rev_id'],
          date: rev_data['date'],
          characters: rev_data['characters'],
          article_id: articles[mw_page_id],
          mw_page_id:,
          user_id: users[rev_data['username']],
          new_article: string_to_boolean(rev_data['new_article']),
          system: string_to_boolean(rev_data['system']),
          wiki_id: rev_data['wiki_id']
        }
      end
    end
  end
end