UNC-Libraries/hy-c

View on GitHub
app/services/tasks/solr_migration_service.rb

Summary

Maintainability
A
0 mins
Test Coverage
A
97%
require 'date'
require 'time'
require 'ruby-progressbar'

module Tasks
  require 'tasks/migrate/services/progress_tracker'
  require 'tasks/migration_helper'
  # Service for reindexing objects from one solr instance to another
  class SolrMigrationService
    PAGE_SIZE = 1000
    ADMIN_TYPES = 'has_model_ssim:AdminSet'.freeze
    AF_TYPES = 'has_model_ssim:ActiveFedora*'.freeze
    HYDRA_TYPES = 'has_model_ssim:Hydra*'.freeze
    ALL_OTHER_TYPES = '-has_model_ssim:AdminSet AND -has_model_ssim:ActiveFedora* AND -has_model_ssim:Hydra*'.freeze
    BASE_QUERIES = [AF_TYPES, HYDRA_TYPES, ADMIN_TYPES, ALL_OTHER_TYPES].freeze

    # List all object ids in the repository, ordered by object type.
    # Returns the path to the file containing the list of ids. Its name contains the timestamp when the command was issued.
    def list_object_ids(output_path, after_timestamp = nil)
      # Capture time in UTC, as snapshot of starting point for ids
      # (back dated a minute to ensure no changes get lost between now and when the query executes)
      start_time = (Time.now - 60).utc.iso8601.gsub!(/:/, '_')
      # Start list file for IDs, in file named with starting point timestamp
      filename = "id_list_#{start_time}.txt"
      file_path = File.join(output_path, filename)
      File.open(file_path, 'w') do |file|
        BASE_QUERIES.each do |base_query|
          record_paged_type_query(file, base_query, after_timestamp)
        end
      end
      file_path
    end

    def record_paged_type_query(file, base_query, after_timestamp)
      start_row = 0
      total_count = 0
      query = base_query
      unless after_timestamp.nil?
        # Replace underscores with :'s since that is the format used in the list filenames
        after_timestamp.gsub!(/_/, ':')
        # Validate the timestamp is in iso8601 format
        begin
          DateTime.iso8601(after_timestamp)
        rescue Date::Error
          raise ArgumentError, "Invalid after timestamp, must be in ISO8601 format but was #{after_timestamp}"
        end
        query = "#{query} AND system_modified_dtsi:[#{after_timestamp} TO *]"
      end
      loop do
        resp = ActiveFedora::SolrService.get(query,
                                             sort: 'system_create_dtsi ASC',
                                             start: start_row,
                                             rows: PAGE_SIZE,
                                             fl: 'id')['response']
        total_count = resp['numFound'].to_i
        resp['docs'].each do |doc|
          file.puts(doc['id'])
        end
        start_row += PAGE_SIZE
        break unless resp['docs'].length == PAGE_SIZE
      end
    end

    # Trigger indexing of all objects listed in the provided file
    def reindex(id_list_file, clean_index)
      reindex_list(id_list_file, clean_index)
    rescue ArgumentError => e
      raise e
    rescue StandardError => e
      Rails.logger.error 'Execution interrupted by unexpected error'
      Rails.logger.error [e.class.to_s, e.message, *e.backtrace].join($RS)
    end

    def reindex_list(id_list_file, clean_index)
      # count the number of lines in the file to get the total number of ids being indexed for presenting progress
      id_total = File.foreach(id_list_file).inject(0) { |c, _line| c + 1 }

      # Start or resume from progress log, which is a sidecar file based off the id list.
      # For example, /tmp/id_list_2022-06-21T19_55_08Z.txt logs progress to /tmp/id_list_2022-06-21T19_55_08Z.txt-progress.log
      progress_file = progress_log_path(id_list_file)
      progress_tracker = Migrate::Services::ProgressTracker.new(progress_file)
      completed = progress_tracker.completed_set
      resuming = false

      unless completed.empty?
        puts "**** Resuming reindexing, #{completed.length} previously completed ****"
        resuming = true
      end
      if clean_index
        if resuming
          raise ArgumentError, "Cannot request clean index when resuming. To start over, delete the progress log at #{progress_file}"
        end

        puts '**** Clearing index ****'
        Blacklight.default_index.connection.delete_by_query('*:*')
        Blacklight.default_index.connection.commit
      end

      progressbar = ProgressBar.create(total: id_total,
                                       starting_at: completed.length,
                                       length: 80,
                                       format: "%E |%b\u{15E7}%i| %p%% (%c / %C \u{0394}%R)",
                                       progress_mark: ' ',
                                       remainder_mark: "\u{FF65}")

      # Read input file
      id_file = File.new(id_list_file)
      id_file.each_line do |id_line|
        id = id_line.chomp
        # skip id if it has previously been indexed
        next if resuming && completed.include?(id)

        begin
          object = ActiveFedora::Base.find(id)
          # Must use update_index instead of going to SolrService.add in order to trigger NestingCollection behaviors
          object.update_index
        rescue Ldp::Gone => e
          puts "Object with id #{id} is gone, skipping"
          Rails.logger.warn "Object with id #{id} is gone, skipping"
        rescue ActiveFedora::ObjectNotFoundError => e
          Rails.logger.warn "Object with id #{id} was not found, skipping: #{e.message}"
        rescue JSON::GeneratorError => e
          Rails.logger.error "Failed to generate doc for #{id}, skipping: #{e.message}"
        end
        progressbar.increment
        progress_tracker.add_entry(id)
      end
    end

    def progress_log_path(id_list_file)
      "#{id_list_file}-progress.log"
    end
  end
end