lib/tasks/dimensions_ingest.rake
# frozen_string_literal: true
namespace :dimensions do
DIMENSIONS_URL = 'https://app.dimensions.ai/api/'
EARLIEST_DATE = '1970-01-01'
desc 'Ingest metadata and publications from Dimensions'
task :ingest_publications, [:admin_set, :start_date, :end_date] => :environment do |t, args|
Rails.logger.info "[#{Time.now}] starting dimensions publications ingest"
is_cron_job = false
start_date = args[:start_date]
end_date = args[:end_date]
# Ensure start_date and end_date are provided, or neither are provided
if args[:start_date].present? && !args[:end_date].present?
raise ArgumentError, 'Both start_date and end_date must be provided if specifying a date range. Only start_date provided.'
elsif !args[:start_date].present? && args[:end_date].present?
raise ArgumentError, 'Both start_date and end_date must be provided if specifying a date range. Only end_date provided.'
end
start_date, end_date, is_cron_job = get_date_range(args)
config = {
'admin_set' => args[:admin_set],
'depositor_onyen' => ENV['DIMENSIONS_INGEST_DEPOSITOR_ONYEN'],
}
# Query and ingest publications
query_service = Tasks::DimensionsQueryService.new
ingest_service = Tasks::DimensionsIngestService.new(config)
publications = ingest_service.ingest_publications(query_service.query_dimensions(start_date: start_date, end_date: end_date))
report = Tasks::DimensionsReportingService.new(publications, query_service.dimensions_total_count, { start_date: start_date, end_date: end_date }, is_cron_job).generate_report
begin
DimensionsReportMailer.dimensions_report_email(report).deliver_now
Rails.logger.info 'Dimensions ingest report email sent successfully.'
Rails.logger.info "Ingested Publications (Total #{publications[:ingested].count}): #{publications[:ingested].map { |pub| pub['id'] }}"
Rails.logger.info "Failed Publication Ingest (Total #{publications[:failed].count}): #{publications[:failed].map { |pub| pub['id'] }}"
Rails.logger.info "[#{Time.now}] completed dimensions publications ingest"
rescue StandardError => e
Rails.logger.error "Failed to send email: #{e.message}"
end
# Write the last run time to a file only if this is a cron job
if is_cron_job
File.open(dimensions_last_run_path, 'w') do |f|
f.puts Time.current
end
else
Rails.logger.info 'Not a cron job. Skipping writing last run time to file.'
end
end
# Helper method to compute the last run file path
def self.dimensions_last_run_path
File.join(ENV['DATA_STORAGE'], 'last_dimensions_ingest_run.txt')
end
def self.get_date_range(args)
if args[:start_date] && args[:end_date]
start_date = Date.parse(args[:start_date]).strftime('%Y-%m-%d')
end_date = Date.parse(args[:end_date]).strftime('%Y-%m-%d')
Rails.logger.info "Using provided date range: #{start_date} to #{end_date}"
is_cron_job = false
else
last_run_time = File.exist?(dimensions_last_run_path) ? Date.parse(File.read(dimensions_last_run_path).strip) : nil
if last_run_time
Rails.logger.info "Last ingest run was at: #{last_run_time}"
start_date = last_run_time.strftime('%Y-%m-%d')
else
Rails.logger.info "No previous run time found. Starting from default date. (#{EARLIEST_DATE})"
start_date = EARLIEST_DATE
end
is_cron_job = true
end_date = Date.today.strftime('%Y-%m-%d')
Rails.logger.info "Using date range: #{start_date} to #{end_date}"
end
[start_date, end_date, is_cron_job]
end
end