indexer/app/lib/indexer_common.rb
require 'ashttp'
require 'uri'
require 'json'
require 'fileutils'
require 'aspace_i18n'
require 'set'
require 'asutils'
require 'jsonmodel'
require 'jsonmodel_client'
require 'config/config-distribution'
require 'record_inheritance'
require_relative 'index_batch'
require_relative 'indexer_common_config'
require_relative 'indexer_timing'
require_relative 'fake_solr_timeout_response'
class IndexerCommon
include JSONModel
@@record_types = IndexerCommonConfig.record_types
@@global_types = IndexerCommonConfig.global_types
@@records_with_children = []
@@init_hooks = []
@@resolved_attributes = IndexerCommonConfig.resolved_attributes
@@paused_until = Time.now
def self.add_indexer_initialize_hook(&block)
@@init_hooks << block
end
def self.add_attribute_to_resolve(attr)
@@resolved_attributes.push(attr) unless @@resolved_attributes.include?(attr)
end
def resolved_attributes
@@resolved_attributes
end
def record_types
@@record_types
end
# This is to pause the indexer.
# Duration is given in seconds.
def self.pause(duration = 900 )
@@paused_until = Time.now + duration
end
def self.paused?
@@paused_until > Time.now
end
def initialize(backend_url)
@backend_url = backend_url
@document_prepare_hooks = []
@extra_documents_hooks = []
@delete_hooks = []
@batch_hooks = []
@current_session = nil
while true
begin
JSONModel::init(:client_mode => true, :url => @backend_url)
break
rescue
Log.error("Connection to backend failed (#{$!}). Retrying...")
sleep(5)
end
end
# Force load up front
self.enum_fields
configure_doc_rules
@@init_hooks.each do |hook|
hook.call(self)
end
end
def self.generate_years_for_date_range(begin_date, end_date)
return [] unless begin_date
end_date ||= begin_date
b = begin_date.scan(/\A[0-9]{1,4}/).first
e = end_date.scan(/\A[0-9]{1,4}/).first
if b && e
(b .. e).to_a
else
[]
end
end
def self.generate_permutations_for_identifier(identifer)
return [] if identifer.nil?
[
identifer,
identifer.gsub(/[[:punct:]]+/, " "),
identifer.gsub(/[[:punct:] ]+/, ""),
identifer.scan(/([0-9]+|[^0-9]+)/).flatten(1).join(" ")
].uniq
end
# Isolate leading alpha and numeric values to create a sortable string
def self.generate_sort_string_for_identifier(identifier, size = 255)
letters, numbers, rest = identifier.scan(/([^0-9]*)([0-9]*)(.*)/)[0]
letters.strip.ljust(size).gsub(' ', '#') + numbers.strip.rjust(size).gsub(' ', '0') + rest.strip.ljust(size)
end
def self.extract_string_values(doc, *opts)
return doc, doc if doc.is_a?(String)
if doc.is_a?(Array)
published_queue = doc.flatten
else
published_queue = [doc]
end
extract_unpublished = !opts.include?(:published_only)
unpublished_queue = []
published_strings = []
unpublished_strings = []
published_done = false
[
[published_queue, published_strings],
[unpublished_queue, unpublished_strings]
].each do |queue, strings|
while !queue.empty?
doc = queue.pop
if (!published_done && doc.has_key?("publish") && !doc["publish"])
if extract_unpublished
unpublished_queue.push(doc)
end
next
end
doc.each do |key, val|
if IndexerCommonConfig.fullrecord_excludes.include?(key) || key =~ /_enum_s$/
next # ignored
elsif val.is_a?(String)
strings.push(val)
elsif val.is_a?(Hash)
queue.push(val)
elsif val.is_a?(Array)
val.flatten.each do |v|
if v.is_a?(String)
strings.push(v)
elsif v.is_a?(Hash)
queue.push(v)
end
end
end
end
end
if extract_unpublished
published_done = true
else
break
end
end
if extract_unpublished
return published_strings, unpublished_strings
else
return published_strings
end
end
def build_fullrecord(doc, record)
# 'fullrecord' only contains unpublished text at this stage, but 'fullrecord_published'
# will be merged into it by Solr using copyField
doc['fullrecord_published'], doc['fullrecord'] = IndexerCommon.extract_string_values(record['record'])
end
# There's a problem with how translation paths get loaded when selenium tests are run
# Call this instead of I18n.t so that it won't cause an error when selenium tests are run
@@selenium = Dir.getwd.end_with? 'selenium'
def t(*args)
if @@selenium
args[0]
else
args << {} unless args.last.is_a?(Hash)
args[1][:default] ||= args[0].split('.').last
I18n.t(*args)
end
end
def add_agents(doc, record)
if record['record']['linked_agents']
# index all linked agents first
doc['agents'] = record['record']['linked_agents'].collect{|link| link['_resolved']['display_name']['sort_name']}
doc['agent_uris'] = record['record']['linked_agents'].collect{|link| link['ref']}
# only published agents
doc['published_agents'] = []
doc['published_agent_uris'] = []
record['record']['linked_agents'].each do |link|
if link['_resolved']['publish']
doc['published_agents'] << link['_resolved']['display_name']['sort_name']
doc['published_agent_uris'] << link['ref']
end
end
# index the creators only
creators = record['record']['linked_agents'].select{|link| link['role'] === 'creator'}
doc['creators'] = creators.collect{|link| link['_resolved']['display_name']['sort_name']} if not creators.empty?
# make a special sort field for each agent
# creator > subject > source
seen = {}
record['record']['linked_agents'].each do |link|
if seen[link['ref']] == 'creator'
# do nothing
elsif seen[link['ref']] == 'subject' && link['role'] != 'creator'
# do nothing
else
relator_label = link['relator'] ? t("enumerations.linked_agent_archival_record_relators.#{link['relator']}") : ''
doc["#{link['ref'].gsub(/\//, '_')}_relator_sort"] = "#{link['role']} #{relator_label}"
seen[link['ref']] = link['role']
end
end
end
end
def add_subjects(doc, record)
if record['record']['subjects']
doc['subjects'] = record['record']['subjects'].map {|s| s['_resolved']['title']}.compact
doc['subject_uris'] = record['record']['subjects'].collect{|link| link['ref']}
end
end
def add_subjects_subrecord(doc, record, subrecord, type = 'subjects')
doc['subjects'] ||= []
doc['subject_uris'] ||= []
if record['record'][subrecord]
record['record'][subrecord].each do |sr|
next unless sr[type]
doc['subjects'].concat(sr[type].map {|s| s['_resolved']['title']}.compact)
doc['subject_uris'].concat(sr[type].collect{|link| link['ref']})
end
end
end
def add_audit_info(doc, record)
['created_by', 'last_modified_by', 'user_mtime', 'system_mtime', 'create_time'].each do |f|
doc[f] = record['record'][f] if record['record'].has_key?(f)
end
end
def add_notes(doc, record)
if record['record']['notes']
# 'notes' only contains unpublished notes at this stage, but 'notes_published'
# will be merged into it by Solr using copyField
doc['notes_published'], doc['notes'] = IndexerCommon.extract_string_values(record['record']['notes'])
end
end
def add_years(doc, record)
if record['record']['dates']
doc['years'] = []
record['record']['dates'].each do |date|
doc['years'] += IndexerCommon.generate_years_for_date_range(date['begin'], date['end'])
end
unless doc['years'].empty?
doc['years'] = doc['years'].sort.uniq
doc['year_sort'] = doc['years'].first.rjust(4, '0') + doc['years'].last.rjust(4, '0')
end
dates = record['record']['dates']
display_dates = dates.select {|date| date['date_type'] == 'inclusive'}
display_dates = dates if display_dates.empty?
doc['dates'] = []
display_dates.each do |date|
if date['expression']
doc['dates'] << date['expression']
elsif date['date_type'] === "single"
doc['dates'] << date['begin']
elsif date['date_type']
doc['dates'] << "#{date['begin']} - #{date['end']}"
end
end
end
end
def add_level(doc, record)
if record['record'].has_key? 'level'
doc['level'] = (record['record']['level'] === 'otherlevel') ? record['record']['other_level'] : record['record']['level']
end
end
def add_summary(doc, record)
if record['record'].has_key?('notes') && record['record']['notes'].is_a?(Array)
notes = record['record']['notes']
abstract = notes.find {|note| note['type'] == 'abstract'}
if abstract
doc['summary'] = abstract['content'].join("\n")
else
scopecontent = notes.find {|note| note['type'] == 'scopecontent'}
if scopecontent && scopecontent.has_key?('subnotes')
doc['summary'] = scopecontent['subnotes'].map {|sn| sn['content']}.join("\n")
end
end
end
end
def add_extents(doc, record)
if record['record']['extents']
extents = record['record']['extents']
display_extents = extents.select {|extent| extent['portion'] == 'whole'}
display_extents = extents if display_extents.empty?
doc['extents'] = []
display_extents.each do |extent|
doc['extents'] << "#{extent['number']} --- #{extent['extent_type']}"
end
end
end
def enum_fields
return @enum_fields if @enum_fields
enum_fields = []
queue = JSONModel.models.map {|_,model| model.schema['properties']}.flatten.uniq
while !queue.empty?
elt = queue.shift
if elt.is_a?(Hash)
elt.each do |k, v|
if v.is_a?(Hash)
enum_fields.push(k) if v['dynamic_enum'] || v.dig('items', 'dynamic_enum')
end
queue << v
end
elsif elt.is_a?(Array)
queue.concat(elt)
end
end
enum_fields.delete('items') # not an enum, creeps in through dynamic enum lists
@enum_fields = enum_fields.uniq
end
def trim_ark_value(s)
s.gsub(/\A.*ark:/, 'ark:')
end
def add_arks(doc, record)
return unless AppConfig[:arks_enabled]
if arks = record['record']['ark_name']
doc['ark_name'] = ([arks.fetch('current', nil)] + arks.fetch('previous')).compact.map {|s| trim_ark_value(s)}
end
end
def configure_doc_rules
add_document_prepare_hook {|doc, record|
found_keys = Set.new
ASUtils.search_nested(record["record"], enum_fields, ['_resolved']) do |field, field_value|
key = "#{field}_enum_s"
doc[key] ||= Set.new
doc[key] << field_value
found_keys << key
end
ASUtils.search_nested(record["record"], ['items'], ['_resolved']) do |field, field_value|
if field_value.is_a?(Hash) && field_value.key?('type')
doc['type_enum_s'] ||= Set.new
doc['type_enum_s'] << field_value.fetch('type')
found_keys << 'type_enum_s'
end
end
# Turn our sets back into regular arrays so they serialize out to JSON correctly
found_keys.each do |key|
doc[key] = doc[key].to_a.flatten
end
}
add_document_prepare_hook {|doc, record|
if doc['primary_type'] == 'archival_object'
doc['resource'] = record['record']['resource']['ref'] if record['record']['resource']
doc['title'] = record['record']['display_string']
doc['identifier'] = record['record']['component_id']
doc['component_id'] = record['record']['component_id']
doc['ref_id'] = record['record']['ref_id']
doc['slug'] = record['record']['slug']
doc['is_slug_auto'] = record['record']['is_slug_auto']
end
}
add_document_prepare_hook {|doc, record|
add_subjects(doc, record)
add_agents(doc, record)
add_audit_info(doc, record)
add_notes(doc, record)
add_years(doc, record)
add_level(doc, record)
add_summary(doc, record)
add_extents(doc, record)
add_arks(doc, record)
}
add_document_prepare_hook {|doc, record|
if doc['primary_type'] == 'accession'
date = record['record']['accession_date']
if date == '9999-12-31'
unknown = t('accession.accession_date_unknown')
doc['accession_date'] = unknown
doc['fullrecord'] ||= ''
doc['fullrecord'] << unknown + ' '
else
doc['accession_date'] = date
end
doc['accession_date_year'] = Date.parse(date).year
doc['identifier'] = (0...4).map {|i| record['record']["id_#{i}"]}.compact.join("-")
doc['title'] = record['record']['display_string']
doc['acquisition_type'] = record['record']['acquisition_type']
doc['resource_type'] = record['record']['resource_type']
doc['restrictions_apply'] = record['record']['restrictions_apply']
doc['access_restrictions'] = record['record']['access_restrictions']
doc['use_restrictions'] = record['record']['use_restrictions']
doc['related_resource_uris'] = record['record']['related_resources'].
collect { |resource| resource["ref"] }.
compact.uniq
doc['related_accession_uris'] = record['record']['related_accessions'].
collect { |accession| accession["ref"] }.
compact.uniq
doc['slug'] = record['record']['slug']
doc['is_slug_auto'] = record['record']['is_slug_auto']
if cm = record['record']['collection_management']
doc['processing_priority'] = cm['processing_priority']
doc['processors'] = cm['processors']
end
end
}
add_document_prepare_hook {|doc, record|
if doc['primary_type'] == 'subject'
doc['source'] = record['record']['source']
doc['first_term_type'] = record['record']['terms'][0]['term_type']
doc['publish'] = record['record']['publish'] && record['record']['is_linked_to_published_record']
doc['slug'] = record['record']['slug']
doc['is_slug_auto'] = record['record']['is_slug_auto']
end
}
add_document_prepare_hook {|doc, record|
if record['record'].has_key?('used_within_repositories')
doc['used_within_repository'] = record['record']['used_within_repositories']
doc['used_within_published_repository'] = record['record']['used_within_published_repositories']
end
}
add_document_prepare_hook {|doc, record|
if doc['primary_type'] == 'repository'
doc['repository'] = doc["id"]
doc['title'] = record['record']['repo_code']
doc['repo_sort'] = record['record']['display_string']
doc['slug'] = record['record']['slug']
doc['is_slug_auto'] = record['record']['is_slug_auto']
doc['position_int_sort'] = record['record']['position']
end
}
add_document_prepare_hook {|doc, record|
if doc['primary_type'] == 'location'
if record['record'].has_key? 'temporary'
doc['temporary'] = record['record']['temporary']
end
doc['building'] = record['record']['building']
doc['floor'] = record['record']['floor']
doc['room'] = record['record']['room']
doc['area'] = record['record']['area']
if record['record']['owner_repo']
repo = JSONModel::HTTP.get_json(record['record']['owner_repo']['ref'])
doc['owner_repo_uri_u_sstr'] = record['record']['owner_repo']['ref']
doc['owner_repo_display_string_u_ssort'] = repo["repo_code"]
end
end
}
add_document_prepare_hook {|doc, record|
if doc['primary_type'] == 'digital_object_component'
doc['digital_object'] = record['record']['digital_object']['ref']
doc['digital_object_id'] = record['record']['component_id']
doc['identifier'] = record['record']['component_id']
doc['title'] = record['record']['display_string']
doc['slug'] = record['record']['slug']
doc['is_slug_auto'] = record['record']['is_slug_auto']
end
}
add_document_prepare_hook {|doc, record|
if doc['primary_type'] == 'resource'
doc['finding_aid_title'] = record['record']['finding_aid_title']
doc['finding_aid_filing_title'] = record['record']['finding_aid_filing_title']
doc['identifier'] = (0...4).map {|i| record['record']["id_#{i}"]}.compact.join("-")
doc['resource_type'] = record['record']['resource_type']
doc['level'] = record['record']['level']
doc['restrictions'] = record['record']['restrictions']
doc['ead_id'] = record['record']['ead_id']
doc['finding_aid_status'] = record['record']['finding_aid_status']
doc['related_accession_uris'] = record['record']['related_accessions'].
collect{|accession| accession["ref"]}.
compact.uniq
doc['slug'] = record['record']['slug']
doc['is_slug_auto'] = record['record']['is_slug_auto']
if cm = record['record']['collection_management']
doc['processing_priority'] = cm['processing_priority']
doc['processors'] = cm['processors']
end
end
if doc['primary_type'] == 'digital_object'
doc['digital_object_type'] = record['record']['digital_object_type']
doc['digital_object_id'] = record['record']['digital_object_id']
doc['identifier'] = record['record']['digital_object_id']
doc['level'] = record['record']['level']
doc['restrictions'] = record['record']['restrictions']
doc['slug'] = record['record']['slug']
doc['is_slug_auto'] = record['record']['is_slug_auto']
doc['collection_uri_u_sstr'] = record['record']['collection'].map {|collection| collection['ref']}
doc['linked_instance_uris'] = record['record']['linked_instances'].
collect{|instance| instance["ref"]}.
compact.uniq
end
}
add_document_prepare_hook {|doc, record|
if doc['primary_type'] == 'repository'
doc['repository'] = doc["id"]
end
}
add_document_prepare_hook {|doc, record|
if doc['primary_type'] == 'event'
doc['event_type'] = record['record']['event_type']
doc['title'] = record['record']['event_type'] # adding this for emedded searches
doc['outcome'] = record['record']['outcome']
doc['linked_record_uris'] = record['record']['linked_records'].map { |c| c['ref'] }
# ANW-1635: index linked record titles/display names so they are available in CSV output
doc['linked_record_titles'] = record['record']['linked_records'].map do |rec|
if rec['_resolved']['jsonmodel_type'] == "agent_person"
rec['_resolved']['display_name']['sort_name']
elsif rec['_resolved']['jsonmodel_type'] == "agent_family"
rec['_resolved']['display_name']['sort_name']
elsif rec['_resolved']['jsonmodel_type'] == "agent_corporate_entity"
rec['_resolved']['display_name']['sort_name']
elsif rec['_resolved']['jsonmodel_type'] == "agent_software"
rec['_resolved']['display_name']['sort_name']
elsif rec['_resolved']['jsonmodel_type'] == "accession"
rec['_resolved']['title']
elsif rec['_resolved']['jsonmodel_type'] == "resource"
rec['_resolved']['title']
elsif rec['_resolved']['jsonmodel_type'] == "digital_object"
rec['_resolved']['title']
elsif rec['_resolved']['jsonmodel_type'] == "digital_object_component"
rec['_resolved']['title']
elsif rec['_resolved']['jsonmodel_type'] == "archival_object"
rec['_resolved']['display_string']
else
"not_found"
end
end
end
}
add_document_prepare_hook {|doc, record|
if ['agent_person', 'agent_family', 'agent_software', 'agent_corporate_entity'].include?(doc['primary_type'])
record['record'].reject! { |rec| rec === 'agent_contacts' }
doc['title'] = record['record']['display_name']['sort_name']
authorized_name = record['record']['names'].find {|name| name['authorized']}
has_conventions_dec = !record['record']['agent_conventions_declarations'].empty?
has_agent_record_id = !record['record']['agent_record_identifiers'].empty?
if has_agent_record_id
primary_record_id = record['record']['agent_record_identifiers'].select do |ari|
ari['primary_identifier'] == true
end
primary_record_id = primary_record_id.first
else
primary_record_id = nil
end
if has_conventions_dec
conventions_dec = record['record']['agent_conventions_declarations'].first
else
conventions_dec = nil
end
if primary_record_id
doc['authority_id'] = primary_record_id['record_identifier']
doc['source'] = primary_record_id['source']
elsif authorized_name
doc['authority_id'] = authorized_name['authority_id']
doc['source'] = authorized_name['source']
end
if conventions_dec && conventions_dec['name_rule']
doc['rules'] = conventions_dec['name_rule']
elsif authorized_name
doc['rules'] = authorized_name['rules']
end
doc['linked_agent_roles'] = record['record']['linked_agent_roles']
doc['related_agent_uris'] = ASUtils.wrap(record['record']['related_agents']).collect{|ra| ra['ref']}
doc['slug'] = record['record']['slug']
doc['is_slug_auto'] = record['record']['is_slug_auto']
if record['record']['is_user']
doc['is_user'] = true
doc['types'] << 'agent_with_user'
else
doc['is_user'] = false
end
add_subjects_subrecord(doc, record, 'agent_functions')
add_subjects_subrecord(doc, record, 'agent_occupations')
add_subjects_subrecord(doc, record, 'agent_places')
add_subjects_subrecord(doc, record, 'agent_topics')
add_subjects_subrecord(doc, record, 'agent_functions', 'places')
add_subjects_subrecord(doc, record, 'agent_occupations', 'places')
add_subjects_subrecord(doc, record, 'agent_resources', 'places')
add_subjects_subrecord(doc, record, 'agent_topics', 'places')
# Assign the additional type of 'agent'
doc['types'] << 'agent'
end
}
add_document_prepare_hook {|doc, record|
doc['external_id'] = Array(record['record']['external_ids']).map do |eid|
eid['external_id']
end
}
add_document_prepare_hook {|doc, record|
if ['classification', 'classification_term'].include?(doc['primary_type'])
doc['classification_path'] = ASUtils.to_json(record['record']['path_from_root'])
doc['agent_uris'] = ASUtils.wrap(record['record']['creator']).collect{|agent| agent['ref']}
doc['published_agent_uris'] = []
if !record.dig(:record, :creator, :_resolved).nil?
if record['record']['creator']['_resolved']['publish'] && !record['record']['creator']['ref'].nil?
doc['published_agent_uris'] << record['record']['creator']['ref']
end
end
doc['agents'] = ASUtils.wrap(record['record']['creator']).collect{|link| link['_resolved']['display_name']['sort_name']}
doc['identifier_sort'] = IndexerCommon.generate_sort_string_for_identifier(record['record']['identifier'])
doc['repo_sort'] = record['record']['repository']['_resolved']['display_string']
doc['has_classification_terms'] = record['record']['has_classification_terms']
doc['slug'] = record['record']['slug']
doc['is_slug_auto'] = record['record']['is_slug_auto']
doc['identifier'] = record['record']['identifier']
end
}
add_document_prepare_hook {|doc, record|
if doc['primary_type'] == 'classification_term'
doc['classification'] = record['record']['classification']['ref']
end
}
add_document_prepare_hook {|doc, record|
if doc['primary_type'] == 'job'
report_type = record['record']['job']['report_type']
doc['title'] = (report_type ? t("reports.#{report_type}.title", :default => report_type) :
t("job.types.#{record['record']['job_type']}"))
doc['types'] << record['record']['job_type']
doc['types'] << report_type
doc['job_type'] = record['record']['job_type']
doc['report_type'] = report_type
doc['job_report_type'] = report_type || doc['job_type']
doc['status'] = record['record']['status']
doc['owner'] = record['record']['owner']
doc['time_submitted'] = Time.parse(record['record']['time_submitted']).getlocal if record['record']['time_submitted']
doc['time_started'] = Time.parse(record['record']['time_started']).getlocal if record['record']['time_started']
doc['time_finished'] = Time.parse(record['record']['time_finished']).getlocal if record['record']['time_finished']
filenames = record['record']['job']['filenames'] || [record['record']['job']['filename']].compact
doc['files'] = []
doc['job_data'] = []
files = JSONModel::HTTP::get_json("#{record['record']['uri']}/output_files")
files.each do |file|
job_id = record['record']['uri'].split('/').last
link = "/jobs/#{job_id}/file/#{file}"
doc['files'] << link
filename = filenames.shift
doc['job_data'] << (filename ? "input_file --- #{filename}" : "output_file --- #{link}")
end
unless record['record']['job'].is_a? String
record['record']['job'].reject { |k, _v| ['jsonmodel_type', 'filenames', 'report_type'].include? k }.each do |k, v|
doc['job_data'] << "#{k} --- #{v}"
end
end
doc['queue_position'] = record['record']['queue_position']
end
}
add_document_prepare_hook {|doc, record|
records_with_classifications = ['resource', 'accession', 'digital_object']
if records_with_classifications.include?(doc['primary_type']) && record['record']['classifications'].length > 0
doc['classification_paths'] = record['record']['classifications'].map { |c| ASUtils.to_json(c['_resolved']['path_from_root']) }
doc['classification_uris'] = record['record']['classifications'].map { |c| c['ref'] }
end
}
add_document_prepare_hook {|doc, record|
if ['resource', 'archival_object', 'accession'].include?(doc['primary_type']) && record['record']['instances'] && record['record']['instances'].length > 0
doc['location_uris'] = record['record']['instances'].
collect{|instance| instance["sub_container"]}.compact.
collect{|sub_container| sub_container["top_container"]["_resolved"]}.compact.
collect{|top_container| top_container["container_locations"]}.flatten.
collect{|container_location| container_location["ref"]}.uniq
doc['digital_object_uris'] = record['record']['instances'].
collect{|instance| instance["digital_object"]}.compact.
collect{|digital_object_instance| digital_object_instance["ref"]}.
flatten.uniq
end
}
# Index four-part IDs separately
add_document_prepare_hook {|doc, record|
four_part_id = (0..3).map {|n| record['record']["id_#{n}"]}.compact.join(" ")
unless four_part_id.empty?
doc['four_part_id'] = four_part_id
end
}
add_document_prepare_hook {|doc, record|
if record['record']['jsonmodel_type'] == 'top_container'
doc['title'] = record['record']['long_display_string']
doc['display_string'] = record['record']['display_string']
doc['type_u_ssort'] = record['record']['type']
doc['notes'] = record['record']['internal_note']
if record['record']['series']
doc['series_uri_u_sstr'] = record['record']['series'].map {|series| series['ref']}
doc['series_title_u_sstr'] = record['record']['series'].map {|series| series['display_string']}
doc['series_level_u_sstr'] = record['record']['series'].map {|series| series['level_display_string']}
doc['series_identifier_stored_u_sstr'] = record['record']['series'].map {|series| series['identifier']}
doc['series_identifier_u_stext'] = record['record']['series'].map {|series|
IndexerCommon.generate_permutations_for_identifier(series['identifier'])
}.flatten
record['record']['series'].select{|series| series['publish']}.each do |series|
doc['published_series_uri_u_sstr'] ||= []
doc['published_series_uri_u_sstr'] << series['ref']
doc['published_series_title_u_sstr'] ||= []
doc['published_series_title_u_sstr'] << series['display_string']
end
end
if record['record']['collection']
doc['collection_uri_u_sstr'] = record['record']['collection'].map {|collection| collection['ref']}
doc['collection_display_string_u_sstr'] = record['record']['collection'].map {|collection| collection['display_string']}
doc['collection_identifier_stored_u_sstr'] = record['record']['collection'].map {|collection| collection['identifier']}
doc['collection_identifier_u_stext'] = record['record']['collection'].map {|collection|
IndexerCommon.generate_permutations_for_identifier(collection['identifier'])
}.flatten
end
if record['record']['container_profile']
doc['container_profile_uri_u_sstr'] = record['record']['container_profile']['ref']
doc['container_profile_display_string_u_sstr'] = record['record']['container_profile']['_resolved']['display_string']
end
if record['record']['container_locations'].length > 0
doc['has_location_u_sbool'] = true
doc['location_uri_u_sstr'] = []
doc['location_uris'] = []
doc['location_display_string_u_sstr'] = []
record['record']['container_locations'].each do |container_location|
if container_location['status'] == 'current'
doc['location_uri_u_sstr'] << container_location['ref']
doc['location_uris'] << container_location['ref']
doc['location_display_string_u_sstr'] << container_location['_resolved']['title']
end
end
else
doc['has_location_u_sbool'] = false
end
doc['exported_u_sbool'] = record['record'].has_key?('exported_to_ils')
doc['empty_u_sbool'] = record['record']['collection'].empty?
if record['record']['indicator']
doc['indicator_u_icusort'] = record['record']['indicator']
end
doc['top_container_u_typeahead_utext'] = record['record']['display_string'].gsub(/[^0-9A-Za-z]/, '').downcase
doc['top_container_u_icusort'] = record['record']['display_string']
doc['barcode_u_sstr'] = record['record']['barcode']
doc['barcode_u_icusort'] = record['record']['barcode']
doc['subcontainer_barcodes_u_sstr'] = record["record"]["subcontainer_barcodes"]
doc['created_for_collection_u_sstr'] = record['record']['created_for_collection']
end
}
add_document_prepare_hook {|doc, record|
if ['resource', 'archival_object', 'accession'].include?(doc['primary_type'])
# we no longer want the contents of containers to be indexed at the container's location
doc.delete('location_uris')
# index the top_container's linked via a sub_container
ASUtils.wrap(record['record']['instances']).each{|instance|
if instance['sub_container'] && instance['sub_container']['top_container']
doc['top_container_uri_u_sstr'] ||= []
doc['top_container_uri_u_sstr'] << instance['sub_container']['top_container']['ref']
if instance['sub_container']['type_2']
doc['child_container_u_sstr'] ||= []
doc['child_container_u_sstr'] << "#{instance['sub_container']['type_2']} #{instance['sub_container']['indicator_2']} #{instance['sub_container']['barcode_2']}"
end
if instance['sub_container']['type_3']
doc['grand_child_container_u_sstr'] ||= []
doc['grand_child_container_u_sstr'] << "#{instance['sub_container']['type_3']} #{instance['sub_container']['indicator_3']}"
end
end
}
end
}
add_document_prepare_hook {|doc, record|
if doc['primary_type'] == 'container_profile'
doc['title'] = record['record']['display_string']
doc['display_string'] = record['record']['display_string']
doc['note'] = record['record']['note']
['width', 'height', 'depth'].each do |property|
doc["container_profile_#{property}_u_sstr"] = record['record'][property]
end
doc["container_profile_dimension_units_u_sstr"] = record['record']['dimension_units']
doc['typeahead_sort_key_u_sort'] = record['record']['display_string']
end
}
add_document_prepare_hook { |doc, record|
if !self.instance_of?(PUIIndexer)
# The PUI indexer makes its own call to build_fullrecord, so only call it here for realtime and periodic
build_fullrecord(doc, record)
end
}
add_document_prepare_hook {|doc, record|
if doc['primary_type'] == 'location_profile'
doc['title'] = record['record']['display_string']
doc['display_string'] = record['record']['display_string']
['width', 'height', 'depth'].each do |property|
doc["location_profile_#{property}_u_sstr"] = record['record'][property]
end
doc["location_profile_dimension_units_u_sstr"] = record['record']['dimension_units']
doc['typeahead_sort_key_u_sort'] = record['record']['display_string']
end
if record['record']['location_profile']
doc['location_profile_uri_u_sstr'] = record['record']['location_profile']['ref']
doc['location_profile_display_string_u_ssort'] = record['record']['location_profile']['_resolved']['display_string']
end
}
add_document_prepare_hook {|doc, record|
doc['ancestors'] = ASUtils.wrap(record['record']['ancestors']).map {|ancestor|
ancestor.fetch('ref')
}
}
add_document_prepare_hook {|doc, record|
ASUtils.wrap(record['record']['rights_statements']).each do |rights_statement|
ASUtils.wrap(rights_statement['linked_agents']).each do |agent_link|
doc['rights_statement_agent_uris'] ||= []
doc['rights_statement_agent_uris'] << agent_link['ref']
end
end
}
record_has_children('collection_management')
add_extra_documents_hook {|record|
docs = []
cm = record['record']['collection_management']
if cm
parent_type = JSONModel.parse_reference(record['uri'])[:type]
title = record['record']['title'] || record['record']['display_string']
docs << {
'id' => cm['uri'],
'uri' => cm['uri'],
'parent_id' => record['uri'],
'parent_title' => title,
'parent_type' => parent_type,
'title' => title,
'title_sort' => clean_for_sort(title),
'types' => ['collection_management'],
'primary_type' => 'collection_management',
'json' => cm.to_json(:max_nesting => false),
'processing_priority' => cm['processing_priority'],
'processing_status' => cm['processing_status'],
'processing_hours_total' => cm['processing_hours_total'],
'processing_funding_source' => cm['processing_funding_source'],
'processors' => cm['processors'],
'suppressed' => record['record']['suppressed'],
'repository' => get_record_scope(record['uri']),
'created_by' => cm['created_by'],
'last_modified_by' => cm['last_modified_by'],
'system_mtime' => cm['system_mtime'],
'user_mtime' => cm['user_mtime'],
'create_time' => cm['create_time'],
}
end
docs
}
add_document_prepare_hook {|doc, record|
if doc['primary_type'] == 'assessment'
doc['assessment_id'] = JSONModel.parse_reference(record['record']['uri']).fetch(:id)
doc['title'] = record['record']['display_string']
doc['display_string'] = record['record']['display_string']
doc['assessment_record_uris'] = ASUtils.wrap(record['record']['records']).map{|r| r['ref']}
doc['assessment_records'] = ASUtils.wrap(record['record']['records']).map{|r| r['_resolved']['display_string'] || r['_resolved']['title']}
doc['assessment_record_types'] = ASUtils.wrap(record['record']['records']).map{|r| r['_resolved']['jsonmodel_type']}.uniq.sort
doc['assessment_surveyor_uris'] = ASUtils.wrap(record['record']['surveyed_by']).map{|r| r['ref']}
doc['assessment_surveyors'] = ASUtils.wrap(record['record']['surveyed_by']).map{|r| r['_resolved']['title']}
doc['assessment_survey_begin'] = "#{record['record']['survey_begin']}T00:00:00Z"
doc['assessment_survey_end'] = "#{record['record']['survey_end']}T00:00:00Z" if record['record']['survey_end']
doc['assessment_review_required'] = record['record']['review_required']
doc['assessment_sensitive_material'] = record['record']['sensitive_material']
if (ASUtils.wrap(record['record']['reviewer']).length > 0)
doc['assessment_reviewer_uris'] = ASUtils.wrap(record['record']['reviewer']).map{|r| r['ref']}
doc['assessment_reviewers'] = ASUtils.wrap(record['record']['reviewer']).map{|r| r['_resolved']['title']}
end
doc['assessment_inactive'] = record['record']['inactive']
doc['assessment_survey_year'] = IndexerCommon.generate_years_for_date_range(record['record']['survey_begin'], record['record']['survey_end'])
doc['assessment_collection_uris'] = ASUtils.wrap(record['record']['collections']).map{|r| r['ref']}
doc['assessment_collections'] = ASUtils.wrap(record['record']['collections']).map{|r| r['_resolved']['display_string'] || r['_resolved']['title']}
doc['assessment_completed'] = !record['record']['survey_end'].nil?
doc['assessment_formats'] = record['record']['formats'].select{|r| r.has_key?('value')}.map{|r| r['label']}
doc['assessment_ratings'] = record['record']['ratings'].select{|r| r.has_key?('value') || r.has_key?('note')}.map{|r| r['label']}
doc['assessment_conservation_issues'] = record['record']['conservation_issues'].select{|r| r.has_key?('value')}.map{|r| r['label']}
doc['title_sort'] = doc['assessment_id'].to_s.rjust(10, '0')
end
}
add_document_prepare_hook {|doc, record|
doc['langcode'] ||= []
if record['record'].has_key?('lang_materials') and record['record']['lang_materials'].is_a?(Array)
record['record']['lang_materials'].each { |langmaterial|
if langmaterial.has_key?('language_and_script')
doc['langcode'].push(langmaterial['language_and_script']['language'])
end
}
doc['langcode'].uniq!
end
}
end
def add_document_prepare_hook(&block)
@document_prepare_hooks << block
end
def record_has_children(record_type)
@@records_with_children << record_type.to_s
end
def records_with_children
@@records_with_children || []
end
def add_extra_documents_hook(&block)
@extra_documents_hooks << block
end
def add_batch_hook(&block)
@batch_hooks << block
end
def add_delete_hook(&block)
@delete_hooks << block
end
def solr_url
URI.parse(AppConfig[:solr_url])
end
def do_http_request(url, req)
req['X-ArchivesSpace-Session'] = @current_session
opts = {
:read_timeout => AppConfig[:indexer_solr_timeout_seconds].to_i
}
ASHTTP.start_uri(url, opts) do |http|
http.request(req)
end
rescue Timeout::Error
FakeSolrTimeoutResponse.new(req)
end
def reset_session
@current_session = nil
end
def login
if @current_session
return @current_session
end
username = AppConfig[:search_username]
password = AppConfig[:search_user_secret]
url = URI.parse(@backend_url + "/users/#{username}/login")
request = Net::HTTP::Post.new(url.request_uri)
request.set_form_data("expiring" => "false",
"password" => password)
response = do_http_request(url, request)
if response.code == '200'
auth = ASUtils.json_parse(response.body)
@current_session = auth['session']
JSONModel::HTTP.current_backend_session = auth['session']
else
raise "Authentication to backend failed: #{response.body}"
end
end
def get_record_scope(uri)
JSONModel.parse_reference(uri)[:repository] || "global"
end
def is_repository_unpublished?(uri, values)
repo_id = get_record_scope(uri)
return false if (repo_id == "global")
values['repository']['_resolved']['publish'] == false
end
def delete_records(records, opts = {})
return if records.empty?
req = Net::HTTP::Post.new("#{solr_url.path}/update")
req['Content-Type'] = 'application/json'
# Delete the ID plus any documents that were the child of that ID
delete_request = {:delete => records.map {|id|
[{"id" => id},
{'query' => opts.fetch(:parent_id_field, 'parent_id') + ":\"#{id}\""}]}.flatten(1)
}
@delete_hooks.each do |hook|
hook.call(records, delete_request)
end
req.body = delete_request.to_json
response = do_http_request(solr_url, req)
if response.code == '200'
Log.info "Deleted #{records.length} documents: #{response}"
else
Log.error "SolrIndexerError when deleting records: #{response.body}"
end
end
# When applying a batch of updates, keep only the most recent version of each record
def dedupe_by_uri(records)
result = []
seen = {}
records.reverse.each do |record|
if !seen[record['uri']]
result << record
seen[record['uri']] = true
end
end
result.reverse
end
def clean_whitespace(doc)
if doc.is_a?(String) && !doc.frozen?
doc.strip!
elsif doc.is_a?(Hash)
doc.values.each {|v| clean_whitespace(v)}
elsif doc.is_a?(Array)
doc.each {|v| clean_whitespace(v)}
end
doc
end
def clean_for_sort(value)
return nil if value.nil?
out = value.gsub(/<[^>]+>/, '')
out.gsub!(/-/, ' ')
out.gsub!(/[^\p{L}\d\s_]/, '')
out.gsub!(/\s+/, ' ')
out.strip
end
# ANW-1065
# iterate through the do_not_index list and scrub out that part of the JSON tree
def sanitize_json(json)
IndexerCommonConfig.do_not_index.each do |k, v|
if json["jsonmodel_type"] == k
# subrec is a reference used to navigate inside of the JSON as specified by the v[:location] to find the part of the tree to sanitize
subrec = json
v[:location].each do |l|
unless subrec.nil?
subrec = subrec[l]
end
end
unless subrec.nil?
subrec[v[:to_clean]] = []
end
end
end
return json
end
def index_records(records, timing = IndexerTiming.new)
batch = IndexBatch.new
records = dedupe_by_uri(records)
timing.time_block(:conversion_ms) do
records.each do |record|
values = record['record']
uri = record['uri']
reference = JSONModel.parse_reference(uri)
record_type = reference && reference[:type]
if !record_type || skip_index_record?(record) || (record_type != 'repository' && !record_types.include?(record_type.intern))
next
end
doc = {}
doc['id'] = uri
doc['uri'] = uri
if ( !values["finding_aid_filing_title"].nil? && values["finding_aid_filing_title"].length > 0 )
doc['title'] = values["finding_aid_filing_title"]
else
doc['title'] = values['title']
end
doc['primary_type'] = record_type
doc['types'] = [record_type]
doc['json'] = ASUtils.to_json(sanitize_json(values))
doc['suppressed'] = values.has_key?('suppressed') && values['suppressed']
if doc['suppressed']
doc['publish'] = false
elsif is_repository_unpublished?(uri, values)
doc['publish'] = false
elsif values['has_unpublished_ancestor']
doc['publish'] = false
else
doc['publish'] = values.has_key?('publish') && values['publish']
end
doc['system_generated'] = values.has_key?('system_generated') ? values['system_generated'].to_s : 'false'
doc['repository'] = get_record_scope(uri)
@document_prepare_hooks.each do |hook|
hook.call(doc, record)
end
doc['title_sort'] ||= clean_for_sort(doc['title'])
# do this last of all so we know for certain the doc is published
apply_pui_fields(doc, record)
next if skip_index_doc?(doc)
batch << clean_whitespace(doc)
# Allow a single record to spawn multiple Solr documents if desired
@extra_documents_hooks.each do |hook|
batch.concat(hook.call(record))
end
end
end
index_batch(batch, timing)
timing
end
def index_batch(batch, timing = IndexerTiming.new, opts = {})
timing ||= IndexerTiming.new
timing.time_block(:batch_hooks_ms) do
# Allow hooks to operate on the entire batch if desired
@batch_hooks.each_with_index do |hook|
hook.call(batch)
end
end
if !batch.empty?
# For any record we're updating, delete any child records first (where applicable)
records_with_children = self.records_with_children.map {|record_type|
batch.record_info_for_type(record_type).map {|info| '"%s"' % [info[:id]]}
}.flatten
if !records_with_children.empty?
req = Net::HTTP::Post.new("#{solr_url.path}/update")
req['Content-Type'] = 'application/json'
req.body = {:delete => {'query' => opts.fetch(:parent_id_field, 'parent_id') + ":(" + records_with_children.join(" OR ") + ")"}}.to_json
response = do_http_request(solr_url, req)
end
# Now apply the updates
req = Net::HTTP::Post.new("#{solr_url.path}/update")
req['Content-Type'] = 'application/json'
# Note: We call to_json_stream before asking for the count because this
# writes out the closing array and newline.
stream = batch.to_json_stream
req['Content-Length'] = batch.byte_count
req.body_stream = stream
timing.time_block(:solr_add_ms) do
response = do_http_request(solr_url, req)
stream.close
batch.destroy
if response.code != '200'
Log.error "SolrIndexerError when indexing records: #{response.body}"
end
end
end
end
def send_commit(type = :hard)
req = Net::HTTP::Post.new("#{solr_url.path}/update")
req['Content-Type'] = 'application/json'
req.body = {:commit => {"softCommit" => (type == :soft) }}.to_json
response = do_http_request(solr_url, req)
if response.code != '200'
if response.body =~ /exceeded limit of maxWarmingSearchers/
Log.info "INFO: #{response.body}"
else
Log.error "SolrIndexerError when committing: #{response.body}"
end
end
end
def paused?
self.singleton_class.class_variable_get(:@@paused_until) > Time.now
end
def skip_index_record?(record)
false
end
def skip_index_doc?(doc)
false
end
def apply_pui_fields(doc, record)
# only add pui types if the record is published
if doc['publish']
object_record_types = ['accession', 'digital_object', 'digital_object_component']
if object_record_types.include?(doc['primary_type'])
doc['types'] << 'pui_record'
end
if ['agent_person', 'agent_corporate_entity'].include?(doc['primary_type'])
doc['types'] << 'pui_agent'
end
unless RecordInheritance.has_type?(doc['primary_type'])
# All record types are available to PUI except archival objects, since
# our pui_indexer indexes a specially formatted version of those.
if ['resource'].include?(doc['primary_type'])
doc['types'] << 'pui_collection'
elsif ['classification'].include?(doc['primary_type'])
doc['types'] << 'pui_record_group'
elsif ['agent_person'].include?(doc['primary_type'])
doc['types'] << 'pui_person'
else
doc['types'] << 'pui_' + doc['primary_type']
end
doc['types'] << 'pui'
end
end
# index all top containers for pui
if doc['primary_type'] == 'top_container'
doc['publish'] = record['record']['is_linked_to_published_record']
if doc['publish']
doc['types'] << 'pui_container'
doc['types'] << 'pui'
end
end
end
end
ASUtils.find_local_directories('indexer').each do |dir|
Dir.glob(File.join(dir, "*.rb")).sort.each do |file|
require file
end
end