app/harvesters/smithsonian_harvester.rb
##
# A harvester implementation that can handle a directory of Smithsonian
# .xml[.gz] files.
#
# A typical invocation is:
#
# SmithsonianHarvester.new(uri: '/data-files/Smithsonian/')
#
# Note that `:uri` is a local directory in this case. Any files ending in
# '.xml' or '.xml.gz' (case insensitive) will be harvested.
#
class SmithsonianHarvester
include Krikri::Harvester
##
# @see Krikri::Harvester#count
def count
raise NotImplementedError
end
##
# @return [Enumerator::Lazy] an enumerator of the records targeted by this
# harvester.
def records
enumerate_records.lazy.map { |rec| build_record(rec) }
end
##
# Gets a single record with the given identifier
#
# @return [Enumerator::Lazy] an enumerator over the ids for the records
# targeted by this harvester.
def record_ids
enumerate_records.lazy.map(&:identifier)
end
##
# @param identifier [#to_s] the identifier of the record to get
# @return [#to_s] the record
def get_record(_)
raise NotImplementedError
end
##
# @return [String] the content type for the records generated by this
# harvester
def content_type
'text/xml'
end
private
# @yield [Enumerable<IO>] gives a collection of IO objects representing
# XML to be parsed into the record.
def each_collection
Dir.open(uri).each do |path|
next unless path =~ /\.xml(\.gz)?/i
File.open(File.join(uri, path)) do |fh|
fh = Zlib::GzipReader.new(fh) if path.downcase.end_with?('.gz')
yield fh
end
end
end
##
# @return [Enumerator::Lazy] an enumerator of the records targeted by this
# harvester.
def enumerate_records
Enumerator.new do |yielder|
each_collection do |xml|
Nokogiri::XML::Reader(xml).each do |node|
if node.name == 'doc' &&
node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
yielder << SmithsonianDoc.new(node)
end
end
end
end
end
##
# Produce an OriginalRecord
#
# @param doc [SmithsonianDoc] A Smithsonian record
# @return [Krikri::OriginalRecord]
def build_record(doc)
@record_class.build(mint_id(doc.identifier),
doc.source,
content_type)
end
##
# A value object containing the Smithsonian document's identifier and its XML
# source
class SmithsonianDoc
attr_reader :identifier, :source
def initialize(nokogiri_doc)
@source = nokogiri_doc.outer_xml
doc = Nokogiri::XML(@source)
@identifier = doc.xpath('//descriptiveNonRepeating/record_ID').text
end
end
end