app/harvesters/smithsonian_harvester.rb from dpla/heidrun

app/harvesters/smithsonian_harvester.rb
Summary

Maintainability

35 mins
Test Coverage

Issues
##
# A harvester implementation that can handle a directory of Smithsonian
# .xml[.gz] files.
#
# A typical invocation is:
#
#   SmithsonianHarvester.new(uri: '/data-files/Smithsonian/')
#
# Note that `:uri` is a local directory in this case.  Any files ending in
# '.xml' or '.xml.gz' (case insensitive) will be harvested.
#
class SmithsonianHarvester
  include Krikri::Harvester

  ##
  # @see Krikri::Harvester#count
  def count
    raise NotImplementedError
  end

  ##
  # @return [Enumerator::Lazy] an enumerator of the records targeted by this
  #   harvester.
  def records
    enumerate_records.lazy.map { |rec| build_record(rec) }
  end

  ##
  # Gets a single record with the given identifier
  #
  # @return [Enumerator::Lazy] an enumerator over the ids for the records
  #   targeted by this harvester.
  def record_ids
    enumerate_records.lazy.map(&:identifier)
  end

  ##
  # @param identifier [#to_s] the identifier of the record to get
  # @return [#to_s] the record
  def get_record(_)
    raise NotImplementedError
  end

  ##
  # @return [String] the content type for the records generated by this
  #   harvester
  def content_type
    'text/xml'
  end

  private

  # @yield [Enumerable<IO>] gives a collection of IO objects representing
  #   XML to be parsed into the record.
  def each_collection
    Dir.open(uri).each do |path|
      next unless path =~ /\.xml(\.gz)?/i

      File.open(File.join(uri, path)) do |fh|
        fh = Zlib::GzipReader.new(fh) if path.downcase.end_with?('.gz')
        yield fh
      end
    end
  end

  ##
  # @return [Enumerator::Lazy] an enumerator of the records targeted by this
  #   harvester.
  def enumerate_records
    Enumerator.new do |yielder|
      each_collection do |xml|
        Nokogiri::XML::Reader(xml).each do |node|
          if node.name == 'doc' &&
             node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
            yielder << SmithsonianDoc.new(node)
          end
        end
      end
    end
  end

  ##
  # Produce an OriginalRecord
  #
  # @param doc [SmithsonianDoc] A Smithsonian record
  # @return [Krikri::OriginalRecord]
  def build_record(doc)
    @record_class.build(mint_id(doc.identifier),
                        doc.source,
                        content_type)
  end

  ##
  # A value object containing the Smithsonian document's identifier and its XML
  # source
  class SmithsonianDoc
    attr_reader :identifier, :source

    def initialize(nokogiri_doc)
      @source = nokogiri_doc.outer_xml
      doc = Nokogiri::XML(@source)

      @identifier = doc.xpath('//descriptiveNonRepeating/record_ID').text
    end
  end
end