sul-dlss/was_robot_suite

View on GitHub
lib/dor/was_crawl/cdxj_generator_service.rb

Summary

Maintainability
A
0 mins
Test Coverage
A
100%
# frozen_string_literal: true

module Dor
  module WasCrawl
    class CdxjGeneratorService
      def initialize(collection_path, druid_id)
        @druid_id = druid_id
        @collection_path = collection_path
      end

      def generate(warc_file_list)
        FileUtils.mkdir_p druid_dir
        warc_file_list.each do |warc_file_name|
          generate_cdx_for_one_warc(warc_file_name)
        end
      end

      private

      def druid_base_directory
        DruidTools::AccessDruid.new(@druid_id, @collection_path).path
      end

      def druid_dir
        @druid_dir ||= "#{Settings.cdxj_indexer.working_directory}/#{@druid_id}"
      end

      def cdx_file_name(warc_file_name)
        file_name = File.basename(warc_file_name, '.*')
        "#{druid_dir}/#{file_name}.cdxj"
      end

      def generate_cdx_for_one_warc(warc_file_name)
        cdx_file_path  = cdx_file_name(warc_file_name)
        warc_file_path = "#{druid_base_directory}/#{warc_file_name}"
        cmd_string = "TMPDIR=#{Settings.cdxj_indexer.tmpdir} " \
                     "#{Settings.cdxj_indexer.bin} #{warc_file_path} --output #{cdx_file_path} --dir-root #{Settings.was_crawl_dissemination.stacks_collections_path} --post-append 2>> log/cdx_indexer.log"
        Dor::WasCrawl::Dissemination::Utilities.run_sys_cmd(cmd_string, 'extracting CDXJ')
      end
    end
  end
end