lib/relaton_nist/data_fetcher.rb
# frozen_string_literal: true
require "yaml"
module RelatonNist
class DataFetcher
URL = "https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml"
def initialize(output, format)
@output = output
@format = format
@ext = format.sub(/^bib/, "")
@files = []
end
def index
@index ||= Relaton::Index.find_or_create :nist, file: "index-v1.yaml"
end
def series
@series ||= YAML.load_file File.expand_path("series.yaml", __dir__)
end
#
# Save document
#
# @param bib [RelatonNist::NistBibliographicItem]
#
def write_file(bib) # rubocop:disable Metrics/AbcSize
id = bib.docidentifier[0].id.gsub(%r{[/\s:.]}, "_").upcase.sub(/^NIST_IR/, "NISTIR")
file = File.join(@output, "#{id}.#{@ext}")
if @files.include? file
warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}"
# warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
else @files << file
end
index.add_or_update bib.docidentifier[0].id, file
File.write file, output(bib), encoding: "UTF-8"
end
def output(bib)
case @format
when "yaml" then bib.to_hash.to_yaml
when "xml" then bib.to_xml bibdata: true
else bib.send "to_#{@format}"
end
end
#
# Fetch all the documnts from dataset
#
def fetch # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
t1 = Time.now
puts "Started at: #{t1}"
FileUtils.mkdir_p @output
FileUtils.rm Dir[File.join(@output, "*.#{@ext}")]
fetch_tech_pubs
add_static_files
index.save
t2 = Time.now
puts "Stopped at: #{t2}"
puts "Done in: #{(t2 - t1).round} sec."
rescue StandardError => e
warn e.message
warn e.backtrace[0..5].join("\n")
end
def fetch_tech_pubs
docs = Nokogiri::XML OpenURI.open_uri URL
docs.xpath(
"/body/query/doi_record/report-paper/report-paper_metadata",
).each { |doc| write_file TechPubsParser.parse(doc, series) }
end
def add_static_files
Dir["./static/*.yaml"].each do |file|
hash = YAML.load_file file
write_file RelatonNist::NistBibliographicItem.from_hash(hash)
end
end
#
# Fetch all the documnts from dataset
#
# @param [String] output foldet name to save the documents
# @param [String] format format to save the documents (yaml, xml, bibxml)
#
def self.fetch(output: "data", format: "yaml")
new(output, format).fetch
end
end
end