lib/moab/signature_catalog.rb
# frozen_string_literal: true
module Moab
# A digital object's Signature Catalog is derived from an filtered aggregation of the file inventories
# of a digital object's set of versions. (see {#update})
# It has an entry for every file (identified by {FileSignature}) found in any of the versions,
# along with a record of the SDR storage location that was used to preserve a single file instance.
# Once this catalog has been populated, it has multiple uses:
# * The signature index is used to determine which files of a newly submitted object version
# are new additions and which are duplicates of files previously ingested. (See {#version_additions})
# (When a new version contains a mixture of added files and files carried over from the previous version
# we only need to store the files from the new version that have unique file signatures.)
# * Reconstruction of an object version (see {StorageObject#reconstruct_version}) requires a combination
# of a full version's {FileInventory} and the SignatureCatalog.
# * The catalog can also be used for performing consistency checks between manifest files and storage
#
# ====Data Model
# * <b>{SignatureCatalog} = lookup table containing a cumulative collection of all files ever ingested</b>
# * {SignatureCatalogEntry} [1..*] = an row in the lookup table containing storage information about a single file
# * {FileSignature} [1] = file fixity information
#
# @example {include:file:spec/fixtures/derivatives/manifests/v3/signatureCatalog.xml}
# @see StorageObject
# @see Bagger
# @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
# All rights reserved. See {file:LICENSE.rdoc} for details.
class SignatureCatalog < Serializer::Manifest
include HappyMapper
# The name of the XML element used to serialize this objects data
tag 'signatureCatalog'
# (see Serializable#initialize)
def initialize(opts = {})
@entries = []
@signature_hash = {}
super(opts)
end
# @attribute
# @return [String] The object ID (druid)
attribute :digital_object_id, String, tag: 'objectId'
# @attribute
# @return [Integer] The ordinal version number
attribute :version_id, Integer, tag: 'versionId', key: true, on_save: proc(&:to_s)
# @return [String] The unique identifier concatenating digital object id with version id
def composite_key
"#{@digital_object_id}-#{StorageObject.version_dirname(@version_id)}"
end
# @attribute
# @return [String] The datetime at which the catalog was updated
attribute :catalog_datetime, Time, tag: 'catalogDatetime'
def catalog_datetime=(datetime)
@catalog_datetime = Moab::UtcTime.input(datetime)
end
def catalog_datetime
Moab::UtcTime.output(@catalog_datetime)
end
# @attribute
# @return [Integer] The total number of data files (dynamically calculated)
attribute :file_count, Integer, tag: 'fileCount', on_save: proc(&:to_s)
def file_count
entries.size
end
# @attribute
# @return [Integer] The total size (in bytes) of all data files (dynamically calculated)
attribute :byte_count, Integer, tag: 'byteCount', on_save: proc(&:to_s)
def byte_count
entries.inject(0) { |sum, entry| sum + entry.signature.size.to_i }
end
# @attribute
# @return [Integer] The total disk usage (in 1 kB blocks) of all data files (estimating du -k result) (dynamically calculated)
attribute :block_count, Integer, tag: 'blockCount', on_save: proc(&:to_s)
def block_count
block_size = 1024
entries.inject(0) { |sum, entry| sum + ((entry.signature.size.to_i + block_size - 1) / block_size) }
end
# @return [Array<String>] The data fields to include in summary reports
def summary_fields
%w[digital_object_id version_id catalog_datetime file_count byte_count block_count]
end
# @attribute
# @return [Array<SignatureCatalogEntry>] The set of data groups comprising the version
has_many :entries, SignatureCatalogEntry, tag: 'entry'
def entries=(entry_array)
entry_array.each do |entry|
add_entry(entry)
end
end
# @return [Hash] An index having {FileSignature} objects as keys and {SignatureCatalogEntry} objects as values
attr_accessor :signature_hash
# @api internal
# @param entry [SignatureCatalogEntry] The new catalog entry
# @return [void] Add a new entry to the catalog and to the {#signature_hash} index
def add_entry(entry)
@signature_hash[entry.signature] = entry
entries << entry
end
# @param [FileSignature] file_signature The signature of the file whose path is sought
# @return [String] The object-relative path of the file having the specified signature
def catalog_filepath(file_signature)
catalog_entry = @signature_hash[file_signature]
if catalog_entry.nil?
msg = "catalog entry not found for #{file_signature.fixity.inspect} in #{@digital_object_id} - #{@version_id}"
raise FileNotFoundException, msg
end
catalog_entry.storage_path
end
# @param group [FileGroup] A group of the files from a file inventory
# @param group_pathname [Pathname] The location of the directory containing the group's files
# @return [void] Inspect and upgrade the group's signature data to include all desired checksums
def normalize_group_signatures(group, group_pathname = nil)
unless group_pathname.nil?
group_pathname = Pathname(group_pathname)
raise(MoabRuntimeError, "Could not locate #{group_pathname}") unless group_pathname.exist?
end
group.files.each do |file|
unless file.signature.complete?
if @signature_hash.key?(file.signature)
file.signature = @signature_hash.find { |k, _v| k == file.signature }[0]
elsif group_pathname
file_pathname = group_pathname.join(file.instances[0].path)
file.signature = file.signature.normalized_signature(file_pathname)
end
end
end
end
# @api external
# @param version_inventory [FileInventory] The complete inventory of the files comprising a digital object version
# @param data_pathname [Pathname] The location of the object's data directory
# @return [void] Compares the {FileSignature} entries in the new versions {FileInventory} against the signatures
# in this catalog and create new {SignatureCatalogEntry} addtions to the catalog
# @example {include:file:spec/features/catalog/catalog_update_spec.rb}
def update(version_inventory, data_pathname)
version_inventory.groups.each do |group|
group.files.each do |file|
unless @signature_hash.key?(file.signature)
entry = SignatureCatalogEntry.new
entry.version_id = version_inventory.version_id
entry.group_id = group.group_id
entry.path = file.instances[0].path
if file.signature.complete?
entry.signature = file.signature
else
file_pathname = data_pathname.join(group.group_id, entry.path)
entry.signature = file.signature.normalized_signature(file_pathname)
end
add_entry(entry)
end
end
end
@version_id = version_inventory.version_id
@catalog_datetime = Time.now
end
# @api external
# @param version_inventory (see #update)
# @return [FileInventory] Retrurns a filtered copy of the input inventory
# containing only those files that were added in this version
# @example {include:file:spec/features/catalog/version_additions_spec.rb}
def version_additions(version_inventory)
version_additions = FileInventory.new(type: 'additions')
version_additions.copy_ids(version_inventory)
version_inventory.groups.each do |group|
group_addtions = FileGroup.new(group_id: group.group_id)
group.files.each do |file|
group_addtions.add_file_instance(file.signature, file.instances[0]) unless @signature_hash.key?(file.signature)
end
version_additions.groups << group_addtions unless group_addtions.files.empty?
end
version_additions
end
end
end