sul-dlss/moab-versioning

View on GitHub
lib/moab/file_group_difference.rb

Summary

Maintainability
A
3 hrs
Test Coverage
A
100%
# frozen_string_literal: true

module Moab
  # Performs analysis and reports the differences between two matching {FileGroup} objects.
  # The descending elements of the report hold a detailed breakdown of file-level differences, organized by change type.
  # This stanza is a child element of {FileInventoryDifference}, the documentation of which contains a full example.
  #
  # In order to determine the detailed nature of the differences that are present between the two manifests,
  # this algorithm first compares the sets of file signatures present in the groups being compared,
  # then uses the result of that operation for subsequent analysis of filename correspondences.
  #
  # For the first step, a Ruby Hash is extracted from each of the of the two groups, with an array of
  # {FileSignature} object used as hash keys, and the corresponding {FileInstance} arrays as the hash values.
  # The set of keys from the basis hash can be compared against the keys from the other hash using {Array} operators:
  # * <i>matching</i>  = basis_array & other_array
  # * <i>basis_only</i> = basis_array - other_array
  # * <i>other_only</i> = other_array - basis_array
  #
  # For the second step of the comparison, the matching and non-matching sets of hash entries
  # are further categorized as follows:
  # * <i>identical</i> = signature and file path is the same in both basis and other file group
  # * <i>renamed</i> = signature is unchanged, but the path has moved
  # * <i>copyadded</i> = duplicate copy of file was added
  # * <i>copydeleted</i> = duplicate copy of file was deleted
  # * <i>modified</i> = path is same in both groups, but the signature has changed
  # * <i>added</i> = signature and path are only in the other inventor
  # * <i>deleted</i> = signature and path are only in the basis inventory
  #
  # ====Data Model
  # * {FileInventoryDifference} = compares two {FileInventory} instances based on file signatures and pathnames
  #   * <b>{FileGroupDifference} [1..*] = performs analysis and reports differences between two matching {FileGroup} objects</b>
  #     * {FileGroupDifferenceSubset} [1..5] = collects a set of file-level differences of a give change type
  #       * {FileInstanceDifference} [1..*] = contains difference information at the file level
  #         * {FileSignature} [1..2] = contains the file signature(s) of two file instances being compared
  #
  # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
  #   All rights reserved.  See {file:LICENSE.rdoc} for details.
  class FileGroupDifference < Serializer::Serializable
    include HappyMapper

    # The name of the XML element used to serialize this objects data
    tag 'fileGroupDifference'

    # @return [Hash<Symbol,FileGroupDifferenceSubset>] A set of containers (one for each change type),
    #    each of which contains a collection of file-level differences having that change type.
    attr_accessor :subset_hash

    # @param change [String] the change type to search for
    # @return [FileGroupDifferenceSubset] Find a specified subset of changes
    def subset(change)
      subset_hash[change.to_sym]
    end

    # (see Serializable#initialize)
    def initialize(opts = {})
      @subset_hash = Hash.new { |hash, key| hash[key] = FileGroupDifferenceSubset.new(change: key.to_s) }
      super(opts)
    end

    # @attribute
    # @return [String] The name of the file group
    attribute :group_id, String, tag: 'groupId', key: true

    # @attribute
    # @return [Integer] the total number of differences found between the two inventories that were
    #   compared  (dynamically calculated)
    attribute :difference_count, Integer, tag: 'differenceCount', on_save: proc(&:to_s)

    def difference_count
      count = 0
      subset_hash.each do |type, subset|
        count += subset.count if type != :identical
      end
      count
    end

    # @attribute
    # @return [Integer] How many files were unchanged
    attribute :identical, Integer, on_save: proc(&:to_s)
    def identical
      subset_hash[:identical].count
    end

    # @attribute
    # @return [Integer] How many duplicate copies of files were added
    attribute :copyadded, Integer, on_save: proc(&:to_s)
    def copyadded
      subset_hash[:copyadded].count
    end

    # @attribute
    # @return [Integer] How many duplicate copies of files were deleted
    attribute :copydeleted, Integer, on_save: proc(&:to_s)
    def copydeleted
      subset_hash[:copydeleted].count
    end

    # @attribute
    # @return [Integer] How many files were renamed
    attribute :renamed, Integer, on_save: proc(&:to_s)
    def renamed
      subset_hash[:renamed].count
    end

    # @attribute
    # @return [Integer] How many files were modified
    attribute :modified, Integer, on_save: proc(&:to_s)
    def modified
      subset_hash[:modified].count
    end

    # @attribute
    # @return [Integer] How many files were added
    attribute :added, Integer, on_save: proc(&:to_s)
    def added
      subset_hash[:added].count
    end

    # @attribute
    # @return [Integer] How many files were deleted
    attribute :deleted, Integer, on_save: proc(&:to_s)
    def deleted
      subset_hash[:deleted].count
    end

    # @attribute
    # @return [Array<FileGroupDifferenceSubset>] A set of Arrays (one for each change type),
    #    each of which contains an collection of file-level differences having that change type.
    has_many :subsets, FileGroupDifferenceSubset, tag: 'subset'

    def subsets
      subset_hash.values
    end

    def subsets=(array)
      return unless array

      array.each { |subset| subset_hash[subset.change.to_sym] = subset }
    end

    # @return [Array<String>] The data fields to include in summary reports
    def summary_fields
      %w[group_id difference_count identical copyadded copydeleted renamed modified deleted added]
    end

    # @api internal
    # @return [FileGroupDifference] Clone just this element for inclusion in a versionMetadata structure
    def summary
      FileGroupDifference.new(
        group_id: group_id,
        identical: identical,
        copyadded: copyadded,
        copydeleted: copydeleted,
        renamed: renamed,
        modified: modified,
        added: added,
        deleted: deleted
      )
    end

    # @api internal
    # @param basis_hash [Hash] The first hash being compared
    # @param other_hash [Hash] The second hash being compared
    # @return [Array] Compare the keys of two hashes and return the intersection
    def matching_keys(basis_hash, other_hash)
      basis_hash.keys & other_hash.keys
    end

    # @api internal
    # @param (see #matching_keys)
    # @return [Array] Compare the keys of two hashes and return the keys unique to the first hash
    def basis_only_keys(basis_hash, other_hash)
      basis_hash.keys - other_hash.keys
    end

    # @api internal
    # @param (see #matching_keys)
    # @return [Array] Compare the keys of two hashes and return the keys unique to the second hash
    def other_only_keys(basis_hash, other_hash)
      other_hash.keys - basis_hash.keys
    end

    # @api internal
    # @param basis_group [FileGroup] The file group that is the basis of the comparison
    # @param other_group [FileGroup] The file group that is compared against the basis group
    # @return [FileGroupDifference] Compare two file groups and return a differences report
    def compare_file_groups(basis_group, other_group)
      @group_id = basis_group.group_id
      compare_matching_signatures(basis_group, other_group)
      compare_non_matching_signatures(basis_group, other_group)
      self
    end

    # @api internal
    # @param (see #compare_file_groups)
    # @return [FileGroupDifference] For signatures that are present in both groups,
    #   report which file instances are identical or renamed
    def compare_matching_signatures(basis_group, other_group)
      matching_signatures = matching_keys(basis_group.signature_hash, other_group.signature_hash)
      tabulate_unchanged_files(matching_signatures, basis_group.signature_hash, other_group.signature_hash)
      tabulate_renamed_files(matching_signatures, basis_group.signature_hash, other_group.signature_hash)
      self
    end

    # @api internal
    # @param (see #compare_file_groups)
    # @return [FileGroupDifference] For signatures that are present in only one or the other group,
    #   report which file instances are modified, deleted, or added
    def compare_non_matching_signatures(basis_group, other_group)
      basis_only_signatures = basis_only_keys(basis_group.signature_hash, other_group.signature_hash)
      other_only_signatures = other_only_keys(basis_group.signature_hash, other_group.signature_hash)
      basis_path_hash = basis_group.path_hash_subset(basis_only_signatures)
      other_path_hash = other_group.path_hash_subset(other_only_signatures)
      tabulate_modified_files(basis_path_hash, other_path_hash)
      tabulate_added_files(basis_path_hash, other_path_hash)
      tabulate_deleted_files(basis_path_hash, other_path_hash)
      self
    end

    # @api internal
    # @param matching_signatures [Array<FileSignature>] The file signature of the file manifestations being compared
    # @param basis_signature_hash [Hash<FileSignature, FileManifestation>]
    #   Signature to file path mapping from the file group that is the basis of the comparison
    # @param other_signature_hash [Hash<FileSignature, FileManifestation>]
    #   Signature to file path mapping from the file group that is the being compared to the basis group
    # @return [FileGroupDifference]
    #   Container for reporting the set of file-level differences of type 'identical'
    def tabulate_unchanged_files(matching_signatures, basis_signature_hash, other_signature_hash)
      matching_signatures.each do |signature|
        basis_paths = basis_signature_hash[signature].paths
        other_paths = other_signature_hash[signature].paths
        matching_paths = basis_paths & other_paths
        matching_paths.each do |path|
          fid = FileInstanceDifference.new(change: 'identical')
          fid.basis_path = path
          fid.other_path = 'same'
          fid.signatures << signature
          subset_hash[:identical].files << fid
        end
      end
      self
    end

    # @api internal
    # @param matching_signatures [Array<FileSignature>] The file signature of the file manifestations being compared
    # @param basis_signature_hash [Hash<FileSignature, FileManifestation>]
    #   Signature to file path mapping from the file group that is the basis of the comparison
    # @param other_signature_hash [Hash<FileSignature, FileManifestation>]
    #   Signature to file path mapping from the file group that is the being compared to the basis group
    # @return [FileGroupDifference]
    #   Container for reporting the set of file-level differences of type 'renamed','copyadded', or 'copydeleted'
    def tabulate_renamed_files(matching_signatures, basis_signature_hash, other_signature_hash)
      matching_signatures.each do |signature|
        basis_paths = basis_signature_hash[signature].paths
        other_paths = other_signature_hash[signature].paths
        basis_only_paths = basis_paths - other_paths
        other_only_paths = other_paths - basis_paths
        maxsize = [basis_only_paths.size, other_only_paths.size].max
        (0..maxsize - 1).each do |n|
          fid = FileInstanceDifference.new
          fid.basis_path = basis_only_paths[n]
          fid.other_path = other_only_paths[n]
          fid.signatures << signature
          if fid.basis_path.nil?
            fid.change = 'copyadded'
            fid.basis_path = basis_paths[0]
          elsif fid.other_path.nil?
            fid.change = 'copydeleted'
          else
            fid.change = 'renamed'
          end
          subset_hash[fid.change.to_sym].files << fid
        end
      end
      self
    end

    # @api internal
    # @param basis_path_hash [Hash<String,FileSignature>]
    #   The file paths and associated signatures for manifestations appearing only in the basis group
    # @param other_path_hash [Hash<String,FileSignature>]
    #   The file paths and associated signatures for manifestations appearing only in the other group
    # @return [FileGroupDifference]
    #   Container for reporting the set of file-level differences of type 'modified'
    def tabulate_modified_files(basis_path_hash, other_path_hash)
      matching_keys(basis_path_hash, other_path_hash).each do |path|
        fid = FileInstanceDifference.new(change: 'modified')
        fid.basis_path = path
        fid.other_path = 'same'
        fid.signatures << basis_path_hash[path]
        fid.signatures << other_path_hash[path]
        subset_hash[:modified].files << fid
      end
      self
    end

    # @api internal
    # @param basis_path_hash [Hash<String,FileSignature>]
    #   The file paths and associated signatures for manifestations appearing only in the basis group
    # @param other_path_hash [Hash<String,FileSignature>]
    #   The file paths and associated signatures for manifestations appearing only in the other group
    # @return [FileGroupDifference]
    #   Container for reporting the set of file-level differences of type 'added'
    def tabulate_added_files(basis_path_hash, other_path_hash)
      other_only_keys(basis_path_hash, other_path_hash).each do |path|
        fid = FileInstanceDifference.new(change: 'added')
        fid.basis_path = ''
        fid.other_path = path
        fid.signatures << other_path_hash[path]
        subset_hash[:added].files << fid
      end
      self
    end

    # @api internal
    # @param basis_path_hash [Hash<String,FileSignature>]
    #   The file paths and associated signatures for manifestations appearing only in the basis group
    # @param other_path_hash [Hash<String,FileSignature>]
    #   The file paths and associated signatures for manifestations appearing only in the other group
    # @return [FileGroupDifference]
    #   Container for reporting the set of file-level differences of type 'deleted'
    def tabulate_deleted_files(basis_path_hash, other_path_hash)
      basis_only_keys(basis_path_hash, other_path_hash).each do |path|
        fid = FileInstanceDifference.new(change: 'deleted')
        fid.basis_path = path
        fid.other_path = ''
        fid.signatures << basis_path_hash[path]
        subset_hash[:deleted].files << fid
      end
      self
    end

    # @return [Hash<Symbol,Array>] Sets of filenames grouped by change type for use in performing file or metadata operations
    def file_deltas
      # The hash to be returned
      deltas = Hash.new { |hash, key| hash[key] = [] }
      # case where other_path is empty or 'same'.  (create array of strings)
      %i[identical modified deleted copydeleted].each do |change|
        deltas[change].concat(subset_hash[change].files.collect(&:basis_path))
      end
      # case where basis_path and other_path are both present.  (create array of arrays)
      %i[copyadded renamed].each do |change|
        deltas[change].concat(subset_hash[change].files.collect { |file| [file.basis_path, file.other_path] })
      end
      # case where basis_path is empty.  (create array of strings)
      [:added].each do |change|
        deltas[change].concat(subset_hash[change].files.collect(&:other_path))
      end
      deltas
    end

    # @param [Array<Array<String>>] filepairs The set of oldname, newname pairs for all files being renamed
    # @return [Boolean] Test whether any of the new names are the same as one of the old names,
    #    such as would be true for insertion of a new file into a page sequence, or a circular rename.
    #    In such a case, return true, indicating that use of intermediate temporary files would be required
    #    when updating a copy of an object's files at a given location.
    def rename_require_temp_files(filepairs)
      # Split the filepairs into two arrays
      oldnames = []
      newnames = []
      filepairs.each do |old, new|
        oldnames << old
        newnames << new
      end
      # Are any of the filenames the same in set of oldnames and set of newnames?
      intersection = oldnames & newnames
      intersection.count > 0
    end

    # @param [Array<Array<String>>] filepairs The set of oldname, newname pairs for all files being renamed
    # @return [Array<Array<String>>] a set of file triples containing oldname, tempname, newname
    def rename_tempfile_triplets(filepairs)
      filepairs.collect { |old, new| [old, new, "#{new}-#{Time.now.strftime('%Y%m%d%H%H%S')}-tmp"] }
    end
  end
end