QutBioacoustics/baw-server

View on GitHub
lib/gems/baw-workers/lib/baw_workers/jobs/audio_check/csv_helper.rb

Summary

Maintainability
D
1 day
Test Coverage
# frozen_string_literal: true

require 'csv'

# Postgresql query to export csv file
# COPY (SELECT id, uuid, recorded_date || 'Z' as recorded_date, duration_seconds, sample_rate_hertz, channels,bit_rate_bps, media_type, data_length_bytes, file_hash, original_file_name FROM "audio_recordings") TO '/tmp/staging_audio_recordings.csv' DELIMITER ',' CSV HEADER;

module BawWorkers
  module Jobs
    module AudioCheck
      class CsvHelper
        class << self
          def logged_csv_line(file_path, exists, moved_path = nil,
                              compare_hash = nil, api_result_hash = nil,
                              api_response = nil, review_level = :none_all_good,
                              audio_recording_id)
            csv_headers = [
              :file_path, :exists,

              :moved_path,
              :errors,

              :check_new_file_name, :check_file_errors,

              :check_file_hash, :check_extension, :check_media_type,
              :check_sample_rate_hertz, :check_channels, :check_bit_rate_bps,
              :check_data_length_bytes, :check_duration_seconds,

              :expected_file_hash, :expected_extension, :expected_media_type,
              :expected_sample_rate_hertz, :expected_channels, :expected_bit_rate_bps,
              :expected_data_length_bytes, :expected_duration_seconds,

              :actual_file_hash, :actual_extension, :actual_media_type,
              :actual_sample_rate_hertz, :actual_channels, :actual_bit_rate_bps,
              :actual_data_length_bytes, :actual_duration_seconds,

              :api_file_hash, :api_media_type,
              :api_sample_rate_hertz, :api_channels, :api_bit_rate_bps,
              :api_data_length_bytes, :api_duration_seconds,

              :api_response,

              :review_level,

              :audio_recording_id
            ]

            csv_values = []

            # file path and exists must always be available
            csv_values[0] = file_path
            csv_values[1] = exists

            # add moved path - this might be nil if the file wasn't moved
            csv_values[2] = moved_path unless moved_path.nil?

            # add all the info from comparison hash if it is available
            unless compare_hash.blank?
              csv_values[3] = compare_hash[:actual][:errors]

              csv_values[4] = compare_hash[:checks][:new_file_name]
              csv_values[5] = compare_hash[:checks][:file_errors]

              csv_values[6] = compare_hash[:checks][:file_hash]
              csv_values[7] = compare_hash[:checks][:extension]
              csv_values[8] = compare_hash[:checks][:media_type]
              csv_values[9] = compare_hash[:checks][:sample_rate_hertz]
              csv_values[10] = compare_hash[:checks][:channels]
              csv_values[11] = compare_hash[:checks][:bit_rate_bps]
              csv_values[12] = compare_hash[:checks][:data_length_bytes]
              csv_values[13] = compare_hash[:checks][:duration_seconds]

              csv_values[14] = compare_hash[:expected][:file_hash]
              csv_values[15] = compare_hash[:expected][:extension]
              csv_values[16] = compare_hash[:expected][:media_type]
              csv_values[17] = compare_hash[:expected][:sample_rate_hertz]
              csv_values[18] = compare_hash[:expected][:channels]
              csv_values[19] = compare_hash[:expected][:bit_rate_bps]
              csv_values[20] = compare_hash[:expected][:data_length_bytes]
              csv_values[21] = compare_hash[:expected][:duration_seconds]

              csv_values[22] = compare_hash[:actual][:file_hash]
              csv_values[23] = compare_hash[:actual][:extension]
              csv_values[24] = compare_hash[:actual][:media_type]
              csv_values[25] = compare_hash[:actual][:sample_rate_hertz]
              csv_values[26] = compare_hash[:actual][:channels]
              csv_values[27] = compare_hash[:actual][:bit_rate_bps]
              csv_values[28] = compare_hash[:actual][:data_length_bytes]
              csv_values[29] = compare_hash[:actual][:duration_seconds]
            end

            # add values from api results
            api_result_hash_blank = api_result_hash.blank?

            csv_values[30] =
              if api_result_hash_blank
                :notsent
              else
                api_result_hash.include?(:file_hash) ? :updated : :noaction
              end
            csv_values[31] =
              if api_result_hash_blank
                :notsent
              else
                api_result_hash.include?(:media_type) ? :updated : :noaction
              end
            csv_values[32] =
              if api_result_hash_blank
                :notsent
              else
                api_result_hash.include?(:sample_rate_hertz) ? :updated : :noaction
              end
            csv_values[33] = if api_result_hash_blank
                               :notsent
                             else
                               api_result_hash.include?(:channels) ? :updated : :noaction
                             end
            csv_values[34] =
              if api_result_hash_blank
                :notsent
              else
                api_result_hash.include?(:bit_rate_bps) ? :updated : :noaction
              end
            csv_values[35] =
              if api_result_hash_blank
                :notsent
              else
                api_result_hash.include?(:data_length_bytes) ? :updated : :noaction
              end
            csv_values[36] =
              if api_result_hash_blank
                :notsent
              else
                api_result_hash.include?(:duration_seconds) ? :updated : :noaction
              end

            # record response from api request
            api_response_value = api_response.nil? ? :invalid : api_response
            csv_values[37] = api_response_value

            csv_values[38] = review_level
            csv_values[39] = audio_recording_id

            {
              headers: csv_headers,
              values: csv_values
            }
          end

          def read_audio_file_hash_csv(csv_file)
            index_to_key_map = {
              file_hash: 0,
              uuid: 1
            }

            # load csv file
            CSV.foreach(csv_file, { headers: false, return_headers: false }) do |row|
              audio_params = {}

              # get values from row, put into hash that matches what check action expects
              audio_params = { file_hash: row[0].strip, uuid: row[1].strip } if row && !row[0].blank? && !row[1].blank?

              # provide the audio parameters to yield
              yield audio_params if block_given?
            end
          end

          def write_audio_recordings_csv(original_csv, hash_csv, result_csv)
            audio_info = {}

            BawWorkers::ReadCsv.read_audio_recording_csv(original_csv) do |audio_params|
              audio_info[audio_params[:uuid]] = audio_params
              print '.'
            end

            BawWorkers::Jobs::AudioCheck::CsvHelper.read_audio_file_hash_csv(hash_csv) do |hash_params|
              new_hash = hash_params[:file_hash]
              new_uuid = hash_params[:uuid]
              prefix = 'SHA256::'

              if audio_info.include?(new_uuid)
                file_hash = audio_info[new_uuid][:file_hash]
                if file_hash.blank? || file_hash == prefix
                  audio_info[new_uuid][:file_hash] = "#{prefix}#{new_hash}"
                  print ','
                else
                  print ' '
                end
              else
                audio_info[new_uuid] = {}
                audio_info[new_uuid][:file_hash] = "#{prefix}#{new_hash}"
                audio_info[new_uuid][:uuid] = new_uuid
                print ';'
              end
            end

            # write to csv
            csv_options = { col_sep: ',', force_quotes: true }

            index_to_key_map = {
              id: 0,
              uuid: 1,
              recorded_date: 2,
              duration_seconds: 3,
              sample_rate_hertz: 4,
              channels: 5,
              bit_rate_bps: 6,
              media_type: 7,
              data_length_bytes: 8,
              file_hash: 9,
              original_file_name: 10
            }

            column_order = []
            index_to_key_map.each do |key, value|
              column_order[value] = key
            end

            CSV.open(result_csv, 'w', csv_options) do |writer|
              audio_info.each do |_key, value|
                row_values = []
                column_order.each do |attr|
                  row_values.push(value[attr])
                end

                writer << row_values
              end
            end
          end

          # extract the CSV log lines from a log file.
          def extract_csv_logs(read_path, write_path)
            # for lines that start with a datestamp (format: 2015-04-12T23:06:48.295+0000) and log info
            # keep only if the row then contains '[CSV], '.

            line_start_regexp = /\A\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d{3}\+\d{4} \[[^\]]+\] \[CSV-data\], /

            File.open(write_path, 'a') do |dest|
              File.open(read_path).each_line do |line|
                next unless line.match(line_start_regexp)

                dest.puts line
              end
            end
          end

          # Compare csv file and files.
          # @return [Hash]
          def compare_csv_db(csv_file)
            original_audio = BawWorkers::Config.original_audio_helper

            # gather all existing files
            files = []
            original_audio.existing_files do |file|
              file_name = File.basename(file).downcase
              files.push(file_name)
            end

            intersection = []
            # actually copy the array without keeping any references
            files_without_db_entry = Marshal.load(Marshal.dump(files))
            db_entries_without_file = []

            # compare with details in db
            BawWorkers::ReadCsv.read_audio_recording_csv(csv_file) do |audio_params|
              opts =
                {
                  uuid: audio_params[:uuid],
                  datetime_with_offset: BawWorkers::Validation.normalise_datetime(audio_params[:recorded_date]),
                  original_format: audio_params[:original_format]
                }
              utc_file_name = original_audio.file_names(opts)[1].downcase

              match_result = files & [utc_file_name]
              is_match = match_result.any?

              # add file names that exist on disk to intersection if one or more exist
              intersection.push(*match_result) if is_match

              # remove file names from files_without_db_entry if there is a match on disk
              match_result.each do |match|
                files_without_db_entry.delete(match)
              end

              # add utc file name to db_entries_without_file if no file matches on disk
              db_entries_without_file.push(utc_file_name) unless is_match
            end

            name = 'baw:worker:audio_check:standalone:compare'

            BawWorkers::Config.logger_worker.warn(name) do
              "Exact intersection (only utc file names are included) (#{intersection.size}): #{intersection.join(', ')}"
            end

            BawWorkers::Config.logger_worker.warn(name) do
              "Existing files without db entry (only utc file names are included) (#{files_without_db_entry.size}): #{files_without_db_entry.join(', ')}"
            end

            # BawWorkers::Config.logger_worker.warn(name) {
            #   "Db entries with no files (only utc file names are included) (#{db_entries_without_file.size}): #{db_entries_without_file.join(', ')}"
            # }

            {
              intersection: intersection,
              files_without_db_entry: files_without_db_entry,
              db_entries_without_file: db_entries_without_file
            }
          end
        end
      end
    end
  end
end