app/models/media_file_modules/meta_data_extraction.rb

Summary

Maintainability
A
1 hr
Test Coverage
# Note: git blame, most if not all of this was copied over
#
# this module is used to extract meta_data from an imported
# file and set the meta_data attribute for the media_file
#
# this code as (almost) nothing to do with the MetaData model related
# to a MediaResource
#
# there is a similar named module used with media_entry_incomplete to 
# set actual MetaData 
#
module MediaFileModules
  module MetaDataExtraction
    extend ActiveSupport::Concern
    module ClassMethods
    end

    def import_meta_data
      case content_type
      when /image/ then
        import_image_metadata(file_storage_location) 
      when /video/ then
        import_audio_video_metadata(file_storage_location)
      when /audio/ then
        import_audio_metadata(file_storage_location)
      when /application/ then
        import_document_metadata(file_storage_location)
      else
        # TODO implement other content_types
      end
      update_attributes(:meta_data => meta_data)
    end


    def meta_data_without_binary
      r = meta_data.reject{|k,v| ["!binary |", "Binary data"].any?{|x| v.to_yaml.include?(x)}}
      r.map {|x| x.map{|y| y.to_s.dup.force_encoding('utf-8') } }
    end


    def import_image_metadata(full_path_file)
      self.meta_data ||= {}
      group_tags = ['File:', 'Composite:', 'IFD', 'ICC-','ICC_Profile','XMP-exif', 'XMP-xmpMM', 'XMP-aux', 'XMP-tiff', 'Photoshop:', 'ExifIFD:', 'JFIF', 'IFF:', 'GPS:', 'PNG:' ] #'System:' leaks system info
      ignore_fields = ['UserComment','ImageDescription', 'ProfileCopyright', 'System:']
      exif_hash = {}

      blob = Exiftool.parse_metadata(full_path_file, group_tags)
      blob.each do |tag_array_entry|
        tag_array_entry.each do |entry|
          exif_hash[entry[0]]=entry[1] unless ignore_fields.any? {|w| entry[0].include? w }
        end
        meta_data.merge!(exif_hash)
      end
      # Use ImageMagick to get dimensions; exiftool turned out to be unreliable 
      # HACK: working around a bug where ImageMagick exits which error even if only warning were issued (issue reported upstream)
      #original:# img_x, img_y= System.execute_cmd!("identify -format '%w %h' #{full_path_file}").split
      output= `identify -format '%w %h' #{full_path_file}`
      begin
        img_x, img_y= output.split
      rescue
        raise "ImageMagick: No sizes found!"
      end
      update_attributes(:width => img_x, :height => img_y)
    end

    def import_audio_video_metadata(full_path_file)
      # TODO: merge with import_image_metadata, make fields configurable in :options hash
      self.meta_data ||= {}

      # The Tracks: entries describe video, audio or subtitle tracks present in the container. We extract 10
      # because we think no one would be mad enough to have more.
      tracks = []
      [1..10].each do |n|
        tracks << "Track#{n}:"
      end

      group_tags = ['File:', 'Composite:', 'IFD', 'ICC-','ICC_Profile','XMP-exif', 'XMP-xmpMM', 'XMP-aux', 'XMP-tiff', 'Photoshop:', 'ExifIFD:', 'JFIF', 'IFF:', 'GPS:', 'PNG:', 'QuickTime:'] + tracks #'System:' leaks system info
      ignore_fields = ['UserComment','ImageDescription', 'ProfileCopyright', 'System:']
      exif_hash = {}

      blob = Exiftool.parse_metadata(full_path_file, group_tags)
      blob.each do |tag_array_entry|
        tag_array_entry.each do |entry|
          exif_hash[entry[0]]=entry[1] unless ignore_fields.any? {|w| entry[0].include? w }
        end
        meta_data.merge!(exif_hash)
      end

      # Exiftool couldn't get the dimensions. Let's try with ffmpeg
      if exif_hash["Composite:ImageSize"].nil?
        img_x, img_y = get_sizes_from_ffmpeg(full_path_file)
      else
        img_x, img_y = exif_hash["Composite:ImageSize"].split("x")
      end
      update_attributes(:width => img_x, :height => img_y)
    end

    def get_sizes_from_ffmpeg(path)
      out = `ffmpeg -i #{path} 2>&1`
      out.split("\n").each do |line|
        if line =~ /.*Stream.*Video.*/
          x, y = line.scan(/\d+x\d+/).first.split("x")
          return [x, y]
        end
      end
    end

    def import_audio_metadata(full_path_file)

      # TODO refactor to use ffmpeg, some id3 tag extractor, etc.
    end

    def import_document_metadata(full_path_file)
      #TODO - specifically for other non-zipped documents (e.g. source code, application binary, etc)
    end

  end
end