lib/link_preview/parser.rb from socialcast/link_preview

lib/link_preview/parser.rb
Summary

Maintainability

4 hrs
Test Coverage

Issues
# Copyright (c) 2014-2016, VMware, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is furnished to do
# so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

require 'multi_json'
require 'nokogiri'
require 'set'

module LinkPreview
  class Parser
    attr_accessor :discovered_uris

    def initialize(config, options = {})
      @config = config
      @options = options
      self.discovered_uris = Set.new
    end

    def parse(data)
      return {} unless valid_data?(data)
      case data.headers[:content_type]
      when /image/, 'binary/octet-stream'
        parse_image(data)
      when %r{\Atext/html.*}
        parse_html(data)
      when %r{\Atext/xml.*}
        parse_oembed(data)
      when %r{\Aapplication/json.*}
        parse_oembed(data)
      else
        {}
      end
    end

    private

    def ignore_opengraph_video_type_html?
      @config.ignore_opengraph_video_type_html
    end

    def valid_data?(data)
      data && data.headers && data.body && data.headers[:content_type]
    end

    def parse_html(data)
      doc = Nokogiri::HTML.parse(data.body, nil, 'UTF-8')
      return unless doc

      enum_oembed_link(doc) do |link_rel|
        discovered_uris << LinkPreview::URI.parse(link_rel, @options)
      end

      {
        opengraph: parse_opengraph_video_data(doc),
        opengraph_embed: parse_opengraph_embed_data(doc),
        html: parse_html_data(doc)
      }
    end

    def parse_html_data(doc)
      {
        title: find_title(doc),
        description: find_meta_description(doc),
        tags: Array.wrap(find_rel_tags(doc))
      }
    end

    def parse_image(data)
      {
        image: {
          image_url: data.url,
          image_data: parse_image_data(data),
          image_content_type: data.headers[:content_type],
          image_file_name: parse_image_file_name(data)
        }
      }
    end

    def parse_image_file_name(data)
      content_disposition_filename = parse_content_disposition_filename(data)
      if content_disposition_filename.present?
        content_disposition_filename
      elsif data.url
        parsed_uri = LinkPreview::URI.parse(data.url, @options)
        parsed_uri.path.split('/').last || parsed_uri.hostname.tr('.', '_')
      end
    end

    def parse_opengraph_common_data(doc)
      opengraph_image_array_first_elem = find_meta_property_array(doc, 'og:image').first
      {
        title: find_meta_property(doc, 'og:title'),
        description: find_meta_property(doc, 'og:description'),
        image_url: opengraph_image_array_first_elem['og:image'] || opengraph_image_array_first_elem['og:image:url'],
        image_secure_url: opengraph_image_array_first_elem['og:image:secure_url'],
        tag: find_meta_property(doc, 'og:tag'),
        url: find_meta_property(doc, 'og:url'),
        type: find_meta_property(doc, 'og:type'),
        site_name: find_meta_property(doc, 'og:site_name')
      }
    end

    def parse_opengraph_video_data(doc)
      opengraph_video_array_first_elem = find_meta_property_array(doc, 'og:video').detect { |x| x['og:video:type'] != 'text/html' }
      return {} unless opengraph_video_array_first_elem
      parse_opengraph_common_data(doc).merge(
        video_url: opengraph_video_array_first_elem['og:video'] || opengraph_video_array_first_elem['og:video:url'],
        video_secure_url: opengraph_video_array_first_elem['og:video:secure_url'],
        video_type: opengraph_video_array_first_elem['og:video:type'],
        video_width: opengraph_video_array_first_elem['og:video:width'],
        video_height: opengraph_video_array_first_elem['og:video:height']
      )
    end

    def parse_opengraph_embed_data(doc)
      return {} if ignore_opengraph_video_type_html?
      opengraph_video_array_first_elem = find_meta_property_array(doc, 'og:video').detect { |x| x['og:video:type'] == 'text/html' }
      return {} unless opengraph_video_array_first_elem
      opengraph_common_data = parse_opengraph_common_data(doc)

      video_secure_url = opengraph_video_array_first_elem['og:video:secure_url']
      video_url = opengraph_video_array_first_elem['og:video:url']
      video_width = opengraph_video_array_first_elem['og:video:width']
      video_height = opengraph_video_array_first_elem['og:video:height']
      opengraph_common_data.merge(
        video_url: video_secure_url || video_url,
        video_type: 'text/html',
        video_width: video_width,
        video_height: video_height
      )
    end

    def parse_oembed(data)
      oembed_data = parse_oembed_data(data)
      return {} unless oembed_data.is_a?(Hash) && oembed_data['type']
      { oembed: oembed_data.merge(url: parse_oembed_content_url(data)) }
    end

    def parse_oembed_data(data)
      case data.headers[:content_type]
      when /xml/
        Hash.from_xml(Nokogiri::XML.parse(data.body, nil, 'UTF-8').to_s)['oembed']
      when /json/
        MultiJson.load(data.body)
      end
    rescue
      nil
    end

    def parse_oembed_content_url(data)
      return unless data.url
      parsed_uri = LinkPreview::URI.parse(data.url, @options)
      parsed_uri.as_content_uri.to_s
    end

    def parse_image_data(data)
      StringIO.new(data.body.dup) if data.body
    end

    # see http://www.ietf.org/rfc/rfc1806.txt
    def parse_content_disposition_filename(data)
      return unless data.headers[:'content-disposition'] =~ /filename=(.*?)\z/
      Regexp.last_match(1).gsub(/\A['"]+|['"]+\z/, '')
    end

    def enum_oembed_link(doc, &_block)
      doc.search("//head/link[@rel='alternate'][@type='application/json+oembed']", "//head/link[@rel='alternate'][@type='text/xml+oembed']").each do |node|
        next unless node && node.respond_to?(:attributes) && node.attributes['href']
        yield node.attributes['href'].value
      end
    end

    def find_title(doc)
      doc.at('head/title').try(:inner_text)
    end

    # See http://microformats.org/wiki/rel-tag
    def find_rel_tags(doc)
      doc.search("//a[@rel='tag']").map(&:inner_text).reject(&:blank?)
    end

    def enum_meta_pair(doc, key, value)
      Enumerator.new do |e|
        doc.search('head/meta').each do |node|
          next unless matching_meta_pair?(node, key, value)
          e.yield OpenStruct.new(key: node.attributes['property'].value, value: node.attributes['content'].value)
        end
      end
    end

    def matching_meta_pair?(node, key, value)
      return false unless valid_meta_node?(node)
      return false unless node.attributes[key]
      return false unless node.attributes[key].value
      return false unless matching_meta_value?(node, key, value)
      true
    end

    def matching_meta_value?(node, key, value)
      case value
      when String
        node.attributes[key].value.casecmp(value.downcase).zero?
      when Regexp
        node.attributes[key].value =~ value
      end
    end

    def valid_meta_node?(node)
      return false unless node
      return false unless node.respond_to?(:attributes)
      return false unless node.attributes['content']
      return false unless node.attributes['content'].value
      true
    end

    def find_meta_description(doc)
      Enumerator.new do |e|
        doc.search('head/meta[name=description]').each do |node|
          next unless matching_meta_pair?(node, 'name', 'description')
          e.yield node.attributes['content'].value
        end
      end.first
    end

    def find_meta_property(doc, property)
      enum_meta_pair(doc, 'property', property).first.try(:value)
    end

    def find_meta_property_array(doc, property)
      [].tap do |property_array|
        property_group = {}
        enum_meta_pair(doc, 'property', /\A#{Regexp.escape(property)}/).each do |pair|
          if property_array_delimiter?(property, pair) && property_group.any?
            property_array.push(property_group.dup)
            property_group.clear
          end
          property_group.merge!(pair.key => pair.value)
        end
        property_array.push(property_group)
      end
    end

    def property_array_delimiter?(property, pair)
      pair.key == property || pair.key == "#{property}:url"
    end
  end
end