socialcast/link_preview

View on GitHub
lib/link_preview/content.rb

Summary

Maintainability
C
1 day
Test Coverage
# Copyright (c) 2014-2016, VMware, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is furnished to do
# so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

require 'link_preview/uri'
require 'link_preview/parser'
require 'link_preview/http_crawler'
require 'link_preview/null_crawler'

require 'active_support/core_ext/object'

module LinkPreview
  class Content
    PROPERTIES = [
      :title,
      :description,
      :site_name,
      :site_url,
      :image_url,
      :image_data,
      :image_content_type,
      :image_file_name,
      :content_url,
      :content_type,
      :content_width,
      :content_height
    ].freeze

    SOURCES = [:initial, :image, :oembed, :opengraph_embed, :opengraph, :html].freeze

    SOURCE_PROPERTIES_TABLE =
      {
        oembed: {
          site_name: :provider_name,
          site_url: :provider_url,
          image_url: :thumbnail_url
        },
        opengraph: {
          image_url: [:image_secure_url, :image_url],
          content_url: [:video_secure_url, :video_url],
          content_type: :video_type,
          content_width: :video_width,
          content_height: :video_height
        },
        opengraph_embed: {
          image_url: [:image_secure_url, :image_url],
          content_url: [:video_secure_url, :video_url],
          content_type: :video_type,
          content_width: :video_width,
          content_height: :video_height
        }
      }.freeze

    PROPERTIES_SOURCE_TABLE =
      Hash.new { |h, k| h[k] = {} }.tap do |reverse_property_table|
        SOURCE_PROPERTIES_TABLE.each do |source, table|
          table.invert.each_pair do |keys, val|
            Array.wrap(keys).each do |key|
              reverse_property_table[source][key] = val
            end
          end
        end
      end

    def initialize(config, content_uri, options = {}, sources = {})
      @config = config
      @content_uri = content_uri
      @options = options
      @sources = Hash.new { |h, k| h[k] = {} }
      crawler.enqueue!(@content_uri)

      add_source_properties!(sources)
    end

    # @return [String] permalink URL of resource
    def url
      extract(:url) || @content_uri
    end

    PROPERTIES.each do |property|
      define_method(property) do
        extract(property)
      end
    end

    # @return [Boolean] true of at least related content URI has been successfully fetched
    def found?
      extract_all
      crawler.success?
    end

    # @return [Boolean] true of at least one content property is present
    def empty?
      extract_all
      SOURCES.none? do |source|
        @sources[source].any?(&:present?)
      end
    end

    attr_reader :sources

    def as_oembed
      if content_type_embed? || content_type_iframe? || content_type_video? || content_type_flash?
        @sources[:oembed].reverse_merge(as_oembed_video)
      else
        @sources[:oembed].reverse_merge(as_oembed_link)
      end
    end

    protected

    def crawler
      @crawler ||= crawler_class.new(@config, @options)
    end

    def parser
      @parser ||= LinkPreview::Parser.new(@config, @options)
    end

    def parsed_url
      LinkPreview::URI.parse(url, @options)
    end

    def default_property(property)
      send("default_#{property}") if respond_to?("default_#{property}", true)
    end

    # called via default_property
    def default_title
      parsed_url.for_display.to_s
    end

    # called via default_property
    def default_site_name
      parsed_url.host
    end

    # called via default_property
    def default_site_url
      return unless parsed_url.scheme && parsed_url.host
      "#{parsed_url.scheme}://#{parsed_url.host}"
    end

    def normalize_property(property, value)
      if respond_to?("normalize_#{property}", true)
        send("normalize_#{property}", value)
      else
        normalize_generic(property, value)
      end
    end

    def normalize_generic(property, value)
      case value
      when String
        strip_html(value.strip)
      when Array
        value.compact.map { |elem| normalize_property(property, elem) }
      else
        value
      end
    end

    # called via normalize_property
    def normalize_image_url(partial_image_url)
      return unless partial_image_url
      parsed_partial_image_url = LinkPreview::URI.parse(partial_image_url, @options)
      parsed_absolute_image_url = parsed_partial_image_url.to_absolute(@content_uri)
      parsed_absolute_image_url.to_s.tap do |absolute_image_url|
        crawler.enqueue!(absolute_image_url, :image)
      end
    end

    # called via normalize_property
    def normalize_url(partial_url)
      return unless partial_url
      partial_unencoded_url = LinkPreview::URI.unescape(partial_url)
      parsed_partial_url = LinkPreview::URI.parse(partial_unencoded_url, @options)
      parsed_absolute_url = parsed_partial_url.to_absolute(@content_uri)
      crawler.enqueue!(parsed_absolute_url, :html)
      parsed_absolute_url.for_display.to_s
    end

    # called via normalize_property
    def normalize_content_url(content_url)
      return unless content_url
      LinkPreview::URI.safe_escape(content_url).to_s
    end

    # called via normalize_property
    def normalize_title(title)
      CGI.unescapeHTML(title)
    end

    # called via normalize_property
    def normalize_html(html)
      html
    end

    def get_property(property)
      SOURCES.map do |source|
        @sources[source][property_alias(source, property)]
      end.compact.first || default_property(property)
    end

    def property?(property)
      SOURCES.map do |source|
        @sources[source][property_alias(source, property)]
      end.any?(&:present?)
    end

    def property_alias(source, property)
      property_aliases(source, property).detect { |p| @sources[source].key?(p) }
    end

    def property_aliases(source, property)
      Array.wrap(SOURCE_PROPERTIES_TABLE.fetch(source, {}).fetch(property, property))
    end

    def property_unalias(source, property)
      PROPERTIES_SOURCE_TABLE.fetch(source, {}).fetch(property, property)
    end

    def property_source_priority(property)
      case property
      when :description
        [:html, :oembed, :opengraph_oembed, :opengraph, :default]
      when :image_data, :image_content_type, :image_file_name
        [:image, :oembed, :opengraph_oembed, :opengraph, :default]
      else
        [:oembed, :opengraph_oembed, :opengraph, :html, :image, :default]
      end
    end

    def add_source_properties!(sources)
      sources.symbolize_keys!
      sources.reject! { |_, properties| properties.empty? }
      sources.select! { |source, _| SOURCES.include?(source) }
      sources.each do |source, properties|
        properties.symbolize_keys!
        properties.reject! { |_, value| value.blank? }
        prioritized_properties(source, properties).each do |property, value|
          next if @sources[source][property]
          @sources[source][property] = normalize_property(property_unalias(source, property), value)
        end
      end
      parser.discovered_uris.each do |uri|
        crawler.enqueue!(uri)
      end
    end

    def extract(property)
      until crawler.finished?
        break if property?(property)
        data = crawler.dequeue!(property_source_priority(property))
        properties = parser.parse(data)
        add_source_properties!(properties)
      end
      get_property(property)
    end

    def extract_all
      PROPERTIES.each do |property|
        send(property)
      end
    end

    def strip_html(value)
      Nokogiri::HTML(value).xpath('//text()').remove.to_s
    end

    def as_oembed_link
      {
        version: '1.0',
        provider_name: site_name,
        provider_url: site_url,
        url: url,
        title: title,
        description: description,
        type: 'link',
        thumbnail_url: image_url
      }.reject { |_, v| v.nil? }
    end

    def as_oembed_video
      as_oembed_link.merge(type: 'video',
                           html: content_html,
                           width: content_width_scaled.to_i,
                           height: content_height_scaled.to_i)
    end

    def content_type_video?
      content_type =~ %r{\Avideo/.*} ? true : false
    end

    def content_type_iframe?
      content_type =~ %r{\Atext/html} ? true : false
    end

    def content_type_flash?
      content_type == 'application/x-shockwave-flash'
    end

    def content_type_embed?
      get_property(:html) ? true : false
    end

    def content_html
      return content_html_embed if content_type_embed?
      return content_html_iframe if content_type_iframe?
      return content_html_video if content_type_video?
      return content_html_flash if content_type_flash?
    end

    def content_html_embed
      get_property(:html)
    end

    def content_html_video
      return unless content_url.present?
      width_attribute = %(width="#{content_width_scaled}") if content_width_scaled > 0
      height_attribute = %(height="#{content_height_scaled}") if content_height_scaled > 0
      <<-EOF.strip.gsub(/\s+/, ' ').gsub(/>\s+</, '><')
        <video #{width_attribute} #{height_attribute} controls>
          <source src="#{content_url}"
                  type="#{content_type}" />
        </video>
      EOF
    end

    def content_html_iframe
      return unless content_url.present?
      width_attribute = %(width="#{content_width_scaled}") if content_width_scaled > 0
      height_attribute = %(height="#{content_height_scaled}") if content_height_scaled > 0
      <<-EOF.strip.gsub(/\s+/, ' ').gsub(/>\s+</, '><')
        <iframe src="#{content_url}" #{width_attribute} #{height_attribute} allowfullscreen="true" />
      EOF
    end

    def content_html_flash
      return unless content_url.present?
      <<-EOF.strip.gsub(/\s+/, ' ').gsub(/>\s+</, '><')
        <object width="#{content_width_scaled}" height="#{content_height_scaled}">
          <param name="movie" value="#{content_url}"></param>
          <param name="allowScriptAccess" value="always"></param>
          <param name="allowFullScreen" value="true"></param>
          <embed src="#{content_url}"
                 type="#{content_type}"
                 allowscriptaccess="always"
                 allowfullscreen="true"
                 width="#{content_width_scaled}" height="#{content_height_scaled}"></embed>
        </object>
      EOF
    end

    def content_width_scaled
      # Width takes precedence over height
      if @options[:width].to_i > 0
        @options[:width]
      elsif @options[:height].to_i > 0 && content_height.to_i > 0
        # Compute scaled width using the ratio of requested height to actual height, round up to prevent truncation
        (((@options[:height].to_i * 1.0) / (content_height.to_i * 1.0)) * content_width.to_i).ceil
      else
        content_width.to_i
      end
    end

    def content_height_scaled
      # Width takes precedence over height
      if @options[:width].to_i > 0 && content_width.to_i > 0 && content_height.to_i > 0
        # Compute scaled height using the ratio of requested width to actual width, round up to prevent truncation
        (((@options[:width].to_i * 1.0) / (content_width.to_i * 1.0)) * content_height.to_i).ceil
      elsif @options[:height].to_i > 0
        @options[:height]
      elsif @options[:width].to_i > 0
        (@options[:width].to_i * (1.0 / @config.default_content_aspect_ratio)).ceil
      else
        content_height.to_i
      end
    end

    private

    def crawler_class
      @crawler_class ||= @options.fetch(:allow_requests, true) ? LinkPreview::HTTPCrawler : LinkPreview::NullCrawler
    end

    def prioritized_properties(source, properties)
      return properties unless prioritized_properties_for_source(source)
      Hash[properties.sort_by { |key, _| prioritized_properties_for_source(source).find_index(key) || -1 }]
    end

    def prioritized_properties_for_source(source)
      @prioritized_properties_for_source ||= {}
      @prioritized_properties_for_source[source] = SOURCE_PROPERTIES_TABLE[source] ? SOURCE_PROPERTIES_TABLE[source].values.flatten : nil
    end
  end
end