greshny/diffbot

View on GitHub
lib/diffbot/article.rb

Summary

Maintainability
A
0 mins
Test Coverage
A
100%
require "yajl"
require 'hashie/extensions/dash/indifferent_access'
require 'hashie/extensions/indifferent_access'
require 'hashie/trash'
require 'hashie/extensions/coercion'

module Diffbot
  # Representation of an article (ie a blog post or similar). This class offers
  # a single entry point: the `.fetch` method, that, given a URL, will return
  # the article as analyzed by Diffbot.
  class Article < Hashie::Trash
    include Hashie::Extensions::Coercion
    include Hashie::Extensions::IndifferentAccess

    # Public: Fetch an article from a URL.
    #
    # url      - The article URL.
    # token    - The API token for Diffbot.
    # parser   - The callable object that will parse the raw output from the
    #            API. Defaults to Yajl::Parser.method(:parse).
    # defaults - The default request options. See Diffbot.article_defaults.
    #
    # Yields the request configuration.
    #
    # Examples
    #
    #   # Request an article with the default options.
    #   article = Diffbot::Article.fetch(url, api_token)
    #
    #   # Pass options to the request. See Diffbot::Article::RequestParams to
    #   # see the available configuration options.
    #   article = Diffbot::Article.fetch(url, api_token) do |req|
    #     req.html = true
    #   end
    #
    # Returns a Diffbot::Article.
    def self.fetch(url, token=Diffbot.token, parser=Yajl::Parser.method(:parse), defaults=Diffbot.article_defaults)
      params = defaults.dup
      yield params if block_given?

      request = Diffbot::Request.new(token)
      response = request.perform(:get, endpoint, params) do |req|
        req[:query][:url] = url
      end

      new(parser.call(response.body))
    end

    # The API endpoint where requests should be made.
    #
    # Returns a URL.
    def self.endpoint
      "http://www.diffbot.com/api/article"
    end

    # Public: URL of the article.
    property :url

    # Public: Title of the article.
    property :title

    # Public: Number of pages of the article.
    property :numPages

    # Public: Author (or Authors) ofthe article.
    property :author

    # Public: Date of the article (as a string).
    property :date
    
    # Public: Meta tags of the article
    property :meta

    # Returns the (spoken/human) language of the submitted URL, using two-letter
    # ISO 639-1 nomenclature.
    property :human_language, from: :humanLanguage

    class MediaItem < Hashie::Trash
      include Hashie::Extensions::IndifferentAccess
      property :type
      property :link
      property :primary, default: false
      property :caption
    end

    # Public: List of media items related to the articles. Each item is an
    # object with the following attributes:
    #
    # type    - Either `"image"` or `"video"`.
    # link    - The URL of the given media resource.
    # primary - Only present in one of the items. This is assumed to be the most
    #           representative media for this article.
    property :media
    coerce_key :media, Array[MediaItem]

    class ImageItem < Hashie::Trash
      include Hashie::Extensions::IndifferentAccess
      property :url
      property :pixel_height, from: :pixelHeight
      property :pixel_width,  from: :pixelWidth
      property :caption
      property :primary
    end

    # Public: Array of images, if present within the article body.
    # object with the following attributes
    #
    # url          - Direct (fully resolved) link to image.
    # pixel_height - Image height, in pixels.
    # pixel_weight - Image width, in pixels.
    # caption      - Diffbot-determined best caption for the image, if detected.
    # primary      - Returns "true" if image is identified as primary.
    coerce_key :images, Array[ImageItem]

    # Public: The raw text of the article, without formatting.
    property :text

    # Public: The contents of the article in HTML, stripped of any ads or other
    # chunks of HTML which are considered unrelated by Diffbot, unless you set
    # the `dont_strip_ads` option in the request.
    #
    # Only present if you set `html` to true in the request.
    property :html

    # Public: A summary line for this article.
    #
    # Only present if you set `summary` to true in the request.
    property :summary

    # Public: A list of tags related to this article.
    #
    # Only present if you set `tags` to true in the request.
    property :tags

    # Public: The favicon of the page where this article was extracted from.
    property :icon

    property :date_created

    property :cid

    property :categories

    property :supertags

    class VideoItem < Hashie::Trash
      include Hashie::Extensions::IndifferentAccess
      property :url
      property :pixel_height, from: :pixelHeight
      property :pixel_width,  from: :pixelWidth
      property :primary
    end
    # Public: Array of videos, if present within the article body
    # object with the following attributes
    #
    # url          - Direct (fully resolved) link to the video content.
    # pixel_height - Video height, in pixels, if accessible.
    # pixel_width  - Video width, in pixels, if accessible.
    # primary      - Returns "true" if the video is identified as primary.
    coerce_key :videos, Array[VideoItem]

    class Stats < Hashie::Trash
      include Hashie::Extensions::IndifferentAccess
      property :fetch_time, from: :fetchTime
      property :confidence
    end

    # Public: Returns an object with the following attributes:
    #
    # fetch_time - The time of the request, in ms.
    # confidence - The confidence of Diffbot that the returned text is really
    #              the text of the article. Between 0.0 and 1.0.
    #
    # Only present if you set `stats` to true in the request.
    coerce_key :stats, Stats

    # Public: The XPath selector at which the body of the article was found in
    # the page.
    property :xpath

    # Public: If there was an error in the request, this will contain the error
    # message.
    property :error

    # Public: If there was an error in the request, this will contain the error
    # code.
    property :error_code, from: :errorCode

    # This represents the parameters you can pass to Diffbot to configure a
    # given request. These are either set globally with Diffbot.article_defaults
    # or on a request basis by passing a block to Diffbot::Article.fetch.
    #
    # Example:
    #
    #   # All article requests will include the HTML and tags.
    #   Diffbot.configure do |config|
    #     config.article_defaults do |defaults|
    #       defaults.html = true
    #       defaults.tags = true
    #     end
    #   end
    #
    #   # This article request will *also* include the summary.
    #   Diffbot::Article.fetch(url, token) do |req|
    #     req.summary = true
    #   end

    # Number of pages automatically concatenated to form the text or html response.
    property :num_pages, from: :numPages
    property :next_pages, from: :nextPages
    property :resolved_url
    property :type

    class RequestParams < Hashie::Trash
      include Hashie::Extensions::IndifferentAccess
      # Public: Set to true to return HTML instead of plain-text.
      #
      # Defaults to nil.
      #
      # If enabled, sets the `html` key in the `Diffbot::Article`.
      property :html

      # Public: Set to true to keep any inline ads in the generated story.
      #
      # Defaults to nil.
      #
      # If enabled, it will change the `html` key in the `Diffbot::Article`.
      property :dontStripAds, from: :dont_strip_ads
      
      # Public: Set to true to return meta tags.
      #
      # Defaults to nil.
      #
      # If enabled, sets the `meta` key in the `Diffbot::Article`.
      property :meta

      # Public: Set to true to generate tags for the extracted story.
      #
      # Defaults to nil.
      #
      # If enabled, sets the `tags` key in the `Diffbot::Article`.
      property :tags

      # Public: Set to true to find the comments and identify count, link, etc.
      #
      # Defaults to nil.
      #
      # If enabled, sets the `comments` key in the `Diffbot::Article`.
      property :comments

      # Public: Set to true to return a summary text.
      #
      # Defaults to nil.
      #
      # If enabled, sets the `summary` key in the `Diffbot::Article`.
      property :summary

      # Public: Set to true to include performance and probabilistic scoring
      # stats.
      #
      # Defaults to nil.
      #
      # If enabled, sets the `stats` key in the `Diffbot::Article`.
      property :stats
    end
  end
end