lib/gini-api/client.rb
require 'uri'
require 'json'
require 'logger'
require 'faraday'
require 'benchmark'
module Gini
module Api
# Main class to operate on the Gini API
#
class Client
attr_reader :token, :log
# Instantiate a new Gini::Api::Client object with OAuth capabilities
#
# @param [Hash] options Hash of available config settings
# @option options [String] :client_id OAuth client_id
# @option options [String] :client_secret OAuth client_secret
# @option options [String] :oauth_site OAuth site to connect to (https://user.gini.net)
# @option options [String] :oauth_redirect Redirect URI
# @option options [Integer] :upload_timeout Upload timeout in seconds
# @option options [Integer] :processing_timeout API operational timeout in seconds
# @option options [String] :api_uri API URI (https://api.gini.net)
# @option options [String] :api_version API version to use (v1)
# @option options [Logger] :log logger object to use (initialized with STDOUT otherwise)
# @option options [String] :user_agent HTTP User-Agent (gini-api-ruby/VERSION (Faraday vFaraday::VERSION))
#
# @example
# api = Gini::Api::Client.new(
# client_id: 'my_client_id',
# client_secret: 'my_client_secret',
# )
#
def initialize(options = {})
opts = {
oauth_site: 'https://user.gini.net/',
oauth_redirect: 'http://localhost',
api_uri: 'https://api.gini.net',
api_version: 'v1',
api_type: 'json',
upload_timeout: 90,
processing_timeout: 180,
log: Logger.new(STDOUT),
user_agent: "gini-api-ruby/#{VERSION} (Faraday v#{Faraday::VERSION})"
}.merge(options)
# Ensure mandatory keys are set
[:client_id, :client_secret].each do |k|
raise Gini::Api::Error.new("Mandatory option key is missing: #{k}") unless opts.key?(k)
end
# Populate instance variables from merged opts
opts.each do |k, v|
instance_variable_set("@#{k}", v)
self.class.send(:attr_reader, k)
end
# Ensure STDOUT is flushed
STDOUT.sync = true
# Sanitize api_uri
@api_uri.sub!(/(\/)+$/, '')
# Register parser (json+xml) based on API version
register_parser
@log.info('Gini API client initialized')
@log.info("Target: #{@api_uri}")
end
# Register OAuth2 response parser
#
def register_parser
OAuth2::Response.register_parser(:gini_json, [version_header(:json)[:accept]]) do |body|
MultiJson.load(body, symbolize_keys: true) rescue body
end
OAuth2::Response.register_parser(:gini_xml, [version_header(:xml)[:accept]]) do |body|
MultiXml.parse(body) rescue body
end
OAuth2::Response.register_parser(:gini_incubator, [version_header(:json, :incubator)[:accept]]) do |body|
MultiJson.load(body, symbolize_keys: true) rescue body
end
end
# Acquire OAuth2 token and popolate @oauth (instance of Gini::Api::OAuth.new)
# and @token (OAuth2::AccessToken). Supports 2 strategies: username/password and authorization code
#
# @param [Hash] opts Your authorization credentials
# @option opts [String] :auth_code OAuth authorization code. Will be exchanged for a token
# @option opts [String] :username API username
# @option opts [String] :password API password
#
# @example
# api.login(auth_code: '1234567890')
# @example
# api.login(username: 'me@example.com', password: 'secret')
#
def login(opts)
@oauth = Gini::Api::OAuth.new(self, opts)
@token = @oauth.token
end
# Destroy OAuth2 token
#
def logout
@oauth.destroy
end
# Version accept header based on @api_version
#
# @param [Symbol, String] type Expected response type (:xml, :json)
# @param [Symbol, String] version API version (:v1, :incubator)
#
# @return [Hash] Return accept header or empty hash
#
def version_header(type = @api_type, version = @api_version)
{ accept: "application/vnd.gini.#{version}+#{type}" }
end
# Request wrapper that sets URI and accept header
#
# @param [Symbol] verb HTTP request verb (:get, :post, :put, :delete)
# @param [String] resource API resource like /documents
# @param [Hash] options Optional type and custom headers
# @option options [String] :type Type to pass to version_header (:xml, :json)
# @option options [Hash] :headers Custom headers. Must include accept
#
def request(verb, resource, options = {})
opts = {
headers: version_header(options.delete(:type) || @api_type)
}.merge(options)
timeout(@processing_timeout) do
@token.send(verb.to_sym, resource_to_location(resource).to_s , opts)
end
rescue OAuth2::Error => e
raise Gini::Api::RequestError.new(
"API request failed: #{verb} #{resource} (code=#{e.response.status})",
e.response
)
rescue Timeout::Error => e
raise Gini::Api::ProcessingError.new(
"API request timed out: #{verb} #{resource} (#{e.message})"
)
end
# Upload a document
#
# @param [String] file path or open filehandle of the document to upload
# @param [Hash] options Hash of available upload settings
# @option options [String] :doctype_hint Document type hint to optimize results or get incubator results
# @option options [String] :text Use given file-string as text upload
# @option options [Float] :interval Interval to poll progress
#
# @return [Gini::Api::Document] Return Gini::Api::Document object for uploaded document
#
# @example Upload and wait for completion
# doc = api.upload('/tmp/myfile.pdf')
# @example Upload with doctype hint
# doc = api.upload('/tmp/myfile.pdf', doctype_hint: 'Receipt')
# @example Upload and monitor progress
# doc = api.upload('/tmp/myfile.pdf') { |d| puts "Progress: #{d.progress}" }
# @example Upload and monitor progress
# doc = api.upload('This is a text message i would love to get extractions from', text: true)
#
def upload(file, options = {}, &block)
opts = {
doctype_hint: nil,
text: false,
interval: 0.5
}.merge(options)
duration = Hash.new(0)
# Document upload
duration[:upload], response = upload_document(file, opts)
# Start polling (0.5s) when document has been uploaded successfully
if response.status == 201
doc = Gini::Api::Document.new(self, response.headers['location'])
duration[:processing] = poll_document(doc, opts[:interval], &block)
duration[:total] = duration.values.inject(:+)
doc.duration = duration
doc
else
fail Gini::Api::UploadError.new(
"Document upload failed with HTTP code #{response.status}",
response
)
end
end
# Delete document
#
# @param [String] id document ID
#
def delete(id)
response = request(:delete, "/documents/#{id}")
unless response.status == 204
raise Gini::Api::DocumentError.new(
"Deletion of docId #{id} failed (code=#{response.status})",
response
)
end
@log.info("Deleted document #{id}")
end
# Get document by Id
#
# @param [String] id document ID
#
# @return [Gini::Api::Document] Return Gini::Api::Document object
#
def get(id)
Gini::Api::Document.new(self, "/documents/#{id}")
end
# List all documents
#
# @param [Hash] options List options (offset and limit)
# @option options [Integer] :limit Maximum number of documents to return (defaults to 20)
# @option options [Integer] :offset Start offset. Defaults to 0
#
# @return [Gini::Api::DocumentSet] Returns a DocumentSet with total, offset and a list of Document objects
#
def list(options = {})
opts = { limit: 20, offset: 0 }.merge(options)
limit = Integer(opts[:limit])
offset = Integer(opts[:offset])
response = request(:get, "/documents?limit=#{limit}&next=#{offset}")
unless response.status == 200
raise Gini::Api::DocumentError.new(
"Failed to get list of documents (code=#{response.status})",
response
)
end
Gini::Api::DocumentSet.new(self, response.parsed)
end
# Fulltext search for documents
#
# @param [String, Array] query The search term(s), separated by space. Multiple terms as array
# @param [Hash] options Search options
# @option options [String] :type Only include documents with the given doctype
# @option options [Integer] :limit Number of results per page. Must be between 1 and 250. Defaults to 20
# @option options Integer] :offset Start offset. Defaults to 0
#
# @return [Gini::Api::DocumentSet] Returns a DocumentSet with total, offset and a list of Document objects
#
def search(query, options = {})
opts = { type: '', limit: 20, offset: 0 }.merge(options)
query = URI.escape(query)
type = URI.escape(opts[:type])
limit = Integer(opts[:limit])
offset = Integer(opts[:offset])
response = request(:get, "/search?q=#{query}&type=#{type}&limit=#{limit}&next=#{offset}")
unless response.status == 200
raise Gini::Api::SearchError.new(
"Search query failed with code #{response.status}",
response
)
end
Gini::Api::DocumentSet.new(self, response.parsed)
end
private
# Helper to covert resource to a valid location.
#
# @param [String] resource URI to be converted
#
# @return [URI::HTTPS] URI::HTTPS object create from resource
#
def resource_to_location(resource)
parsed_resource = URI.parse(resource)
@api_host ||= URI.parse(@api_uri).host
URI::HTTPS.build(
host: @api_host,
path: parsed_resource.path,
query: parsed_resource.query
)
end
# Poll document and duration
#
# @param [Gini::Api::Document] doc Document instance to poll
# @param [Float] interval Polling interval for completion
#
# @return [Integer] Processing duration
#
def poll_document(doc, interval, &block)
duration = 0
timeout(@processing_timeout) do
duration = Benchmark.realtime do
doc.poll(interval, &block)
end
end
duration
rescue Timeout::Error => e
ex = Gini::Api::ProcessingError.new(e.message)
ex.docid = doc.id
raise ex
end
# Setup API upload connection
#
# @return [Faraday] Faraday object to use in upload
#
def upload_connection
@upload_connection ||= Faraday.new(url: @api_uri) do |builder|
builder.use(Faraday::Request::Multipart)
builder.use(Faraday::Request::UrlEncoded)
builder.request(:retry, 3)
builder.adapter(Faraday.default_adapter)
end
end
# Helper to upload document
#
# @param [String] file location of document or open filehandle to be uploaded
# @param [String] doctype_hint Document type hint to optimize results or get incubator results
#
# @return [Faraday::Response] Response object from upload
#
def upload_document(file, opts)
response = nil
# Use StringIO on file string and force utf-8
file = StringIO.new(file.force_encoding('UTF-8')) if opts[:text]
duration = Benchmark.realtime do
response = upload_connection.post do |req|
req.options[:timeout] = @upload_timeout
req.url 'documents'
req.params[:doctype] = opts[:doctype_hint] if opts[:doctype_hint]
req.headers['Content-Type'] = 'multipart/form-data'
req.headers['Authorization'] = "Bearer #{@token.token}"
req.headers.merge!(version_header)
req.body = { file: Faraday::UploadIO.new(file, 'application/octet-stream') }
end
end
return duration, response
end
end
end
end