app/models/request.rb
require 'digest/md5'
require 'cgi'
# An ActiveRecord which represents a parsed OpenURL resolve service request,
# and other persistent state related to Umlaut's handling of that OpenURL
# request) should not be confused with the Rails ActionController::Request
# class (which represents the complete details of the current 'raw' HTTP
# request, and is not stored persistently in the db).
#
# Constituent openurl data is stored in Referent and Referrer.
class Request < ActiveRecord::Base
has_many :dispatched_services
# Order service_responses by id, so the first
# added to the db comes first. Less confusing to have a consistent order.
# Also lets installation be sure services run first will have their
# responses show up first
if Rails::VERSION::MAJOR >= 4
has_many :service_responses, lambda { order('id ASC') }
belongs_to :referent, lambda { includes(:referent_values) }
else
# Get rid of this when we stop supporting Rails3
has_many :service_responses, :order => 'id ASC'
belongs_to :referent, :include => :referent_values
end
has_many :clickthroughs
# holds a hash representing submitted http params
serialize :http_env
# Either creates a new Request, or recovers an already created Request from
# the db--in either case return a Request matching the OpenURL.
# options[:allow_create] => false, will not create a new request, return
# nil if no existing request can be found.
def self.find_or_create(params, session, a_rails_request, options = {} )
# Pull out the http params that are for the context object,
# returning a CGI::parse style hash, customized for what
# ContextObject.new_from_form_vars wants.
co_params = self.context_object_params( a_rails_request )
# Create a context object from our http params
context_object = OpenURL::ContextObject.new_from_form_vars( co_params )
# Sometimes umlaut puts in a 'umlaut.request_id' parameter.
# first look by that, if we have it, for an existing request.
request_id = params['umlaut.request_id']
# We're trying to identify an already existing response that matches
# this request, in this session. We don't actually match the
# session_id in the cache lookup though, so background processing
# will hook up with the right request even if user has no cookies.
# We don't check IP change anymore either, that was too open to
# mistaken false negative when req.ip was being used.
req = Request.find_by_id(request_id) unless request_id.nil?
# No match? Just pretend we never had a request_id in url at all.
request_id = nil if req == nil
# Serialized fingerprint of openurl http params, suitable for looking
# up in the db to see if we've seen it before. We got our co_params
# direct from parsing path ourselves, but in case a before_filter
# added in certain other params after that, we want to merge them in
# too.
fingerprintable_params = co_params.merge(
{"umlaut.service_group" => params["umlaut.service_group"]}.delete_if {|k, v| v.blank?}
)
param_fingerprint = self.co_params_fingerprint( fingerprintable_params )
client_ip = params['req.ip'] || a_rails_request.remote_ip()
unless (req || params["umlaut.force_new_request"] == "true" || param_fingerprint.blank? )
# If not found yet, then look for an existing request that had the same
# openurl params as this one, in the same session. In which case, reuse.
# Here we do require same session, since we don't have an explicit
# request_id given.
req = Request.where(
:session_id => a_rails_request.session_options[:id],
:contextobj_fingerprint => param_fingerprint,
:client_ip_addr => client_ip ).
order("created_at DESC, id DESC").first
end
# Okay, if we found a req, it might NOT have a referent, it might
# have been purged. If so, create a new one.
if ( req && ! req.referent )
req.referent = Referent.create_by_context_object(context_object)
end
unless (req || options[:allow_create] == false)
# didn't find an existing one at all, just create one
req = self.create_new_request!( :params => params, :session => session, :rails_request => a_rails_request, :contextobj_fingerprint => param_fingerprint, :context_object => context_object )
end
return req
end
# input is a Rails request (representing http request)
# We pull out a hash of request params (get and post) that
# define a context object. We use CGI::parse instead of relying
# on Rails parsing because rails parsing ignores multiple params
# with same key value, which is legal in CGI and is sometimes used in OpenURLs.
#
# So in general values of this hash will be an array.
# ContextObject.new_from_form_vars is good with that.
# Exception is url_ctx_fmt and url_ctx_val, which we'll
# convert to single values, because ContextObject wants it so.
def self.context_object_params(a_rails_request)
# GET params
co_params = CGI::parse( a_rails_request.query_string )
# add in the POST params please
co_params.merge!( CGI::parse(a_rails_request.raw_post)) if a_rails_request.raw_post
# default value nil please, that's what ropenurl wants
co_params.default = nil
# CGI::parse annoyingly sometimes puts a nil key in there, for an empty
# query param (like a url that has two consecutive && in it). Let's get rid
# of it please, only confuses our code.
co_params.delete(nil)
# Exclude params that are for Rails or Umlaut, and don't belong to the
# context object. Except leave in umlaut.* keys that DO matter for
# cacheability, like umlaut.institution (legacy) and umlaut.service_group
excluded_keys = ["action", "controller", "page", /\Aumlaut\.(?!(institution|service_group\[\])\Z)/, 'rft.action', 'rft.controller']
co_params.keys.each do |key|
excluded_keys.each do |exclude|
co_params.delete(key) if exclude === key;
end
end
# 'id' is a special one, cause it can be a OpenURL 0.1 key, or
# it can be just an application-level primary key. If it's only a
# number, we assume the latter--an openurl identifier will never be
# just a number.
if co_params['id']
co_params['id'].each do |id|
co_params['id'].delete(id) if id =~ /^\d+$/
end
end
return co_params
end
# Method that registers the dispatch status of a given service participating
# in this request.
#
# Status can be true (shorthand for DispatchedService::Success), false
# (shorthand for DispatchedService::FailedTemporary), or one of the other
# DispatchedService status codes.
# If a DispatchedService row already exists in the db, that row will be
# re-used, over-written with new status value.
#
# Exception can optionally be provided, generally with failed statuses,
# to be stored for debugging purposes.
#
# Safe to call in thread, uses explicit connectionpool checkout.
def dispatched(service, status, exception=nil)
ActiveRecord::Base.connection_pool.with_connection do
ds = self.find_dispatch_object( service )
unless ds
ds= self.new_dispatch_object!(service, status)
end
# In case it was already in the db, make sure to over-write status.
# and add the exception either way.
ds.status = status
ds.store_exception( exception )
ds.save!
end
end
# Someone asks us if it's okay to dispatch this guy. Only if it's
# marked as Queued, or Failed---otherwise it should be already working,
# or done.
def can_dispatch?(service)
ds= self.dispatched_services.where(:service_id => service.service_id).first
return ds.nil? || (ds.status == DispatchedService::Queued) || (ds.status == DispatchedService::FailedTemporary)
end
# Sets a DispatchedService object attached to this Request, for given
# service, marked InProgress -- but only if existing DispatchedService object did
# not already exist, or existed and was marked Queued or FailedTemporary.
# Returns true if was able to register as InProgress for given service,
# otherwise false.
#
# Wrapped in a connection_pool.with_connection, safe for calling from threaded
# context.
def register_in_progress(service)
ActiveRecord::Base.connection_pool.with_connection do
ds = self.find_dispatch_object( service )
if ds
# Already existed, need to update atomically, only if it's got
# a compatible existing status.
updated_count = self.dispatched_services.where(:id => ds.id,
:status => [DispatchedService::Queued || DispatchedService::FailedTemporary] ).
update_all(:status => DispatchedService::InProgress)
return (updated_count > 0)
else
# create new one, if race condition happened in between `find` above and now,
# we might wind up with a constraint violation raised, sorry.
ds= self.new_dispatch_object!(service, DispatchedService::InProgress)
ds.save!
return true
end
end
end
# Create a ServiceResponse and it's associated ServiceType(s) object,
# attached to this request.
# Arg is a hash of key/values. Keys MUST include:
# * :service, with the value being the actual Service object, not just the ID.
# * :service_type_value => the ServiceTypeValue object (or string name) for
# the the 'type' of response this is.
#
# Other keys are as conventional for the service. See documentation of
# conventional keys in ServiceResponse
#
# Some keys end up stored in columns in the db directly, others
# end up serialized in a hash in a 'text' column, caller doesn't have
# to worry about that, just pass em all in.
#
# Eg, called from a service adapter plugin:
# request.add_service_response(:service=>self,
# :service_type_value => 'cover_image',
# :display_text => 'Cover Image',
# :url => img.inner_html,
# :asin => asin,
# :size => size)
#
# Safe to call in thread, uses connection pool checkout.
def add_service_response(response_data)
raise ArgumentError.new("missing required `:service` key") unless response_data[:service].kind_of?(Service)
raise ArgumentError.new("missing required `:service_type_value` key") unless response_data[:service_type_value]
svc_resp = nil
ActiveRecord::Base.connection_pool.with_connection do
svc_resp = self.service_responses.build
svc_resp.service_id = response_data[:service].service_id
response_data.delete(:service)
type_value = response_data.delete(:service_type_value)
type_value = ServiceTypeValue[type_value.to_s] unless type_value.kind_of?(ServiceTypeValue)
svc_resp.service_type_value = type_value
# response_data now includes actual key/values for the ServiceResponse
# send em, take_key_values takes care of deciding which go directly
# in columns, and which in serialized hash.
svc_resp.take_key_values( response_data )
svc_resp.save!
end
return svc_resp
end
# Methods to look at status of dispatched services
def failed_service_dispatches
return self.dispatched_services.where(
:status => [DispatchedService::FailedTemporary, DispatchedService::FailedFatal]
).to_a
end
# Returns array of Services in progress or queued. Intentionally
# uses cached in memory association, so it wont' be a trip to the
# db every time you call this.
def services_in_progress
# Intentionally using the in-memory array instead of going to db.
# that's what the "to_a" is. Minimize race-condition on progress
# check, to some extent, although it doesn't really get rid of it.
dispatches = self.dispatched_services.to_a.find_all do | ds |
(ds.status == DispatchedService::Queued) ||
(ds.status == DispatchedService::InProgress)
end
svcs = dispatches.collect { |ds| ds.service }
return svcs
end
# convenience method to call service_types_in_progress with one element.
def service_type_in_progress?(svc_type)
return service_types_in_progress?( [svc_type] )
end
#pass in array of ServiceTypeValue or string name of same. Returns
# true if ANY of them are in progress.
def service_types_in_progress?(type_array)
# convert strings to ServiceTypeValues
type_array = type_array.collect {|s| s.kind_of?(ServiceTypeValue)? s : ServiceTypeValue[s] }
self.services_in_progress.each do |s|
# array intersection
return true unless (s.service_types_generated & type_array).empty?
end
return false;
end
def any_services_in_progress?
return services_in_progress.length > 0
end
def to_context_object
#Mostly just the referent
context_object = self.referent.to_context_object
#But a few more things
context_object.referrer.add_identifier(self.referrer_id) if self.referrer_id
context_object.requestor.set_metadata('ip', self.client_ip_addr) if self.client_ip_addr
return context_object
end
# Is the citation represetned by this request a title-level only
# citation, with no more specific article info? Or no, does it
# include article or vol/iss info?
def title_level_citation?
data = referent.metadata
# atitle can't generlaly get us article-level, but it can with
# lexis nexis, so we'll consider it article-level. Since it is!
return ( data['atitle'].blank? &&
data['volume'].blank? &&
data['issue'].blank? &&
# pmid or doi is considered article-level, because SFX can
# respond to those. Other identifiers may be useless.
(! referent.identifiers.find {|i| i =~ /^info\:(doi|pmid)/})
)
end
# pass in a ServiceTypeValue (or string name of such), get back list of
# ServiceResponse objects with that value belonging to this request.
# :refresh=>true will force a trip to the db to get latest values.
# otherwise, association is used.
def get_service_type(svc_type, options = {})
svc_type_obj = (svc_type.kind_of?(ServiceTypeValue)) ? svc_type : ServiceTypeValue[svc_type]
responses = if ( options[:refresh])
ActiveRecord::Base.connection_pool.with_connection do
self.service_responses.where(["service_type_value_name = ?", svc_type_obj.name ]).to_a
end
else
# find on an assoc will go to db, unless we convert it to a plain
# old array first.
self.service_responses.to_a.find_all { |response|
response.service_type_value == svc_type_obj }
end
# Filter out any services with ID's not currently registered in
# ServiceStore
(responses, excluded_responses) = responses.partition do |r|
ServiceStore.service_definition_for(r.service_id).present?
end
if excluded_responses.present?
Rails.logger.warn("ServiceResponses skipped for unknown service_ids: " +
excluded_responses.collect {|s| s.service_id}.uniq.join(","))
end
return responses
end
# Warning, doesn't check for existing object first. Use carefully, usually
# paired with find_dispatch_object. Doesn't actually call save though,
# caller must do that (in case caller wants to further initialize first).
def new_dispatch_object!(service, status)
service_id = if service.kind_of?(Service)
service.service_id
else
service.to_s
end
ds = DispatchedService.new
ds.service_id = service_id
ds.status = status
self.dispatched_services << ds
return ds
end
# Returns an array of 0 or more ServiceDispatch objects matching
# specified conditions. Right now only one condition is supported:
#
# dispatch_objects_with(:service_type_values => values)
# values can be one or more string names of service types, returns
# DispatchedServices for services whose generated values include
# one or more of what you specified.
def dispatch_objects_with(options = {})
value_names = Array(options[:service_type_values])
raise ArgumentError.new("Need to supply a :service_type_values argument") unless value_names.present?
list = self.dispatched_services.to_a.find_all do |ds|
(value_names & ds.service.service_types_generated.collect(&:name)).present?
end
return list
end
protected
# Called by self.find_or_create, if a new request _really_ needs to be created.
def self.create_new_request!( args )
# all of these are required
params = args[:params]
session = args[:session]
a_rails_request = args[:rails_request]
contextobj_fingerprint = args[:contextobj_fingerprint]
context_object = args[:context_object]
# We don't have a complete Request, but let's try finding
# an already existing referent and/or referrer to use, if possible, or
# else create new ones.
rft = nil
if ( params['umlaut.referent_id'])
rft = Referent.where(:id => params['umlaut.referent_id']).first
end
# No id given, or no object found? Create it.
unless (rft )
rft = Referent.create_by_context_object(context_object)
end
# Create the Request
req = Request.new
req.session_id = a_rails_request.session_options[:id]
req.contextobj_fingerprint = contextobj_fingerprint
# Don't do this! It is a performance problem.
# rft.requests << req
# (rfr.requests << req) if rfr
# Instead, say it like this:
req.referent = rft
req.referrer_id = context_object.referrer.identifier unless context_object.referrer.empty? || context_object.referrer.identifier.empty?
# Save client ip
req.client_ip_addr = params['req.ip'] || a_rails_request.remote_ip()
req.client_ip_is_simulated = true if req.client_ip_addr != a_rails_request.remote_ip()
# Save selected http headers, keep some out to avoid being too long to
# serialize. This is in retrospect not a great design to save http hash,
# should be individual columns of things we want to save. When we next make
# Umlaut schema changes maybe.
#
# One problem we're running into is exceeding width of db column.
# We'll only save REQUEST_URI AND HTTP_REFERER if they're not too long to try and avoid.
#
# Also mark as "ISO-8859-1" to save space in the YAML encoding, current YAML uuencodes
# 'binary' taking up too much space. HTTP headers are usually ascii, theoretically
# can be ISO-8859-1, theoretically can but never are something else with proper marking,
# we won't worry about.
req.http_env = {}
a_rails_request.env.each do |k, v|
if ((k.slice(0,5) == 'HTTP_' && k != 'HTTP_COOKIE' ) ||
(k == 'REQUEST_URI') ||
k == 'SERVER_NAME')
k = k.dup.force_encoding("ISO-8859-1")
v.force_encoding("ISO-8859-1")
v.scrub!
req.http_env[k] = v.slice(0, 800) # only first 800 chars sorry
end
end
#["HTTP_X_FORWARDED_FOR", "SERVER_NAME", "HTTP_USER_AGENT", "HTTP_ACCEPT", 'HTTP_ACCEPT_LANGUAGE', 'HTTP_ACCEPT_CHARSET', 'HTTP_ACCEPT_ENCODING']
req.save!
return req
end
def find_dispatch_object(service)
return self.dispatched_services.where(:service_id => service.service_id).first
end
# Input is a CGI::parse style of HTTP params (array values)
# output is a string "fingerprint" canonically representing the input
# params, which can be stored in the db, so that when another request
# comes in, we can easily see if this exact request was seen before.
#
# This method will exclude certain params that are not part of the context
# object, or which we do not want to consider for equality, and will
# then serialize in a canonical way such that two co's considered
# equivelent will have equivelent serialization.
#
# Returns nil if there aren't any params to include in the fingerprint.
def self.co_params_fingerprint(params)
# Don't use ctx_time, consider two co's equal if they are equal but for ctx_tim.
# exclude cache-busting "_" key that JQuery adds. Fine to bust HTTP cache, but
# don't want to it to force new Umlaut processing.
# exclude umlaut.jsonp and umlaut.response_format, those shouldn't effect cache
# lookup.
excluded_keys = ["action", "controller", "page", "rft.action", "rft.controller", "ctx_tim", "_", "umlaut.jsonp", "umlaut.response_format", "format"]
# "url_ctx_val", "request_xml"
# Hash.sort will do a first run through of canonicalization for us
# production an array of two-element arrays, sorted by first element (key)
params = params.sort
# Now exclude excluded keys, and sort value array for further
# canonicalization
params.each do |pair|
# CGI::parse().sort sometimes leaves us a value string with nils in it,
# annoyingly. Especially for malformed requests, which can happen.
# Remove them please.
pair[1].compact! if pair[1]
# === works for regexp and string
if ( excluded_keys.find {|exc_key| exc_key === pair[0]})
params.delete( pair )
else
pair[1].sort! if (pair[1] && pair[1].respond_to?("sort!"))
end
end
return nil if params.blank?
# And YAML-ize for a serliazation
serialized = params.to_yaml
# And make an MD5 hash/digest. Why store the whole thing if all we need to
# do is look it up? hash/digest works well for this.
return Digest::MD5.hexdigest( serialized )
end
end