app/service_adaptors/internet_archive.rb
# This service searches the Internet Archive (archive.org) by title
# and, if present, creator. Results are broken down by mediatypes. Which
# mediatypes are searched can be configured via umlaut_config/services.yml.
# Also an optional link to a full search in the native interface can be
# presented to the user.
# Property settings can be set in services.yml
# url:
# num_results: a number. This is the number of results returned for each
# mediatype within the main section of the view
# mediatypes: an array of the mediatypes searched. insure there is an
# appropriate mediatype as defined by IA. Searching by mediatype searches
# across collections.
# The following link will (currently) show the possible mediatypes:
# http://homeserver7.us.archive.org:8983/solr/select?q=[*+TO+*]&fl=identifier&wt=json&rows=0&indent=yes&facet=true&facet.field=mediatype
# show_web_link: boolean. If set to true, if there are more results than
# num_results a link to those further results will display
# with highlighted_links
# display_name: defaults to "Internet Archive"
class InternetArchive < Service
require 'open-uri' #
require 'cgi'
require 'multi_json' #we ask IA for json
require 'timeout' # used to timeout our requests
include MetadataHelper
# No parameters are required, we have working defaults for them all.
attr_reader :url, :num_results, :mediatypes
# maps the IA mediatype to Umlaut service type
SERVICE_TYPE_MAP = {
"texts" => :fulltext,
"audio" => :audio
}
def service_types_generated
types = [
ServiceTypeValue[:fulltext],
ServiceTypeValue[:audio],
ServiceTypeValue[:'highlighted_link']
]
types << ServiceTypeValue[:search_inside] if @include_search_inside
return types
end
def initialize(config)
# Default base URL for IA advanced search. We use this base link rather than
# the this rather than the IA Solr index directly because IA suggests that
# the Solr home may change over time.
@url = 'http://www.archive.org/advancedsearch.php?'
# default number of results to return
@num_results = 1
# default IA mediatypes to search
@mediatypes = ["texts", "audio"]
# Should the web link to further results be shown? default to true
@show_web_link = true
@display_name = "the Internet Archive"
@http_timeout = 5.seconds
@include_search_inside = false
@credits = {
"The Internet Archive" => "http://archive.org/"
}
super(config)
@num_results_for_types ||= {}
@mediatypes.each do |type|
@num_results_for_types[type] ||= @num_results
end
end
def handle(request)
begin
do_query(request)
rescue Timeout::Error => e
return request.dispatched(self, false, e)
end
return request.dispatched(self, true)
end
def do_query(request)
# get the search terms for use in both fulltext search and highlighted_link
# IA does index apostrophes, although not generally other puncutation. Need to keep em.
search_terms = {:title => get_search_title(request.referent ,:keep_apostrophes=>true),
:creator => get_search_creator(request.referent)}
# We need both title and author to continue
return nil if (search_terms[:title].blank? || search_terms[:creator].blank?)
# Return if this is an journal article link, an IA search can do nothing
# for us except waste CPU cycles for us and IA.
metadata = request.referent.metadata
return nil unless metadata["atitle"].blank? &&
metadata["issue"].blank? &&
metadata["volume"].blank?
# create one link that searches all configured mediatypes
link = @url + ia_params(search_terms)
# using open() conveniently follows the redirect for us. Alas, it
# doesn't give us access to the IA http status code response though.
response = nil
timeout(@http_timeout.to_i) {
response = open(link).read
}
if response.blank?
raise Exception.new("InternetArchive returned empty response for #{link}")
end
doc = MultiJson.load(response)
results = doc['response']['docs']
@mediatypes.each do |type|
hits = matching_hits(request, search_terms, results, type)
# if we have more results than we want to show in the main view
# we can ceate a link (highlighted_link) to the search in the sidebar
num_found = hits.length #doc['response']['numFound']
if (@show_web_link and not hits.empty? and @num_results_for_types[type] < num_found )
do_web_link(request, search_terms, type, num_found)
end
# Check for search inside only for first result of type 'text'
if (@include_search_inside &&
type == 'texts' &&
(first_hit = hits[0]) &&
(identifier = first_hit["identifier"])
)
direct_url = URI.parse("http://www.archive.org/stream/" + identifier)
# Head request, if we get a 200, we think it means we have page
# turner with search.
req = Net::HTTP.new(direct_url.host, direct_url.port)
response = req.request_head(direct_url.path)
if response.code == "200"
# search inside!
request.add_service_response(
:service => self,
:display_text=> @display_name,
:display_text_i18n => "display_name",
:url => direct_url.to_s,
:service_type_value => :search_inside
)
end
end
# add a service response for each result for this mediatype
hits.each_with_index do |result, index|
break if index >= @num_results_for_types[type]
display_name = @display_name
if result["contributor"] && result["contributor"].first
display_name += ": " + result["contributor"].first
elsif ( result["collection"] && COLLECTION_LABELS[result["collection"][0]])
display_name += ": " + COLLECTION_LABELS[result["collection"][0]]
end
service_type = SERVICE_TYPE_MAP[type]
request.add_service_response(
:service=>self,
:display_text=>display_name,
:display_text_i18n => "display_name",
:url=>create_result_url(result),
:match_reliability => ServiceResponse::MatchUnsure,
:edition_str => edition_str(result),
:service_type_value => service_type )
end
end
end
# Here we create params in the format that the IA advanced search needs.
# These are solr-like params.
def ia_params(search_terms)
return nil if search_terms[:title].nil?
params = 'fl%5B%5D=*&fmt=json&xmlsearch=Search' #&indent=yes
params << "&rows=999&q=" #is 999 too many or even too few?
params << create_query_params(search_terms)
end
def create_result_url(result)
'http://archive.org/details/' + result['identifier']
end
# displaying the num_found relies on the number of results from ia_params being
# enough to capture all results for a mediatype. If there are more potential
# results then num_found will not be accurate. But good enough.
def do_web_link(request, search_terms, type, num_found)
display_text = "#{num_found} digital #{type.singularize} " + (num_found > 1 ? "files" : "file")
url = create_web_link_url(search_terms, type)
request.add_service_response(
:service=>self,
:url=>url,
:display_text=>display_text,
:service_type_value => :highlighted_link
)
end
def create_web_link_url(search_terms, type)
'http://www.archive.org/search.php?query=' << create_query_params(search_terms, type)
#url << CGI.escape('mediatype:' << type << ' AND ')
end
# if given a type it will only search for one mediatype. otherwise it
# does an OR search for all configured mediatypes
def create_query_params(search_terms, type=nil)
# Downcase params to avoid weird misconfiguration in IA's SOLR
# installation, where it's interpreting uppercase words as
# commands even within quotes. Also take out any parens in input.
# Also IA does not semi-colons in input?!?
title = safe_argument(search_terms[:title])
params = 'title:' << CGI.escape('"' << title << '"')
if (! search_terms[:creator].blank?)
creator = safe_argument(search_terms[:creator])
params << '+AND+creator:' << CGI.escape('(' << creator << ')')
end
mt = []
params << '+AND+('
if type
params << 'mediatype:' << type
else
@mediatypes.each do |t|
mt << ('mediatype:' << t)
end
params << mt.join('+OR+')
end
params << ')' #closing the mediatypes with a paren
end
# used on what will be values stuck into a URL as search terms,
# does NOT cgi escape, but does safe-ify them in other ways for IA.
def safe_argument(string)
# Downcase params to avoid weird misconfiguration in IA's SOLR
# installation, where it's interpreting uppercase words as
# commands even within quotes.
output = string.downcase
# Remove parens, semi-colons, brackets, hyphens, colons -- they all mess
# up IA, which thinks they are special chars. Remove double quote,
# special char, which sometimes we want to use ourselves. Replace
# all with spaces to avoid accidentally conjoining words.
# (could be
# escaping instead? Not worth it, we don't want to search
# on these anyway. Remove ALL punctuation? Not sure.)
output.gsub!(/[)(\]\[;"\=\-\:]/, ' ')
return output
end
def matching_hits(request, search_terms, results, type)
full_title = raw_search_title(request.referent)
hits = results.find_all do |r|
r["mediatype"] == type &&
titles_sufficiently_matched(search_terms[:title], full_title, r["title"])
end
return hits
end
# Some obtuse code to heuristically decide if our query title and a result
# title fuzzily match sufficiently to be considered a match.
def titles_sufficiently_matched(query_title, full_title, result)
normalized_query = normalize_title(query_title)
normalized_full_title = normalize_title(full_title)
# If the title has more than 3 words, and our IA query returned
# a result for it -- that's probably good enough.
return true if normalized_query.split(" ").length > 3
# Otherwise, make multiple versions of the candidate
# title -- the whole thing, the title until the first colon,
# and the title until the first comma or semi-colon or other punct. Normalize
# them all. See if any of them match EITHER our search title or
# our full title.
candidates = [
result,
result.split(":").first,
result.split(/[\;\,\(\)]/).first
].compact.uniq.collect {|a| normalize_title(a)}
return (candidates & [normalized_query, normalized_full_title]).present?
end
def edition_str(result)
edition_str = ""
edition_str << result['title'] unless result['title'].blank?
edition_str << " / #{result['creator'].first}" unless result['creator'].blank?
edition_str << ". #{result["publisher"].first}" unless result['publisher'].blank?
unless result['date'].blank?
year = result['date'].slice(0,4)
edition_str << ": #{year}"
end
edition_str = nil if edition_str.blank?
return edition_str
end
# catch and redirect response_url fo rsearch_inside
def response_url(service_type, submitted_params)
if ( ! (service_type.service_type_value.name == "search_inside" ))
return super(service_type, submitted_params)
else
base = service_type.service_response[:url]
query = CGI.escape(submitted_params["query"] || "")
url = base + "#search/#{query}"
return url
end
end
## collection labels
# list of collection labels can be found here:
# http://www.archive.org/advancedsearch.php?q=mediatype%3Acollection&fl[]=collection&fl[]=identifier&fl[]=title&sort[]=&sort[]=&sort[]=&rows=9999&indent=yes&fmt=json&xmlsearch=Search
# FIXME either get these dynamically at intervals or add a fuller set below.
# Currently there are over 4300 collections.
# If we're going to do this as a static hash then it should be a class
# constant. Currently this hash contains a small selection of collections
# which include the 'audio' mediatype and all that contain the 'texts' mediatype.
COLLECTION_LABELS = {
"CaliforniaFishandGame"=>"California Fish and Game",
"ol_data"=>"Open Library Data",
"worldhealthorganization"=>"World Health Organization",
"opensource_movies"=>"Open Source Movies",
"clairetcarneylibrary"=>
"Claire T. Carney Library, University of Massachusetts Dartmouth",
"university_of_illinois_urbana-champaign"=>
"University of Illinois Urbana-Champaign",
"smithsonian_books"=>"Smithsonian",
"nhml_london"=>"Natural History Museum Library, London",
"animationandcartoons"=>"Animation & Cartoons",
"university_of_toronto_regis"=>"Regis College Library",
"vlogs"=>"Vlogs",
"opensource"=>"Open Source Books",
"USGovernmentDocuments"=>"US Government Documents",
"danceman"=>"Dance Manuals",
"additional_collections"=>"Additional Collections",
"internet_archive_books"=>"Internet Archive Books",
"sloan"=>"Sloan Foundation",
"iacl"=>"Children's Library",
"audio_religion"=>"Spirituality & Religion",
"microfilm"=>"Books from Microfilm",
"toronto"=>"Canadian Libraries",
"prelinger"=>"Prelinger Archives",
"bostonpubliclibrary"=>"Boston Public Library",
"sports"=>"Sports Videos",
"universallibrary"=>"Universal Library",
"sfpl"=>"The San Francisco Public Library",
"university_of_toronto_knox"=>"Caven Library, Knox College",
"memorial_university"=>"Memorial University of Newfoundland & Labrador",
"MBLWHOI"=>"MBLWHOI Library",
"oreilly_books"=>"O'Reilly",
"burstein"=>"The Burstein Alice in Wonderland Collection",
"ucroho"=>"Regional Oral History Office",
"Brandeis_University"=>"Brandeis University Libraries",
"birney_anti_slavery_collection"=>"Birney Anti-Slavery Collection",
"Johns_Hopkins_University"=>"The Johns Hopkins University Sheridan Libraries",
"culturalandacademicfilms"=>"Cultural & Academic Films",
"Harvard_University"=>"Harvard University",
"montana_state_publications"=>"Montana State Government Publications",
"national_institute_for_newman_studies"=>
"National Institute for Newman Studies",
"buddha"=>"Buddha Books",
"university_of_toronto_fisher"=>"Thomas Fisher Rare Book Library",
"ryerson_university"=>"Ryerson University",
"university_of_toronto_emmanuel"=>
"Emmanuel College Library, Victoria University",
"unica"=>"Unica: Rare Books from UIUC",
"mugar"=>"The Mugar Memorial Library, Boston University",
"havergal"=>"Havergal College",
"university_of_toronto_gerstein"=>
"University of Toronto - Gerstein Science Information Centre",
"NY_Botanical_Garden"=>"The New York Botanical Garden",
"calacademy"=>"California Academy of Sciences",
"chm_fiche"=>"Computer History Museum",
"university_of_toronto_crrs"=>
"Centre for Reformation and Renaissance Studies Library",
"djo"=>"Dickens Journals Online",
"unclibraries"=>"University of North Carolina at Chapel Hill",
"university_of_toronto_oise"=>"OISE/UT Library",
"newsandpublicaffairs"=>"News & Public Affairs",
"biodiversity"=>"Biodiversity Heritage Library",
"university_of_ottawa"=>"University of Ottawa",
"Wellesley_College_Library"=>"Wellesley College Library",
"audio_foreign"=>"Non-English Audio",
"national_library_of_australia"=>"National Library of Australia",
"datadumps"=>"Open Library Data",
"microfilmreel"=>"Reels of Microfilm",
"saint_marys_college"=>"Saint Mary's College of California",
"university_of_toronto_pratt"=>"E.J. Pratt Library",
"Boston_College_Library"=>"Boston College Library",
"uchicago"=>"University of Chicago",
"audio_podcast"=>"Podcasts",
"tufts"=>"Tufts University",
"opensource_audio"=>"Open Source Audio",
"university_of_toronto_trinity"=>"John W. Graham Library, Trinity College",
"audio_tech"=>"Computers & Technology",
"moviesandfilms"=>"Movies",
"etree"=>"Live Music Archive",
"marcuslucero"=>"the Marucs Lucero",
"opencontentalliance"=>"Open Content Alliance",
"radioprograms"=>"Radio Programs",
"university_of_toronto_pims"=>"PIMS - University of Toronto",
"newspapers"=>"Newspapers",
"university_of_california_libraries"=>"University of California Libraries",
"millionbooks"=>"Million Book Project",
"university_of_toronto_robarts"=>"University of Toronto - Robarts Library",
"university_of_toronto"=>"University of Toronto",
"montana_state_library"=>"Montana State Library",
"bancroft_library"=>"The Bancroft Library",
"prelinger_library"=>"Prelinger Library",
"libraryofcongress"=>"The Library of Congress",
"richtest"=>"Test books from California",
"mobot"=>"Missouri Botanical Garden",
"gamevideos"=>"Video Games",
"blc"=>"The Boston Library Consortium",
"cdl"=>"California Digital Library",
"Princeton"=>"Princeton Theological Seminary",
"mcmaster_university"=>"McMaster University",
"sanfranciscopubliclibrary"=>"San Francisco Public Library",
"spanish_texts"=>"The Spanish Language Library",
"boston_college_libraries"=>"The Boston College Libraries",
"gutenberg"=>"Project Gutenberg",
"Music_UniversityofToronto"=>"Music - University of Toronto",
"msn_books"=>"Microsoft",
"youth_media"=>"Youth Media",
"independent"=>"independent texts",
"carletonlibrary"=>"Carleton University Library",
"arpanet"=>"Arpanet",
"yahoo_books"=>"Yahoo!",
"johnadamsBPL"=>"The John Adams Library at the Boston Public Library",
"library_of_congress"=>"The Library of Congress",
"ColumbiaUniversityLibraries"=>"Columbia University Libraries",
"university_of_guelph"=>"University of Guelph",
"GratefulDead"=>"Grateful Dead",
"audio_bookspoetry"=>"Audio Books & Poetry",
"ncsulibraries"=>"North Carolina State University Libraries",
"brown_university_library"=>"Brown University Library",
"Allen_County_Public_Library"=>"Allen County Public Library",
"yrlsc"=>"The Charles E. Young Research Library Special Collections",
"torontotest"=>"Test books from Canada",
"americana"=>"American Libraries",
"librivoxaudio"=>"LibriVox",
"audio_music"=>"Music & Arts",
"toronto_public_library"=>"Toronto Public Library",
"getty"=>"Research Library, Getty Research Institute",
"ontla"=>"The Legislative Assembly of Ontario Collection",
"TheChristianRadical"=>"The Christian Radical",
"netlabels"=>"Netlabels",
"newyorkpubliclibrary"=>"New York Public Library",
"University_of_New_Hampshire_Library"=>"University of New Hampshire Library",
"cbk"=>"Cook Books and Home Economics",
"audio_news"=>"News & Public Affairs",
"ant_texts"=>"Ant Texts",
"computersandtechvideos"=>"Computers & Technology",
"the_beat_within"=>"The Beat Within Magazine",
"university_of_toronto_kelly"=>"University of Toronto - John M Kelly Library",
"library_and_archives_canada"=>"Library and Archives Canada",
"ephemera"=>"Ephemeral Films",
"OXFAM"=>"Oxfam",
"foreignlanguagevideos"=>"Non-English Videos",
"MontanaStateLibrary"=>"Montana State Library",
"EarthSciences_UniversityofToronto"=>"Earth Sciences University of Toronto",
"octavo"=>"Octavo",
"artsandmusicvideos"=>"Arts & Music"
}
end
# Test URLs using defaults
# Shows texts and audio under fulltext, but only a see also for texts
# http://localhost:3000/resolve?&rft.title=Fairy+Tales&rft.aulast=Andersen&ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook
#
# Shows texts and audio, but only see also for audio
# http://localhost:3000/resolve?&rft.title=Frankenstein&rft.aulast=Shelley&ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook
#
# WorldCat links
# If you have OpenURL Referrer or another Firefox add-on configured to
# turn COiNS into an OpenURL to localhost:3000, these links have hits in IA.
# Frankenstein: http://www.worldcat.org/oclc/33045872
# Alice in Wonderland: http://www.worldcat.org/oclc/221499
# Fairy Tales by Andersen: http://www.worldcat.org/oclc/68711386
# Adventures of Huckleberry Finn: http://www.worldcat.org/oclc/2985768
# Gift of the Magi: http://www.worldcat.org/oclc/9065223
# Heart of the West: http://www.worldcat.org/oclc/49293242
# Little Women; or, Meg, Jo, Beth, and Amy: http://www.worldcat.org/oclc/1157
# FIXME should we remove everything after ; as well?
# Letters from a Cat: http://www.worldcat.org/oclc/13529549
# Uncle Tom's Cabin: http://www.worldcat.org/oclc/7945691
# needed apostrophe to succeed
# Goody Two-Shoes: http://www.worldcat.org/oclc/32678428
# The Snow-Image: http://www.worldcat.org/oclc/5020610
# Les Canadiens-Français: http://www.worldcat.org/oclc/186641188
# FIXME should match 1 record and doesn't. character encoding problems?
# John L. Stoddard's Lectures: http://www.worldcat.org/oclc/2181690