fuwatto.rb
#!/usr/local/bin/ruby
# -*- coding: utf-8 -*-
# $Id$
$:.unshift "."
require "net/http"
require "net/https"
#require "pp"
require "digest/md5"
require "tempfile"
require "erb"
require "cgi"
require "nkf"
require "kconv"
require "rubygems"
require "json"
require "libxml"
begin
require "MeCab"
rescue LoadError
begin
require "mecab"
rescue
require "mecab_local.rb"
end
end
begin
require "extractcontent"
rescue LoadError
# require "extractcontent_local.rb"
end
Encoding.default_external = Encoding::UTF_8
module Math
def self::log2( n )
Math.log10( n ) / Math.log10( 2 )
end
end
class String
def shorten( len = 120 )
matched = self.gsub( /\n/, ' ' ).scan( /^.{0,#{len - 2}}/u )[0]
if $'.nil? || $'.empty?
matched
else
matched + '..'
end
end
end
class Array
def uniq_by
result = []
hash = {}
self.each do |e|
next if hash[ yield e ]
hash[ yield e ] = true
result << e
end
result
end
end
class Hash
def make_uri_params
if self.empty?
""
else
self.keys.map do |e|
"#{ e }=#{ URI.escape( self[e].to_s ) }"
end.join( "&" )
end
end
end
module Fuwatto
VERSION = '3.2.0'
BASE_URI = 'https://fuwat.to/'
USER_AGENT = "Fuwatto Search/#{ VERSION }; #{ BASE_URI }"
CACHE_TIME = 60 * 60 * 24 * 5 # 3日経つまで、キャッシュは有効
MAX_PAGE = 19 # ページネイションに表示されるアイテム数
PRF_TOP_K = 10
PRF_ALPHA = 0.5 # 元クエリベクトルに対する相対的な重み
# (元ベクトルの重み最大値に対する比を指定する)
# Bag of Words による文書表現
class Document < Array
include Fuwatto
attr_reader :content
# opts - 特徴語スコアの計算手法切り替え
#
# :term_weight
# - :default (:logcost) => MeCab単語コストの対数値
# - :cost => MeCab単語コスト
# - :tf => TF(出現回数)
def initialize( content, mode = :mecab, opts = {} )
super()
return if content.nil?
@content = NKF.nkf( "-wm0XZ1", content ).gsub( /\s+/, " " ).strip
normalized_content = @content.downcase
opts[ :term_weight ] = :default if not opts[ :term_weight ]
clear
case mode
when "yahoo"
self.push( *extract_keywords_yahooapi( normalized_content ) )
else
self.push( *extract_keywords_mecab( normalized_content, opts ) )
end
#puts self
end
# 類似度計算
def sim( vector )
sum = 0.0
vector.each do |k, v|
term = self.assoc( k )
sum += v * term[1] if term
end
sum / vector.size
end
end
YAHOO_KEYWORD_BASEURI = "http://jlp.yahooapis.jp/KeyphraseService/V1/extract"
YAHOO_APPID = "W11oHSWxg65mAdRwjBT4ylIdfS9PkHPjVvtJzx9Quwy.um8e1LPf_b.4usSBcmI-"
def extract_keywords_yahooapi( str )
#cont = open( "?appid=#{ YAHOO_APPID }&sentence=#{ URI.escape( str ) }&output=xml" ){|io| io.read }
uri = URI.parse( YAHOO_KEYWORD_BASEURI )
#response = http_get( uri )
http = Net::HTTP.new( uri.host, uri.port )
xml = nil
http.start do |conn|
data = "appid=#{ YAHOO_APPID }&sentence=#{ URI.escape( str ) }&output=xml"
# p data
res, = conn.post( uri.path, data )
xml = res.body
end
#p xml
parser = LibXML::XML::Parser.string( xml )
doc = parser.parse
keywords = doc.find( "//y:Keyphrase", "y:urn:yahoo:jp:jlp:KeyphraseService" ).map{|e| e.content }
#keywords.each do |e|
# p e.content
#end
keywords.select{|e| not e.nil? and not e.empty? }
end
def extract_keywords_mecab( str, opts )
return [] if str.strip.empty?
mecab = MeCab::Tagger.new( '--node-format=%m\t%H\t%c\n --unk-format=%m\tUNK\t%c\n' )
lines = mecab.parse( str.toeuc )
#puts lines
lines = lines.toutf8.split( /\n/ ).map{|l| l.split(/\t/) }
lines = lines.select{|l| l[2] and l[1] =~ /^名詞|UNK|形容詞/o and l[1] !~ /接[頭尾]|非自立|代名詞/o }
#p lines
if lines.empty?
raise Fuwatto::NoKeywordExtractedError
end
min = lines.map{|e| e[2].to_i }.min
lines = lines.map{|e| [ e[0], e[1], e[2].to_i + min.abs + 1 ] } if min < 0
count = Hash.new( 0 )
score = 0
lines.each_with_index do |line, idx|
# Ignore ASCII and other symbol chars
next if line[0] =~ /\A[\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\x7f、。「『』」・〇〜ー□‰±♪゜]+\Z/o
# Ignore one or two digit numbers
next if line[0] =~ /\A[0-9][0-9]?\Z/o
# Stop words, derived from Lucene
next if line[0] =~ /\A(?:w(?:h(?:e(?:re(?:a(?:[st]|fter)|u(?:nto|pon)|in(?:to)?|o[fn]?|from|with|ver|by)?|n(?:(?:so)?ever|ce)?|ther)|o(?:m(?:(?:so)?ever)?|s(?:oever|e)|ever|le)?|i(?:ch(?:(?:so)?ever)?|l(?:st|e)|ther)|at(?:(?:so)?ever)?|y)|i(?:th(?:out|in)?|ll)|e(?:ll|re)?|ould|as)|a(?:l(?:(?:bei|mos)t|on[eg]|though|ready|ways|so|l)|n(?:y(?:(?:wher|on)e|thing|how)?|other|d)?|fter(?:wards)?|bo(?:ut|ve)|gain(?:st)?|mong(?:st)?|r(?:ound|e)|(?:cros)?s|dj|t)?|t(?:h(?:e(?:re(?:(?:upo|i)n|afte|for|by)?|m(?:selves)?|n(?:ce)?|ir|se|y)?|r(?:ough(?:out)?|u)|o(?:ugh|se)|[iu]s|a[nt])|o(?:gether|wards?|o)?)?|s(?:o(?:me(?:t(?:imes?|hing)|(?:wher|on)e|how)?)?|e(?:em(?:ing|ed|s)?|veral)|(?:inc|am)e|h(?:ould|e)|till|uch)?|b(?:e(?:c(?:om(?:es?|ing)|a(?:us|m)e)|fore(?:hand)?|(?:hi|yo)nd|(?:twe)?en|sides?|ing|low)?|oth|ut|y)|h(?:e(?:r(?:e(?:(?:upo|i)n|by)?|s(?:elf)?)?|eafter|nce)?|i(?:m(?:self)?|s)|a(?:[ds]|ve)|ow(?:ever)?)|o(?:u(?:r(?:(?:selve)?s)?|t)|n(?:ce one|ly|to)?|ther(?:wise|s)?|f(?:ten|f)?|(?:ve)?r|wn)|e(?:ve(?:r(?:y(?:(?:wher|on)e|thing)?)?|n)|ls(?:ewher)?e|(?:noug|ac)h|ither|xcept|tc|g)|n(?:o(?:[rw]|t(?:hing)?|body|o?ne)?|e(?:ver(?:theless)?|ither|xt)|amely|where)|m(?:o(?:re(?:over)?|st(?:ly)?)|(?:eanwhil)?e|u(?:ch|st)|y(?:self)?|an?y|ight)|i(?:[efs]|n(?:deed|to|c)?|t(?:s(?:elf)?)?)?|f(?:or(?:mer(?:ly)?)?|urther|irst|rom|ew)|l(?:a(?:tter(?:ly)?|st)|e(?:ast|ss)|td)|y(?:ou(?:r(?:s(?:el(?:ves|f))?)?)?|et)|x(?:author|other |note|subj|cal)|u(?:n(?:der|til)|p(?:on)?|s)|c(?:an(?:not)?|o(?:uld)?)|d(?:uring|own)|per(?:haps)?|v(?:ery|ia)|rather)\Z/o
#next if line[0].size < 3
#p line[2]
#puts line.join("\t")
case opts[ :term_weight ]
when :tf
score = 1
when :count, :cost
score = line[2].to_i
else # :logcost, :default
score = Math.log2( line[2].to_i + 1 )
end
#p [ line[0], score, idx ]
score /= Math.log2( idx + 2 ) if opts[ :term_weight_position ]
count[ line[0] ] += score
#count[ line[0] ] += 1
end
# p count
ranks = count.keys.sort_by{|e| count[e] }.reverse.map{|e| [e,count[e]] }
#pp ranks
#3.times do |i|
# puts [ i+1, ranks[i], count[ ranks[i] ] ].join( "\t" )
#end
ranks
end
# Supports redirect
def http_get( uri, limit = 10 )
#STDERR.puts uri.to_s
raise "Too many redirects: #{ uri }" if limit < 0
http_proxy = ENV[ "http_proxy" ]
proxy, proxy_port = nil
if http_proxy
proxy_uri = URI.parse( http_proxy )
proxy = proxy_uri.host
proxy_port = proxy_uri.port
end
http = Net::HTTP.Proxy( proxy, proxy_port ).new( uri.host, uri.port )
http.use_ssl = true if uri.scheme == "https"
http.start do |http|
response, = http.get( uri.request_uri, { 'User-Agent'=>USER_AGENT } )
#if response.code !~ /^2/
# response.each do |k,v|
# p [ k, v ]
# end
#end
case response
when Net::HTTPSuccess
response
when Net::HTTPRedirection
redirect_uri = URI.parse( response['Location'] )
STDERR.puts "redirect to #{ redirect_uri } (#{limit})"
http_get( uri + redirect_uri, limit - 1 )
else
response.error!
end
end
end
CACHE_DIR = "cache"
class Cache
CACHE_DIR = "cache"
def initialize( prefix )
@prefix = prefix
@cache_dir = File.join( CACHE_DIR, @prefix )
begin
unless File.exist? @cache_dir
Dir.mkdir @cache_dir
end
end
end
def filename( name, page = 0 )
xml_fname = name.dup
if xml_fname.size > 245
xml_fname = Digest::MD5.hexdigest( xml_fname )
end
xml_fname << ":#{ page }" if not page.nil? and not page == 0
xml_fname << ".xml"
File.join( @cache_dir, xml_fname )
end
def fetch( name, page = 0 )
fname = filename( name, page )
if File.exist?( fname ) and ( Time.now - File.mtime( fname ) ) < CACHE_TIME
cont = open( fname ){|io| io.read }
else
cont = yield
open( fname, "w" ){|io| io.print cont }
end
cont
end
end
def cache_xml( prefix, name, page = 0 )
xml_fname = name.dup
if xml_fname.size > 245
xml_fname = Digest::MD5.hexdigest( xml_fname )
end
xml_fname << ":#{ page }" if not page.nil? and not page == 0
xml_fname << ".xml"
File.join( CACHE_DIR, prefix, xml_fname )
end
PDFTOTEXT = "/usr/bin/pdftotext"
def pdftotext( pdf_str )
pdf_file = Tempfile.new( [ "pdf", ".pdf" ] )
pdf_file.print pdf_str
pdf_file.flush
#p pdf_file.size
IO.popen( "#{ PDFTOTEXT } -raw -enc EUC-JP #{ pdf_file.path } -" ) do |io|
io.read
end
end
CINII_APPID = "CiNii10-281c60b8055e2d850686566dedb7c922"
# CiNii Opensearch API
def cinii_search( keyword, opts = {} )
base_uri = "http://ci.nii.ac.jp/opensearch/search"
q = URI.escape( keyword )
cont = nil
cache_file = cache_xml( "cinii", q, opts[:start] )
#p File.mtime( cache_file )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
# TODO: Atom/RSSの双方を対象にできるようにすること(現状は Atom のみ)
opts[ :format ] = "atom"
opts_s = opts.make_uri_params
opensearch_uri = URI.parse( "#{ base_uri }?q=#{ q }&appid=#{ CINII_APPID }&#{ opts_s }" )
response = http_get( opensearch_uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
end
data = {}
parser = LibXML::XML::Parser.string( cont )
doc = parser.parse
# ref. http://ci.nii.ac.jp/info/ja/if_opensearch.html
data[ :q ] = keyword
data[ :link ] = doc.find( "//atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content.gsub( /&(format=atom|appid=#{ CINII_APPID })\b/, "" )
data[ :totalResults ] = doc.find( "//opensearch:totalResults" )[0].content.to_i
if data[ :totalResults ] > 0
data[ :itemsPerPage ] = doc.find( "//opensearch:itemsPerPage" )[0].content.to_i
end
entries = doc.find( "//atom:entry", "atom:http://www.w3.org/2005/Atom" )
data[ :entries ] = []
entries.each do |e|
title = e.find( "./atom:title", "atom:http://www.w3.org/2005/Atom" )[0]
title = title ? title.content : "(No Title)"
url = e.find( "./atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content
author = e.find( ".//atom:author/atom:name", "atom:http://www.w3.org/2005/Atom" ).to_a.map{|name|
a = name.content
/^\s*\W/.match( a ) ? a.gsub( /\s*,\s*/, " " ) : a
}.join( "; " )
pubname = e.find( "./prism:publicationName", "prism:http://prismstandard.org/namespaces/basic/2.0/" )[0]
if pubname.nil?
pubname = e.find( "./dc:publisher", "dc:http://purl.org/dc/elements/1.1/" )[0]
pubname = pubname.content if pubname
else
pubname = pubname.content
end
pubdate = e.find( "./prism:publicationDate", "prism:http://prismstandard.org/namespaces/basic/2.0/" )[0] #.content
pubdate = pubdate.nil? ? "" : pubdate.content
description = e.find( "./atom:content", "atom:http://www.w3.org/2005/Atom" )[0]
description = description.nil? ? "" : description.content
data[ :entries ] << {
:title => title,
:url => url,
:author => author,
:publicationName => pubname,
:publicationDate => pubdate,
:description => description,
}
end
data
end
def cinii_research_search( keyword, opts = {} )
base_uri = "https://cir.nii.ac.jp/opensearch/all"
q = URI.escape( keyword )
cont = nil
cache_file = cache_xml( "cinii_r", q, opts[:start] )
#p File.mtime( cache_file )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
# TODO: Atom/RSSの双方を対象にできるようにすること(現状は Atom のみ)
opts[ :format ] = "atom"
opts_s = opts.make_uri_params
opensearch_uri = URI.parse( "#{ base_uri }?q=#{ q }&appid=#{ CINII_APPID }&#{ opts_s }" )
response = http_get( opensearch_uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
end
data = {}
parser = LibXML::XML::Parser.string( cont )
doc = parser.parse
# ref. https://support.nii.ac.jp/ja/cir/r_opensearch, (old) http://ci.nii.ac.jp/info/ja/if_opensearch.html
data[ :q ] = keyword
data[ :link ] = doc.find( "//atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content.gsub( /&(format=atom|appid=#{ CINII_APPID })\b/, "" )
data[ :totalResults ] = doc.find( "//opensearch:totalResults" )[0].content.to_i
if data[ :totalResults ] > 0
data[ :itemsPerPage ] = doc.find( "//opensearch:itemsPerPage" )[0].content.to_i
end
entries = doc.find( "//atom:entry", "atom:http://www.w3.org/2005/Atom" )
data[ :entries ] = []
entries.each do |e|
title = e.find( "./atom:title", "atom:http://www.w3.org/2005/Atom" )[0]
title = title ? title.content : "(No Title)"
url = e.find( "./atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content
author = e.find( ".//atom:author/atom:name", "atom:http://www.w3.org/2005/Atom" ).to_a.map{|name|
a = name.content
/^\s*\W/.match( a ) ? a.gsub( /\s*,\s*/, " " ) : a
}.join( "; " )
pubname = e.find( "./prism:publicationName", "prism:http://prismstandard.org/namespaces/basic/2.0/" )[0]
if pubname.nil?
pubname = e.find( "./dc:publisher", "dc:http://purl.org/dc/elements/1.1/" )[0]
pubname = pubname.content if pubname
else
pubname = pubname.content
end
pubdate = e.find( "./prism:publicationDate", "prism:http://prismstandard.org/namespaces/basic/2.0/" )[0] #.content
pubdate = pubdate.nil? ? "" : pubdate.content
description = e.find( "./atom:content", "atom:http://www.w3.org/2005/Atom" )[0]
description = description.nil? ? "" : description.content
data[ :entries ] << {
:title => title,
:url => url,
:author => author,
:publicationName => pubname,
:publicationDate => pubdate,
:description => description,
}
end
data
end
# CiNii (Author) Opensearch API
def cinii_author_search( keyword, opts = {} )
base_uri = "http://ci.nii.ac.jp/opensearch/author"
q = URI.escape( keyword )
cont = nil
cache_file = cache_xml( "cinii_author", q, opts[:start] )
#p File.mtime( cache_file )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
opts[ :format ] = "atom"
opts[ :sortorder ] ||= 3
opts_s = opts.make_uri_params
opensearch_uri = URI.parse( "#{ base_uri }?q=#{ q }&appid=#{ CINII_APPID }&#{ opts_s }" )
response = http_get( opensearch_uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
end
data = {}
parser = LibXML::XML::Parser.string( cont )
doc = parser.parse
# ref. http://ci.nii.ac.jp/info/ja/if_opensearch_auth.html
data[ :q ] = keyword
data[ :link ] = doc.find( "//atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content.gsub( /&(format=atom|appid=#{ CINII_APPID })\b/, "" )
data[ :totalResults ] = doc.find( "//opensearch:totalResults" )[0].content.to_i
entries = doc.find( "//atom:entry", "atom:http://www.w3.org/2005/Atom" )
data[ :entries ] = []
entries.each do |e|
title = e.find( "./atom:title", "atom:http://www.w3.org/2005/Atom" )[0].content
url = e.find( "./atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content
affiliation = e.find( "./atom:content", "atom:http://www.w3.org/2005/Atom" )[0]
affiliation = affiliation ? affiliation.content : ""
data[ :entries ] << {
:title => title,
:url => url,
:affiliation => affiliation,
}
end
data
end
# CiNii (Author:NRID) Opensearch API
def cinii_nrid_search( nrid, opts = {} )
q = URI.escape( nrid )
base_uri = "http://ci.nii.ac.jp/opensearch/nrid/"
cont = nil
cache_file = cache_xml( "cinii_nrid", q, opts[:start] )
#p File.mtime( cache_file )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
opts[ :format ] = "atom"
# opts[ :sortorder ] ||= 3
opts_s = opts.make_uri_params
opensearch_uri = URI.parse( "#{ base_uri }#{ q }?appid=#{ CINII_APPID }&#{ opts_s }" )
response = http_get( opensearch_uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
end
data = {}
parser = LibXML::XML::Parser.string( cont )
doc = parser.parse
# ref. http://ci.nii.ac.jp/info/ja/if_opensearch_auth.html
data[ :q ] = nrid
data[ :link ] = doc.find( "//atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content.gsub( /&(format=atom|appid=#{ CINII_APPID })\b/, "" )
data[ :totalResults ] = doc.find( "//opensearch:totalResults" )[0].content.to_i
entries = doc.find( "//atom:entry", "atom:http://www.w3.org/2005/Atom" )
data[ :entries ] = []
entries.each do |e|
title = e.find( "./atom:title", "atom:http://www.w3.org/2005/Atom" )[0].content
url = e.find( "./atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content
author = e.find( ".//atom:author/atom:name", "atom:http://www.w3.org/2005/Atom" ).to_a.map{|name| name.content }.join( "; " )
pubname = e.find( "./prism:publicationName", "prism:http://prismstandard.org/namespaces/basic/2.0/" )[0]
if pubname.nil?
pubname = e.find( "./dc:publisher", "dc:http://purl.org/dc/elements/1.1/" )[0]
pubname = pubname.content if pubname
else
pubname = pubname.content
end
pubdate = e.find( "./prism:publicationDate", "prism:http://prismstandard.org/namespaces/basic/2.0/" )[0] #.content
pubdate = pubdate.nil? ? "" : pubdate.content
description = e.find( "./atom:content", "atom:http://www.w3.org/2005/Atom" )[0]
description = description.nil? ? "" : description.content
data[ :entries ] << {
:title => title,
:url => url,
:author => author,
:publicationName => pubname,
:publicationDate => pubdate,
:description => description,
}
end
data
end
def cinii_author_nrid_search( name, naid = [] )
data = cinii_author_search( name )
result = data
if not naid.empty?
entries = []
result[ :entries ].each do |author|
#p author[ :url ]
if /nrid\/(.+)\Z/ =~ author[ :url ]
nrid = $1
if cinii_nrid_search( nrid )[ :entries ].find{|e| naid.include?( e[ :url ] ) }
entries << author
end
end
end
result[ :entries ] = entries
end
result
end
# NDL Porta Opensearch
def ndl_search( keyword, opts = {} )
base_uri = "http://api.porta.ndl.go.jp/servicedp/opensearch"
q = URI.escape( keyword )
cont = nil
cache_file = cache_xml( "ndl", q, opts[:start] )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
opts[ :format ] = "atom"
opts[ :dpgroupid ] = "ndl"
if opts[ :start ]
opts[ :idx ] = opts[ :start ].dup
opts.delete( :start )
end
opts_s = opts.make_uri_params
opensearch_uri = URI.parse( "#{ base_uri }?any=#{ q }&#{ opts_s }" )
response = http_get( opensearch_uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
end
data = {}
parser = LibXML::XML::Parser.string( cont )
doc = parser.parse
data[ :q ] = keyword
data[ :link ] = "http://porta.ndl.go.jp/cgi-bin/openurl.cgi"
data[ :totalResults ] = doc.find( "//openSearch:totalResults", "http://a9.com/-/spec/opensearchrss/1.0/" )[0].content.to_i
entries = doc.find( "//item" )
data[ :entries ] = []
entries.each do |e|
dpid = e.find( "./dcndl_porta:dpid", "http://ndl.go.jp/dcndl/dcndl_porta/" )[0].content
title = e.find( "./title" )[0].content
url = e.find( "./link" )[0].content
author = e.find( ".//author" )
if author and author[0]
author = author[0].content
else
author = ""
end
source = e.find( "./source" )
if source and source[0]
source = source[0].content
else
source = ""
end
publicationName = e.find( "dcterms:bibliographicCitation", "http://purl.org/dc/terms/" )
if publicationName and publicationName[0]
publicationName = publicationName[0].content
else
publicationName = nil
end
date = e.find( "./dcterms:issued", "http://purl.org/dc/terms/" )
if date and date[0]
date = date[0].content
else
date = e.find( "./dcterms:modified", "http://purl.org/dc/terms/" )
if date and date[0]
date = date[0].content
else
date = nil
end
end
publisher = e.find( "./dc:publisher", "http://purl.org/dc/elements/1.1/" )
if publisher and publisher[0]
publisher = publisher[0].content
else
publisher = nil
end
description = e.find( "./description" )
if description and description[0] and dpid != "zassaku"
description = description[0].content
else
description = ""
end
if publicationName.nil? or publicationName.empty?
publicationName = [ source, publisher ].select{|type|
not type.nil? and not type.empty?
}.join( "; " )
end
isbn = e.find( "./dc:identifier[@xsi:type='dcndl:ISBN']",
[ "dc:http://purl.org/dc/elements/1.1/",
"xsi:http://www.w3.org/2001/XMLSchema-instance",
"dcndl:http://ndl.go.jp/dcndl/terms/" ] )[0]
isbn = isbn.content if isbn
data[ :entries ] << {
:title => title,
:url => url,
:author => author,
:source => source,
:date => date,
:publicationDate => date,
:publisher => publisher,
:publicationName => publicationName,
:description => description,
:dpid => dpid,
:isbn => isbn,
}
end
data
end
# NDLサーチ版:
def iss_ndl_search( keyword, opts = {} )
base_uri = "http://iss.ndl.go.jp/api/opensearch"
q = URI.escape( keyword )
cont = nil
cache_file = cache_xml( "ndl", q, opts[:start] )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
opts[ :dpgroupid ] = "ndl"
if opts[ :start ]
opts[ :idx ] = opts[ :start ]
opts.delete( :start )
end
opts_s = opts.make_uri_params
opensearch_uri = URI.parse( "#{ base_uri }?any=#{ q }&#{ opts_s }" )
response = http_get( opensearch_uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
end
data = {}
#STDERR.puts cont
parser = LibXML::XML::Parser.string( cont )
doc = parser.parse
data[ :q ] = keyword
data[ :link ] = doc.find( "//link" )[0].content.sub( /\/api\/opensearch/, '/books' ).sub( /&dpgroupid=ndl/, "" )
data[ :totalResults ] = doc.find( "//openSearch:totalResults", "http://a9.com/-/spec/opensearchrss/1.0/" )[0].content.to_i
entries = doc.find( "//item" )
data[ :entries ] = []
entries.each do |e|
#dpid = e.find( "./dcndl:dpid", "http://ndl.go.jp/dcndl/dcndl_porta/" )[0].content
title = e.find( "./title" )[0].content
url = e.find( "./link" )[0].content
author = e.find( ".//author" )
if author and author[0]
author = author[0].content
else
author = ""
end
creators = e.find( ".//dc:creator", "http://purl.org/dc/elements/1.1/" )
creator = creators.map{|e| e.content }.join( ", " )
source = e.find( "./source" )
if source and source[0]
source = source[0].content
else
source = ""
end
publicationName = e.find( "./dcndl:publicationName", "dcndl:http://ndl.go.jp/dcndl/terms/" )
if publicationName and publicationName[0]
publicationName = publicationName[0].content
else
publicationName = publicationName[0]
end
volume = e.find( "./dcndl:publicationVolume", "dcndl:http://ndl.go.jp/dcndl/terms/" )
if volume and volume[0]
volume = volume[0].content
end
date = e.find( "./dcterms:issued", "http://purl.org/dc/terms/" )
if date and date[0]
date = date[0].content
else
date = e.find( "./dcterms:modified", "http://purl.org/dc/terms/" )
if date and date[0]
date = date[0].content
else
date = nil
end
end
publisher = e.find( "./dc:publisher", "http://purl.org/dc/elements/1.1/" )
if publisher and publisher[0]
publisher = publisher[0].content
else
publisher = nil
end
description = e.find( "./description" )
if description and description[0] #and dpid != "zassaku"
description = description[0].content
else
description = ""
end
isbn = e.find( "./dc:identifier[@xsi:type='dcndl:ISBN']",
[ "dc:http://purl.org/dc/elements/1.1/",
"xsi:http://www.w3.org/2001/XMLSchema-instance",
"dcndl:http://ndl.go.jp/dcndl/terms/" ] )[0]
isbn = isbn.content if isbn
data[ :entries ] << {
:title => title,
:url => url,
:author => author,
:creator => creator,
:source => source,
:date => date,
:publicationDate => date,
:publisher => publisher,
:publicationName => publicationName,
:description => description,
#:dpid => dpid,
:isbn => isbn,
:volume => volume,
}
end
data
end
# レファ協API via NDL PORTA Opensearch
def crd_search2( keyword, opts = {} )
base_uri = "http://api.porta.ndl.go.jp/servicedp/opensearch"
q = URI.escape( keyword )
cont = nil
cache_file = cache_xml( "crd2", q, opts[:start] )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
opts[ :format ] = "atom"
opts[ :dpid ] = "refkyo"
opts_s = opts.make_uri_params
opensearch_uri = URI.parse( "#{ base_uri }?any=#{ q }&#{ opts_s }" )
response = http_get( opensearch_uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
end
data = {}
parser = LibXML::XML::Parser.string( cont )
doc = parser.parse
data[ :q ] = keyword
data[ :link ] = "http://iss.ndl.go.jp/"
data[ :totalResults ] = doc.find( "//openSearch:totalResults", "http://a9.com/-/spec/opensearchrss/1.0/" )[0].content.to_i
entries = doc.find( "//item" )
data[ :entries ] = []
entries.each do |e|
dpid = e.find( "./dcndl_porta:dpid", "http://ndl.go.jp/dcndl/terms/" )[0].content
title = e.find( "./title" )[0].content
url = e.find( "./link" )[0].content
author = e.find( ".//author" )
if author and author[0]
author = author[0].content
else
author = ""
end
source = e.find( "./source" )
if source and source[0]
source = source[0].content
else
source = ""
end
publicationName = e.find( "dcterms:bibliographicCitation", "http://purl.org/dc/terms/" )
if publicationName and publicationName[0]
publicationName = publicationName[0].content
else
publicationName = nil
end
date = e.find( "./dcterms:issued", "http://purl.org/dc/terms/" )
if date and date[0]
date = date[0].content
else
date = e.find( "./dcterms:modified", "http://purl.org/dc/terms/" )
if date and date[0]
date = date[0].content
else
date = nil
end
end
publisher = e.find( "./dc:publisher", "http://purl.org/dc/elements/1.1/" )
if publisher and publisher[0]
publisher = publisher[0].content
else
publisher = nil
end
description = e.find( "./description" )
if description and description[0] and dpid != "zassaku"
description = description[0].content
else
description = ""
end
if publicationName.nil? or publicationName.empty?
publicationName = [ source, publisher ].select{|e|
not e.nil? and not e.empty?
}.join( "; " )
end
data[ :entries ] << {
:title => title,
:url => url,
:author => author,
:source => source,
:date => date,
:publicationDate => date,
:publisher => publisher,
:publicationName => publicationName,
:description => description,
:dpid => dpid,
}
end
data
end
# レファレンス協同データベースAPI
def crd_search( keyword, opts = {} )
require "htmlentities"
base_uri = "http://crd.ndl.go.jp/refapi/servlet/refapi.RSearchAPI"
q = URI.escape( keyword )
opts[ :query_logic ] = "2"
opts_s = opts.make_uri_params
query = "01_" + q + ".02_" + q
opts_s = "&" + opts_s if not opts_s.empty?
uri = URI.parse( "#{ base_uri }?query=#{ query }#{ opts_s }" )
cont = nil
cache_file = cache_xml( "crd", q, opts[:start] )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
response = http_get( uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
end
data = {}
parser = LibXML::XML::Parser.string( cont )
doc = nil
begin
doc = parser.parse
rescue LibXML::XML::Error => e
begin
parser = LibXML::XML::Parser.string( NKF.nkf( "-Ww", cont ) )
doc = parser.parse
rescue
File.unlink( cache_file )
raise e
end
end
# ref. http://ci.nii.ac.jp/info/ja/if_opensearch.html
data[ :q ] = keyword
#data[ :link ] = doc.find( "//atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content.sub( /&format=atom\b/, "" )
data[ :link ] = "http://crd.ndl.go.jp/" # TODO: リンク先を適宜補完すること。
data[ :totalResults ] = doc.find( "//crd:hit_num", "crd:http://crd.ndl.go.jp/refapi/servlet/refapi.RSearchAPI" )[0].content.to_i
entries = doc.find( "//crd:result", "crd:http://crd.ndl.go.jp/refapi/servlet/refapi.RSearchAPI" )
data[ :entries ] = []
entries.each do |e|
title = e.find( "./crd:QUESTION", "crd:http://crd.ndl.go.jp/refapi/servlet/refapi.RSearchAPI" )[0].content
url = e.find( "./crd:URL", "crd:http://crd.ndl.go.jp/refapi/servlet/refapi.RSearchAPI" )[0].content
author = e.find( "./crd:LIB-NAME", "crd:http://crd.ndl.go.jp/refapi/servlet/refapi.RSearchAPI" ).to_a.map{|name| name.content }.join( "; " )
description = e.find( "./crd:ANSWER", "crd:http://crd.ndl.go.jp/refapi/servlet/refapi.RSearchAPI" )[0].content
if description.nil? or description.empty?
description = e.find( "./crd:ANS-PROC", "crd:http://crd.ndl.go.jp/refapi/servlet/refapi.RSearchAPI" )[0].content
end
pubdate = e.find( "./crd:CRT-DATE", "crd:http://crd.ndl.go.jp/refapi/servlet/refapi.RSearchAPI" )[0]
pubdate = pubdate ? pubdate.content : e.find( "./crd:REG-DATE", "crd:http://crd.ndl.go.jp/refapi/servlet/refapi.RSearchAPI" )[0].content
solution = e.find( "./crd:SOLUTION", "crd:http://crd.ndl.go.jp/refapi/servlet/refapi.RSearchAPI" )[0]
solution = solution ? solution.content : nil
data[ :entries ] << {
:title => title,
:url => url,
:author => author,
:description => description,
:publicationDate => pubdate,
:solution => solution,
}
end
data
end
# WorldCat Basic API (Opensearch)
WORLDCAT_BASIC_WSKEY = "5lIR8i5bSQQNg4Xb3ro6QbOzXiGSs6PrGGJ02BKolP9qTUQRcui2Ze9AsQIlM8IzV0E9XMcrWWieWvrM"
def worldcat_search( keyword, opts = {} )
base_uri = "http://worldcat.org/webservices/catalog/search/opensearch"
q = URI.escape( keyword )
cont = nil
cache_file = cache_xml( "worldcat", q, opts[:start] )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
opts[ :format ] = "atom"
opts[ :wskey ] = WORLDCAT_BASIC_WSKEY
opts_s = opts.make_uri_params
opensearch_uri = URI.parse( "#{ base_uri }?q=#{ q }&#{ opts_s }" )
response = http_get( opensearch_uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
end
data = {}
parser = LibXML::XML::Parser.string( cont )
doc = parser.parse
# ref. http://ci.nii.ac.jp/info/ja/if_opensearch.html
data[ :q ] = keyword
# data[ :link ] = doc.find( "//atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content.sub( /&format=atom\b/, "" ).sub( /&wskey=\w+/, "" )
data[ :link ] = "http://www.worldcat.org/search?q=#{ q }"
#STDERR.puts q.inspect
#STDERR.puts doc.find( "//opensearch:totalResults" )[0].inspect
#STDERR.puts doc.find( "//opensearch:totalResults" )[0].class
totalResults = doc.find( "//opensearch:totalResults" )[0]
if totalResults
data[ :totalResults ] = doc.find( "//opensearch:totalResults" )[0].content.to_i
else
data[ :totalResults ] = 0
end
entries = doc.find( "//atom:entry", "atom:http://www.w3.org/2005/Atom" )
data[ :entries ] = []
entries.each do |e|
title = e.find( "./atom:title", "atom:http://www.w3.org/2005/Atom" )[0].content
url = e.find( "./atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content
author = e.find( ".//atom:author/atom:name", "atom:http://www.w3.org/2005/Atom" ).to_a.map{|name| name.content }.join( "; " )
content = e.find( "./atom:content", "atom:http://www.w3.org/2005/Atom" )[0]
isbn = nil
e.find( "./dc:identifier", "dc:http://purl.org/dc/elements/1.1/" ).each do |identifier|
if identifier.content =~ /\Aurn:ISBN:(\d{9,12}[\dx])\Z/o
isbn = $1
break
end
end
data[ :entries ] << {
:title => title,
:url => url,
:author => author,
:content => content,
:publicationName => content,
:isbn => isbn,
}
end
data
end
# JAWikipedia API
def wikipedia_ja_search( keyword, opts = {} )
base_uri = "http://ja.wikipedia.org/w/api.php"
q = URI.escape( keyword )
cont = nil
cache_file = cache_xml( "jawikipedia", q, opts[:start] )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
opts[ :action ] = "query"
opts[ :list ] = "search"
opts[ :format ] = "xml"
opts_s = opts.make_uri_params
search_uri = URI.parse( "#{ base_uri }?srsearch=#{ q }&#{ opts_s }" )
response = http_get( search_uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
end
data = {}
parser = LibXML::XML::Parser.string( cont )
doc = parser.parse
# ref. http://ci.nii.ac.jp/info/ja/if_opensearch.html
data[ :q ] = keyword
# data[ :link ] = doc.find( "//atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content.sub( /&format=atom\b/, "" ).sub( /&wskey=\w+/, "" )
data[ :link ] = "http://ja.wikipedia.org/wiki/Sepecial:Search/#{ q }"
data[ :totalResults ] = doc.find_first( "//searchinfo" ).attributes[ "totalhits" ].to_i
entries = doc.find( "//p" )
data[ :entries ] = []
entries.each do |e|
title = e.attributes[ "title" ]
url = "http://ja.wikipedia.org/wiki/#{ URI.escape( title ) }"
content = e.attributes[ "snippet" ]
timestamp = e.attributes[ "timestamp" ]
data[ :entries ] << {
:title => title,
:url => url,
:content => content,
:publicationName => content,
:publicationDate => timestamp,
}
end
data
end
# EPI SRU API
def epi_search( keyword, opts = {} )
base_uri = "http://dl.nier.go.jp/epi"
client_base_uri = "http://dl.nier.go.jp/epi-search/sru-gw.rb"
q = URI.escape( keyword.split.join( " AND " ) )
cont = nil
cache_file = cache_xml( "epi", q, opts[:start] )
#p File.mtime( cache_file )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
opts[ :version ] = "1.1"
opts[ :operation ] = "searchRetrieve"
opts[ :maximumRecords ] ||= 20
opts[ :startRecord ] ||= opts[ :start ] if opts[ :start ]
#p opts
params = [ :operation, :version, :query, :startRecord, :maximumRecords, :recordPacking, :recordSchema, :recordXPath, :resultSetTTL, :sortKeys, :stylesheet, :extraRequestData ].map do |e|
opts[ e ] ? "#{ e }=#{ URI.escape( opts[e].to_s ) }" : nil
end.compact.join( "&" )
#p params
sru_uri = URI.parse( "#{ base_uri }?query=#{ q }&#{ params }" )
response = http_get( sru_uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
end
data = {}
parser = LibXML::XML::Parser.string( cont )
doc = parser.parse
#p [ q, cont ]
data[ :q ] = keyword
data[ :link ] = client_base_uri + "?keyword=#{ q }"
data[ :entries ] = []
data[ :totalResults ] = doc.find( "//srw:numberOfRecords", "srw:http://www.loc.gov/zing/srw/" )[0].content.to_i
#p data[ :totalResults ]
#if data[ :totalResults ]
# data[ :totalResults ] = data[ :totalResults ].content.to_i
#else
# data[ :totalResults ] = 0
# return data
#end
entries = doc.find( "//srw:record", "srw:http://www.loc.gov/zing/srw/" )
#p entries.to_a
entries.each do |e|
title = e.find( "./srw:recordData/xml/title", "srw:http://www.loc.gov/zing/srw/" )[0]
title = title.nil? ? "(無題)" : title.content
bibid = e.find( "./srw:recordData/xml/bibid", "srw:http://www.loc.gov/zing/srw/" )[0].content
url = client_base_uri + "?keyword=bibid%20exact%20#{ bibid }"
author = e.find( "./srw:recordData/xml/author", "srw:http://www.loc.gov/zing/srw/" )[0]
author = author ? author.content : ""
pubname = e.find( "./srw:recordData/xml/journal", "srw:http://www.loc.gov/zing/srw/" )[0].content
pubdate = e.find( "./srw:recordData/xml/pubdate", "srw:http://www.loc.gov/zing/srw/" )[0]
pubdate = pubdate ? pubdate.content : ""
data[ :entries ] << {
:title => title,
:url => url,
:author => author,
:publicationName => pubname,
:publicationDate => pubdate,
}
end
data
end
# JSTAGE API (Opensearch-based?)
def jstage_search( keyword, opts = {} )
base_uri = "http://api.jstage.jst.go.jp/searchapi/do"
client_base_uri = "http://www.jstage.jst.go.jp/search/-char/ja?d6=te&typer=on&searchtype=1"
q = URI.escape( keyword )
cont = nil
cache_file = cache_xml( "jstage", q, opts[:start] )
#p File.mtime( cache_file )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
opts[ :service ] = 3
opts[ :text ] = keyword
#p opts
params = [ :service, :system, :start, :count ].map do |e|
opts[ e ] ? "#{ e }=#{ URI.escape( opts[e].to_s ) }" : nil
end.compact.join( "&" )
#p params
sru_uri = URI.parse( "#{ base_uri }?text=#{ q }&#{ params }" )
response = http_get( sru_uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
end
data = {}
parser = LibXML::XML::Parser.string( cont )
doc = parser.parse
#p [ q, cont ]
data[ :q ] = keyword
data[ :link ] = client_base_uri + "&dp6=#{ q }"
data[ :entries ] = []
data[ :totalResults ] = doc.find( "//opensearch:totalResults" )[0].content.to_i
return data if data[ :totalResults ] == 0
entries = doc.find( "//atom:entry", "atom:http://www.w3.org/2005/Atom" )
data[ :entries ] = []
entries.each do |e|
title = e.find( "./atom:title", "atom:http://www.w3.org/2005/Atom" )[0].content
url = e.find( "./atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content
author = e.find( "./atom:author/atom:ja/atom:name", "atom:http://www.w3.org/2005/Atom" )
author = e.find( "./atom:author/atom:en/atom:name", "atom:http://www.w3.org/2005/Atom" ) if author[0].nil?
author = author.to_a.map{|name| name.content }.join( "; " )
material_title = e.find( "./atom:material_title/atom:ja", "atom:http://www.w3.org/2005/Atom" )
material_title = e.find( "./atom:material_title/atom:en", "atom:http://www.w3.org/2005/Atom" ) if material_title[0].nil?
material_title = material_title[0].content
pubyear = e.find( "./atom:pubyear", "atom:http://www.w3.org/2005/Atom" )[0].content
data[ :entries ] << {
:title => title,
:url => url,
:author => author,
:publicationName => material_title,
:publicationDate => pubyear,
}
end
data
end
# 一橋大学 OPAC Opensearch (not API)
def opac_hit_u_search( keyword, opts = {} )
require "htmlentities"
base_uri = "https://opac.lib.hit-u.ac.jp/opac/opac_list.cgi"
q = URI.escape( keyword )
opts[ :amode ] = 9 if opts.key?( :key )
opts_s = opts.make_uri_params
opts_s = "&" + opts_s if not opts_s.empty?
uri = URI.parse( "#{ base_uri }?kywd=#{ q }#{ opts_s }" )
cont = nil
cache_file = cache_xml( "opac_hit_u", q, opts[:start] )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
response = http_get( uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
end
cont.force_encoding( 'UTF-8' )
data = {}
# <td class="list_result"><span class="name"><a href="/opac/opac_details.cgi?lang=0&amode=11&place=&bibid=1000258087&key=B126875030611669&start=1&srmode=0"><strong>Take the test : sample questions from OECD's PISA assessments</strong></a></span><div class="other">[Paris] : OECD , c2009.</div></td>
data[ :opac_hit_u_key ] = $1 if %r[&key=(\w+)&] =~ cont # ad-hoc...
data[ :q ] = keyword
data[ :link ] = uri.to_s
if %r[該当件数(?: |[:\s])*<b>(\d+)</b> 件] =~ cont
totalResults = $1.to_i
else
totalResults = 0
end
htmlentities = HTMLEntities.new
data[ :totalResults ] = totalResults
data[ :entries ] = []
cont.gsub( %r|<td class="list_result"><span class="name"><a href="([^\"]+)"><strong>([^<]*)</strong></a></span><div class="other">(.*?)</div></td>| ) do |entry|
url, title, other = $1, $2, $3
url = uri.merge( $1 )
if url.query =~ /(bibid=\w+)/
url.query = "amode=11&#$1"
end
title = htmlentities.decode( title )
author = ""
if title =~ / \/ /
#STDERR.puts title.split( / \/ / )
title, author = title.split( / \/ / )
end
other.gsub!( /<[^>]+>/, "" )
other = htmlentities.decode( other )
case other
when /c?(\d{4})\.\Z/, /\b(\d{4}[\.\d+])\.\Z/
date = $1
when other =~ /c?(\d{4})\./
date = $1
end
data[ :entries ] << {
:title => title,
:author => author,
:url => url,
:description => other,
:publicationDate => date,
}
end
data
end
SPRINGER_METADATA_APIKEY = "hczcn8hnkx8c2zuzhnkmbz5j"
SPRINGER_INTERVAL = 0.5
# Springer Metadata API
## cf. http://dev.springer.com/docs
def springer_metadata_search( keyword, opts = {} )
base_uri = "http://api.springer.com/metadata/pam"
q = URI.escape( keyword )
cont = nil
cache_file = cache_xml( "springer", q, opts[:start] )
#p File.mtime( cache_file )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
if not opts.empty?
opts_s = opts.keys.map do |e|
next if e.to_s =~ /\A_/
if e == :start
"s=#{ URI.escape( opts[e].to_s ) }"
else
"#{ e }=#{ URI.escape( opts[e].to_s ) }"
end
end.join( "&" )
end
if opts[ :_prev_time ]
elapsed = Time.now - opts[ :_prev_time ]
if SPRINGER_INTERVAL > elapsed
sleep( SPRINGER_INTERVAL - elapsed )
end
end
opensearch_uri = URI.parse( "#{ base_uri }?q=#{ q }&api_key=#{ SPRINGER_METADATA_APIKEY }&#{ opts_s }" )
response = http_get( opensearch_uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
opts[ :_prev_time ] = Time.now
end
data = {}
parser = LibXML::XML::Parser.string( cont )
doc = parser.parse
data[ :q ] = keyword
# data[ :link ] = doc.find( "//atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content.gsub( /&(format=atom|apikey=#{ SPRINGER_METADATA_APIKEY })\b/, "" )
data[ :totalResults ] = doc.find( "//result/total" )[0].content.to_i
if data[ :totalResults ] > 0
data[ :itemsPerPage ] = doc.find( "//result/pageLength" )[0].content.to_i
end
xmlns = [ "dc:http://purl.org/dc/elements/1.1/",
"pam:http://prismstandard.org/namespaces/pam/2.0/",
"prism:http://prismstandard.org/namespaces/basic/2.0/",
"xhtml:http://www.w3.org/1999/xhtml",
]
entries = doc.find( "//pam:message", xmlns )
data[ :entries ] = []
entries.each do |e|
title = e.find( "./xhtml:head/pam:article/dc:title", xmlns )[0].content
url = e.find( "./xhtml:head/pam:article/prism:url", xmlns )[0].content
author = e.find( "./xhtml:head/pam:article/dc:creator", xmlns ).to_a.map{|au| au.content }.join( "; " )
pubname = e.find( "./xhtml:head/pam:article/prism:publicationName", xmlns )[0].content
pubdate = e.find( "./xhtml:head/pam:article/prism:publicationDate", xmlns )[0].content
if pubdate
pubdate.gsub!( /-01-01\Z/, "" )
pubdate.gsub!( /-01\Z/, "" )
end
doi = e.find( "./xhtml:head/pam:article/prism:doi", xmlns )[0].content
description = e.find( "./xhtml:body/p", xmlns )[0]
description = description.nil? ? "" : description.content
bibdata = {
:title => title,
:url => url,
:author => author,
:publicationName => pubname,
:publicationDate => pubdate,
:description => description,
:doi => doi,
}
[ :isbn, :issn, :volume, :number, :startingPage ].each do |type|
cont = e.find( "./xhtml:head/pam:article/prism:#{ type }", xmlns )[0]
bibdata[ type ] = cont.content if cont and not cont.empty?
end
data[ :entries ] << bibdata
end
data
end
DPLA_APIKEY = "76ac9bd08ecf381128a4da86f49feb95"
def dpla_search( keyword, opts = {} )
base_uri = "http://api.dp.la/v2/items"
q = CGI.escape( keyword )
cont = nil
cache = Cache.new( "dpla" )
cont = cache.fetch( q, opts[ :page ] ) do
if opts[ :page ] and opts[ :page ] > 0
opts[ :page ] += 1
end
if opts[ :count ]
opts[ :page_size ] = opts[ :count ]
opts.delete( :count )
end
opts_s = opts.make_uri_params
uri = URI.parse( "#{ base_uri }?q=#{ q }&api_key=#{ DPLA_APIKEY }&#{ opts_s }" )
#p [ opts, uri ]
response = http_get( uri )
cont = response.body
end
data = {}
json = JSON.load( cont )
data[ :q ] = q
data[ :link ] = "http://dp.la/search?q=#{ q }"
data[ :totalResults ] = json[ "count" ]
entries = json[ "docs" ]
data[ :entries ] = []
entries.each do |e|
title = e[ "sourceResource" ][ "title" ]
title = title.join( "\n" ) if title.respond_to? :join
url = e[ "@id" ]
url.sub!( "http://dp.la/api/items/", "http://dp.la/item/" )
author = e[ "creator" ]
#pubdate = e[ "date" ][ "displayDate" ] if e[ "date" ]
pubdate = e[ "sourceResource" ][ "date" ]
pubdate = pubdate[ "displayDate" ] if pubdate
description = e[ "sourceResource" ][ "description" ]
description = description.join( "\t" ) if description.respond_to? :join
data[ :entries ] << {
:title => title,
:url => url,
:author => author,
:publicationDate => pubdate,
:description => description,
}
end
data
end
SPRINGER_IMAGES_APIKEY = "uwud8n4tbkmr4bqw6zfq8ab8"
# Springer Images API
## cf. http://dev.springer.com/docs
def springer_images_search( keyword, opts = {} )
base_uri = "http://api.springer.com/images/xml"
q = CGI.escape( keyword )
cont = nil
cache_file = cache_xml( "springer-images", q )
if File.exist?( cache_file ) and ( Time.now - File.mtime( cache_file ) ) < CACHE_TIME
cont = open( cache_file ){|io| io.read }
else
if opts[ :start ]
opts[ :s ] = opts[ :start ].dup
opts.delete( :start )
end
opts_s = opts.make_uri_params
uri = URI.parse( "#{ base_uri }?q=#{ q }&api_key=#{ SPRINGER_IMAGES_APIKEY }&#{ opts_s }" )
response = http_get( uri )
cont = response.body
open( cache_file, "w" ){|io| io.print cont }
end
data = {}
parser = LibXML::XML::Parser.string( cont )
doc = parser.parse
data[ :q ] = keyword
# data[ :link ] = doc.find( "//atom:id", "atom:http://www.w3.org/2005/Atom" )[0].content.gsub( /&(format=atom|apikey=#{ SPRINGER_METADATA_APIKEY })\b/, "" )
data[ :totalResults ] = doc.find( "//result/total" )[0].content.to_i
if data[ :totalResults ] > 0
data[ :itemsPerPage ] = doc.find( "//result/pageLength" )[0].content.to_i
end
xmlns = [ "dc:http://purl.org/dc/elements/1.1/",
"pam:http://prismstandard.org/namespaces/pam/2.0/",
"prism:http://prismstandard.org/namespaces/basic/2.0/",
"xhtml:http://www.w3.org/1999/xhtml",
]
entries = doc.find( "//pam:message", xmlns )
data[ :entries ] = []
entries.each do |e|
title = e.find( "./xhtml:head/pam:article/dc:title", xmlns )[0].content
url = e.find( "./xhtml:head/pam:article/prism:url", xmlns )[0].content
author = e.find( "./xhtml:head/pam:article/dc:creator", xmlns ).to_a.map{|au| au.content }.join( "; " )
pubname = e.find( "./xhtml:head/pam:article/prism:publicationName", xmlns )[0].content
pubdate = e.find( "./xhtml:head/pam:article/prism:publicationDate", xmlns )[0].content
if pubdate
pubdate.gsub!( /-01-01\Z/, "" )
pubdate.gsub!( /-01\Z/, "" )
end
doi = e.find( "./xhtml:head/pam:article/prism:doi", xmlns )[0].content
description = e.find( "./xhtml:body/p", xmlns )[0]
description = description.nil? ? "" : description.content
bibdata = {
:title => title,
:url => url,
:author => author,
:publicationName => pubname,
:publicationDate => pubdate,
:description => description,
:doi => doi,
}
[ :isbn, :issn, :volume, :number, :startingPage ].each do |type|
cont = e.find( "./xhtml:head/pam:article/prism:#{ type }", xmlns )[0]
bibdata[ type ] = cont.content if cont and not cont.empty?
end
data[ :entries ] << bibdata
end
data
end
class NoHitError < Exception; end
class NoKeywordExtractedError < Exception; end
class UnsupportedURI < Exception; end
class Message < Hash
ERROR_MESSAGE = {
"Fuwatto::NoHitError" => "関連する文献を見つけることができませんでした。",
"UnsupportedURI" => "未対応のURL形式が指定されています。",
}
def initialize
set = ERROR_MESSAGE.dup
end
end
class BaseApp
attr_reader :format, :content, :url, :html
attr_reader :count, :page, :mode
TERMS = 10
EXAMPLE_TEXT = <<EOF
<p>
例:
<a href="?url=http://www.asahi.com/paper/editorial.html">朝日新聞社説</a> <span style="font-size:smaller;">(<a href="http://www.asahi.com/paper/editorial.html">元記事(asahi.com)</a>)</span>
</p>
<div id="feed_mainichi_opinion"></div>
EOF
def initialize( cgi )
@cgi = cgi
@url = @cgi.params["url"][0]
@content = @cgi.params["text"][0]
@html = @cgi.params["html"][0]
@format = @cgi.params["format"][0] || "html"
@count = @cgi.params["count"][0].to_i
@count = 20 if @count < 1
@page = @cgi.params["page"][0].to_i
@mode = @cgi.params["mode"][0] || "mecab"
@callback = @cgi.params["callback"][0]
raise( "Crawler access is limited to the first page." ) if @page > 0 and @cgi.user_agent =~ /bot|slurp|craw|spid/i
raise( "Crawler access is limited" ) if @cgi.user_agent =~ Regexp.new(Regexp.union(load_robots_txt))
end
def query_url?
not url.nil? and not url.empty? and url != "http://"
end
def query_text?
not content.nil? and not content.empty?
end
def query_html?
not html.nil? and not html.empty?
end
def query?
query_url? or query_text? or query_html?
end
def load_robots_txt
robots = []
open("robots.txt") do |io|
io.each do |line|
if line =~ /\A\s*User\-Agent\:\s*(.+?)\Z/i
bot = $1.dup.strip
next if ( bot.empty? or bot == "*" )
robots << $1
end
end
end
robots
end
include Fuwatto
def execute( search_method, terms, opts = {} )
data = {}
opts[ :use_df ] = true if not opts.has_key?( :use_df )
opts[ :prf_alpha ] = PRF_ALPHA if opts[ :prf ] and not opts.has_key?( :prf_alpha )
prev_time = nil
interval = nil
interval = opts[ :_interval ] if opts[ :_interval ]
if not query?
return data
end
time_pre = Time.now
search_opts = {}
opts.each do |k, v|
case k
when :term_weight, :term_weight_position, :use_df, :reranking, :combination, :prf, :prf_alpha
# skip document weighting params
else
search_opts[ k ] = v
end
end
#p search_opts
if query_url?
uri = URI.parse( url )
if not uri.respond_to?( :request_uri )
data[ :error ] = :UnsupportedURI
return data
end
response = http_get( uri )
@content = response.body
case response[ "content-type" ]
when /^text\/html\b/
@content = ExtractContent::analyse( @content ).join( "\n" )
when /^text\//
when /^application\/pdf\b/
@content = pdftotext( @content )
else
raise "Unknown Content-Type: #{ response[ "content-type" ] }"
end
elsif query_html?
@content = ExtractContent::analyse( @html ).join( "\n" )
end
#p content
vector = Document.new( content, mode, opts )
#STDERR.puts vector.inspect
#vector[0..20].each do |e|
# puts e.join("\t")
#end
prev_scores = []
vector1 = Document.new( nil ) # empty vector
vector_orig = Document.new( nil ) # ditto
single_entries = []
while vector.size > 0
k = vector.shift
vector_orig << k
prev_scores << k[1]
res = send( search_method, k[0], search_opts )
single_entries += res[ :entries ]
next if res[ :totalResults ] < 1
score = k[1] * 1 / Math.log2( res[ :totalResults ] + 1 )
vector1 << [ k[0], score ]
break if vector1.size >= terms
#STDERR.puts [ vector1.size, vector.inspect ]
end
#STDERR.puts [ vector1.size, vector.inspect ]
#STDERR.puts [ vector1.size, vector1.inspect ]
raise Fuwatto::NoHitError if vector1.empty?
single_entries = single_entries.uniq_by{|e| e[ :url ] }
if opts[ :use_df ]
vector1.sort!{|a, b| b[1] <=> a[1] }
prev_min = prev_scores.min
cur_min = vector1[-1][1]
vector.map! do |k|
factor = prev_min / cur_min
score = k[1] / factor
[ k[0], score ]
end
vector1.concat( vector ) if not vector.empty?
vector = vector1
else
vector_orig.concat( vector )
vector = vector_orig
end
#STDERR.puts vector1.inspect
if opts[ :prf ] # Rocchio-based blind query expantion
# STDERR.puts :prf
prf_top_k = single_entries.map do |e|
begin
Document.new( [ e[:title], e[:description] ].join("\n"),
mode, opts )
rescue Fuwatto::NoKeywordExtractedError
Document.new( nil )
end
end.sort_by do |e|
e.sim( vector )
end.reverse
prf_top_k = prf_top_k.map{|e|
e.select{|w| w.first.size > 1 }
}
prf_top_k = prf_top_k[ 0, PRF_TOP_K ]
prf_weight = Hash.new( 0 )
total_words = 0
prf_top_k.each do |d|
total_words += d.size
end
avg_words = total_words.to_f / PRF_TOP_K
# STDERR.puts "avg_words: #{ avg_words }"
prf_top_k.each do |d|
words_factor = d.size / avg_words
d.each do |k, v|
prf_weight[ k ] += v.to_f * words_factor / PRF_TOP_K
end
end
# $KCODE="u"
# STDERR.puts prf_weight.map{|k,v| [k,v] }.sort_by{|e| e[1] }.reverse[0,20].inspect
# STDERR.puts vector.inspect
max_weight = prf_weight.map{|k,v| v }.max
prf_factor = max_weight / vector[0][1]
# STDERR.puts "prf_factor: #{ prf_factor }"
prf_weight.each do |k, v|
v /= prf_factor / opts[ :prf_alpha ] # Magic-number "4"
w = vector.assoc( k )
if w
w[ 1 ] += v
else
vector << [ k, v ]
end
end
#STDERR.puts vector.inspect
vector.sort!{|a,b| b[1] <=> a[1] }
#STDERR.puts vector[0,20].inspect
end
#p vector
#vector[0..20].each do |e|
# puts e.join("\t")
#end
#p vector
keywords = {}
vector[ 0..20 ].each do |k,v|
keywords[ k ] = v
end
keyword = ""
entries = []
additional_keywords = []
if opts[ :combination ]
vector[ 0..(terms-1) ].map{|k| k[0] }.combination(3) do |v|
keyword = v.join( " " )
STDERR.puts keyword
data = send( search_method, keyword, search_opts )
if data[ :totalResults ] > 0
entries = ( entries + data[ :entries ] ).uniq_by{|e| e[:url] }
end
end
if entries.size < count and entries.size < count * ( page + 1 )
vector[ 0..(terms-1) ].map{|k| k[0] }.combination(2) do |v|
keyword = v.join( " " )
STDERR.puts keyword
data = send( search_method, keyword, search_opts )
if data[ :totalResults ] > 0
entries = ( entries + data[ :entries ] ).uniq_by{|e| e[:url] }
end
end
end
if entries.size < count and entries.size < count * ( page + 1 )
entries = ( entries + single_entries ).uniq_by{|e| e[:url] }
end
data[ :q ] = vector[ 0..(terms-1) ].map{|k| k[0] }.join( " " )
data[ :totalResults ] = entries.size
else
terms.times do |i|
next if vector.size < terms - i
keyword = vector[ 0..(terms-i-1) ].map{|k| k[0] }.join( " " )
STDERR.puts keyword
data = send( search_method, keyword, search_opts )
if data[ :totalResults ] > 0
entries = ( entries + data[ :entries ] ).uniq_by{|e| e[:url] }
#p [ entries.size, count, page, count * ( page + 1 ) ]
#p [ vector.size, terms, i ]
if entries.size < count or entries.size < count * ( page + 1 )
start = 1 + count
start = 1 + data[ :itemsPerPage ] if data[ :itemsPerPage ]
#p [ :start, data[ :totalResults ], start, count * (page+1) ]
while data[ :totalResults ] > start and entries.size < count * ( page + 1 )
#p [ entries.size, start ]
if search_method == :dpla_search
search_opts[ :page ] = start / count
else
search_opts[ :start ] = start
end
search_opts[ :key ] = data[ :opac_hit_u_key ] if data[ :opac_hit_u_key ]
data = send( search_method, keyword, search_opts )
entries = ( entries + data[ :entries ] ).uniq_by{|e| e[:url] }
if data[ :itemsPerPage ]
start += data[ :itemsPerPage ]
else
start += count
end
end
search_opts.delete( :start )
end
if entries.size < count or entries.size < count * ( page + 1 )
if ( terms-i ) > 1
#p [ vector, terms - i - 1 ]
additional_keywords.unshift( vector[ terms - i - 1 ][0] )
#p additional_keywords
next
end
end
break
end
end
end
if opts[ :reranking ]
entries.each do |e|
e[ :score ] = begin
sim = vector.sim( Document.new( [ e[:title], e[:description], e[:publicationName] ].join("\n"), mode, opts ) )
if sim.nan?
#STDERR.puts sim
#STDERR.puts e[:url]
sim = 0
end
sim
rescue Fuwatto::NoKeywordExtractedError
0.0
end
end
entries = entries.sort_by{|e| e[ :score ] }.reverse
end
#p entries[ 0, 5 ]
data[ :keywords ] = keywords
data[ :entries ] = entries
data[ :entries ] = entries[0, @count] if @format == "json"
data[ :additional_keywords ] = additional_keywords
data[ :count ] = count
data[ :page ] = page
data[ :database ] = self.class.to_s.sub( /\AFuwatto::(\w+)App\Z/, '\1' ).downcase
data[ :searchTime ] = "%0.02f" % ( Time.now - time_pre )
data
end
def output( prefix, data = {} )
#STDERR.puts data.inspect
case format
when "html"
result = eval_rhtml( "./#{ prefix }.rhtml", binding ) if query? and not data.has_key?( :error )
print @cgi.header
print eval_rhtml( "./top.rhtml", binding )
when "json"
print @cgi.header "application/json"
result = JSON::generate( data, :ascii_only => true )
if @callback and @callback =~ /^\w+$/
result = "#{ @callback }(#{ result })"
end
print result
else
raise "unknown format specified: #{ format }"
end
end
include ERB::Util
def eval_rhtml( fname, binding )
rhtml = open( fname ){|io| io.read }
result = ERB::new( rhtml, $SAFE, "<>" ).result( binding )
end
end
end