nadoka/nadoka

View on GitHub
plugins/googlebot.nb

Summary

Maintainability
Test Coverage
# -*-ruby; coding: utf-8 -*- vim:set ft=ruby:
#
# Copyright (c) 2004-2005 SASADA Koichi <ko1 at atdot.net>
# Copyright (c) 2009, 2010 Kazuhiro NISHIYAMA
#
# This program is free software with ABSOLUTELY NO WARRANTY.
# You can re-distribute and/or modify this program under
# the same terms of the Ruby's license.
#
#
# $Id$
#

=begin

== Usage with irc client

  google> keyword
    -> search keyword by google with default search langage

  google:[lang]> keyword
    -> search keyword by google with [lang] langage

  googlec> k1 k2 k3 k4 k5(max 5 words)
    -> search and show each hit count

  googlec> k1 k2 k3 k4 k5(max 5 words)
    -> search and show each hit count with default count language

  googlec:[lang]> k1 k2 k3 k4 k5(max 5 words)
    -> search and show each hit count with [lang] langage


== Configuration:

BotConfig = [
{
  :name                => :GoogleBot,
  :ch                  => /.*/,
  :headers             => {
    #"User-Agent" => "Ruby/#{RUBY_VERSION}",
    'Referer' => 'https://github.com/nadoka/nadoka',
  },
  # API key
  :api_key             => 'INSERT_YOUR_API_KEY',
  # Custom search engine ID
  :cx                  => '017576662512468239146:omuauf_lfve',
  :googlec_maxwords    => 5,
  :search_default_lang => 'ja',
  :count_default_lang  => '',
  :ch_kcode            => :tojis,
},
]

=end


unless "".respond_to?(:encode)
  require 'iconv'
end
require 'kconv'
require 'shellwords'
require 'cgi'
require 'open-uri'
begin
  require 'json'
rescue LoadError
  require 'rubygems'
  require 'json'
end

if __FILE__ == $0
  # for test
  module Nadoka
    class NDK_Bot
      def bot_init_utils
      end
      def initialize
        @bot_config = Hash.new
        @bot_config[:headers] = {
          "User-Agent" => "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) snap Chromium/80.0.3987.132 Chrome/80.0.3987.132 Safari/537.36",
        }
        bot_initialize
      end
    end
  end
end

class GoogleBot < Nadoka::NDK_Bot
  def bot_initialize
    bot_init_utils

    @search_default_lang = (@bot_config[:search_default_lang] || 'ja').sub(/^lang_/, '')
    @googlec_maxwords    = @bot_config[:googlec_maxwords] || 5
    @count_default_lang  = (@bot_config[:count_default_lang] || '').sub(/^lang_/, '')
    @headers = @bot_config.fetch(:headers, {})
    @api_key = @bot_config[:api_key]
    @cx = @bot_config[:cx]
    @uri_slog = @bot_config.fetch(:uri_slog, false)
    @ch_kcode = @bot_config.fetch(:ch_kcode, :tojis)
  end

  def on_privmsg prefix, ch, msg
    return unless @available_channel === ch
    return if same_bot?(ch)
    msg = NKF.nkf('-w', msg)
    if response = dispatch_command(msg)
      send_notice(ch, response.send(@ch_kcode))
    end
  end

  SEARCHER = %w!web calc code local video blogs news books images ime imed patent suggest!.freeze
  SEARCHER_RE = Regexp.new("(?:" + SEARCHER.join('|') + ")").freeze

  def search_searcher key 
    SEARCHER.each{|searcher|
      if /\A#{key}/ =~ searcher
        return searcher
      end
    }; nil
  end

  def dispatch_command msg
    begin
      case msg
      when /^g>\s*(.+)/
        custom_search $1
      when /^goo(o*)gle( #{SEARCHER_RE})?(:.*?)?>\s*(.+)/o, /^gu(u*)guru(#{SEARCHER_RE})?(:.+)?>\s*(.+)/o
        "goo#{$1}gle#{$2} bot#{$3}: #{search($1.length, $3, $4, $2)}"
      when /^googlec( #{SEARCHER_RE})?(:.*?)?>\s*(.+)/o
        "googlec#{$1} bot#{$2}: #{googlec($1, $3, $2)}"
      when /^g(\w+)?(:.*?)?>\s*(.+)/
        searcher = $1 ? search_searcher($1) : 'web'
        "google #{searcher} bot#{$2}: #{search(0, $2, $3, searcher)}" if searcher
      end
    rescue Exception => e
      @manager.ndk_error e
      "google bot: #{e.class} (#{e.message} @ #{e.backtrace[0]})"
    end
  end

  def custom_search word
    uri = "https://www.googleapis.com/customsearch/v1"
    uri << "?key=#{@api_key}&cx=#{@cx}&q="
    uri << CGI.escape(word)
    @logger.slog "GoogleBot: #{uri}" if @uri_slog
    result = open(uri, @headers) do |f|
      JSON.parse(f.read)
    end
    @logger.slog "GoogleBot: #{result}" if @uri_slog
    count = result["searchInformation"]["totalResults"].to_i
    if count == 0
      return "no match"
    end
    count = count.to_s.gsub(/(\d)(?=\d{3}+$)/, '\\1,')
    item, = result["items"]
    title = item["title"]
    url = item["link"]
    "#{title} - #{url} (and #{count} hit#{(count.to_i > 1) ? 's' : ''})".delete("\r\n")
  rescue OpenURI::HTTPError => e
    @logger.slog "GoogleBot: #{e.inspect}" if @uri_slog
    result = JSON.parse(e.io.read)
    if @uri_slog
      @logger.slog "GoogleBot: #{result.inspect}"
    end
    result["error"]["errors"][0]["reason"].delete("\r\n")
  end

  def do_search word, cnt, lang, searcher='web'
    i = 0
    begin
      uri = "http://ajax.googleapis.com/ajax/services/search/"
      uri << searcher
      uri << "?v=1.0&q="
      uri << CGI.escape(word)
      if @api_key
        uri << "&key=#{CGI.escape(@api_key)}"
      end
      cnt = cnt.to_i
      if cnt > 0
        uri << "&start=#{cnt.to_i}"
      end
      if lang
        uri << "&hl=#{CGI.escape(lang)}"
        if searcher == 'web'
          uri << "&lr=lang_#{CGI.escape(lang)}"
        end
      end
      @logger.slog "GoogleBot: #{uri}" if @uri_slog

      result = open(uri, @headers) do |f|
        JSON.parse(f.read)
      end
      def result.estimatedTotalResultsCount
        self["responseData"]["cursor"]["estimatedResultCount"]
      end
      result
    rescue Exception => e
      retry if (i+=1) < 5
      raise
    end
  end

  def api_search word, cnt, lang, searcher
    result = do_search word, cnt, lang, searcher

    if result["responseData"].nil?
      # {"responseData": null, "responseDetails": "qps rate exceeded", "responseStatus": 503}
      return "error #{result['responseStatus']}: #{result['responseDetails']}"
    end

    count = result.estimatedTotalResultsCount.to_i

    if count > 0
      count = count.to_s.gsub(/(\d)(?=\d{3}+$)/, '\\1,')
      url = title = ''

      e = result["responseData"]["results"][0]
      url   = e['unescapedUrl'] || e['url'] || e['postUrl']
      title = show_char_code_and_erase_tag(e['titleNoFormatting'])
      url = shorten_url(url)
      "#{title} - #{url} (and #{count} hit#{(count.to_i > 1) ? 's' : ''})"
    else
      "no match"
    end
  end

  def google_calc exp
    @logger.slog("google_calc<#{exp.dump}")
    uri = "https://www.google.co.jp/search?ie=UTF8&oe=UTF-8&q=#{CGI.escape(exp)}"
    html = open(uri, @headers) do |f|
      f.read
    end
    open("g.html", "wb") { |f| f.write html } if $DEBUG
    if /class=r [^<>]+><b>(.+?)<\/b>/u =~ html
      result = $1
      # @logger.slog("google_calc>#{result.dump}")
      result.gsub!(/<sup>(.+?)<\/sup>/u) { "^(#{$1})" }
      result.gsub!(/<.+?>/u, '')
      result.gsub!(/&\#215;/u, "\303\227")
      return result
    elsif /<[^<>]+ id="cwos"[^<>]*>([^<>]+)</u =~ html
      result = $1
      if /<[^<>]+ id="cwles"[^<>]*>([^<>]+)</u =~ html
        result = "#{$1}#{result}"
      end
      #@logger.slog("google_calc>#{result.dump}")
      result.gsub!(/&nbsp;/u, " ")
      result.gsub!(/\s+/, " ")
      return result
    elsif /<div class="leg_calc[^<>]*>(?:<div[^<>]*>)+([^<>]+)<\/div><div[^<>]*>([^<>]+)</u =~ html
      result = "#{$1} #{$2}"
      #@logger.slog("google_calc>#{result.dump}")
      return result
    elsif /<g-card>(.*)<\/g-card>/ =~ html
      result = $1
      result.sub!(/<a.*/u, '')
      result.gsub!(/<.+?>/u, '')
      return result
    elsif /<div class="RJn8N xXEKkb ellip[^"]*">= ([^<>]+)</ =~ html
      result = $1
      #@logger.slog("google_calc>#{result.dump}")
      return result
    else
      #IO.write('g.html', html) if STDOUT.tty?
      "response error"
    end
  rescue Exception
    $!.to_s
  end

  def google_suggest(word, lang)
    uri = "http://suggestqueries.google.com/complete/search?output=firefox"
    uri << "&q="
    uri << CGI.escape(word)
    if lang
      uri << "&hl=#{CGI.escape(lang)}"
    end
    @logger.slog "GoogleBot: #{uri}" if @uri_slog

    result = open(uri, @headers) do |f|
      JSON.parse(f.read)
    end
    result[1].join(", ")
  end

  def google_code key
    return "http://google.com/codesearch#search/&q=#{CGI.escape(key)}&ct=os"
  end

  if defined?(URI.encode_www_form)
    def encode_www_form(enum)
      URI.encode_www_form(enum)
    end
  else
    def encode_www_form(enum)
      enum.map do |k, v|
        "#{URI.encode(k)}=#{URI.encode(v)}"
      end.join('&')
    end
  end

  # see http://www.google.com/intl/ja/ime/cgiapi.html
  def google_ime text, d=false
    url = 'http://www.google.com/transliterate?'
    url << encode_www_form('langpair' => 'ja-Hira|ja', 'text' => text)
    data = open(url,@headers){|f|
      # TODO: gsub fix invalid JSON, should remove after fix response
      # see http://www.google.com/support/forum/p/ime/thread?tid=06501c8b7a16add3&hl=ja
      JSON.parse(f.read.gsub(/,(?=\n\])/,''))
    }
    if d
      result = data.map do |org, candidates|
        "#{org}=#{candidates.join('/')}"
      end.join(' ')
    else
      result = data.map do |org, candidates|
        candidates[0]
      end.join('')
    end
    show_char_code_and_erase_tag(result)
  rescue Exception
    $!.to_s[/.+/] # first line
  end

  def search cnt, lang, word, searcher=nil
    lang = lang_check(lang)
    searcher = searcher_check(searcher)
    word = search_char_code(word)

    case searcher
    when 'code'
      google_code word
    when 'calc'
      google_calc word
    when 'ime'
      google_ime word
    when 'imed'
      google_ime word, true
    when 'suggest'
      google_suggest word, lang
    else
      api_search word, cnt, lang, searcher
    end
  end

  def googlec lang, word, searcher=nil
    lang = lang_check(lang, @count_default_lang)
    searcher = searcher_check(searcher)
    words = Shellwords.shellwords(word).map{|e| "\"#{e}\""}
    return 'too many options' if words.size > @googlec_maxwords

    words.map{|rw|
      w = search_char_code(rw)
      result = do_search "'#{w}'", 0, lang, searcher
      "#{rw}(#{result.estimatedTotalResultsCount.to_s.gsub(/(\d)(?=\d{3}+$)/, '\\1,')})"
    }.join(', ')
  end

  def erase_tag str
    CGI.unescapeHTML(str.gsub(/\<.+?\>/, ''))
  end

  def lang_check lang, default = @search_default_lang
    if !lang
      @search_default_lang
    else
      lang = lang[1..-1]
      if lang.empty?
        nil
      elsif /^lang_/ =~ lang
        lang.sub(/^lang_/, '')
      else
        lang
      end
    end
  end

  def searcher_check searcher
    if !searcher
      'web'
    else
      searcher = searcher.strip
      if SEARCHER.include?(searcher)
        searcher
      else
        'web'
      end
    end
  end

  def show_char_code_and_erase_tag str
    if str.respond_to?(:encode)
      return CGI.unescapeHTML(erase_tag(str.toutf8))
    end
    return CGI.unescapeHTML(erase_tag(str.toeuc))

    case $KCODE
    when 'EUC', 'SJIS'
      CGI.unescapeHTML(str.gsub(/\<.+?\>/, ''))
    when 'NONE', 'UTF-8'
      begin
        str = Iconv.conv("EUC-JP", "UTF-8", str)
        CGI.unescapeHTML(str.gsub(/\<.+?\>/, ''))
      rescue => e
        "(char code problem: #{e.class}[#{e.message.dump}])"
      end
    else
      str
    end
  end

  def search_char_code str
    if str.respond_to?(:encode)
      return str.toutf8
    end
    case $KCODE
    when 'EUC', 'SJIS'
      str.toeuc
    when 'NONE'
      begin
        Iconv.conv("UTF-8", "EUC-JP", str.toeuc)
      rescue => e
        raise "(char code problem: #{e.class})"
      end
    when 'UTF-8'
      str
    else
      raise
    end
  end

  def shorten_url(url)
    case url
    when %r!\Ahttp://www\.amazon\.co\.jp/.*(/dp/.+)\z!
      "http://amazon.jp#{$1}"
    else
      # default: do nothing
      url
    end
  end
end

if __FILE__ == $0
  if ARGV.empty?
    puts "ad hoc test usage:"
    puts " ruby -vd plugins/googlebot.nb 'gc>1+1'"
    puts " ruby -vd plugins/googlebot.nb 'gc>1ドルを円で'"
  end
  require 'logger'
  google_bot = GoogleBot.new
  google_bot.instance_eval do
    @logger = Object.new
    def @logger.slog(log)
      STDERR.puts "slog>#{log}"
    end
  end
  ARGV.each do |arg|
    puts arg
    puts google_bot.dispatch_command(arg)
  end
end