lib/atig/ofilter/escape_url.rb
# -*- mode:ruby; coding:utf-8 -*-
require 'atig/util'
require 'atig/http'
require 'atig/url_escape'
begin
require "punycode"
rescue LoadError
end
module Atig
module OFilter
class EscapeUrl
include Util
def initialize(context)
@log = context.log
@http = Atig::Http.new @log
end
def call(status)
status.merge(status: escape_http_urls(status[:status]))
end
def exist_uri?(uri, limit = 1)
ret = nil
#raise "Not supported." unless uri.is_a?(URI::HTTP)
return ret if limit.zero? or uri.nil? or not uri.is_a?(URI::HTTP)
@log.debug uri.inspect
req = @http.req :head, uri
@http.http(uri, 3, 2).request(req) do |res|
ret = case res
when Net::HTTPSuccess
true
when Net::HTTPRedirection
uri = resolve_http_redirect(uri)
exist_uri?(uri, limit - 1)
when Net::HTTPClientError
false
else
nil
end
end
ret
rescue => e
@log.error e.inspect
ret
end
def escape_http_urls(text)
original_text = text.encoding!("UTF-8").dup
if defined? ::Punycode
# TODO: Nameprep
text.gsub!(%r{(https?://)([^\x00-\x2C\x2F\x3A-\x40\x5B-\x60\x7B-\x7F]+)}) do
domain = $2
# Dots:
# * U+002E (full stop) * U+3002 (ideographic full stop)
# * U+FF0E (fullwidth full stop) * U+FF61 (halfwidth ideographic full stop)
# => /[.\u3002\uFF0E\uFF61] # Ruby 1.9 /x
$1 + domain.split(/\.|\343\200\202|\357\274\216|\357\275\241/).map do |label|
break [domain] if /\A-|[\x00-\x2C\x2E\x2F\x3A-\x40\x5B-\x60\x7B-\x7F]|-\z/ === label
next label unless /[^-A-Za-z0-9]/ === label
punycode = Punycode.encode(label)
break [domain] if punycode.size > 59
"xn--#{punycode}"
end.join(".")
end
if text != original_text
log :info, "Punycode encoded: #{text}"
original_text = text.dup
end
end
urls = []
text.split(/[\s<>]+/).each do |str|
next if /%[0-9A-Fa-f]{2}/ === str
# URI::UNSAFE + "#"
escaped_str = Atig::UrlEscape.escape(str, %r{[^-_.!~*'()a-zA-Z0-9;/?:@&=+$,\[\]#]}) #'
URI.extract(escaped_str, %w[http https]).each do |url|
uri = URI(Atig::UrlEscape.rstrip(url))
if not urls.include?(uri.to_s) and self.exist_uri?(uri)
urls << uri.to_s
end
end if escaped_str != str
end
urls.each do |url|
unescaped_url = Atig::UrlEscape.unescape(url).encoding!("UTF-8")
text.gsub!(unescaped_url, url)
end
log :info, "Percent encoded: #{text}" if text != original_text
text.encoding!("UTF-8")
rescue => e
log :error, e
text
end
end
end
end