lib/html_sanitizer/scrubber/link.rb
# Copyright (C) 2012-2024 Zammad Foundation, https://zammad-foundation.org/
class HtmlSanitizer
module Scrubber
class Link < Base
LINKABLE_URL_SCHEMES = URI.scheme_list.keys.map(&:downcase) - ['mailto'] + ['tel']
attr_reader :external, :web_app_url_prefix
def initialize(web_app_url_prefix:, external: false) # rubocop:disable Lint/MissingSuper
@direction = :top_down
@external = external
@web_app_url_prefix = web_app_url_prefix
end
def scrub(node)
if (urls = node_urls(node))
return if urls.blank?
add_link(node.content, urls, node)
end
# prepare links
return if href_cleanup(node)
return STOP if ensure_href_present(node)
update_node_title(node)
end
private
def href_cleanup(node)
return if !node['href']
href = cleanup_target(node['href'], keep_spaces: true)
href_without_spaces = href.gsub(%r{[[:space:]]}, '')
if href_retry_protocol?(href_without_spaces)
node['href'] = "http://#{node['href']}"
href = node['href']
href_without_spaces = href.gsub(%r{[[:space:]]}, '')
end
return true if !href_starts_with_protocol?(href_without_spaces)
href_set_values(node, href)
false
end
def href_retry_protocol?(href_without_spaces)
return if !external
return if href_without_spaces.blank?
return if href_without_spaces.downcase.start_with?('mailto:')
return if href_without_spaces.downcase.start_with?('tel:')
return if href_without_spaces.downcase.start_with?('//')
return if href_without_spaces.downcase.match? %r{^.{1,6}://.+?}
true
end
def href_starts_with_protocol?(href_without_spaces)
CGI
.unescape(href_without_spaces)
.utf8_encode(fallback: :read_as_sanitized_binary)
.gsub(%r{[[:space:]]}, '')
.downcase
.start_with?('http', 'ftp', '//')
end
def href_set_values(node, value)
node.set_attribute('href', value)
node.set_attribute('rel', 'nofollow noreferrer noopener')
# do not "target=_blank" WebApp URLs (e.g. mentions)
return if value.downcase.start_with?(web_app_url_prefix)
node.set_attribute('target', '_blank')
end
def node_urls(node)
return if !node.is_a?(Nokogiri::XML::Text)
return if node.content.blank?
return if node.content.exclude?(':')
return if node.ancestors.map(&:name).intersection(%w[a pre]).any?
URI.extract(node.content, LINKABLE_URL_SCHEMES)
.map { |u| u.sub(%r{[,.]$}, '') } # URI::extract captures trailing dots/commas
.grep_v(%r{^[^:]+:$}) # URI::extract will match, e.g., 'tel:'
end
def ensure_href_present(node)
return if node.name != 'a'
return if node['href'].present?
node.replace node.children.to_s
true
end
def update_node_title(node)
return if node.name != 'a'
return if url_same?(node['href'], node.text)
return if node['title'].present?
node['title'] = node['href']
end
def add_link(content, urls, node)
return if add_link_blank_text(content, urls, node)
url = urls.shift
return if content !~ %r{^(.*)#{Regexp.quote(url)}(.*)$}mx
pre = $1
post = $2
a_elem = add_link_build_node(node, url)
if node.class != Nokogiri::XML::Text
text = Nokogiri::XML::Text.new(pre, node.document)
node.add_next_sibling(text).add_next_sibling(a_elem)
return if post.blank?
add_link(post, urls, a_elem)
return
end
add_link_apply_to_node(node, pre, a_elem)
return if post.blank?
add_link(post, urls, a_elem)
end
def add_link_apply_to_node(node, pre, a_elem)
node.content = pre
node.add_next_sibling(a_elem)
end
def add_link_blank_text(content, urls, node)
return false if urls.present?
text = Nokogiri::XML::Text.new(content, node.document)
node.add_next_sibling(text)
true
end
def add_link_build_node(node, url)
if url.match?(%r{^www}i)
url = "http://#{url}"
end
a = Nokogiri::XML::Node.new 'a', node.document
a['href'] = url
a['rel'] = 'nofollow noreferrer noopener'
a['target'] = '_blank'
a.content = url
a
end
def cleanup_target(string, **options)
cleaned_string = string.utf8_encode(fallback: :read_as_sanitized_binary)
cleaned_string = cleaned_string.gsub(%r{[[:space:]]}, '') if !options[:keep_spaces]
cleaned_string = cleaned_string.strip
.delete("\t\n\r\u0000")
.gsub(%r{/\*.*?\*/}, '')
.gsub(%r{<!--.*?-->}, '')
sanitize_attachment_disposition(cleaned_string)
end
def sanitize_attachment_disposition(url)
@fqdn ||= Setting.get('fqdn')
uri = URI(url)
if uri.host == @fqdn && uri.query.present?
params = CGI.parse(uri.query || '')
.tap { |p| p.merge!('disposition' => 'attachment') if p.include?('disposition') }
uri.query = URI.encode_www_form(params)
end
uri.to_s
rescue
url
end
def url_same?(url_new, url_old)
url_new = url_same_build(url_new)
url_old = url_same_build(url_old)
return true if url_new == url_old
return true if url_old == "http://#{url_new}"
return true if url_new == "http://#{url_old}"
return true if url_old == "https://#{url_new}"
return true if url_new == "https://#{url_old}"
false
end
def url_same_build(input)
url = CGI
.unescape(input.to_s)
.utf8_encode(fallback: :read_as_sanitized_binary)
.downcase
.delete_suffix('/')
.gsub(%r{[[:space:]]|\t|\n|\r}, '')
.strip
html_decode(url)
.sub('/?', '?')
end
end
end
end