Coursemology/coursemology2

View on GitHub
app/helpers/application_html_formatters_helper.rb

Summary

Maintainability
A
25 mins
Test Coverage
# frozen_string_literal: true
# rubocop:disable Metrics/ModuleLength
module ApplicationHtmlFormattersHelper
  # Constants that defines the size/lines limit of the code
  MAX_CODE_SIZE = 50 * 1024 # 50 KB
  MAX_CODE_LINES = 1000

  # Replaces the Rails sanitizer with the one configured with HTML Pipeline.
  def sanitize(text, _options = {})
    pipeline = HTML::Pipeline.new([HTML::Pipeline::SanitizationFilter], { whitelist: SANITIZATION_FILTER_WHITELIST })
    format_with_pipeline(pipeline, text)
  end

  # Sanitises and formats the given user-input string. The string is assumed to contain HTML markup.
  # Conversions may happen, depending on the transformers registered in the pipeline.
  #
  # @param [String] text The text to display
  # @return [String]
  def format_html(text)
    format_with_pipeline(DEFAULT_HTML_CONVERTING_PIPELINE, text)
  end

  def format_ckeditor_rich_text(text)
    process_ckeditor_rich_text_with_pipeline(DEFAULT_HTML_CONVERTING_PIPELINE, text)
  end

  def sanitize_ckeditor_rich_text(text)
    process_ckeditor_rich_text_with_pipeline(DEFAULT_HTML_PIPELINE, text)
  end

  # Syntax highlights and adds lines numbers to the given code fragment.
  #
  # This filter will normalise all line endings to Unix format (\n) for use with the Rouge
  # highlighter.
  #
  # @param [String] code The code to syntax highlight.
  # @param [Coursemology::Polyglot::Language] language The language to highlight the code block
  #   with.
  # @param [Integer] start_line The line number of the first line, default is 1. This
  #   should be provided if the code fragment does not start on the first line.
  def format_code_block(code, language = nil, start_line = 1)
    if code_size_exceeds_limit?(code)
      content_tag(:div, class: 'alert alert-warning') do
        I18n.t('layouts.code_formatter.size_too_big')
      end
    else
      sanitize_and_format_code(code, language, start_line)
    end
  end

  # Syntax highlights the given code fragment without adding line numbers.
  #
  # This filter will normalise all line endings to Unix format (\n) for use with the Rouge
  # highlighter.
  #
  # @param [String] code The code to syntax highlight.
  # @param [Coursemology::Polyglot::Language] language The language to highlight the code block
  #   with.
  def highlight_code_block(code, language = nil)
    return if code_size_exceeds_limit?(code)

    code = html_escape(code) unless code.html_safe?
    code = code.gsub(/\r\n|\r/, "\n").html_safe

    code = content_tag(:pre, lang: language ? language.rouge_lexer : nil) do
      content_tag(:code) { code }
    end

    pipeline = HTML::Pipeline.new(DEFAULT_PIPELINE.filters + [PreformattedTextLineSplitFilter],
                                  DEFAULT_CODE_PIPELINE_OPTIONS)

    format_with_pipeline(pipeline, code)
  end

  def self.build_html_pipeline(custom_options)
    pipeline = HTML::Pipeline.new([HTML::Pipeline::SanitizationFilter], custom_options)
    options = DEFAULT_PIPELINE_OPTIONS.merge(custom_options)

    HTML::Pipeline.new(pipeline.filters + DEFAULT_PIPELINE.filters, options)
  end

  private_class_method :build_html_pipeline

  private

  # List of video hosting site URLs to allow
  VIDEO_URL_WHITELIST = Regexp.union(
    /\A(?:https?:)?\/\/(?:www\.)?(?:m.)?youtube\.com\//,
    /\A(?:https?:)?\/\/(?:www\.)?youtu.be\//,
    /\A(?:https?:)?\/\/(?:www\.)?(?:player.)?vimeo\.com\//,
    /\A(?:https?:)?\/\/(?:www\.)?vine\.co\//,
    /\A(?:https?:)?\/\/(?:www\.)?instagram\.com\//,
    /\A(?:https?:)?\/\/(?:www\.)?(?:geo.)?dailymotion\.com\//,
    /\A(?:https?:)?\/\/(?:www\.)?dai\.ly\//,
    /\A(?:https?:)?\/\/(?:www\.)?youku\.com\//
  ).freeze

  OEMBED_WHITELIST_TRANSFORMER = lambda do |env|
    node, node_name = env[:node], env[:node_name]

    return if env[:is_whitelisted] || !node.element?

    return unless node_name == 'oembed'
    return unless node['url']&.match VIDEO_URL_WHITELIST

    { node_whitelist: [node] }
  end.freeze

  OEMBED_WHITELIST_CONVERTER = lambda do |env|
    node, node_name = env[:node], env[:node_name]

    return if env[:is_whitelisted] || !node.element?

    return unless node_name == 'oembed'
    return unless node['url']&.match VIDEO_URL_WHITELIST

    resource = OEmbed::Providers.get(node['url'])
    new_node = Nokogiri::HTML5.fragment(resource.html).children.first

    node.add_next_sibling(new_node)

    { node_whitelist: [node] }
  end.freeze

  # Transformer to whitelist iframes containing embedded video content
  VIDEO_WHITELIST_TRANSFORMER = lambda do |env|
    node, node_name = env[:node], env[:node_name]

    return if env[:is_whitelisted] || !node.element?

    return unless node_name == 'iframe'
    return unless node['src']&.match VIDEO_URL_WHITELIST

    Sanitize.node!(node, elements: ['iframe'],
                         attributes: {
                           'iframe' => ['allowfullscreen', 'frameborder', 'height', 'src', 'width']
                         })

    { node_whitelist: [node] }
  end.freeze

  # - Allow whitelisting of base64 encoded images for HTML text.
  # TODO: Remove 'data' from whitelisted protocols once we disable Base64 encoding
  IMAGE_WHITELIST_TRANSFORMER = lambda do |env|
    node, node_name = env[:node], env[:node_name]

    return if env[:is_whitelisted] || !node.element?

    return unless node_name == 'img'
    return node.unlink unless node['src']

    Sanitize.node!(node, elements: ['img'],
                         protocols: ['http', 'https', 'data', :relative],
                         attributes: { 'img' => ['src', 'style'] },
                         css: { properties: ['height', 'width'] })

    { node_whitelist: [node] }
  end.freeze

  # SanitizationFilter Custom Options
  # See https://github.com/gjtorikian/html-pipeline#2-how-do-i-customize-an-allowlist-for-sanitizationfilters
  SANITIZATION_FILTER_WHITELIST = begin
    list = HTML::Pipeline::SanitizationFilter::ALLOWLIST.deep_dup
    list[:remove_contents] = ['style']
    list[:elements] |= ['span', 'font', 'u', 'colgroup', 'col']
    list[:attributes][:all] |= ['style']
    list[:attributes]['font'] = ['face']
    list[:attributes]['table'] = ['class']
    list[:attributes]['code'] = ['class']
    list[:attributes]['figure'] = ['class']
    list[:css] = { properties: [
      'background-color', 'color', 'font-family', 'margin',
      'margin-bottom', 'margin-left', 'margin-right', 'margin-top', 'text-align',
      'width', 'list-style-type'
    ] }
    list[:transformers] |= [VIDEO_WHITELIST_TRANSFORMER, IMAGE_WHITELIST_TRANSFORMER].freeze
    list
  end.freeze

  DEFAULT_PIPELINE_OPTIONS = {
    scope: 'codehilite',
    replace_br: true
  }.freeze

  DEFAULT_CODE_PIPELINE_OPTIONS = DEFAULT_PIPELINE_OPTIONS.merge(css_table_class: 'table').freeze

  # The default pipeline, used by both text and HTML pipelines.
  DEFAULT_PIPELINE = HTML::Pipeline.new(
    [HTML::Pipeline::AutolinkFilter, HTML::Pipeline::SyntaxHighlightFilter],
    DEFAULT_PIPELINE_OPTIONS
  )

  # The default HTML pipeline that sanitises an HTML.
  DEFAULT_HTML_PIPELINE = begin
    whitelist = SANITIZATION_FILTER_WHITELIST.deep_dup
    whitelist[:transformers].prepend OEMBED_WHITELIST_TRANSFORMER

    build_html_pipeline({ whitelist: whitelist })
  end

  # The default HTML pipeline that sanitises AND converts certain HTML markups for display/formatting purposes.
  # This pipeline is generally NOT used for saving to the database.
  DEFAULT_HTML_CONVERTING_PIPELINE = begin
    whitelist = SANITIZATION_FILTER_WHITELIST.deep_dup
    whitelist[:transformers].prepend OEMBED_WHITELIST_CONVERTER

    build_html_pipeline({ whitelist: whitelist })
  end

  # Test if the given code exceeds the size or line limit.
  def code_size_exceeds_limit?(code)
    code && (code.bytesize > MAX_CODE_SIZE || code.lines.size > MAX_CODE_LINES)
  end

  def sanitize_and_format_code(code, language, start_line)
    code = html_escape(code) unless code.html_safe?
    code = code.gsub(/\r\n|\r/, "\n").html_safe
    code = content_tag(:pre, lang: language ? language.rouge_lexer : nil) do
      content_tag(:code) do
        code
      end
    end

    format_with_pipeline(default_code_pipeline(start_line), code)
  end

  def process_ckeditor_rich_text_with_pipeline(pipeline, text)
    text_with_updated_code_tag = remove_internal_adjacent_code_tags(text)
    format_with_pipeline(pipeline, text_with_updated_code_tag).
      gsub(/<table>/, '<table class="table table-bordered">') # Add lines to tables
  end

  # Filters the given text through the given pipeline.
  #
  # This inserts a dummy root node to conform with html-pipeline needing a root element.
  #
  # @param [HTML::Pipeline] pipeline The pipeline to filter with.
  # @param [String] text The text to filter.
  # @return [String]
  def format_with_pipeline(pipeline, text)
    pipeline.to_document("<div>#{text}</div>").child.inner_html.html_safe
  end

  # The Code formatter pipeline.
  #
  # @param [Integer] starting_line_number The line number of the first line, default is 1.
  # @return [HTML::Pipeline]
  def default_code_pipeline(starting_line_number = 1)
    HTML::Pipeline.new(DEFAULT_PIPELINE.filters + [PreformattedTextLineNumbersFilter],
                       DEFAULT_CODE_PIPELINE_OPTIONS.merge(line_start: starting_line_number))
  end

  # Removes adjacent code tags inside pre tag
  # In the past, when creating multiline codeblock using summernote,
  # it would generate <pre><code>some code </code><code> some other code</code></pre>
  # When there are multiple code tags within a pre tag, CKEditor will automatically
  # add pre tag for every code tag, which messes up the display.
  # This function will convert <pre><code></code>  <code></code></pre> into
  # <pre><code>  </code></pre>
  #
  # @param [String] text The text to be updated
  # @return [String]
  def remove_internal_adjacent_code_tags(text)
    return unless text

    detect_pre_tag = /<pre>([\s\S]*?)<\/pre>/
    text.gsub(detect_pre_tag) do |match|
      # Remove adjacent code tag (eg </code>  <code>) in the pre tag.
      match.gsub(/(?:<\/code>(.*?)<code.*?>)/, '\\1')
    end
  end
end
# rubocop:enable Metrics/ModuleLength