lib/quesadilla/extractor/markdown.rb from soffes/quesadilla

lib/quesadilla/extractor/markdown.rb
Summary

Maintainability

25 mins
Test Coverage

Issues
# encoding: UTF-8

module Quesadilla
  class Extractor
    # Extract Markdown.
    #
    # This module has no public methods.
    module Markdown
    private

      # Gruber's regex is recursive, but I can't figure out how to do it in Ruby without the `g` option.
      # Maybe I should use StringScanner instead. For now, I think it's fine. Everything appears to work
      # as expected.
      NESTED_BRACKETS_REGEX = %r{
        (?>
           [^\[\]]+
        )*
      }x.freeze

      # 2 = Text, 3 = URL, 6 = Title
      LINK_REGEX = %r{
        (
          \[
            (#{NESTED_BRACKETS_REGEX})
          \]
          \(
            [ \t]*
          <?(.*?)>?
            [ \t]*
          (
            (['"])
            (.*?)
            \5
          )?
          \)
        )
      }x.freeze

      # 1 = URL
      AUTOLINK_LINK_REGEX = /<((?:https?|ftp):[^'">\s]+)>/i.freeze

      # 1 = Email
      AUTOLINK_EMAIL_REGEX = %r{
        <
            (?:mailto:)?
        (
          [-.\w]+
          \@
          [-a-z0-9]+(?:\.[-a-z0-9]+)*\.[a-z]+
        )
        >
      }xi.freeze

      # 1 = Delimiter, 2 = Text
      BOLD_ITALIC_REGEX = %r{ (\*\*\*|___) (?=\S) (.+?[*_]*) (?<=\S) \1 }x.freeze

      # 1 = Delimiter, 2 = Text
      BOLD_REGEX = %r{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }x.freeze

      # 1 = Delimiter, 2 = Text
      ITALIC_REGEX = %r{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }x.freeze

      # 1 = Delimiter, 2 = Text
      STRIKETHROUGH_REGEX = %r{ (~~) (?=\S) (.+?[~]*) (?<=\S) \1 }x.freeze

      # 1 = Delimiter, 2 = Text
      CODE_REGEX = %r{ (`+) (.+?) (?<!`) \1 (?!`) }x.freeze

      def extract_markdown
        extract_markdown_code if @options[:markdown_code]

        if @options[:markdown_links]
          extract_markdown_autolink_links
          extract_markdown_autolink_email
          extract_markdown_links
        end

        extract_markdown_span(BOLD_ITALIC_REGEX, ENTITY_TYPE_TRIPLE_EMPHASIS) if @options[:markdown_triple_emphasis]
        extract_markdown_span(BOLD_REGEX, ENTITY_TYPE_DOUBLE_EMPHASIS) if @options[:markdown_double_emphasis]
        extract_markdown_span(ITALIC_REGEX, ENTITY_TYPE_EMPHASIS) if @options[:markdown_emphasis]
        extract_markdown_span(STRIKETHROUGH_REGEX, ENTITY_TYPE_STRIKETHROUGH) if @options[:markdown_strikethrough]
      end

    private

      def extract_markdown_span(regex, type)
        # Match until there's no results
        while match = @working_text.match(regex)
          original = match[0]
          length = original.length

          # Find the start position of the original
          start = @working_text.index(original)

          # Create the entity
          entity = {
            type: type,
            text: original,
            display_text: match[2],
            indices: [
              start,
              start + length
            ]
          }

          # Let block modify
          entity = yield(entity, match) if block_given?

          # Add the entity
          @entities << entity

          # Remove from the working text
          @working_text.sub!(original, REPLACE_TOKEN * length)
        end
      end

      def extract_markdown_code
        extract_markdown_span(CODE_REGEX, 'code') do |entity, match|
          # Strip tabs from the display text
          display = match[2]
          display.gsub!(/^[ \t]*/, '')
          display.gsub!(/[ \t]*$/, '')
          entity[:display_text] = display
          entity
        end
      end

      def extract_markdown_autolink(regex)
        # Match until there's no results
        while match = @working_text.match(regex)
          original = match[0]
          length = original.length

          # Find the start position of the original
          start = @working_text.index(original)

          # Create the entity
          entity = {
            type: ENTITY_TYPE_LINK,
            text: original,
            indices: [
              start,
              start + length
            ]
          }

          # Let block modify
          entity = yield(entity, match) if block_given?

          # Add the entity
          @entities << entity

          # Remove from the working text
          @working_text.sub!(original, REPLACE_TOKEN * length)
        end
      end

      def extract_markdown_autolink_links
        extract_markdown_autolink AUTOLINK_LINK_REGEX do |entity, match|
          entity[:url] = match[1]
          entity[:display_text] = display_url(match[1])
          entity
        end
      end

      def extract_markdown_autolink_email
        extract_markdown_autolink AUTOLINK_EMAIL_REGEX do |entity, match|
          email = match[1]
          entity[:url] = "mailto:#{email}"
          entity[:display_text] = email
          entity
        end
      end

      def extract_markdown_links
        extract_markdown_span(LINK_REGEX, ENTITY_TYPE_LINK) do |entity, match|
          # Add the URL
          entity[:url] = match[3]

          # Add the title
          entity[:title] = match[6] if match[6]
          entity
        end
      end
    end
  end
end