lib/relaton_bib/formatted_string.rb from relaton/relaton-bib

lib/relaton_bib/formatted_string.rb
Summary

Maintainability

2 hrs
Test Coverage

Issues
# frozen_string_literal: true

require "relaton_bib/localized_string"

module RelatonBib
  # Formatted string
  class FormattedString < LocalizedString
    FORMATS = %w[text/plain text/html application/docbook+xml
                 application/tei+xml text/x-asciidoc text/markdown
                 application/x-metanorma+xml].freeze

    # @return [String]
    attr_reader :format

    # @param content [String, Array<RelatonBib::LocalizedString>]
    # @param language [String, nil] language code Iso639
    # @param script [String, nil] script code Iso15924
    # @param format [String] the content type
    def initialize(content: "", language: nil, script: nil, format: "text/plain")
      # if format && !FORMATS.include?(format)
      #   raise ArgumentError, %{Format "#{format}" is invalid.}
      # end

      @format = format
      super(content, language, script)
    end

    def ==(other)
      super && format == other.format
    end

    # @param builder [Nokogiri::XML::Builder]
    def to_xml(builder)
      builder.parent["format"] = format if format
      super
    end

    #
    # Encode content.
    #
    # @param [String] cnt content
    #
    # @return [String] encoded content
    #
    def encode(cnt) # rubocop:disable Metrics/MethodLength
      return escp(cnt) unless format == "text/html"

      parts = cnt.scan(%r{
        <(?<tago>\w+)(?<attrs>[^>]*)> | # tag open
        </(?<tagc>\w+)> | # tag close
        (?<cmt><!--.*?-->) | # comment
        (?<cnt>.+?)(?=<|$) # content
        }x)
      scan_xml parts
    end

    #
    # Scan XML and escape HTML entities.
    #
    # @param [Array<Array<String,nil>>] parts XML parts
    #
    # @return [String] output string
    #
    def scan_xml(parts) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity,Metrics/MethodLength
      return "" unless parts.any?

      out = ""
      while parts.any? && (parts.first[3] || parts.first[4])
        _, _, _, cmt, cnt = parts.shift
        out += "#{cmt}#{escp(cnt)}"
      end
      unless out.empty?
        out += scan_xml(parts) if parts.any? && parts.first[0]
        return out
      end

      tago, attrs, tagc, = parts.shift
      out = if tago && attrs && attrs[-1] == "/"
              "<#{tago}#{attrs}>"
            elsif tago
              inr = scan_xml parts
              _, _, tagc, = parts.shift
              if tago == tagc
                "<#{tago}#{attrs}>#{inr}</#{tagc}>"
              else
                "#{escp("<#{tago}#{attrs}>")}#{inr}#{escp("</#{tagc}>")}"
              end
            end
      out += scan_xml(parts) if parts.any? && (parts.first[0] || parts.first[3] || parts.first[4])
      out
    end

    # @return [Hash]
    def to_hash
      hash = super
      return hash unless format

      hash = { "content" => hash } unless hash.is_a? Hash
      hash["format"] = format
      hash
    end

    # @param prefix [String]
    # @param count [Integer] number of elements
    # @return [String]
    def to_asciibib(prefix = "", count = 1, has_attrs = false)
      has_attrs ||= !(format.nil? || format.empty?)
      pref = prefix.empty? ? prefix : "#{prefix}."
      # out = count > 1 ? "#{prefix}::\n" : ""
      out = super
      out += "#{pref}format:: #{format}\n" if format
      out
    end

    #
    # Remove HTML tags except <em>, <strong>, <stem>, <sup>, <sub>, <tt>, <br>, <p>.
    # Replace <i> with <em>, <b> with <strong>.
    #
    # @param [String] str content
    #
    # @return [String] cleaned content
    #
    def cleanup(str)
      return str unless format == "text/html"

      str.gsub(/(?<=<)\w+:(?=\w+>)/, "").gsub(/(?<=<\/)\w+:(?=\w+>)/, "")
        .gsub(/<i>/, "<em>").gsub(/<\/i>/, "</em>")
        .gsub(/<b>/, "<strong>").gsub(/<\/b>/, "</strong>")
        .gsub(/<(?!\/?(em|strong|stem|sup|sub|tt|br\s?\/|p))[^\s!]\/?.*?>/, "")
        .gsub(/\s+([.,:;!?<])/, "\\1").strip.squeeze(" ")
    end
  end
end