metanorma/metanorma-standoc

View on GitHub
lib/metanorma/standoc/cleanup_xref.rb

Summary

Maintainability
A
35 mins
Test Coverage
module Metanorma
  module Standoc
    module Cleanup
      # extending localities to cover ISO referencing
      CONN_REGEX_STR = "(?<conn>and|or|from|to)!".freeze

      LOCALITIES = "section|clause|part|paragraph|chapter|page|line|" \
        "table|annex|figure|example|note|formula|list|time|anchor|" \
        "locality:[^ \\t\\n\\r:,;=]+".freeze

      LOCALITY_REGEX_STR = <<~REGEXP.freeze
        ^((#{CONN_REGEX_STR})?
            (?<locality>#{LOCALITIES})(\\s+|=)
               (?<ref>[^"][^ \\t\\n,:;-]*|"[^"]+")
                 (-(?<to>[^"][^ \\t\\n,:;-]*|"[^"]"))?|
          (?<locality2>whole|title|locality:[^ \\t\\n\\r:,;=]+))(?<punct>[,:;]?)\\s*
         (?<text>.*)$
      REGEXP

      def to_regex(str)
        Regexp.new(str.gsub(/\s/, ""), Regexp::IGNORECASE | Regexp::MULTILINE)
      end

      LOCALITY_REGEX_VALUE_ONLY_STR = <<~REGEXP.freeze
        ^(?<conn0>(#{CONN_REGEX_STR}))
          (?!whole|title|locality:)
          (?<value>[^=,;:\\t\\n\\r]+)
          (?<punct>[,;\\t\\n\\r]|$)
      REGEXP

      def tq(text)
        text.sub(/^"/, "").sub(/"$/, "")
      end

      def extract_localities(elem)
        elem.children.empty? and return
        f = elem.children.first
        f.text? or return
        head = f.remove.text
        tail = elem.children.remove
        extract_localities1(elem, head)
        tail and elem << tail
      end

      LOCALITY_REGEX_STR_TRIPLEDASH = <<~REGEXP.freeze
        ^(?<locality>(#{CONN_REGEX_STR})?
            (#{LOCALITIES})(\\s+|=))
               (?<ref>[^"][^ \\t\\n,:;-]*
                 -[^ \\t\\n,:;"-]+
                 -[^ \\t\\n,:;"]+)
          (?<text>[,:;]?\\s*
         .*)$
      REGEXP

      # treat n-n-n locality as "n-n-n", do not parse as a range
      def locality_normalise(text)
        re = to_regex(LOCALITY_REGEX_STR_TRIPLEDASH)
        m = re.match(text) and
          text = %(#{m[:locality]}"#{m[:ref]}"#{m[:text]})
        text
      end

      def extract_localities1(elem, text)
        re = to_regex(LOCALITY_REGEX_STR)
        b = elem.add_child("<localityStack/>").first if re.match text
        while (m = re.match locality_normalise(text))
          add_locality(b, m)
          text = extract_localities_update_text(m)
          b = elem.add_child("<localityStack/>").first if m[:punct] == ";"
        end
        fill_in_eref_connectives(elem)
        elem.add_child(text) if text
      end

      # clause=3;and!5 => clause=3;and!clause=5
      def extract_localities_update_text(match)
        ret = match[:text]
        re = to_regex(LOCALITY_REGEX_VALUE_ONLY_STR)
        re.match?(ret) && match[:punct] == ";" and
          ret.sub!(%r{^(#{CONN_REGEX_STR})}o, "\\1#{match[:locality]}=")
        ret
      end

      def add_locality(stack, match)
        stack.children.empty? && match[:conn] and
          stack["connective"] = match[:conn]
        ref =
          match[:ref] ? "<referenceFrom>#{tq match[:ref]}</referenceFrom>" : ""
        refto = match[:to] ? "<referenceTo>#{tq match[:to]}</referenceTo>" : ""
        stack.add_child("<locality type='#{locality_label(match)}'>#{ref}" \
                        "#{refto}</locality>")
      end

      def fill_in_eref_connectives(elem)
        elem.xpath("./localityStack").size < 2 and return
        elem.xpath("./localityStack[not(@connective)]").each do |l|
          n = l.next_element
          l["connective"] = if n && n.name == "localityStack" &&
              n["connective"] == "to"
                              "from"
                            else "and" end
        end
      end

      def locality_label(match)
        loc = match[:locality] || match[:locality2]
        /^locality:/.match?(loc) ? loc : loc&.downcase
      end

      def xref_to_eref(elem, name)
        elem.name = name
        elem["bibitemid"] = elem["target"]
        if ref = @anchors&.dig(elem["target"], :xref)
          t = @anchors.dig(elem["target"], :id, elem["style"]) and
            ref = t
          elem["citeas"] = @c.decode(ref)
        else xref_to_eref1(elem)
        end
        elem.delete("target")
        extract_localities(elem)
      end

      def xref_to_eref1(elem)
        elem["citeas"] = ""
        @internal_eref_namespaces.include?(elem["type"]) or
          @log.add("Crossreferences", elem,
                   "#{elem['target']} does not have a corresponding " \
                   "anchor ID in the bibliography!")
      end

      def xref_cleanup(xmldoc)
        anchor_alias(xmldoc)
        xref_compound_cleanup(xmldoc)
        xref_cleanup1(xmldoc)
        xref_compound_wrapup(xmldoc)
        eref_stack(xmldoc)
      end

      def eref_stack(xmldoc)
        xmldoc.xpath("//eref[eref]").each do |e|
          e.name = "erefstack"
          e.delete("bibitemid")
          e.delete("citeas")
          e.xpath("./eref").each do |e1|
            e1["type"] = e["type"]
          end
          e.delete("type")
        end
      end

      def anchor_alias(xmldoc)
        t = xmldoc.at("//metanorma-extension/table[@id = " \
                      "'_misccontainer_anchor_aliases']") or return
        key = ""
        t.xpath("./tbody/tr").each do |tr|
          tr.xpath("./td | ./th").each_with_index do |td, i|
            if i.zero? then key = td.text
            else anchor_alias1(key, td)
            end
          end
        end
      end

      def anchor_alias1(key, elem)
        id = elem.text.strip
        id.empty? and elem.at("./link") and
          id = elem.at("./link/@target")&.text
        (key && !id.empty?) or return
        @anchor_alias[id] = key
      end

      def xref_compound_cleanup(xmldoc)
        xmldoc.xpath("//xref").each do |x|
          x["target"].include?(";") or next
          locations = x["target"].split(";")
          x["target"] = locations.first.sub(/^[^!]*!/, "")
          xref_compound_cleanup1(x, locations)
        end
      end

      def xref_compound_cleanup1(xref, locations)
        xref.children.empty? and xref.children = "<sentinel/>"
        xref_parse_compound_locations(locations).reverse.each do |y|
          xref.children.first.previous =
            "<xref target='#{y[1]}' connective='#{y[0]}'/>"
        end
        xref&.at("./sentinel")&.remove
      end

      def xref_parse_compound_locations(locations)
        l = locations.map { |y| y.split("!", 2) }
        l.map.with_index do |y, i|
          y.size == 1 and
            y.unshift(l.dig(i + 1, 0) == "to" ? "from" : "and")
          y
        end
      end

      def xref_compound_wrapup(xmldoc)
        xmldoc.xpath("//xref//xref").each do |x|
          x.name = "location"
        end
      end

      def xref_cleanup1(xmldoc)
        xmldoc.xpath("//xref").each do |x|
          %r{:(?!//)}.match?(x["target"]) and xref_to_internal_eref(x)
          x.name == "xref" or next
          if refid? x["target"]
            xref_to_eref(x, "eref")
          elsif @anchor_alias[x["target"]] then xref_alias(x)
          else x.delete("type")
          end
        end
      end

      def xref_to_internal_eref(elem)
        a = elem["target"].split(":", 3)
        unless a.size < 2 || a[0].empty? || a[1].empty?
          elem["target"] = "#{a[0]}_#{a[1]}"
          a.size > 2 and
            elem.children = %{anchor="#{a[2..-1].join}",#{elem.children&.text}}
          elem["type"] = a[0]
          @internal_eref_namespaces << a[0]
          xref_to_eref(elem, "eref")
        end
      end

      def xref_alias(elem)
        elem["style"] == "id" && elem.text.strip.empty? and
          elem << elem["target"]
        elem["target"] = @anchor_alias[elem["target"]]
      end

      def quotesource_cleanup(xmldoc)
        xmldoc.xpath("//quote/source | //terms/source").each do |x|
          xref_to_eref(x, "source")
        end
      end

      def origin_cleanup(xmldoc)
        xmldoc.xpath("//origin/concept[termref]").each do |x|
          x.replace(x.at("./termref"))
        end
        xmldoc.xpath("//origin").each do |x|
          x["citeas"] = @anchors&.dig(x["bibitemid"], :xref) or
            @log.add("Crossreferences", x,
                     "#{x['bibitemid']} does not have a corresponding anchor " \
                     "ID in the bibliography!")
          extract_localities(x)
        end
      end
    end
  end
end