metanorma/metanorma-standoc

View on GitHub
lib/metanorma/standoc/spans_to_bibitem_preprocessing.rb

Summary

Maintainability
A
2 hrs
Test Coverage
module Metanorma
  module Standoc
    module Cleanup
      class SpansToBibitem
        def extract_spans(bib)
          bib.xpath("./formattedref//span").each_with_object([]) do |s, m|
            s.at("./ancestor::span") and next
            extract_spans1(s, m)
          end
        end

        def extract_spans1(span, acc)
          keys = span["class"].split(".", 2)
          acc << { key: keys[0], type: keys[1],
                   val: span.children.to_xml }
          (span["class"] == "type" and span.remove) or
            span.replace(span.children)
        end

        def extract_docid(bib)
          bib.xpath("./docidentifier").each_with_object([]) do |d, m|
            m << { key: "docid", type: d["type"], val: d.text }
            d.remove unless bib.at("./title")
          end
        end

        def empty_span_hash
          { contrib: [], docid: [], uri: [], date: [], extent: {}, in: {} }
        end

        def spans_preprocess(spans)
          ret = empty_span_hash
          spans.each { |s| span_preprocess1(s, ret) }
          host_rearrange(ret)
        end

        def span_preprocess1(span, ret)
          case span[:key]
          when "uri", "docid"
            val = link_unwrap(Nokogiri::XML.fragment(span[:val])).to_xml
            ret[span[:key].to_sym] << { type: span[:type], val: val }
          when "date"
            ret[span[:key].to_sym] << { type: span[:type] || "published",
                                        val: span[:val] }
          when "pages", "volume", "issue"
            ret[:extent][span[:key].to_sym] ||= []
            ret[:extent][span[:key].to_sym] << span[:val]
          when "pubplace", "title", "type", "series", "edition", "version"
            ret[span[:key].to_sym] = span[:val]
          when "note"
            ret[span[:key].to_sym] = { type: span[:type], val: span[:val] }
          when "in_title"
            ret[:in][:title] = span[:val]
          when "publisher"
            ret[:contrib] << { role: "publisher", entity: "organization",
                               name: span[:val] }
          when "surname", "initials", "givenname", "formatted-initials"
            ret[:contrib] = spans_preprocess_contrib(span, ret[:contrib])
          when "fullname"
            ret[:contrib] = spans_preprocess_fullname(span, ret[:contrib])
          when "organization"
            ret[:contrib] = spans_preprocess_org(span, ret[:contrib])
          when "in_surname", "in_initials", "in_givenname",
            "in_formatted-initials"
            ret[:in][:contrib] ||= []
            span[:key].sub!(/^in_/, "")
            ret[:in][:contrib] =
              spans_preprocess_contrib(span, ret[:in][:contrib])
          when "in_fullname"
            ret[:in][:contrib] ||= []
            span[:key].sub!(/^in_/, "")
            ret[:in][:contrib] =
              spans_preprocess_fullname(span, ret[:in][:contrib])
          when "in_organization"
            ret[:in][:contrib] ||= []
            span[:key].sub!(/^in_/, "")
            ret[:in][:contrib] =
              spans_preprocess_org(span, ret[:in][:contrib])
          else
            msg = "unrecognised key '#{span[:key]}' in " \
                  "`span:#{span[:key]}[#{span[:val]}]`"
            @err << { msg: msg }
          end
        end

        def host_rearrange(ret)
          ret[:in][:title] or return ret
          ret[:in].merge!(empty_span_hash, { type: "misc" }) do |_, old, _|
            old
          end
          %i(series).each do |k|
            ret[:in][k] = ret[k]
            ret.delete(k)
          end
          /^in/.match?(ret[:type]) and ret[:in][:type] =
                                         ret[:type].sub(/^in/, "")
          ret
        end

        def spans_preprocess_contrib(span, contrib)
          span[:key] == "initials" and span[:key] = "formatted-initials"
          spans_preprocess_new_contrib?(span, contrib) and
            contrib << { role: span[:type] || "author", entity: "person" }
          if multiple_givennames?(span, contrib)
            contrib[-1][:givenname] = [contrib[-1][:givenname],
                                       span[:val]].flatten
          else contrib[-1][span[:key].to_sym] = span[:val]
          end
          contrib
        end

        def spans_preprocess_new_contrib?(span, contrib)
          contrib.empty? ||
            (span[:key] == "surname" && contrib[-1][:surname]) ||
            contrib[-1][:role] != (span[:type] || "author")
        end

        def multiple_givennames?(span, contrib)
          (%w(formatted-initials givenname).include?(span[:key]) &&
            (contrib[-1][:"formatted-initials"] || contrib[-1][:givenname])) or
            return false
          if contrib[-1][:"formatted-initials"]
            contrib[-1][:givenname] = contrib[-1][:"formatted-initials"]
            contrib[-1].delete(:"formatted-initials")
          end
          true
        end

        def spans_preprocess_fullname(span, contrib)
          name = span[:val].gsub(/\.(?=\p{Alpha})/, ". ").split(/ /)
          out = { role: span[:type] || "author", entity: "person",
                  surname: name[-1] }
          if name.size > 1 && name[0..-2].all? { |x| /\.$/.match?(x) }
            out[:"formatted-initials"] = name[0..-2].join(" ")
          else out[:givenname] = name[0..-2]
          end
          contrib << out
          contrib
        end

        def spans_preprocess_org(span, contrib)
          contrib << { role: span[:type] || "author", entity: "organization",
                       name: span[:val] }
          contrib
        end
      end
    end
  end
end