next-l/enju_leaf

View on GitHub
app/models/concerns/enju_loc/enju_manifestation.rb

Summary

Maintainability
F
4 days
Test Coverage
B
85%
module EnjuLoc
  module EnjuManifestation
    extend ActiveSupport::Concern

    included do
      def self.loc_search(query, options = {})
        options = {}.merge(options)
        doc = nil
        results = {}
        startrecord = options[:idx].to_i
        if startrecord == 0
          startrecord = 1
        end
        url = LOC_SRU_BASEURL + "?operation=searchRetrieve&version=1.1&=query=#{URI.escape(query)}"
        cont = Faraday.get(url).body
        parser = LibXML::XML::Parser.string(cont)
        doc = parser.parse
      end

      def self.import_record_from_loc_isbn(options)
        #if options[:isbn]
        lisbn = Lisbn.new(options[:isbn])
        raise EnjuLoc::InvalidIsbn unless lisbn.valid?

        #end

        manifestation = Manifestation.find_by_isbn(lisbn.isbn)
        return manifestation.first if manifestation.present?

        doc = return_xml(lisbn.isbn)
        raise EnjuLoc::RecordNotFound unless doc

        import_record_from_loc(doc)
      end

      NS = {"mods" => "http://www.loc.gov/mods/v3"}
      def self.import_record_from_loc(doc)
        record_identifier = doc.at('//mods:recordInfo/mods:recordIdentifier', NS).try(:content)
        identifier_type = IdentifierType.find_by(name: 'loc_identifier')
        identifier_type ||= IdentifierType.create!(name: 'loc_identifier')
        loc_identifier = Identifier.find_by(body: record_identifier, identifier_type_id: identifier_type.id)
        return loc_identifier.manifestation if loc_identifier

        publishers = []
        doc.xpath('//mods:publisher', NS).each do |publisher|
          publishers << {
            full_name: publisher.content,
            #:agent_identifier => publisher.attributes["about"].try(:content)
          }
        end

        creators = get_mods_creators(doc)

        # title
        titles = get_mods_titles(doc)

        # date of publication
        date = get_mods_date_of_publication(doc)

        language = Language.find_by(iso_639_2: get_mods_language(doc))
        if language
          language_id = language.id
        else
          language_id = 1
        end

        isbn = Lisbn.new(doc.at('/mods:mods/mods:identifier[@type="isbn"]', NS).try(:content).to_s).try(:isbn)
        lccn = StdNum::LCCN.normalize(doc.at('/mods:mods/mods:identifier[@type="lccn"]', NS).try(:content).to_s)
        issn = StdNum::ISSN.normalize(doc.at('/mods:mods/mods:identifier[@type="issn"]', NS).try(:content).to_s)
        issn_l = StdNum::ISSN.normalize(doc.at('/mods:mods/mods:identifier[@type="issn-l"]', NS).try(:content).to_s)

        types = get_mods_carrier_and_content_types(doc)
        content_type = types[:content_type]
        carrier_type = types[:carrier_type]

        record_identifier = doc.at('//mods:recordInfo/mods:recordIdentifier', NS).try(:content)
        description = doc.xpath('//mods:abstract', NS).collect(&:content).join("\n")
        edition_string = doc.at('//mods:edition', NS).try(:content)
        extent = get_mods_extent(doc)
        note = get_mods_note(doc)
        frequency = get_mods_frequency(doc)
        issuance = doc.at('//mods:issuance', NS).try(:content)
        is_serial = true if issuance == "serial"
        statement_of_responsibility = get_mods_statement_of_responsibility(doc)
        access_address = get_mods_access_address(doc)
        publication_place = get_mods_publication_place(doc)

        manifestation = nil
        Agent.transaction do
          creator_agents = Agent.import_agents(creators)
          publisher_agents = Agent.import_agents(publishers)

          manifestation = Manifestation.new(
            manifestation_identifier: record_identifier,
            original_title: titles[:original_title],
            title_alternative: titles[:title_alternative],
            language_id: language_id,
            pub_date: date,
            description: description,
            edition_string: edition_string,
            statement_of_responsibility: statement_of_responsibility,
            start_page: extent[:start_page],
            end_page: extent[:end_page],
            extent: extent[:extent],
            height: extent[:height],
            dimensions: extent[:dimensions],
            access_address: access_address,
            note: note,
            publication_place: publication_place,
            serial: is_serial
          )
          identifier = {}
          if loc_identifier
            identifier[:loc_identifier] = Identifier.new(
              manifestation: manifestation,
              body: loc_identifier,
              identifier_type: IdentifierType.find_by(name: 'loc_identifier') || IdentifierType.create!(name: 'loc_identifier')
            )
          end
          if issn_l
            identifier[:issn_l] = Identifier.new(
              manifestation: manifestation,
              body: issn_l,
              identifier_type: IdentifierType.find_by(name: 'issn_l') || IdentifierType.create!(name: 'issn_l')
            )
          end

          manifestation.carrier_type = carrier_type if carrier_type
          manifestation.manifestation_content_type = content_type if content_type
          manifestation.frequency = frequency if frequency
          manifestation.save!
          identifier.each do |k, v|
            manifestation.identifiers << v if v.valid?
          end
          manifestation.isbn_records.create(body: isbn) if isbn.present?
          manifestation.issn_records.create(body: issn) if issn.present?
          manifestation.create_lccn_record(body: lccn) if lccn.present?
          manifestation.publishers << publisher_agents
          manifestation.creators << creator_agents
          create_loc_subject_related_elements(doc, manifestation)
          create_loc_series_statement(doc, manifestation)
          if is_serial
            create_loc_series_master(doc, manifestation)
          end
        end
        manifestation
      end

      private

      def self.create_loc_subject_related_elements(doc, manifestation)
        subjects = get_mods_subjects(doc)
        classifications = get_mods_classifications(doc)
        if defined?(EnjuSubject)
          subject_heading_type = SubjectHeadingType.find_by(name: 'lcsh') || SubjectHeadingType.create!(name: 'lcsh')
          subjects.each do |term|
            subject = Subject.find_by(term: term[:term])
            unless subject
              subject = Subject.new(term)
              subject.subject_heading_type = subject_heading_type
              subject.subject_type = SubjectType.find_by(name: 'concept') || SubjectType.create!(name: 'concept')
            end
            manifestation.subjects << subject
          end
          if classifications
            classification_type = ClassificationType.find_by(name: 'ddc') || ClassificationType.create!(name: 'ddc')
            classifications.each do |ddc|
              classification = Classification.find_by(category: ddc)
              unless classification
                classification = Classification.new(category: ddc)
                classification.classification_type = classification_type
              end
              manifestation.classifications << classification
            end
          end
        end
      end

      def self.create_loc_series_statement(doc, manifestation)
        doc.xpath('//mods:relatedItem[@type="series"]/mods:titleInfo/mods:title', NS).each do |series|
          series_title = title = series.try(:content)
          if title
            series_title = title.split(';')[0].strip
          end
          next unless series_title

          series_statement = SeriesStatement.find_by(original_title: series_title) || SeriesStatement.create!(original_title: series_title)
          if series_statement.try(:save)
            manifestation.series_statements << series_statement
          end
        end
      end
      
      def self.create_loc_series_master(doc, manifestation)
        titles = get_mods_titles(doc)
        series_statement = SeriesStatement.new(
          original_title: titles[:original_title],
          title_alternative: titles[:title_alternative],
          series_master: true
        )
        if series_statement.try(:save)
          manifestation.series_statements << series_statement
        end
      end

      def self.get_mods_titles(doc)
        original_title = ""
        title_alternatives = []
        doc.xpath('//mods:mods/mods:titleInfo', NS).each do |e|
          type = e.attributes["type"].try(:content)
          case type
          when "alternative", "translated", "abbreviated", "uniform"
            title_alternatives << e.at('./mods:title', NS).content
          else
            nonsort = e.at('./mods:nonSort', NS).try(:content)
            original_title << nonsort if nonsort
            original_title << e.at('./mods:title', NS).try(:content)
            subtitle = e.at('./mods:subTitle', NS).try(:content)
            original_title << " : #{subtitle}" if subtitle
            partnumber = e.at('./mods:partNumber', NS).try(:content)
            partname = e.at('./mods:partName', NS).try(:content)
            partname = [partnumber, partname].compact.join(": ")
            original_title << ". #{partname}" if partname.present?
          end
        end
        { original_title: original_title, title_alternative: title_alternatives.join(" ; ") }
      end

      def self.get_mods_language(doc)
        language = doc.at('//mods:language/mods:languageTerm[@authority="iso639-2b"]', NS).try(:content)
      end

      def self.get_mods_access_address(doc)
        access_address = nil
        url = doc.at('//mods:location/mods:url', NS)
        if url
          usage = url.attributes["usage"].try(:content)
          case usage
          when "primary display", "primary"
            access_address = url.try(:content)
          end
        end
        access_address
      end

      def self.get_mods_publication_place(doc)
        place = doc.at('//mods:originInfo/mods:place/mods:placeTerm[@type="text"]', NS).try(:content)
      end

      def self.get_mods_extent(doc)
        extent = doc.at('//mods:extent', NS).try(:content)
        value = {start_page: nil, end_page: nil, height: nil}
        if extent
          extent = extent.split(';')
          page = extent[0].try(:strip)
          value[:extent] = page
          if page =~ /(\d+)\s*(p|page)/
            value[:start_page] = 1
            value[:end_page] = $1.dup.to_i
          end
          height = extent[1].try(:strip)
          value[:dimensions] = height
          if height =~ /(\d+)\s*cm/
            value[:height] = $1.dup.to_i
          end
        end
        value
      end

      def self.get_mods_statement_of_responsibility(doc)
        note = doc.at('//mods:note[@type="statement of responsibility"]', NS).try(:content)
        if note.blank?
          note = get_mods_creators(doc).map{|e| e[:full_name] }.join(" ; ")
        end
        note
      end

      def self.get_mods_note(doc)
        notes = []
        doc.xpath('//mods:note', NS).each do |note|
          type = note.attributes['type'].try(:content)
          next if type == "statement of responsibility"

          note_s = note.try(:content)
          notes << note_s if note_s.present?
        end
        if notes.empty?
          nil
        else
          notes.join(";\n")
        end
      end

      def self.get_mods_date_of_publication(doc)
        dates = []
        doc.xpath('//mods:dateIssued', NS).each do |pub_date|
          pub_date = pub_date.content.sub(/\A[cp]/, '')
          next unless pub_date =~ /^\d+(-\d\d?){0,2}$/

          date = pub_date.split('-')
          if date[0] and date[1]
            dates << sprintf("%04d-%02d", date[0], date[1])
          else
            dates << pub_date
          end
        end
        dates.compact.first
      end

  # derived from marcfrequency: http://www.loc.gov/standards/valuelist/marcfrequency.html
      MARCFREQUENCY = [
        "Continuously updated",
        "Daily",
        "Semiweekly",
        "Three times a week",
        "Weekly",
        "Biweekly",
        "Three times a month",
        "Semimonthly",
        "Monthly",
        "Bimonthly",
        "Quarterly",
        "Three times a year",
        "Semiannual",
        "Annual",
        "Biennial",
        "Triennial",
        "Completely irregular"
      ]
      def self.get_mods_frequency(doc)
        frequencies = []
        doc.xpath('//mods:frequency', NS).each do |freq|
          frequency = freq.try(:content)
          MARCFREQUENCY.each do |freq_regex|
            if /\A(#{freq_regex})/ =~ frequency
              frequency_name = freq_regex.downcase.gsub(/\s+/, "_")
              frequencies << Frequency.find_by(name: frequency_name)
            end
          end
        end
        frequencies.compact.first
      end

      def self.get_mods_creators(doc)
        creators = []
        doc.xpath('/mods:mods/mods:name', NS).each do |creator|
          creators << {
            full_name: creator.xpath('./mods:namePart', NS).collect(&:content).join(", ")
          }
        end
        creators.uniq
      end

  # TODO:only LCSH-based parsing...
      def self.get_mods_subjects(doc)
        subjects = []
        doc.xpath('//mods:subject[@authority="lcsh"]', NS).each do |s|
          subject = []
          s.xpath('./*', NS).each do |subelement|
            type = subelement.name
            case subelement.name
            when "topic", "geographic", "genre", "temporal"
              subject << { type: type , term: subelement.try(:content) }
            when "titleInfo"
              subject << { type: type, term: subelement.at('./mods:title', NS).try(:content) }
            when "name"
              name = subelement.xpath('./mods:namePart', NS).map{|e| e.try(:content) }.join(", ")
              subject << { type: type, term: name }
            end
          end
          next if subject.compact.empty?

          if subject.size > 1 and subject[0][:type] == "name" and subject[1][:type] == "titleInfo"
            subject[0..1] = { term: subject[0..1].map{|e|e[:term]}.join(". ") }
          end
          subjects << {
            term: subject.map{|e|e[:term]}.compact.join("--")
          }
        end
        subjects
      end

  # TODO:support only DDC.
      def self.get_mods_classifications(doc)
        classifications = []
        doc.xpath('//mods:classification[@authority="ddc"]', NS).each do|c|
          ddc = c.content
          if ddc
            classifications << ddc.split(/[^\d\.]/).first.try(:strip)
          end
        end
        classifications.compact
      end

      def self.get_mods_carrier_and_content_types(doc)
        carrier_type = content_type = nil
        doc.xpath('//mods:form', NS).each do |e|
              authority = e.attributes['authority'].try(:content)
              case authority
              when "gmd"
                case e.content
                when "electronic resource"
                  carrier_type = CarrierType.find_by(name: 'online_resource')
                when "videorecording", "motion picture", "game"
                  content_type = ContentType.find_by(name: 'two_dimensional_moving_image')
                when "sound recording"
                  content_type = ContentType.find_by(name: 'performed_music')
                when "graphic", "picture"
                  content_type = ContentType.find_by(name: 'still_image')
                #TODO: Enju needs more specific mappings...
                when "art original",
                "microscope slides",
                "art reproduction",
                "model",
                "chart",
                "diorama",
                "realia",
                "filmstrip",
                "slide",
                "flash card",
                "technical drawing",
                "toy",
                "kit",
                "transparency",
                "microform"
                  content_type = ContentType.find_by(name: 'other')
                end
                when "marcsmd" # cf.http://www.loc.gov/standards/valuelist/marcsmd.html
                  case e.content
                  when "text", "large print", "regular print", "text in looseleaf binder"
                    carrier_type = CarrierType.find_by(name: 'volume')
                    content_type = ContentType.find_by(name: 'text')
                  when "braille"
                    carrier_type = CarrierType.find_by(name: 'volume')
                    content_type = ContentType.find_by(name: 'tactile_text')
                  when "videodisc"
                    carrier_type = CarrierType.find_by(name: 'videodisc')
                    content_type = ContentType.find_by(name: 'two_dimensional_moving_image')
                  when "videorecording", "videocartridge", "videocassette", "videoreel"
                    carrier_type = CarrierType.find_by(name: 'other')
                    content_type = ContentType.find_by(name: 'two_dimensional_moving_image')
                  when "electronic resource"
                    carrier_type = CarrierType.find_by(name: 'online_resource')
                  when "chip cartridge", "computer optical disc cartridge", "magnetic disk", "magneto-optical disc", "optical disc", "remote", "tape cartridge", "tape cassette", "tape reel"
                    #carrier_type = CarrierType.find_by(name: 'other')
                  when "motion picture", "film cartridge", "film cassette", "film reel"
                    content_type = ContentType.find_by(name: 'two_dimensional_moving_image')
                  when "sound recording", "cylinder", "roll", "sound cartridge", "sound cassette", "sound-tape reel", "sound-track film", "wire recording"
                    content_type = ContentType.find_by(name: 'performed_music')
                  when "sound disc"
                    content_type = ContentType.find_by(name: 'performed_music')
                    carrier_type = CarrierType.find_by(name: 'audio_disc')
                  when "nonprojected graphic", "chart", "collage", "drawing", "flash card", "painting", "photomechanical print", "photonegative", "photoprint", "picture", "print", "technical drawing", "projected graphic", "filmslip", "filmstrip cartridge", "filmstrip roll", "other filmstrip type ", "slide", "transparency"
                    content_type = ContentType.find_by(name: 'still_image')
                  when "tactile material", "braille", "tactile, with no writing system"
                    content_type = ContentType.find_by(name: 'tactile_text')
                  #TODO: Enju needs more specific mappings...
                  when "globe",
                  "celestial globe",
                  "earth moon globe",
                  "planetary or lunar globe",
                  "terrestrial globe",
                  "map",
                  "atlas",
                  "diagram",
                  "map",
                  "model",
                  "profile",
                  "remote-sensing image",
                  "section",
                  "view",
                  "microform",
                  "aperture card",
                  "microfiche",
                  "microfiche cassette",
                  "microfilm cartridge",
                  "microfilm cassette",
                  "microfilm reel",
                  "microopaque",
                  "combination",
                  "moon"
                    content_type = ContentType.find_by(name: 'other')
                  end
                  when "marcform" # cf. http://www.loc.gov/standards/valuelist/marcform.html
                    case e.content
                    when "print", "large print"
                      carrier_type = CarrierType.find_by(name: 'volume')
                      content_type = ContentType.find_by(name: 'text')
                    when "electronic"
                      carrier_type = CarrierType.find_by(name: 'online_resource')
                    when "braille"
                      content_type = ContentType.find_by(name: 'tactile_text')
                    #TODO: Enju needs more specific mappings...
                    when "microfiche", "microfilm"
                      content_type = ContentType.find_by(name: 'other')
                    end
                  end
            end
        doc.xpath('//mods:genre', NS).each do |e|
          authority = e.attributes['authority'].try(:content)
          case authority
          when "rdacontent"
            content_type = ContentType.find_by(name: e.content.gsub(/\W+/, "_"))
            content_type ||= ContentType.find_by(name: 'other')
          end
        end
        type = doc.at('//mods:typeOfResource', NS).try(:content)
        case type
        when "text"
          content_type = ContentType.find_by(name: 'text')
        when "sound recording"
          content_type = ContentType.find_by(name: 'sounds')
        when"sound recording-musical"
          content_type = ContentType.find_by(name: 'performed_music')
        when"sound recording-nonmusical"
          content_type = ContentType.find_by(name: 'spoken_word')
        when "moving image"
          content_type = ContentType.find_by(name: 'two_dimensional_moving_image')
        when "software, multimedia"
          content_type = ContentType.find_by(name: 'other')
        when "cartographic "
          content_type = ContentType.find_by(name: 'cartographic_image')
        when "notated music"
          content_type = ContentType.find_by(name: 'notated_music')
        when "still image"
          content_type = ContentType.find_by(name: 'still_image')
        when "three dimensional object"
          content_type = ContentType.find_by(name: 'other')
        when "mixed material"
          content_type = ContentType.find_by(name: 'other')
        end
        { carrier_type: carrier_type, content_type: content_type }
          end
    end
  end

  class RecordNotFound < StandardError
  end
end