app/models/concerns/enju_ndl/enju_manifestation.rb from next-l/enju_ndl

app/models/concerns/enju_ndl/enju_manifestation.rb
Summary

Maintainability

4 days
Test Coverage

96%
Issues
# frozen_string_literal: true

module EnjuNdl
  module EnjuManifestation
    extend ActiveSupport::Concern

    included do
      has_one :jpno_record
      has_one :ndl_bib_id_record
      searchable do
        string :jpno do
          jpno_record&.body
        end
      end

      def self.import_isbn(isbn)
        manifestation = Manifestation.import_from_ndl_search(isbn: isbn)
        manifestation
      end

      # Use http://www.ndl.go.jp/jp/dlib/standards/opendataset/aboutIDList.txt
      def self.import_ndl_bib_id(ndl_bib_id)
        url = "https://iss.ndl.go.jp/books/R100000002-I#{ndl_bib_id}-00.rdf"
        doc = Nokogiri::XML(Faraday.get(url).body)
        import_record(doc)
      end

      def self.import_from_ndl_search(options)
        # if options[:isbn]
        lisbn = Lisbn.new(options[:isbn])
        raise EnjuNdl::InvalidIsbn unless lisbn.valid?
        # end

        manifestation = Manifestation.find_by_isbn(lisbn.isbn)
        return manifestation.first if manifestation.present?

        doc = return_xml(lisbn.isbn)
        raise EnjuNdl::RecordNotFound unless doc
        # raise EnjuNdl::RecordNotFound if doc.at('//openSearch:totalResults').content.to_i == 0
        import_record(doc)
      end

      def self.import_record(doc)
        iss_itemno = URI.parse(doc.at('//dcndl:BibAdminResource[@rdf:about]').values.first).path.split('/').last
        ndl_bib_id = NdlBibIdRecord.find_by(body: iss_itemno.split('-')[1].gsub(/^I/, ''))
        return ndl_bib_id.manifestation if ndl_bib_id

        jpno = doc.at('//dcterms:identifier[@rdf:datatype="http://ndl.go.jp/dcndl/terms/JPNO"]').try(:content)

        publishers = get_publishers(doc)

        # title
        title = get_title(doc)

        # date of publication
        pub_date = doc.at('//dcterms:issued').try(:content).to_s.tr('.', '-')
        pub_date = nil unless pub_date =~ /^\d+(-\d{0,2}){0,2}$/
        if pub_date
          date = pub_date.split('-')
          date = if date[0] && date[1]
                   format('%04d-%02d', date[0], date[1])
                 else
                   pub_date
                 end
        end

        language = Language.find_by(iso_639_2: get_language(doc))
        language_id = if language
                        language.id
                      else
                        1
                      end

        isbn = Lisbn.new(doc.at('//dcterms:identifier[@rdf:datatype="http://ndl.go.jp/dcndl/terms/ISBN"]').try(:content).to_s).try(:isbn)
        issn = StdNum::ISSN.normalize(doc.at('//dcterms:identifier[@rdf:datatype="http://ndl.go.jp/dcndl/terms/ISSN"]').try(:content))
        issn_l = StdNum::ISSN.normalize(doc.at('//dcterms:identifier[@rdf:datatype="http://ndl.go.jp/dcndl/terms/ISSNL"]').try(:content))

        carrier_type = content_type = nil
        is_serial = nil
        doc.xpath('//dcndl:materialType[@rdf:resource]').each do |d|
          case d.attributes['resource'].try(:content)
          when 'http://ndl.go.jp/ndltype/Book'
            carrier_type = CarrierType.find_by(name: 'print')
            content_type = ContentType.find_by(name: 'text')
          when 'http://ndl.go.jp/ndltype/Braille'
            content_type = ContentType.find_by(name: 'tactile_text')
          # when 'http://ndl.go.jp/ndltype/ComputerProgram'
          #  content_type = ContentType.find_by(name: 'computer_program')
          when 'http://ndl.go.jp/ndltype/ElectronicResource'
            carrier_type = CarrierType.find_by(name: 'file')
          when 'http://ndl.go.jp/ndltype/Journal'
            is_serial = true
          when 'http://ndl.go.jp/ndltype/Map'
            content_type = ContentType.find_by(name: 'cartographic_image')
          when 'http://ndl.go.jp/ndltype/Music'
            content_type = ContentType.find_by(name: 'performed_music')
          when 'http://ndl.go.jp/ndltype/MusicScore'
            content_type = ContentType.find_by(name: 'notated_music')
          when 'http://ndl.go.jp/ndltype/Painting'
            content_type = ContentType.find_by(name: 'still_image')
          when 'http://ndl.go.jp/ndltype/Photograph'
            content_type = ContentType.find_by(name: 'still_image')
          when 'http://ndl.go.jp/ndltype/PicturePostcard'
            content_type = ContentType.find_by(name: 'still_image')
          when 'http://purl.org/dc/dcmitype/MovingImage'
            content_type = ContentType.find_by(name: 'two_dimensional_moving_image')
          when 'http://purl.org/dc/dcmitype/Sound'
            content_type = ContentType.find_by(name: 'sounds')
          when 'http://purl.org/dc/dcmitype/StillImage'
            content_type = ContentType.find_by(name: 'still_image')
          end
        end

        admin_identifier = doc.at('//dcndl:BibAdminResource[@rdf:about]').attributes['about'].value
        description = doc.at('//dcterms:abstract')&.content
        price = doc.at('//dcndl:price')&.content
        volume_number_string = doc.at('//dcndl:volume/rdf:Description/rdf:value')&.content
        extent = get_extent(doc)
        publication_periodicity = doc.at('//dcndl:publicationPeriodicity').try(:content)
        statement_of_responsibility = doc.xpath('//dcndl:BibResource/dc:creator').map(&:content).join('; ')
        publication_place = doc.at('//dcterms:publisher/foaf:Agent/dcndl:location')&.content
        edition_string = doc.at('//dcndl:edition')&.content

        manifestation = Manifestation.find_by(manifestation_identifier: admin_identifier)
        return manifestation if manifestation

        Agent.transaction do
          publisher_agents = Agent.import_agents(publishers)

          manifestation = Manifestation.new(
            manifestation_identifier: admin_identifier,
            original_title: title[:manifestation],
            title_transcription: title[:transcription],
            title_alternative: title[:alternative],
            title_alternative_transcription: title[:alternative_transcription],
            # TODO: NDLサーチに入っている図書以外の資料を調べる
            #:carrier_type_id => CarrierType.find_by(name: 'print').id,
            language_id: language_id,
            pub_date: date,
            description: description,
            volume_number_string: volume_number_string,
            price: price,
            statement_of_responsibility: statement_of_responsibility,
            start_page: extent[:start_page],
            end_page: extent[:end_page],
            height: extent[:height],
            extent: extent[:extent],
            dimensions: extent[:dimensions],
            publication_place: publication_place,
            edition_string: edition_string
          )
          manifestation.serial = true if is_serial
          identifier = {}
          if isbn.present?
            IsbnRecordAndManifestation.create(
              isbn_record: IsbnRecord.find_or_create_by(body: isbn),
              manifestation: manifestation
            )
          end
          if issn.present?
            IssnRecordAndManifestation.create(
              issn_record: IssnRecord.find_or_create_by(body: issn),
              manifestation: manifestation
            )
          end
          if iss_itemno.present?
            manifestation.ndl_bib_id_record = NdlBibIdRecord.find_or_initialize_by(body: iss_itemno.split('-')[1].gsub(/^I/, ''))
          end
          if jpno.present?
            manifestation.jpno_record = JpnoRecord.find_or_initialize_by(body: jpno.strip)
          end
          if issn_l
            identifier[:issn_l] = Identifier.new(body: issn_l)
            identifier[:issn_l].identifier_type = IdentifierType.find_or_create_by(name: 'issn_l')
          end
          manifestation.carrier_type = carrier_type if carrier_type
          manifestation.manifestation_content_type = content_type if content_type
          if manifestation.save
            identifier.each do |_k, v|
              manifestation.identifiers << v if v.valid?
            end
            manifestation.publishers << publisher_agents
            create_additional_attributes(doc, manifestation)
            if is_serial
              series_statement = SeriesStatement.new(
                original_title: title[:manifestation],
                title_alternative: title[:alternative],
                title_transcription: title[:transcription],
                series_master: true
              )
              if series_statement.valid?
                manifestation.series_statements << series_statement
              end
            else
              create_series_statement(doc, manifestation)
            end
          end
        end

        # manifestation.send_later(:create_frbr_instance, doc.to_s)
        manifestation
      end

      def self.create_additional_attributes(doc, manifestation)
        title = get_title(doc)
        creators = get_creators(doc).uniq
        subjects = get_subjects(doc).uniq
        classifications = get_classifications(doc).uniq
        classification_urls = doc.xpath('//dcterms:subject[@rdf:resource]').map { |subject| subject.attributes['resource'].value }

        Agent.transaction do
          creator_agents = Agent.import_agents(creators)
          content_type_id = begin
                              ContentType.find_by(name: 'text').id
                            rescue
                              1
                            end
          manifestation.creators << creator_agents

          if defined?(EnjuSubject)
            subject_heading_type = SubjectHeadingType.find_or_create_by(name: 'ndlsh')
            subjects.each do |term|
              subject = Subject.find_by(term: term[:term])
              unless subject
                subject = Subject.new(term)
                subject.subject_heading_type = subject_heading_type
                subject.subject_type = SubjectType.find_or_create_by(name: 'concept')
              end
              # if subject.valid?
              manifestation.subjects << subject
              # end
              # subject.save!
            end
            if classification_urls
              classification_urls.each do |url|
                begin
                  ndc_url = URI.parse(url)
                rescue URI::InvalidURIError
                end
                next unless ndc_url
                ndc_type = ndc_url.path.split('/').reverse[1]
                next unless (ndc_type == 'ndc9') || (ndc_type == 'ndc10')
                ndc = ndc_url.path.split('/').last
                classification_type = ClassificationType.find_or_create_by(name: ndc_type)
                classification = Classification.new(category: ndc)
                classification.classification_type = classification_type
                manifestation.classifications << classification if classification.valid?
              end
            end
            ndc8 = doc.xpath('//dc:subject[@rdf:datatype="http://ndl.go.jp/dcndl/terms/NDC8"]').first
            if ndc8
              classification_type = ClassificationType.find_or_create_by(name: 'ndc8')
              classification = Classification.new(category: ndc8.content)
              classification.classification_type = classification_type
              manifestation.classifications << classification if classification.valid?
            end
          end
        end
      end

      def self.search_ndl(query, options = {})
        options = { dpid: 'iss-ndl-opac', item: 'any', idx: 1, per_page: 10, raw: false, mediatype: 1 }.merge(options)
        doc = nil
        results = {}
        startrecord = options[:idx].to_i
        startrecord = 1 if startrecord == 0
        url = "https://iss.ndl.go.jp/api/opensearch?dpid=#{options[:dpid]}&#{options[:item]}=#{format_query(query)}&cnt=#{options[:per_page]}&idx=#{startrecord}&mediatype=#{options[:mediatype]}"
        if options[:raw] == true
          Faraday.get(url).body
        else
          RSS::Rss::Channel.install_text_element('openSearch:totalResults', 'http://a9.com/-/spec/opensearchrss/1.0/', '?', 'totalResults', :text, 'openSearch:totalResults')
          RSS::BaseListener.install_get_text_element 'http://a9.com/-/spec/opensearchrss/1.0/', 'totalResults', 'totalResults='
          feed = RSS::Parser.parse(url, false)
        end
      end

      def self.normalize_isbn(isbn)
        if isbn.length == 10
          Lisbn.new(isbn).isbn13
        else
          Lisbn.new(isbn).isbn10
        end
      end

      def self.return_xml(isbn)
        rss = search_ndl(isbn, dpid: 'iss-ndl-opac', item: 'isbn')
        if rss.channel.totalResults.to_i == 0
          isbn = normalize_isbn(isbn)
          rss = search_ndl(isbn, dpid: 'iss-ndl-opac', item: 'isbn')
        end
        if rss.items.first
          doc = Nokogiri::XML(Faraday.get("#{rss.items.first.link}.rdf").body)
        end
      end

      private

      def self.get_title(doc)
        title = {
          manifestation: doc.xpath('//dc:title/rdf:Description/rdf:value').collect(&:content).join(' '),
          transcription: doc.xpath('//dc:title/rdf:Description/dcndl:transcription').collect(&:content).join(' '),
          alternative: doc.at('//dcndl:alternative/rdf:Description/rdf:value').try(:content),
          alternative_transcription: doc.at('//dcndl:alternative/rdf:Description/dcndl:transcription').try(:content)
        }
        volumeTitle = doc.at('//dcndl:volumeTitle/rdf:Description/rdf:value').try(:content)
        volumeTitle_transcription = doc.at('//dcndl:volumeTitle/rdf:Description/dcndl:transcription').try(:content)
        title[:manifestation] << " #{volumeTitle}" if volumeTitle
        title[:transcription] << " #{volumeTitle_transcription}" if volumeTitle_transcription
        title
      end

      def self.get_creators(doc)
        creators = []
        doc.xpath('//dcterms:creator/foaf:Agent').each do |creator|
          creators << {
            full_name: creator.at('./foaf:name').content,
            full_name_transcription: creator.at('./dcndl:transcription').try(:content),
            agent_identifier: creator.attributes['about'].try(:content)
          }
        end
        creators
      end

      def self.get_subjects(doc)
        subjects = []
        doc.xpath('//dcterms:subject/rdf:Description').each do |subject|
          subjects << {
            term: subject.at('./rdf:value').content
            #:url => subject.attribute('about').try(:content)
          }
        end
        subjects
      end

      def self.get_classifications(doc)
        classifications = []
        doc.xpath('//dcterms:subject[@rdf:resource]').each do |classification|
          classifications << {
            url: classification.attributes['resource'].content
          }
        end
        classifications
      end

      def self.get_language(doc)
        # TODO: 言語が複数ある場合
        language = doc.at('//dcterms:language[@rdf:datatype="http://purl.org/dc/terms/ISO639-2"]').try(:content)
        language.downcase if language
      end

      def self.get_publishers(doc)
        publishers = []
        doc.xpath('//dcterms:publisher/foaf:Agent').each do |publisher|
          publishers << {
            full_name: publisher.at('./foaf:name').content,
            full_name_transcription: publisher.at('./dcndl:transcription').try(:content),
            agent_identifier: publisher.attributes['about'].try(:content)
          }
        end
        publishers
      end

      def self.get_extent(doc)
        extent = doc.at('//dcterms:extent').try(:content)
        value = { start_page: nil, end_page: nil, height: nil }
        if extent
          extent = extent.split(';')
          page = extent[0].try(:strip)
          value[:extent] = page
          if page =~ /\d+p/
            value[:start_page] = 1
            value[:end_page] = page.to_i
          end
          height = extent[1].try(:strip)
          value[:dimensions] = height
          value[:height] = height.to_i if height =~ /\d+cm/
        end
        value
      end

      def self.create_series_statement(doc, manifestation)
        series = series_title = {}
        series[:title] = doc.at('//dcndl:seriesTitle/rdf:Description/rdf:value').try(:content)
        series[:title_transcription] = doc.at('//dcndl:seriesTitle/rdf:Description/dcndl:transcription').try(:content)
        series[:creator] = doc.at('//dcndl:seriesCreator').try(:content)
        if series[:title]
          series_title[:title] = series[:title].split(';')[0].strip
          if series[:title_transcription]
            series_title[:title_transcription] = series[:title_transcription].split(';')[0].strip
          end
        end

        if series_title[:title]
          series_statement = SeriesStatement.find_by(original_title: series_title[:title])
          series_statement ||= SeriesStatement.new(
            original_title: series_title[:title],
            title_transcription: series_title[:title_transcription],
            creator_string: series[:creator]
          )
        end

        if series_statement.try(:save)
          manifestation.series_statements << series_statement
        end
        manifestation
      end

      def self.format_query(query)
        Addressable::URI.encode(query.to_s.tr('　', ' '))
      end
    end

    class AlreadyImported < StandardError
    end
  end
end