GlobalNamesArchitecture/dwca-hunter

View on GitHub
lib/dwca_hunter/resources/freebase.rb

Summary

Maintainability
A
3 hrs
Test Coverage
# frozen_string_literal: true

module DwcaHunter
  class ResourceFreebase < DwcaHunter::Resource
    def initialize(opts = {})
      @command = "freebase"
      @title = "Freebase"
      @uuid = "bacd21f0-44e0-43e2-914c-70929916f257"
      @download_path = File.join(Dir.tmpdir,
                                 "dwca_hunter",
                                 "freebase",
                                 "data.json")
      @data = []
      @all_taxa = {}
      @cleaned_taxa = {}
      @extensions = []
      super
    end

    def needs_unpack?
      false
    end

    def make_dwca
      organize_data
      generate_dwca
    end

    def download
      DwcaHunter.logger_write(object_id,
                              "Querying freebase for species information...")
      q = {
        query: [{
          type: "/biology/organism_classification",
          id: nil,
          guid: nil,
          name: nil,
          scientific_name: nil,
          synonym_scientific_name: [],
          higher_classification: {
            id: nil,
            guid: nil,
            scientific_name: nil,
            optional: true
          }
        }],
        cursor: true
      }

      run_query(q)

      data = JSON.pretty_generate @data
      f = open(@download_path, "w:utf-8")
      f.write(data)
      f.close
    end

    private

    def run_query(q)
      count = 0
      requests_num = 0
      loop do
        freebase_url = "http://api.freebase.com/api/service/mqlread?query=%s" %
                       URI.encode(q.to_json)
        res = JSON.load RestClient.get(freebase_url)
        requests_num += 1
        break if res["result"].nil? || res["result"].empty?

        if requests_num % 10 == 0
          DwcaHunter.logger_write(object_id,
                                  "Received %s names" % count)
        end
        count += res["result"].size
        res["result"].each { |d| @data << d }
        q[:cursor] = res["cursor"]
      end
    end

    def organize_data
      @data = JSON.load(open(@download_path, "r:utf-8").read)
      @data.each do |d|
        scientific_name = d["scientific_name"].to_s
        id = d["id"]
        parent_id = d["higher_classification"] ?
                    d["higher_classification"]["id"] :
                    nil
        synonyms = d["synonym_scientific_name"]
        @all_taxa[id] = { id: id,
                          parent_id: parent_id,
                          scientific_name: scientific_name,
                          synonyms: synonyms }
      end

      @all_taxa.each do |k, v|
        next unless v[:scientific_name] && v[:scientific_name].strip != ""

        parent_id = v[:parent_id]
        until (@all_taxa[parent_id] &&
                @all_taxa[parent_id][:scientific_name]) || parent_id.nil?
          puts "did not find parent %s" % parent_id
          parent_id = @all_taxa[parent_id]
        end
        parent_id = nil if v[:id] == parent_id
        v[:parent_id] = parent_id
        @cleaned_taxa[k] = v
      end
    end

    def generate_dwca
      DwcaHunter.logger_write(object_id,
                              "Creating DarwinCore Archive file")
      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
                "http://rs.tdwg.org/dwc/terms/scientificName",
                "http://rs.tdwg.org/dwc/terms/parentNameUsageID"]]

      @extensions << { data: [[
        "http://rs.tdwg.org/dwc/terms/TaxonID",
        "http://rs.tdwg.org/dwc/terms/scientificName"
      ]], file_name: "synonyms.txt" }
      DwcaHunter.logger_write(object_id,
                              "Creating synonyms extension for DarwinCore Archive file")
      count = 0
      @cleaned_taxa.each do |_key, taxon|
        count += 1
        @core << [taxon[:id], taxon[:scientific_name], taxon[:parent_id]]
        if count % BATCH_SIZE == 0
          DwcaHunter.logger_write(object_id,
                                  "Traversing %s extension data record" % count)
        end
        taxon[:synonyms].each do |name|
          @extensions[-1][:data] << [taxon[:id], name]
        end
      end
      @eml = {
        id: @uuid,
        title: @title,
        license: "http://creativecommons.org/licenses/by-sa/3.0/",
        authors: [
          { url: "http://www.freebase.com/home" }
        ],
        abstract: "An entity graph of people, places and things, " \
                  "built by a community that loves open data.",
        metadata_providers: [
          { first_name: "Dmitry",
            last_name: "Mozzherin",
            email: "dmozzherin@mbl.edu" }
        ],
        url: "http://www.freebase.com/home"
      }
      super
    end
  end
end