linguisticexplorer/Linguistic-Explorer

View on GitHub
lib/group_data/importer.rb

Summary

Maintainability
D
2 days
Test Coverage
# GroupDataImporter
#
# ==> Category.csv <==
# id,name,depth,group_id,creator_id,description
#
# ==> Example.csv <==
# id,name,ling_id,group_id,creator_id
#
# ==> ExampleLingsProperty.csv <===
# id,example_id,lings_property_id,group_id,creator_id
#
# ==> Group.csv <==
# id, name, privacy, depth_maximum, ling0_name, ling1_name, property_name, category_name, lings_property_name, example_name, examples_lings_property_name, example_fields
#
# ==> Ling.csv <==
# id,name,parent_id,depth,group_id,creator_id
#
# ==> LingsProperty.csv <==
# id,ling_id,property_id,value,group_id,creator_id
#
# ==> Membership.csv <==
# id,member_id,group_id,level,creator_id
#
# ==> Property.csv <==
# id,name,description,category_id,group_id,creator_id
#
# ===> StoredValue.csv <=====
# id, storable_id, storable_type, key, value, group_id
#
# ==> User.csv <==
# id,name,email,access_level,password
#

require 'csv'
require 'iconv'
# require 'ruby-prof'
require 'progressbar'

module GroupData
  class Importer

    class << self
      def import(config, verbose = true)
        importer = new(config, verbose)
        importer.import!
        importer
      end
    end

    attr_accessor :config

    def self.lazy_init_cache(*caches)
      caches.each do |cache|
        define_method("#{cache}") do
          instance_variable_get("@#{cache}") ||
              # (instance_variable_set("@#{cache}", "#{cache}" =~ /ids/ ? GoogleHashSparseIntToInt.new : Hash.new) && instance_variable_get("@#{cache}"))
              (instance_variable_set("@#{cache}", {}) && instance_variable_get("@#{cache}"))
        end
      end
    end

    #puts "Loading lazy_cache"
    lazy_init_cache :groups, :user_ids, :ling_ids, :category_ids, :property_ids, :example_ids, :lings_property_ids

    # accepts path to yaml file containing paths to csvs
    def initialize(config, verbose)
      @config = config
      @config.symbolize_keys!
      @verbose = verbose
    end

    def import!
      # RubyProf.measure_mode = RubyProf::ALLOCATIONS
      # RubyProf.start
      reset = "\r\e[0K"

      start = Time.now
      # processing users
      logger.info "processing #{csv_size(:user)} users"
      #puts "processing users"

      # print_to_console "processing users..."
      users_bar = ProgressBar.new("Users...", csv_size(:user))

      csv_for_each :user do |row|
        next if row["id"].nil? || row["id"].empty?
        user = User.find_or_initialize_by_email(row["email"])
        if user.new_record?
          user.password_confirmation = row["password"]
          # Needed for CAPTCHA
          user.bypass_humanizer = true

          save_model_with_attributes(user, row)
        end

        # cache user id
        user_ids[row["id"]] = user.id
        users_bar.inc
      end
      # print_to_console "#{reset}processing users...[OK]"
      users_bar.finish

      logger.info "processing #{csv_size(:group)} groups"
      # print_to_console "\nprocessing groups..."
      groups_bar = ProgressBar.new("Groups...", csv_size(:group))
      #puts "processing groups"

      # This function will change the header
      # due to a typo, if didn't exec
      # the validator before
      fix_csv_elp_name

      csv_for_each :group do |row|
        group = Group.find_or_initialize_by_name(row["name"])
        save_model_with_attributes(group, row)

        # cache group id
        groups[row["id"]] = group
        groups_bar.inc
      end
      # print_to_console "#{reset}processing groups...[OK]"
      groups_bar.finish

      logger.info "processing #{csv_size(:membership)} memberships"
      #puts "processing memberships"

      # print_to_console "\nprocessing memberships..."
      members_bar = ProgressBar.new("Memberships...", csv_size(:membership))

      csv_for_each :membership do |row|
        next if row["id"].nil? || row["id"].empty?
        group       = groups[row["group_id"]]
        member_id   = user_ids[row["member_id"]]
        membership  = group.memberships.find_or_initialize_by_member_id(member_id) do |m|
          m.creator = User.find(user_ids[row["creator_id"]]) if row["creator_id"].present?
        end
        save_model_with_attributes(membership, row)
        members_bar.inc

      end
      # print_to_console "#{reset}processing memberships...[OK]"
      members_bar.finish

      logger.info "processing #{csv_size(:ling)} lings"
      #puts "processing lings"

      # print_to_console "\nprocessing lings..."

      lings_bar = ProgressBar.new("Lings...", csv_size(:ling))

      csv_for_each :ling do |row|
        next if row["id"].nil? || row["id"].empty?
        group     = groups[row["group_id"]]
        ling      = group.lings.find_or_initialize_by_name(row["name"]) do |m|
          m.creator = User.find(user_ids[row["creator_id"]]) if row["creator_id"].present?
        end
        save_model_with_attributes(ling, row)

        # cache ling id
        ling_ids[row["id"]] = ling.id
        lings_bar.inc
      end
      # print_to_console "#{reset}processing lings...[OK]"
      lings_bar.finish

      logger.info "processing #{csv_size(:ling)} parent/child ling associations"
      #puts "parent/child ling associations"

      # print_to_console "\nprocessing ling associations..."
      ling_associations_bar = ProgressBar.new("Ling Associations...", csv_size(:ling))

      csv_for_each :ling do |row|
        ling_associations_bar.inc
        next if row["parent_id"].blank?
        child   = Ling.find(ling_ids[row["id"]])
        parent  = Ling.find(ling_ids[row["parent_id"]])
        child.parent = parent
        child.save!
      end
      # print_to_console "#{reset}processing ling associations...[OK]"
      ling_associations_bar.finish

      logger.info "processing #{csv_size(:category)} categories"
      #puts "processing categories"

      # print_to_console "\nprocessing categories..."
      cats_bar = ProgressBar.new("Categories...", csv_size(:category))

      csv_for_each :category do |row|
        next if row["id"].nil? || row["id"].empty?
        group     = groups[row["group_id"]]
        category  = group.categories.find_or_initialize_by_name(row["name"]) do |m|
          m.creator = User.find(user_ids[row["creator_id"]]) if row["creator_id"].present?
        end
        save_model_with_attributes category, row

        # cache category id
        category_ids[row["id"]] = category.id
        cats_bar.inc
      end
      # print_to_console "#{reset}processing categories...[OK]"
      cats_bar.finish

      logger.info "processing #{csv_size(:property)} properties"
      #puts "processing properties"

      # print_to_console "\nprocessing properties..."
      prop_bar = ProgressBar.new("Properties...", csv_size(:property))

      csv_for_each :property do |row|
        next if row["id"].nil? || row["id"].empty?
        group    = groups[row["group_id"]]
        category = group.categories.find(category_ids[row["category_id"]])
        property = group.properties.find_or_initialize_by_name(row["name"]) do |p|
          p.category = category
          p.creator = User.find(user_ids[row["creator_id"]]) if row["creator_id"].present?
        end
        save_model_with_attributes property, row

        # cache property id
        property_ids[row["id"]] = property.id
        prop_bar.inc
      end
      # print_to_console "#{reset}processing properties...[OK]"
      prop_bar.finish

      logger.info "processing #{csv_size(:example)} examples"
      #puts "processing examples"

      # print_to_console "\nprocessing examples..."
      examples_bar = ProgressBar.new("Examples...", csv_size(:example))

      Example.transaction do
        csv_for_each :example do |row|
          next if row["id"].nil? || row["id"].empty?
          group    = groups[row["group_id"]]
          # ling     = Ling.find(ling_ids[row["ling_id"]])
          example  = group.examples.find_or_initialize_by_name_and_ling_id(row["name"], ling_ids[row["ling_id"]]) do |e|
            # e.ling = ling
            e.creator = User.find(user_ids[row["creator_id"]]) if row["creator_id"].present?
          end
          save_model_with_attributes example, row

          # cache example id
          example_ids[row["id"]] = example.id
          examples_bar.inc
        end
      end

      # print_to_console "#{reset}processing examples...[OK]"
      examples_bar.finish

      total = csv_size(:lings_property)
      logger.info "processing #{total} lings_property"

      # print_to_console "\nprocessing lings_property..."
      # print_to_console " will take about #{total/6000} minutes for #{total} rows" unless total<100000

      lings_prop_bar = ProgressBar.new("Lings Property...", total)

      # LingsProperty.skip_callback(:create)

      # ActiveRecord::Base.uncached do

        LingsProperty.transaction do
          csv_for_each :lings_property do |row|
            next if row["id"].nil? || row["id"].empty?
            group       = groups[row["group_id"]]
            ling_id     = ling_ids[row["ling_id"]]
            value       = row["value"]
            property_id = property_ids[row["property_id"]]
            # conditions  = { :group_id => group.id, :value => value, :ling_id => ling_id, :property_id => property_id }

            # lp = group.lings_properties.where(conditions).select(:id).first ||
            #     group.lings_properties.create(conditions) do |lp|
            #       lp.creator = User.find(user_ids[row["creator_id"]]) if row["creator_id"].present?
            #     end
            # lp = group.lings_properties.where(conditions).first_or_create(conditions) do |lp|
            #   lp.creator = User.find(user_ids[row["creator_id"]]) if row["creator_id"].present?
            # end
            lp = group.lings_properties.find_or_initialize_by_group_id_and_value_and_ling_id_and_property_id(group.id, value, ling_id, property_id) do |lp|
              lp.creator = User.find(user_ids[row["creator_id"]]) if row["creator_id"].present?
            end

            save_model_with_attributes lp, row

            # cache lings_property id
            lings_property_ids[row["id"]] = lp.id
            lings_prop_bar.inc
          end
        end

      # end

      lings_prop_bar.finish

      # Restore callbacks
      # LingsProperty.set_callback(:create)

      # print_to_console "#{reset}processing lings_property...[OK]"

      logger.info "processing #{csv_size(:examples_lings_property)} examples_lings_property"
      # print_to_console "\nprocessing examples_lings_property..."

      example_lings_prop_bar = ProgressBar.new("Examples Lings Properties...", csv_size(:examples_lings_property))

      # Speed up the creation
      # ExamplesLingsProperty.skip_callback(:create)

      # ActiveRecord::Base.uncached do

        ExamplesLingsProperty.transaction do
          csv_for_each :examples_lings_property do |row|
            next if row["id"].nil? || row["id"].empty?
            group             = groups[row["group_id"]]
            example_id        = example_ids[row["example_id"]]
            lings_property_id = lings_property_ids[row["lings_property_id"]]
            # conditions  = { :group_id => group.id, :example_id => example_id, :lings_property_id => lings_property_id }

            # group.examples_lings_properties.where(conditions).select(:id).first ||
            # group.examples_lings_properties.create(conditions) do |elp|
            #   elp.creator = User.find(user_ids[row["creator_id"]]) if row["creator_id"].present?
            # end
            # group.examples_lings_properties.where(conditions).first_or_create(conditions) do |elp|
            #   elp.creator = User.find(user_ids[row["creator_id"]]) if row["creator_id"].present?
            # end
            elp = group.examples_lings_properties.find_or_initialize_by_group_id_and_example_id_and_lings_property_id(group.id, example_id, lings_property_id) do |elp|
              elp.creator = User.find(user_ids[row["creator_id"]]) if row["creator_id"].present?
            end

            save_model_with_attributes elp, row

            example_lings_prop_bar.inc
          end

        end

      # end

      example_lings_prop_bar.finish

      # Restore callbacks
      # ExamplesLingsProperty.set_callback(:create)

      # print_to_console "#{reset}processing examples_lings_property...[OK]"

      logger.info "processing #{csv_size(:stored_value)} stored value"
      # print_to_console "\nprocessing stored_values..."

      stored_values_bar = ProgressBar.new("Stored Values...", csv_size(:stored_value))

      # Speed up
      # StoredValue.skip_callback(:create)

      # ActiveRecord::Base.uncached do

        StoredValue.transaction do
          csv_for_each :stored_value do |row|
            next if row["id"].nil? || row["id"].empty?

            value = row["value"].gsub("#{row["key"]}:", '')
            group         = groups[row["group_id"]]
            storable_type = row['storable_type']
            storable_id   = self.send("#{storable_type.downcase}_ids")[row["storable_id"]]
            # conditions = { :group_id => group.id, :storable_id => storable_id, :storable_type => storable_type,
            #                :key => row["key"], :value => value }

            # group.stored_values.where(conditions).select(:id).first || group.stored_values.create(conditions)
            # group.stored_values.where(conditions).first_or_create(conditions)
            stored = group.stored_values.find_or_initialize_by_group_id_and_storable_id_and_storable_type_and_key_and_value(group.id, storable_id, storable_type, row["key"], value)
            StoredValue.skip_callback(:create)
            stored.save!(:validate => false)
            StoredValue.set_callback(:create)

            stored_values_bar.inc
          end
        end

      # end

      stored_values_bar.finish

      # Restore callbacks
      # StoredValue.set_callback(:create)

      # print_to_console "#{reset}processing stored_values...[OK]\n"

      elapsed = seconds_fraction_to_time(Time.now - start)
      print_to_console "Time for import: #{elapsed[0]} : #{elapsed[1]} : #{elapsed[2]}\n"

      # result = RubyProf.stop

      # Print a flat profile to text
      # File.open "#{Rails.root}/tmp/profile-graph.html", 'w' do |file|
        # RubyProf::MultiPrinter.new(result).print(:path => "#{Rails.root}/tmp/", :profile => "allocations")
      # end

    end

    private

    def print_to_console(string)
      print string if @verbose
    end

    def seconds_fraction_to_time(time_difference)
      hours = (time_difference / 3600).to_i
      mins = ((time_difference / 3600 - hours) * 60).to_i
      seconds = (time_difference % 60 ).to_i
      [hours,mins,seconds]
    end


    def set_creator(conditions, group, row)
      creator = User.find(user_ids[row["creator_id"]])
      conditions[:created_at] = creator.created_at
      conditions[:updated_at] = creator.updated_at
      conditions[:group_id] = group.id
      conditions[:creator_id] = creator.id
    end

    def save_model_with_attributes(model, row)
      model.class.importable_attributes.each do |attribute|
        model.send("#{attribute}=", row[attribute])
      end
      model.class.skip_callback(:create)
      model.save!(:validate => false)
      model.class.set_callback(:create)
    end

    def csv_for_each(key)
      CSV.foreach(@config[key], :headers => true) do |row|
        yield(row)
      end
    end

    def csv_size(key)
      (CSV.read(@config[key]).length) -1
    end

    def fix_csv_elp_name
      # Load the CSV file
      file = @config[:group]
      string_fixed = "examples_lings_property_name"
      bad_string = "example_lings_property_name"

      text = File.read(file)
      new_text = text.gsub(/#{bad_string}.*,/, string_fixed)
      File.open(file, "w") {|file| file.puts new_text}
    end

    def logger
      @logger ||= begin
        if Rails.env.production?
          Logger.new(STDOUT)
        else
          Rails.logger
        end
      end
    end
  end

end