linguisticexplorer/Linguistic-Explorer

View on GitHub
lib/sswl_data/converter.rb

Summary

Maintainability
D
2 days
Test Coverage
# SswlData::Converter
#
#
require 'csv'

module SswlData
  class Converter

    class << self
      def load(config)
        validator = new(config)
        validator
      end
    end

    attr_accessor :config

    def self.lazy_init_cache(*caches)
      caches.each do |cache|
        define_method("#{cache}") do
          instance_variable_get("@#{cache}") ||
              (instance_variable_set("@#{cache}", {}) && instance_variable_get("@#{cache}"))
        end
      end
    end

    ##puts "Loading lazy_cache"
    lazy_init_cache :user_ids, :ling_ids, :property_ids, :example_ids, :lings_property_ids,
                    :examples_lings_property_ids, :stored_value_ids, :member_ids

    # accepts path to yaml file containing paths to csvs
    def initialize(config)
      @config = config
      @config.symbolize_keys!
      @sanitized = {}
      @headers = load_headers
    end

    def convert!

      reset = "\r\e[0K"
      start = Time.now

      print "converting users..."

      # SSWL
      #
      # ===> Users.csv <============
      # id, first_name, last_name, username, hashed_password, affiliation, user_type, email, website, role, language, salt
      #
      # Terraling
      #
      # ==> User.csv <==
      # id,name,email,access_level,password
      user_ids = {}
      csv_for_each :user do |row|

        Converter.convert_user_in(row, user_ids)
      end

      write_csv :user, user_ids

      print "#{reset}converting users...[OK]"

      print "\nconverting groups..."

      # Terraling
      #
      # ==> Group.csv <==
      # id, name, privacy, depth_maximum, ling0_name, ling1_name, property_name, category_name, lings_property_name, example_name, examples_lings_property_name, example_fields

      CSV.open(new_path_for_csv(:group), "wb") do |csv|
        csv << @headers[:group]
        time = Time.new
        name = "SSWL_Data"
        csv << ["0",name,"public","0","Language","not-present","Property",
                "Category","Value","Example","Example Value","gloss, words, translation, comment"]

      end

      print "#{reset}converting groups...[OK]"

      print "\nconverting memberships..."

      # SSWL
      #
      # ==> Users.csv <==
      # id, first_name, last_name, username, hashed_password, affiliation, user_type, email, website, role, language, salt
      #
      # Terraling
      #
      # ==> Membership.csv <==
      # id,member_id,group_id,level,creator_id
      member_ids = {}
      csv_for_each :user do |row|

        # cache member id
        Converter.convert_membership_in(row, member_ids)
      end

      write_csv :membership, member_ids

      print "#{reset}converting memberships...[OK]"

      print "\nconverting lings..."

      # SSWL
      #
      # ===> Languages.csv <=====
      # id, value, property, language
      #
      # Terraling
      #
      # ==> Ling.csv <==
      # id,name,parent_id,depth,group_id,creator_id
      ling_ids = {}
      csv_for_each :ling do |row|

        # cache ling id
        Converter.convert_ling_in(row, ling_ids)

      end

      write_csv :ling, ling_ids

      print "#{reset}converting lings...[OK]"

      print "\nconverting categories..."

      # Terraling
      #
      # ==> Category.csv <==
      # id,name,depth,group_id,creator_id,description
      CSV.open(new_path_for_csv(:category), "wb") do |csv|
        csv << @headers[:category]
        csv << [ "0","Category 0","0","0","Category created from SSWL Migration",nil ]

      end

      print "#{reset}converting categories...[OK]"

      print "\nconverting examples..."

      # SSWL
      #
      # ===> ExampleObjects.csv <===
      # id, language, sentence_number
      #
      # Terraling
      # ==> Example.csv <==
      # id,name,ling_id,group_id,creator_id
      #
      counter = 0
      example_ids = {}
      csv_for_each :example do |row|

        next if ling_ids["#{Converter.decode(row["language"])}"].nil?

        # cache example id
        counter = Converter.convert_example_in(row, example_ids, ling_ids, counter)
      end

      write_csv :example, example_ids

      print "#{reset}converting examples...[OK]"

      print "\nconverting properties..."

      #sanitize_text_in_fields :property
      # SSWL
      # ===> Properties.csv <====
      # id, property, description
      #
      # Terraling
      # ==> Property.csv <==
      # id,name,description,category_id,group_id,creator_id
      max_id = 0
      property_ids = {}
      csv_for_each :property do |row|
        max_id = Converter.convert_property_in(row, property_ids, max_id)

      end

      print "#{reset}converting properties...[OK]"

      print "\nconverting lings_property..."

      # SSWL
      #
      # ===> Languages.csv <=====
      # id, value, property, language
      #
      # Terraling
      #
      # ==> LingsProperty.csv <==
      # id,ling_id,property_id,value,group_id,creator_id
      #
      lings_property_ids = {}
      csv_for_each :lings_property do |row|

        max_id = Converter.update_property_in(row, property_ids, max_id)

        Converter.convert_ling_prop_in(row, lings_property_ids, ling_ids, property_ids )
      end

      write_csv :property, property_ids

      write_csv :lings_property, lings_property_ids

      print "#{reset}converting lings_property...[OK]"

      # SSWL
      #
      # ===> Examples.csv <==========
      # id, language, value, sentencenumber, property, example_object_id
      #
      # Terraling
      #
      # ==> ExampleLingsProperty.csv <===
      # id,example_id,lings_property_id,group_id,creator_id
      #
      print "\nconverting examples_lings_property..."

      cache_properties = {}

      csv_for_each :examples_lings_property do |row|

        next if property_ids[row["property"]].nil?
        # Check if the row is the referrer to the property
        # and cache it
        cache_properties[row["example_object_id"]] ||= {
            "lang" => "#{Converter.decode(row["language"])}",
            "value" => row["value"],
            "name" => row["property"]
        }

      end

      csv_for_each :examples_lings_property do |row|
        next unless property_ids[row["property"]].nil?

        # Retrieve from cache to build reference
        property = cache_properties[row["example_object_id"]]
        property.nil? ? next : lings_prop_entry = lings_property_ids["#{property["lang"]}:#{property["name"]}:#{property["value"]}"]

        if lings_prop_entry.nil?
          show_error property
          next
        end

        lings_prop_id = lings_prop_entry["id"]

        # cache examples_lings_property id
        examples_lings_property_ids[row["example_object_id"]] ||= {
            "id" => "#{row["id"]}",
            "value" => "#{row["value"]}",
            "group_id" => "0",
            "example_id" => "#{row["example_object_id"]}",
            "lings_property_id" => "#{lings_prop_id}",
            "ling_id" => "#{ling_ids[Converter.decode(row["language"])]["id"]}"
        }
      end

      write_csv :examples_lings_property, examples_lings_property_ids

      print "#{reset}converting examples_lings_property...[OK]"

      # SSWL
      #
      # ===> Examples.csv <==========
      # id, language, value, sentencenumber, property, example_object_id
      #
      # Terraling
      #
      # ===> StoredValue.csv <=====
      # id, storable_id, storable_type, key, value, group_id
      #
      print "\nconverting stored_values..."
      stored_value_ids = {}
      csv_for_each :stored_value do |row|
        next unless property_ids[row["property"]].nil?

        Converter.convert_stored_value_in(row, stored_value_ids)
      end

      write_csv :stored_value, stored_value_ids

      print "#{reset}converting stored_values...[OK]"

      print "\nCreating YAML configuration file for importing..."

      config = {}.tap do |paths|
        @headers.keys.each do |model|
          paths[model.to_s] = new_path_for_csv(model)
        end
      end
      File.open(get_yaml_path, "wb") { |f| f.write config.to_yaml }

      print "#{reset}Creating YAML configuration file for importing...[OK]\n"

      elapsed = seconds_fraction_to_time(Time.now - start)
      puts "Time for converting: #{elapsed[0]} : #{elapsed[1]} : #{elapsed[2]}"
    end

    def self.convert_stored_value_in(row, stored_value_ids)
      stored_value_ids[row["id"]] ||={
          "id" => "#{row["id"]}",
          "key" => "#{row["property"]}",
          "value" => "#{row["property"]}:#{row["value"]}",
          "group_id" => "0",
          "storable_type" => "Example",
          "storable_id" => "#{row["example_object_id"]}"
      }
    end

    def self.convert_ling_prop_in(row, lings_property_ids, ling_ids, property_ids)
      lings_prop_id = "#{decode(row["language"])}:#{property_ids[row["property"]]["name"]}:#{row["value"]}"

      # cache lings_property id
      lings_property_ids[lings_prop_id] ||= {
          "id" => "#{row["id"]}",
          "value" => "#{row["value"]}",
          "group_id" => "0",
          "category_id" => "0",
          "property_id" => "#{property_ids[row["property"]]["id"]}",
          "ling_id" => "#{ling_ids[decode(row["language"])]["id"]}"
      }
    end

    def self.update_property_in(row, property_ids, max_id)
      max_id +=1 if property_ids[row["property"]].nil?

      # Some properties are splitted in more files
      property_ids[row["property"]] ||= {
          "id" => "#{max_id}",
          "name" => "#{row["property"]}",
          "group_id" => "0",
          "category_id" => "0"
      }
      max_id
    end

    def self.convert_property_in(row, property_ids, max_id)
      max_id = row["id"].to_i unless max_id > row["id"].to_i

      description = "\"#{row["description"]}\""
      #puts "DEBUG: #{description}"
      # cache property id
      property_ids[row["property"]] ||= {
          "id" => "#{row["id"]}",
          "name" => "#{row["property"]}",
          "group_id" => "0",
          "category_id" => "0",
          "description" => description
      }
      max_id
    end

    def self.convert_example_in(row, example_ids, ling_ids, counter)
      example_ids[row["id"]] ||= {
          "id" => "#{row["id"]}",
          "name" => "Example_#{counter}",
          "group_id" => "0",
          "ling_id" => "#{ling_ids["#{decode(row["language"])}"]["id"]}"
      }
      counter+=1
    end

    def self.convert_ling_in(row, ling_ids)
      ling_ids[decode(row["language"])] ||= {
          "id" => "#{row["id"]}",
          "name" => "#{decode(row["language"])}",
          "group_id" => "0",
          "depth" => "0"
      }
    end

    #TODO: improve converting handling Language Experts and Property Author
    def self.convert_membership_in(row, member_ids)
      member_ids[row["id"]] ||= {
          "id" => "#{row["id"]}",
          "member_id" => "#{row["id"]}",
          "group_id" => "0",
          "level" => row["user_type"] == "admin" ? "admin" : "member"
      }
    end

    def self.convert_user_in(row, user_ids)
      # Generator of a random password
      char_array = [('a'..'z'), ('A'..'Z')].map { |i| i.to_a }.flatten;
      password = (0..8).map { char_array[rand(char_array.length)] }.join;

      first_name = row["first_name"]
      first_name = first_name.gsub(/\s/, '') if first_name =~ /\s/
      last_name = row["last_name"]
      last_name = last_name.gsub(/\s/, '') if last_name =~ /\s/
      email = row["email"].present? ? row["email"] : "#{first_name}@#{last_name}.com"
      # cache user id
      user_ids[row["id"]] = {
          "id" => "#{row["id"]}",
          "name" => "#{row["first_name"]} #{row["last_name"]}",
          "email" => "#{email}",
          "access_level" => row["user_type"] == "admin" ? "admin" : "user",
          "password" => password
      }
    end

    private

    # Change from double-quotes in descriptions
    # writing single quote to sanitize csv for parser
    def sanitize_csv(key)
      if !@sanitized[key]
        file = @config[key]
        strings = {
            "\"" => "\\\\'",
            "\\\\;" => "\.",
            "END" => "\n"
        }
        @sanitized[key] ||= true

        strings.each do |bad, fixed|
          text = File.read(file){|f| f.readline}
          new_text = text.gsub(/#{bad}/, fixed)
          File.open(file, "w") {|file| file.puts new_text}
        end
      end
    end

    def self.decode(string)
      string.nil? ? string : Iconv.new('UTF-8','LATIN1').iconv(string.encode("cp1252").force_encoding("UTF-8"))
    end

    def csv_for_each(key)
      sanitize_csv key
      line_cache = ""
      CSV.foreach(@config[key], :headers => true, :col_sep => "\#\#\#") do |row|
        yield(row)
        line_cache = "#{row}"
      end
    rescue  CSV::MalformedCSVError => e
      print "\n#{red e.message}"
      print "\nCheck the entry next to this one of #{@config[key]}:\n #{red line_cache}" unless line_cache.size<1
    end

    def write_csv(key, data_ids)
      CSV.open(new_path_for_csv(key), "wb") do |csv|
        csv << @headers[key]
        data_ids.each do |id, row|
          csv << @headers[key].map {|attribute| row[attribute] }
        end
      end
    end

    def new_path_for_csv(key)
      filename = "#{key.to_s.camelize}.csv"
      return new_path_for_csv(:user).
          gsub(/\w*\.csv/, filename) if @config[key].nil?

      old_path = File.dirname @config[key]

      new_path = old_path << "/terraling/"
      FileUtils.mkdir_p new_path
      new_path << filename
      return new_path
    end

    def get_yaml_path()
      new_path_for_csv(:user).gsub(/\w*\.csv/, "import.yml")
    end

    def red(string)
      "\e[31m#{string}\e[0m"
    end

    def show_error(ling_property)
      puts "\n#{red "ERROR: Cannot find reference for"}"
      puts "\t#{ling_property["lang"]}:#{ling_property["name"]}:#{ling_property["value"]}\n"
      alternatives = search_alternatives ling_property
      if alternatives.any?
        puts "#{red "What I found in your languages file is:" }"
        alternatives.each do |property|
          puts "#{property}"
        end
      else
        puts "#{red "I haven't found any entry in languages file!!!" }"
      end
      puts "\n"
    end

    def search_alternatives(ling_property)
      result = []
      ["Yes", "No", "NA", "Not Yet Set"].each do |value|
        index = "#{ling_property["lang"]}:#{ling_property["name"]}:#{value}"
        found = lings_property_ids[index]
        found.nil? ? next : result << index
      end
      result
    end

    def sanitize_text_in_fields(key)
      file = @config[key]
      bad_string = '\r\n"'
      string_fixed = "\#"
      text = File.read(file){|f| f.readline}
      new_text = text.gsub(/#{bad_string}/, string_fixed)
      File.open(file, "w") {|file| file.puts new_text}
    end

    def seconds_fraction_to_time(time_difference)
      hours = (time_difference / 3600).to_i
      mins = ((time_difference / 3600 - hours) * 60).to_i
      seconds = (time_difference % 60 ).to_i
      [hours,mins,seconds]
    end

    def load_headers
      { :user => ["name","id","email","access_level","password"],
        :group => ["id", "name" ,"privacy", "depth_maximum", "ling0_name", "ling1_name", "property_name", "category_name", "lings_property_name", "example_name", "examples_lings_property_name", "example_fields" ],
        :membership => [ "id", "member_id", "group_id", "level", "creator_id" ],
        :ling => [ "id","name","parent_id","depth","group_id", "creator_id" ],
        :category => [ "id","name","depth","group_id","description", "creator_id" ],
        :property => [ "id","name","description","category_id","group_id", "creator_id" ],
        :example => [ "id","ling_id","group_id","name", "creator_id" ],
        :lings_property => [ "id","ling_id","property_id","value","group_id", "creator_id" ],
        :examples_lings_property => [ "id","example_id","lings_property_id","group_id", "creator_id" ],
        :stored_value => [ "id","storable_id","storable_type","key","value","group_id" ]
      }
    end

  end

end