thomas-mcdonald/qa

View on GitHub
lib/import/stack_exchange.rb

Summary

Maintainability
B
4 hrs
Test Coverage
require 'import/stack_exchange/comment'
require 'import/stack_exchange/edit'
require 'vote_creator'

module QA
  module Import
    class StackExchange
      def initialize(dir)
        @conn = ActiveRecord::Base.connection.raw_connection
        Rails.logger.level = Logger::WARN
        @dir = dir
        @posts = []
        output_intro
        @user_ids = create_users
        create_posts
        create_comments
        create_votes
        create_reputation
        update_counters
        ActiveRecord::Base.clear_active_connections!
      end

      def output_intro
        puts " Welcome to QA."
        puts " Importing from a Stack Exchange Data Dump"
      end

      def create_users
        users_doc = Nokogiri::XML::Document.parse(File.read("#{@dir}/users.xml")).css('users row')
        puts " Creating Users"
        bar = progress_bar('Users', users_doc.length)
        user_ids = []
        @conn.exec('COPY users (id, name, email, created_at, updated_at) FROM STDIN WITH CSV')
        begin
          users_doc.each do |u|
            bar.increment
            next if u["Id"].to_i < 0
            time = DateTime.now
            @conn.put_copy_data(%(#{u["Id"]},"#{u["DisplayName"]}","#{Faker::Internet.safe_email}", #{time}, #{time}\n))
            user_ids << u["Id"].to_i
          end
        ensure
          @conn.put_copy_end
        end
        user_ids
      end

      def create_posts
        posts = Nokogiri::XML::Document.parse(File.read("#{@dir}/posts.xml")).css('posts row')
        post_histories = Nokogiri::XML::Document.parse(File.read("#{@dir}/posthistory.xml")).css('posthistory row')
        questions = posts.select { |p| p["PostTypeId"] == "1" }
        answers = posts.select { |p| p["PostTypeId"] == "2" }

        grouped_edits = build_edits(post_histories)
        posts = nil

        # create posts
        create_questions(questions, grouped_edits)
        create_answers(answers, grouped_edits)

        puts " Updating accepted answer IDs"
        bar = progress_bar('Accepting', questions.length)
        questions.each do |q|
          bar.increment
          info = @posts[q['Id'].to_i]
          answer_info = @posts[q['AcceptedAnswerId'].to_i]
          next unless answer_info # answer doesn't exist... for whatever reason
          Question.update(info[:id], accepted_answer_id: answer_info[:id])
        end
        posts
      end

      def create_comments
        comments = Nokogiri::XML::Document.parse(File.read("#{@dir}/comments.xml")).css('comments row')
        puts " Creating Comments"
        bar = progress_bar('Comments', comments.length)
        comments.each do |row|
          bar.increment
          se_comment = StackExchange::Comment.new(row)
          info = @posts[se_comment.post_id]
          next unless info
          comment = se_comment.build_object(info)
          comment.save
        end
      end

      def create_votes
        voterow = Nokogiri::XML::Document.parse(File.read("#{@dir}/votes.xml")).css('votes row')
        puts " Creating votes"

        user_ids = User.pluck(:id)
        bar = progress_bar('Votes', voterow.count)
        @conn.exec('COPY votes (user_id, post_id, post_type, vote_type, created_at, updated_at) FROM STDIN WITH CSV')

        voterow.each do |row|
          bar.increment
          next unless [2, 3].include? row['VoteTypeId'].to_i
          next if @posts[row['PostId'].to_i].blank?

          post_id = @posts[row['PostId'].to_i][:id]
          post_type = @posts[row['PostId'].to_i][:type]
          vote_type = Vote.vote_types['upvote'] if row['VoteTypeId'].to_i == 2
          vote_type = Vote.vote_types['downvote'] if row['VoteTypeId'].to_i == 3
          user_id = user_ids.sample
          created_at = DateTime.parse row['CreationDate']
          updated_at = DateTime.parse row['CreationDate']
          @conn.put_copy_data(%(#{user_id},#{post_id},"#{post_type}",#{vote_type},#{created_at},#{updated_at}\n))
        end
        @conn.put_copy_end
      end

      def create_reputation
        puts "Building reputation events"
        re = []
        bar = ProgressBar.create(title: 'Reputation', total: Vote.count, format: '%t: |%B| %E')
        Vote.all.each do |v|
          re << ReputationEvent.new(
            action: v,
            event_type: %(give_#{v.event_type}),
            user: v.user
          )
          re << ReputationEvent.new(
            action: v,
            event_type: %(receive_#{v.event_type}),
            user: v.post.user
          )
          bar.increment
        end

        puts "Inserting reputation events"
        @conn.exec('COPY reputation_events (user_id, event_type, action_type, action_id, created_at, updated_at) FROM STDIN WITH CSV')
        bar = progress_bar('RepEvents', re.count)
        re.each do |r|
          event_id = ReputationEvent.event_types[r.event_type]
          puts r && next if event_id.nil?
          @conn.put_copy_data(
            %(#{r.user_id},#{event_id},"#{r.action_type}",#{r.action_id},#{r.created_at},#{r.updated_at}\n)
          )
          bar.increment
        end
        @conn.put_copy_end

        puts "Calculating reputation"
        bar = ProgressBar.create(title: 'Recounting', total: User.count, format: '%t: |%B| %E')
        User.all.each do |u|
          bar.increment
          u.calculate_reputation!
        end
      end

      def update_counters
        print " Updating cache counters & creating background jobs for badges"
        Question.all.each do |q|
          Jobs::QuestionStats.new.perform(q.id)
          Jobs::Badge.perform_async(:question_vote, q.to_global_id)
          Jobs::Badge.perform_async(:question_view, q.to_global_id)
        end
        Answer.all.each do |a|
          Jobs::AnswerStats.new.perform(a.id)
          Jobs::Badge.perform_async(:answer_vote, a.to_global_id)
        end
        puts " - done!"
      end

      private

      def build_edits(post_histories)
        puts " Sorting histories by GUID"
        bar = progress_bar('Sorting', post_histories.length)
        guidgroups = Hash.new { |hash, key| hash[key] = [] }
        post_histories.each do |row|
          bar.increment
          guidgroups[row['RevisionGUID']] << row
        end
        puts " Grouping GUIDs into single edits"
        bar = progress_bar('Grouping', guidgroups.length)
        groupededits = Hash.new { |hash, key| hash[key] = [] }
        guidgroups.each do |_, edit|
          eobj = StackExchange::Edit.new(edit)
          bar.increment
          groupededits[eobj.post_id] << eobj
        end
        groupededits
      end

      def progress_bar(title, length)
        ProgressBar.create(title: title, total: length, format: '%t: |%B| %E')
      end

      def create_questions(questions, grouped_edits)
        puts " Creating and inserting questions"
        bar = progress_bar('Questions', questions.length)
        questions.each do |q|
          bar.increment
          edits = grouped_edits[q['Id']]
          originator = edits.select(&:new_record)[0]
          edits.delete originator

          # we can't handle anonymous users right now
          next unless @user_ids.include? originator[:user_id].to_i

          qc = QuestionCreator.new(User.find(originator[:user_id]), originator.simple_hash)
          qu = qc.create
          # if the record saved
          if !qu.new_record?
            @posts[q['Id'].to_i] = { id: qu.id, type: 'Question' }

            # add dummy view count data
            redis_values = (1..q['ViewCount'].to_i).to_a
            $view.pfadd("question-#{qu.id}", redis_values)
          end
        end
      end

      def create_answers(answers, grouped_edits)
        puts " Creating and inserting answers"
        bar = progress_bar('Answers', answers.length)
        answers.each do |a|
          bar.increment
          next if @posts[a['ParentId'].to_i].blank?
          edits = grouped_edits[a['Id']]
          originator = edits.select(&:new_record)[0]
          edits.delete originator
          next unless @user_ids.include? originator[:user_id].to_i

          question = Question.find_by(id: @posts[a['ParentId'].to_i][:id])
          next if question.nil?
          ac = AnswerCreator.new(question, User.find(originator[:user_id]), originator.simple_hash)
          an = ac.create

          @posts[a['Id'].to_i] = { id: an.id, type: 'Answer' } unless an.new_record?
        end
      end
    end
  end
end