ArtOfCode-/qpixel

View on GitHub
scripts/import/stack_import.rb

Summary

Maintainability
A
3 hrs
Test Coverage
require 'ostruct'
require 'optparse'
require 'open-uri'
require 'csv'

require_relative 'api_import'
require_relative 'dump_import'
require_relative 'database_import'

$logger = ::Logger.new(STDOUT)
$logger.level = :info

def msg2str(msg)
  case msg
  when ::String
    msg
  when ::Exception
    "#{msg.message} (#{msg.class})\n" <<
      (msg.backtrace || []).join("\n")
  else
    msg.inspect
  end
end

$logger.formatter = proc do |severity, time, progname, msg|
  colors = { 'DEBUG' => "\033[0;37m", 'INFO' => "\033[1;36m", 'WARN' => "\033[1;33m", 'ERROR' => "\033[1;31m",
             'FATAL' => "\033[0;31m" }
  "%s, [%s #%d] %s%5s%s -- %s: %s\n" % [severity[0..0], time.strftime('%Y-%m-%d %H:%M:%S'), $$, colors[severity],
                                        severity, "\033[0m", progname, msg2str(msg)]
end

def domain_from_api_param(api_param)
  nonstandard = {
    stackoverflow: '.com',
    superuser: '.com',
    serverfault: '.net',
    askubuntu: '.com',
    mathoverflow: '.net'
  }
  if nonstandard.keys.include? api_param.to_sym
    "#{api_param}#{nonstandard[api_param.to_sym]}"
  else
    "#{api_param}.stackexchange.com"
  end
end

@options = OpenStruct.new
opt_parser = OptionParser.new do |opts|
  opts.banner = "Usage: rails r stack_import.rb [options]"

  opts.on('-s', '--site=SITE', "Stack Exchange site API parameter to operate on") do |site|
    @options.site = site
  end

  opts.on('-k', '--key=KEY', 'Stack Exchange API key') do |key|
    @options.key = key
  end

  opts.on('-q', '--query=REVISION_ID', 'Import posts whose IDs are returned by the SEDE query provided') do |query|
    @options.query = query
  end

  opts.on('-d', '--dump=FILE', 'Specify the path to the decompressed data dump directory') do |path|
    @options.path = path
  end

  opts.on('-i', '--quiet', 'Produce less output') do
    $logger.level = :warn
  end

  opts.on('-v', '--verbose', 'Produce more output') do
    $logger.level = :debug
  end

  opts.on('-c', '--community=ID', Integer, 'Specify the community ID to add imported content to') do |community|
    @options.community = community
  end

  opts.on('-t', '--category=ID', Integer, 'Specify the category ID which imported posts should be added') do |category|
    @options.category = category
  end

  opts.on('-m', '--mode=MODE', 'Specify the mode to work in (full, process, or import)') do |mode|
    @options.mode = mode || 'full'
  end

  opts.on('-a', '--tag-set=ID', 'Specify the tag set into which to add new tags') do |tag_set|
    @options.tag_set = tag_set
  end

  opts.on('--skip-tags', 'Skip updating tag associations if you don\'t care about them for some reason.') do
    @options.skip_tags = true
  end

  opts.on('--zero-scores', 'Set all scores to zero on import.') do
    @options.zero_scores = true
  end

  opts.on('--json', 'Expect JSON in the downloaded query CSV, generated by FOR JSON AUTO') do
    @options.query_is_json = true
  end

  opts.on_tail('-h', '--help', 'Show this message') do
    puts opts
    exit
  end
end
opt_parser.parse!

unless @options.mode.present?
  $logger.fatal "--mode must be provided. Use --help for a list of parameters."
  exit 1
end

require = {
  full: [:site, :query, :path, :community, :category, :tag_set],
  process: [:site, :query, :path, :community, :category, :tag_set],
  import: [:site, :community, :category, :tag_set],
  tags: [:site, :tag_set]
}

require[@options.mode.to_sym].each do |r|
  unless @options[r].present?
    $logger.fatal "#{r.to_s} must be provided. Use --help for a list of parameters."
    exit 1
  end
end

unless @options.key.present?
  $logger.warn 'No key supplied. Can run without for a limited import, but larger datasets will need a key.'
end

RequestContext.community = Community.find(@options.community)

# ==================================================================================================================== #

if @options.mode == 'full' || @options.mode == 'process'
  Dir.chdir Rails.root
  unless Dir.exist?(Rails.root.join('import-data'))
    Dir.mkdir(Rails.root.join('import-data'))
  end

  domain = domain_from_api_param(@options.site)

  query_response = Net::HTTP.get_response(URI("https://data.stackexchange.com/#{@options.site}/csv/#{@options.query}"))
  query_results = CSV.parse(query_response.body)
  if @options.query_is_json
    json_string = query_results.drop(1).flatten.join('')
    data = JSON.parse(json_string)
    required_ids = data.map { |i| i['Id'].to_s }
    $logger.debug "Parsed query input as CSV-wrapped JSON, got #{required_ids.size} IDs"
  else
    required_ids = query_results.map { |r| r[0].to_s }.drop(1)
  end

  api_importer = APIImport.new @options

  posts, posts_file = DumpImport.do_xml_transform(domain, 'Posts', @options) do |rows|
    ids = rows.map { |r| r['id'].to_s }
    missing = required_ids.select { |e| !ids.include? e }
    excess = ids.select { |e| !required_ids.include? e }
    $logger.info "#{ids.size} post rows in dump, #{missing.size} to get from API, #{excess.size} excess"

    rows = rows.select { |r| !excess.include? r['id'].to_s }
    rows = rows.concat(api_importer.posts(missing) || [])

    rows
  end

  required_user_ids = posts.map { |p| p['owner_user_id'] }.uniq
  users, users_file = DumpImport.do_xml_transform(domain, 'Users', @options) do |rows|
    ids = rows.map { |r| r['id'].to_s }
    missing = required_user_ids.select { |e| !ids.include? e }
    excess = ids.select { |e| !required_user_ids.include? e }
    $logger.info "#{ids.size} user rows in dump, #{missing.size} to get from API, #{excess.size} excess"

    rows = rows.select { |r| !excess.include? r['id'].to_s }
    rows = rows.concat(api_importer.users(missing) || [])

    rows
  end

  tags_file = DumpImport.generate_tags(posts, @options)

  if @options.mode == 'process'
    files = [users_file, posts_file, tags_file].map { |s| s.to_s.gsub("#{Rails.root.to_s}/", '') }
    `tar -cvzf qpixel-import.tar.gz #{files.join(' ')}`
    $logger.info 'Written qpixel-import.tar.gz.'
    exit 0
  end
end

if @options.mode == 'import'
  Dir.chdir Rails.root
  `tar -xvzf qpixel-import.tar.gz`
  $logger.info 'Decompressed & unarchived qpixel-import.tar.gz.'
  # Now we have all the files in import-data/ and can continue with the same process for either
  # full or import-only modes
end

if @options.mode == 'import' || @options.mode == 'full'
  @importer = DatabaseImport.new @options, domain_from_api_param(@options.site)
  @importer.load_data('import-data/Users_Formatted.xml', 'users',
                      ['id', 'created_at', 'username', 'website', 'profile', 'profile_markdown', 'se_acct_id'])
  @importer.load_data('import-data/Posts_Formatted.xml', 'posts',
                      ['id', 'post_type_id', 'created_at', 'score', 'body', 'body_markdown', 'user_id', 'last_activity',
                       'title', 'tags_cache', 'answer_count', 'parent_id', 'att_source', 'att_license_name',
                       'att_license_link', 'category_id', 'community_id'])
  @importer.load_data('import-data/Tags_Formatted.xml', 'tags',
                      ['community_id', 'tag_set_id', 'name', 'created_at', 'updated_at'])

  @importer.run
end

if @options.mode == 'tags'
  @importer = DatabaseImport.new @options, domain_from_api_param(@options.site)
  @importer.associate_tags
end