holderdeord/hdo-transcript-search

View on GitHub
indexer/bin/hdo-transcript-indexer

Summary

Maintainability
Test Coverage
#!/usr/bin/env ruby

require 'optparse'

elasticsearch_url = ENV['ELASTICSEARCH_URL'] or 'http://localhost:9200'

options = {
  elasticsearch_url: elasticsearch_url,
  data_dir: File.expand_path('../data'),
  sessions: ['all'],
  index_name: 'hdo-transcripts',
  create_index: false,
  force: false,
  ner: false,
  mail: false,
  lix: false
}

parser = OptionParser.new do |opt|
  opt.on('-e', '--elasticsearch-url URL', "URL to elasticsearch server, default: #{options[:elasticsearch_url]}") { |url| options[:elasticsearch_url] = url }
  opt.on('-i', '--index INDEX_NAME', "Index to use, default: #{options[:index_name]}") { |n| options[:index_name] = n }
  opt.on('-c', '--create-index', "Delete and re-create the index") { options[:create_index] = true }
  opt.on('-d', '--data-dir PATH', "Data dir to cache downloads, default: #{options[:data_dir]}") { |dir| options[:data_dir] = dir }
  opt.on('-f', '--force', "Force re-download of transcripts") { options[:force] = true }
  opt.on('-s', '--sessions SESSIONS', "Comma-separated list of sessions, default: #{options[:sessions].join(',')}") { |sessions| options[:sessions] = sessions.split(',') }
  opt.on('-n', '--ner', 'Enable named entity extraction using Polyglot.') { options[:ner] = true }
  opt.on('-l', '--lix', 'Calculate LIX readability scores.') { options[:lix] = true }
  opt.on('-m', '--mail', 'Enable mail notifications when new transcripts are added.') { options[:mail] = true }
  opt.on('-h', '--help', "You're looking at it.") { puts opt; exit 0; }
end

parser.parse!(ARGV)

require 'hdo-transcript-indexer'
ok = Hdo::Transcript::Indexer.new(options).execute

unless ok
  STDERR.puts "indexer failed"
  exit 1
end