wurmlab/GeneValidator

View on GitHub
bin/genevalidator

Summary

Maintainability
Test Coverage
#!/usr/bin/env ruby
require 'optparse'
require 'English'

opt = {}

global = OptionParser.new do |opts|
  opts.banner = <<BANNER
SUMMARY:
  GeneValidator - Identify problems with predicted genes

USAGE:
  genevalidator [OPTIONAL ARGUMENTS] INPUT_FILE

- To run as a web application:

    genevalidator app [OPTIONAL ARGUMENTS]

    See 'genevalidator app --help' for more information

- To download pre-formatted BLAST databases from NCBI

    genevalidator ncbi-blast-dbs [OPTIONAL ARGUMENTS]

    See 'genevalidator ncbi-blast-dbs --help' for more information
BANNER

  opts.separator 'OPTIONAL ARGUMENTS'
  opts.separator ' '

  opt[:validations] = ['all']
  opts.on('-v', '--validations [VALIDATIONS]', Array,
          'The Validations to be applied.',
          'Validation Options Available (separated by comma):',
          '  all   = All validations (default),',
          '  lenc  = Length validation by clusterization,',
          '  lenr  = Length validation by ranking,',
          '  merge = Analyse gene merge,',
          '  dup   = Check for duplications,',
          '  frame = Open reading frame (ORF) validation,',
          '  orf   = Main ORF validation,',
          '  align = Validating based on multiple alignment') do |val|
    opt[:validations] = val
  end

  opt[:db] = 'swissprot -remote'
  opts.on('-d', '--db [PATH]',
          'Path to the BLAST database',
          'e.g.   genevalidator -d /path/to/databasa.fa Input_File',
          'GeneValidator also supports remote databases:',
          'e.g.   genevalidator -d "swissprot -remote" Input_File') do |db|
    opt[:db] = db
  end

  opts.on('-s', '--select_single_best',
          'Writes the fasta sequence of the best scoring gene to STDOUT.') do
    opt[:select_single_best] = true
  end

  opts.separator ' '
  opts.separator '# OUTPUT ARGUMENTS'
  opts.separator ' '

  opts.on('-o', '--output_dir [PATH]',
          'Path to the output folder.',
          'By default the output folder is in the same directory as the input',
          'file and is named as input filename, followed by the time of',
          'analysis') do |d|
    opt[:output_dir] = d
  end

  opts.on('-f', '--force_rewrite', 'Rewrites over existing output.') do
    opt[:force_rewrite] = true
  end

  opt[:output_formats] = %w[html csv json stdout summary]
  opts.on('--output_formats [STRING]',
          'Output Formats to generate. This can be either: "all", "html",',
          '"csv", "json", "summary" or "stdout". Multiple formats can be',
          'separated by a semi-colon e.g. "csv:json".',
          'By default, all output formats are generated.') do |f|
    opt[:output_formats] = f.split(':') unless f == 'all'
  end

  opts.separator ' '
  opts.separator '# BLAST ARGUMENTS'
  opts.separator ' '

  opt[:min_blast_hits] = 5
  opts.on('--min_blast_hits_required [NUM]',
          'The minimum number of BLAST hits required by GeneValidator in order',
          'to carry out validations. Note: certain validations have their own',
          'set minimum (such as the multiple alignment validation, which',
          ' requires a minimum of 10 BLAST hits)') do |min|
    opt[:min_blast_hits] = min.to_i
  end

  opts.on('-b', '--blast_options [STRING]',
          'A string that is to passed to BLAST') do |blast_options|
    opt[:blast_options] = blast_options
  end

  opts.on('-x', '--blast_xml_file [PATH]',
          'Provide GeneValidator with a pre-computed BLAST XML output',
          'file (BLAST -outfmt option 5).') do |blast_xml_file|
    opt[:blast_xml_file] = blast_xml_file
  end

  opts.on('-t', '--blast_tabular_file [PATH]',
          'Provide GeneValidator with a pre-computed BLAST tabular output',
          'file. (BLAST -outfmt option 6).') do |blast_tabular_file|
    opt[:blast_tabular_file] = blast_tabular_file
  end

  # default blast tabular columns
  opts.on('--blast_tabular_options [STRING]',
          'Custom format used in BLAST -outfmt argument',
          'See BLAST+ manual pages for more details') do |blast_table_options|
    opt[:blast_tabular_options] = blast_table_options
  end

  opts.on('--raw_sequences [PATH]',
          'Supply a fasta file of the raw sequences of all BLAST hits present',
          'in the supplied BLAST XML or BLAST tabular file.') do |raw_seq|
    opt[:raw_sequences] = raw_seq
  end

  opts.separator ' '
  opts.separator '# EXTRACT RAW SEQUENCES ARGUMENTS'
  opts.separator ' '

  opts.on('-e', '--extract_raw_seqs',
          'Extract a fasta file of the raw sequences of BLAST hits in the',
          'supplied BLAST output file. This fasta file can then be provided to',
          'GeneValidator with the "--raw_sequences" argument') do
    opt[:extract_raw_seqs] = true
  end

  opts.separator ' '
  opts.separator '# REPROCESS JSON ARGUMENTS'
  opts.separator ' '

  opts.on('-j', '--json_file [JSON_FILE]',
          'Path to json file. Re-generate the HTML report from a (filtered)',
          'JSON file that was previously produced by GeneValidator') do |json|
    opt[:json_file] = json
  end

  opts.separator ' '
  opts.separator '# GENERAL ARGUMENTS'
  opts.separator ' '

  opt[:num_threads] = 1
  opts.on('-n', '--num_threads [THREADS]',
          'Specify the number of processor threads to use when running',
          'BLAST and GeneValidator.') do |num_threads|
    opt[:num_threads] = num_threads
  end

  opt[:mafft_threads] = 1
  opts.on('-m', '--mafft_threads [THREADS]',
          'Specify the number of processor threads to use when running',
          'Mafft. Note Mafft is run independently in each of the threads',
          'specified in --num_threads.') do |mafft_threads|
    opt[:mafft_threads] = mafft_threads
  end

  opts.on('-r', '--resume [DIR]',
          'Resumes an analysis. This works by using previously generated',
          'temporary files instead of recomputing the analysis where possible.',
          'A new output directory is created where the output files are',
          'generated. This assumes that the input file is the same as that',
          'used in the analysis you are resuming from.') do |dir|
    opt[:resumable] = dir
  end

  opts.on('--bin [DIR]', Array,
          'Path to BLAST and MAFFT bin folders (is added to $PATH variable)',
          'To be provided as follows:',
          'e.g.   genevalidator --bin /blast/bin/ --bin /mafft/bin/') do |bin|
    (opt[:bin] ||= []).concat(bin)
  end

  opts.on('-h', '--help', 'Show this screen.') do
    puts global
    exit
  end

  opts.on('-v', '--version',
          'The version of GeneValidator that you are running.') do
    require 'genevalidator/version'
    puts GeneValidator::VERSION
    exit
  end
end

subcommands = {
  'app' => OptionParser.new do |opts|
    opts.banner = <<BANNER
SUMMARY:
  Launch GeneValidator as a web application

USAGE:
  $ genevalidator app [options]

Examples:
  # Launch GeneValidatorApp with the given config file
  $ genevalidator app --config ~/.genevalidatorapp.conf

  # Launch GeneValidatorApp with 8 threads at port 8888
  $ genevalidator app --num_threads 8 --port 8888

  # Create a config file with the other arguments
  $ genevalidator app -s -d ~/database_dir

BANNER

    opts.separator 'MANDATORY ARGUMENTS'
    opts.separator ' '

    opts.on('-d', '--database_dir [PATH]',
            'Path to the directory containing BLAST database') do |path|
      opt[:database_dir] = path
    end

    opts.separator ' '
    opts.separator 'OPTIONAL ARGUMENTS'
    opts.separator ' '

    opts.on('-g', '--serve_public_dir [PATH]',
            'Web Accessible Directory - this is where GV files will',
            'be stored so that they are accessible to the app.',
            'Default location: $HOME/.genevalidatorapp') do |path|
      opt[:serve_public_dir] = path
    end

    opts.on('-H', '--host [HOST]',
            'Host to run GeneValidatorApp on') do |host|
      opt[:host] = host
    end

    opts.on('-p', '--port [PORT]',
            'Port to run GeneValidatorApp on') do |port|
      opt[:port] = port
    end

    opt[:ssl] = false
    opts.on('--ssl',
            'Use HTTPS:// instead of HTTP:// links ') do
      opt[:ssl] = true
    end

    opt[:max_characters] = 'undefined'
    opts.on('--max_char_input [NUM]',
            'Limit the number of characters (i.e. input sequence) inputted',
            'into the app') do |num|
      opt[:max_characters] = num
    end

    opts.separator ' '
    opts.separator '# BLAST DATABASES ARGUMENTS'
    opts.separator ' '

    opts.on('-f', '--default_database_path  [PATH]',
            'Path to the default BLAST database') do |path|
      opt[:default_database_path] = path
    end

    opts.on('-l', '--list_databases',
            'List found BLAST databases') do
      opt[:list_databases] = true
    end

    opts.separator ' '
    opts.separator '# CONFIG FILE ARGUMENTS'
    opts.separator ' '

    opts.on('-c', '--config_file [PATH]',
            'Path to the directory containing BLAST database',
            'Default location: $HOME/.genevalidatorapp.conf') do |path|
      opt[:config_file] = path
    end

    opts.on('-s', '--set',
            'Set configuration value in default or given config file',
            '(i.e. current save current options to config file)') do
      opt[:set] = true
    end

    opts.separator ' '
    opts.separator '# GENERAL ARGUMENTS'
    opts.separator ' '

    opt[:num_threads] = 1
    opts.on('-n', '--num_threads [THREADS]',
            'Specify the number of processor threads to use when running',
            'BLAST and GeneValidator.') do |num_threads|
      opt[:num_threads] = num_threads
    end

    opt[:mafft_threads] = 1
    opts.on('-m', '--mafft_threads [THREADS]',
            'Specify the number of processor threads to use when running',
            'Mafft. Note Mafft is run independently in each of the threads',
            'specified in --num_threads.') do |mafft_threads|
      opt[:mafft_threads] = mafft_threads
    end

    opts.on('-b', '--bin [PATH]', Array,
            'Path to BLAST and MAFFT bin folders (is added to $PATH variable)',
            'To be provided as follows:',
            'e.g.   genevalidator -b /blast/bin/path/ -b /mafft/bin/path/') do |bin|
      (opt[:bin] ||= []).concat(bin)
    end

    opts.on('-D', '--devel',
            'Start GeneValidatorApp in development mode') do
      opt[:devel] = true
    end

    opts.on('-h', '--help', 'Show this screen.') do
      puts subcommands['app']
      exit
    end
  end,
  'serve' => OptionParser.new do |opts|
    opts.banner = <<BANNER
SUMMARY:
  Create a web server for viewing results

USAGE:
  $ genevalidator serve [options] OUTPUT_DIR

ARGUMENTS:
BANNER
    opt[:file_server_port] = 6789
    opts.on('-p', '--port [PORT]',
            'Port to run the server on') do |port|
      opt[:file_server_port] = port
    end

    opts.on('-h', '--help', 'Show this screen.') do
      puts subcommands['serve']
      exit
    end
  end,
  'ncbi-blast-dbs' => OptionParser.new do |opts|
    opts.banner = <<BANNER
SUMMARY:
  Fast download BLAST databases from NCBI

  - Database files (volumes) are downloaded in parallel.
  - Number of threads to use is determined automatically.
  - MD5 checksum is verified and the database volume extracted upon download.
  - Database volumes are not downloaded in a particular order.
  - Volumes are updated if a newer version is available on the server, or
    re-downloaded if corrupt.
  - Aborted downloads are safely resumed.

USAGE:

  # List available BLAST databases
  $ genevalidator ncbi-blast-dbs

  # Download all volumnes of a BLAST database (in the current directory)
  $ genevalidator ncbi-blast-dbs nt nr

OPTIONS:
BANNER

    opts.on('-h', '--help', 'Show this screen.') do
      puts subcommands['ncbi-blast-dbs']
      exit
    end
  end
}

begin
  global.order!
  first_arg = ARGV.shift
  if subcommands.key?(first_arg)
    command = first_arg
    subcommands[command].order! unless subcommands[command].nil?
  end
rescue OptionParser::ParseError
  $stderr.print 'Error: ' + $ERROR_INFO.to_s + "\n"
  exit 1
end

if command.nil? && opt[:extract_raw_seqs] && opt[:raw_sequences].nil?
  require 'genevalidator'
  require 'genevalidator/get_raw_sequences'
  blast_file = opt[:blast_xml_file] if opt[:blast_xml_file]
  blast_file = opt[:blast_tabular_file] if opt[:blast_tabular_file]
  fname = File.basename(blast_file, File.extname(blast_file))
  GeneValidator.opt    = opt
  GeneValidator.config = {}
  output_dir = GeneValidator.make_output_dir(fname)
  GeneValidator.dirs   = { tmp_dir: output_dir }
  GeneValidator::RawSequences.run
  warn '==> The BLAST HSP raw sequence FASTA files have been saved to:'
  warn "     #{output_dir}"
elsif command.nil? && opt[:json_file]
  start = Time.now
  require 'genevalidator/json_to_gv_results'
  GeneValidator::JsonToGVResults.init(opt)
  GeneValidator::JsonToGVResults.run
  warn "Total running time: #{(Time.now - start).round(3)}s"
  warn '' # a blank line
elsif command.nil?
  start = Time.now
  opt[:input_fasta_file] = first_arg
  if opt[:input_fasta_file].nil?
    warn '' # a blank line
    warn '==> Error: Input Fasta file required.'
    warn '' # a blank line
    puts global
    exit 1
  end
  require 'genevalidator'
  opt[:output_formats] = 'fasta' if opt[:select_single_best]
  GeneValidator.init(opt)
  GeneValidator.run
  warn "Total running time: #{(Time.now - start).round(3)}s"
  warn '' # a blank line
elsif command == 'serve'
  require 'rack'

  # Uses `open` on Mac or `xdg-open` on Linux to opens the search form in
  # user's default browser. Errors, if any, are silenced.
  def open_in_browser(server_url)
    return if using_ssh?
    if RUBY_PLATFORM =~ /linux/ && xdg?
      system "xdg-open #{server_url}"
    elsif RUBY_PLATFORM =~ /darwin/
      system "open #{server_url}"
    end
  rescue StandardError
    # fail silently
  end

  def using_ssh?
    true if ENV['SSH_CLIENT'] || ENV['SSH_TTY'] || ENV['SSH_CONNECTION']
  end

  def xdg?
    true if ENV['DISPLAY'] && command?('xdg-open')
  end

  dir = ARGV[0]
  html_files = Dir[File.join(dir, '*.html')]
  if html_files.empty?
    warn '' # a blank line
    warn '==> Error: GV HTML files not found at this location.'
    warn '    Was the output folder moved after carrying out the analysis?'
    warn '' # a blank line
    puts subcommands['serve']
    exit 1
  end
  index_file = File.basename(html_files.min)
  app = Rack::Static.new nil, urls: [''], root: dir, index: index_file
  Rack::Server.start app: app, Port: opt[:file_server_port], StartCallback: proc {
    open_in_browser("http://localhost:#{opt[:file_server_port]}")
  }
elsif command == 'ncbi-blast-dbs'
  require 'rake'
  load 'ncbi-blast-dbs.rake'

  Rake.application.init 'ncbi-blast-dbs'
  Rake.application.load_imports
  Rake.application.top_level

elsif command == 'app'
  ENV['RACK_ENV'] = 'development' if opt[:devel]

  # Exit gracefully on SIGINT.
  stty = `stty -g`.chomp
  trap('INT') do
    warn ''
    warn 'Aborted.'
    system('stty', stty)
    exit
  end

  require 'genevalidatorapp'

  begin
    GeneValidatorApp.init opt

    # The aim of following error recovery scenarios is to guide user to a
    # working GeneValidatorApp installation. We expect to land following
    # error scenarios either when creating a new GeneValidatorApp (first
    # time or later), or updating config values using -s CLI option.
  rescue GeneValidatorApp::CONFIG_FILE_ERROR => e
    warn e
    exit!
  rescue GeneValidatorApp::NUM_THREADS_INCORRECT => e
    warn e

    if opt[:num_threads].nil?
      warn 'You can set the correct value by running:'
      warn
      warn '  genevalidator app -s -n <value>'
      warn
    end

    exit!
  rescue GeneValidatorApp::BIN_DIR_NOT_FOUND => e
    warn e

    if opt[:bin].nil?
      warn 'You can set the correct value by running:'
      warn
      warn '  genevalidator app -s -b <value>'
      warn
    end

    exit!
  rescue GeneValidatorApp::DATABASE_DIR_NOT_FOUND => e
    warn e

    if opt[:database_dir].nil?
      warn 'You can set the correct value by running:'
      warn
      warn '  genevalidator app -s -d <value>'
      warn
    end

    exit!
  rescue GeneValidatorApp::BLAST_DATABASE_ERROR => e
    warn e
    exit!
  rescue StandardError => e
    # This will catch any unhandled error and some very special errors.
    # Ideally we will never hit this block. If we do, there's a bug in
    # GeneValidatorApp or something really weird going on. If we hit this
    # error block we show the stacktrace to the user requesting them to
    # post the same to our Google Group.
    warn <<MSG
Something went wonky

Looks like you have encountered a bug in GeneValidatorApp. Please could you
report this incident here -
https://github.com/wurmlab/genevalidator/issues

Error:
#{e.backtrace.unshift(e.message).join("\n")}
MSG
    exit!
  end

  if opt[:list_databases]
    puts GeneValidatorApp::Database.all
    exit
  end

  if opt[:set]
    GeneValidatorApp.config.write_config_file
    exit
  end

  GeneValidatorApp.run
end