bin/genevalidator
#!/usr/bin/env ruby
require 'optparse'
require 'English'
opt = {}
global = OptionParser.new do |opts|
opts.banner = <<BANNER
SUMMARY:
GeneValidator - Identify problems with predicted genes
USAGE:
genevalidator [OPTIONAL ARGUMENTS] INPUT_FILE
- To run as a web application:
genevalidator app [OPTIONAL ARGUMENTS]
See 'genevalidator app --help' for more information
- To download pre-formatted BLAST databases from NCBI
genevalidator ncbi-blast-dbs [OPTIONAL ARGUMENTS]
See 'genevalidator ncbi-blast-dbs --help' for more information
BANNER
opts.separator 'OPTIONAL ARGUMENTS'
opts.separator ' '
opt[:validations] = ['all']
opts.on('-v', '--validations [VALIDATIONS]', Array,
'The Validations to be applied.',
'Validation Options Available (separated by comma):',
' all = All validations (default),',
' lenc = Length validation by clusterization,',
' lenr = Length validation by ranking,',
' merge = Analyse gene merge,',
' dup = Check for duplications,',
' frame = Open reading frame (ORF) validation,',
' orf = Main ORF validation,',
' align = Validating based on multiple alignment') do |val|
opt[:validations] = val
end
opt[:db] = 'swissprot -remote'
opts.on('-d', '--db [PATH]',
'Path to the BLAST database',
'e.g. genevalidator -d /path/to/databasa.fa Input_File',
'GeneValidator also supports remote databases:',
'e.g. genevalidator -d "swissprot -remote" Input_File') do |db|
opt[:db] = db
end
opts.on('-s', '--select_single_best',
'Writes the fasta sequence of the best scoring gene to STDOUT.') do
opt[:select_single_best] = true
end
opts.separator ' '
opts.separator '# OUTPUT ARGUMENTS'
opts.separator ' '
opts.on('-o', '--output_dir [PATH]',
'Path to the output folder.',
'By default the output folder is in the same directory as the input',
'file and is named as input filename, followed by the time of',
'analysis') do |d|
opt[:output_dir] = d
end
opts.on('-f', '--force_rewrite', 'Rewrites over existing output.') do
opt[:force_rewrite] = true
end
opt[:output_formats] = %w[html csv json stdout summary]
opts.on('--output_formats [STRING]',
'Output Formats to generate. This can be either: "all", "html",',
'"csv", "json", "summary" or "stdout". Multiple formats can be',
'separated by a semi-colon e.g. "csv:json".',
'By default, all output formats are generated.') do |f|
opt[:output_formats] = f.split(':') unless f == 'all'
end
opts.separator ' '
opts.separator '# BLAST ARGUMENTS'
opts.separator ' '
opt[:min_blast_hits] = 5
opts.on('--min_blast_hits_required [NUM]',
'The minimum number of BLAST hits required by GeneValidator in order',
'to carry out validations. Note: certain validations have their own',
'set minimum (such as the multiple alignment validation, which',
' requires a minimum of 10 BLAST hits)') do |min|
opt[:min_blast_hits] = min.to_i
end
opts.on('-b', '--blast_options [STRING]',
'A string that is to passed to BLAST') do |blast_options|
opt[:blast_options] = blast_options
end
opts.on('-x', '--blast_xml_file [PATH]',
'Provide GeneValidator with a pre-computed BLAST XML output',
'file (BLAST -outfmt option 5).') do |blast_xml_file|
opt[:blast_xml_file] = blast_xml_file
end
opts.on('-t', '--blast_tabular_file [PATH]',
'Provide GeneValidator with a pre-computed BLAST tabular output',
'file. (BLAST -outfmt option 6).') do |blast_tabular_file|
opt[:blast_tabular_file] = blast_tabular_file
end
# default blast tabular columns
opts.on('--blast_tabular_options [STRING]',
'Custom format used in BLAST -outfmt argument',
'See BLAST+ manual pages for more details') do |blast_table_options|
opt[:blast_tabular_options] = blast_table_options
end
opts.on('--raw_sequences [PATH]',
'Supply a fasta file of the raw sequences of all BLAST hits present',
'in the supplied BLAST XML or BLAST tabular file.') do |raw_seq|
opt[:raw_sequences] = raw_seq
end
opts.separator ' '
opts.separator '# EXTRACT RAW SEQUENCES ARGUMENTS'
opts.separator ' '
opts.on('-e', '--extract_raw_seqs',
'Extract a fasta file of the raw sequences of BLAST hits in the',
'supplied BLAST output file. This fasta file can then be provided to',
'GeneValidator with the "--raw_sequences" argument') do
opt[:extract_raw_seqs] = true
end
opts.separator ' '
opts.separator '# REPROCESS JSON ARGUMENTS'
opts.separator ' '
opts.on('-j', '--json_file [JSON_FILE]',
'Path to json file. Re-generate the HTML report from a (filtered)',
'JSON file that was previously produced by GeneValidator') do |json|
opt[:json_file] = json
end
opts.separator ' '
opts.separator '# GENERAL ARGUMENTS'
opts.separator ' '
opt[:num_threads] = 1
opts.on('-n', '--num_threads [THREADS]',
'Specify the number of processor threads to use when running',
'BLAST and GeneValidator.') do |num_threads|
opt[:num_threads] = num_threads
end
opt[:mafft_threads] = 1
opts.on('-m', '--mafft_threads [THREADS]',
'Specify the number of processor threads to use when running',
'Mafft. Note Mafft is run independently in each of the threads',
'specified in --num_threads.') do |mafft_threads|
opt[:mafft_threads] = mafft_threads
end
opts.on('-r', '--resume [DIR]',
'Resumes an analysis. This works by using previously generated',
'temporary files instead of recomputing the analysis where possible.',
'A new output directory is created where the output files are',
'generated. This assumes that the input file is the same as that',
'used in the analysis you are resuming from.') do |dir|
opt[:resumable] = dir
end
opts.on('--bin [DIR]', Array,
'Path to BLAST and MAFFT bin folders (is added to $PATH variable)',
'To be provided as follows:',
'e.g. genevalidator --bin /blast/bin/ --bin /mafft/bin/') do |bin|
(opt[:bin] ||= []).concat(bin)
end
opts.on('-h', '--help', 'Show this screen.') do
puts global
exit
end
opts.on('-v', '--version',
'The version of GeneValidator that you are running.') do
require 'genevalidator/version'
puts GeneValidator::VERSION
exit
end
end
subcommands = {
'app' => OptionParser.new do |opts|
opts.banner = <<BANNER
SUMMARY:
Launch GeneValidator as a web application
USAGE:
$ genevalidator app [options]
Examples:
# Launch GeneValidatorApp with the given config file
$ genevalidator app --config ~/.genevalidatorapp.conf
# Launch GeneValidatorApp with 8 threads at port 8888
$ genevalidator app --num_threads 8 --port 8888
# Create a config file with the other arguments
$ genevalidator app -s -d ~/database_dir
BANNER
opts.separator 'MANDATORY ARGUMENTS'
opts.separator ' '
opts.on('-d', '--database_dir [PATH]',
'Path to the directory containing BLAST database') do |path|
opt[:database_dir] = path
end
opts.separator ' '
opts.separator 'OPTIONAL ARGUMENTS'
opts.separator ' '
opts.on('-g', '--serve_public_dir [PATH]',
'Web Accessible Directory - this is where GV files will',
'be stored so that they are accessible to the app.',
'Default location: $HOME/.genevalidatorapp') do |path|
opt[:serve_public_dir] = path
end
opts.on('-H', '--host [HOST]',
'Host to run GeneValidatorApp on') do |host|
opt[:host] = host
end
opts.on('-p', '--port [PORT]',
'Port to run GeneValidatorApp on') do |port|
opt[:port] = port
end
opt[:ssl] = false
opts.on('--ssl',
'Use HTTPS:// instead of HTTP:// links ') do
opt[:ssl] = true
end
opt[:max_characters] = 'undefined'
opts.on('--max_char_input [NUM]',
'Limit the number of characters (i.e. input sequence) inputted',
'into the app') do |num|
opt[:max_characters] = num
end
opts.separator ' '
opts.separator '# BLAST DATABASES ARGUMENTS'
opts.separator ' '
opts.on('-f', '--default_database_path [PATH]',
'Path to the default BLAST database') do |path|
opt[:default_database_path] = path
end
opts.on('-l', '--list_databases',
'List found BLAST databases') do
opt[:list_databases] = true
end
opts.separator ' '
opts.separator '# CONFIG FILE ARGUMENTS'
opts.separator ' '
opts.on('-c', '--config_file [PATH]',
'Path to the directory containing BLAST database',
'Default location: $HOME/.genevalidatorapp.conf') do |path|
opt[:config_file] = path
end
opts.on('-s', '--set',
'Set configuration value in default or given config file',
'(i.e. current save current options to config file)') do
opt[:set] = true
end
opts.separator ' '
opts.separator '# GENERAL ARGUMENTS'
opts.separator ' '
opt[:num_threads] = 1
opts.on('-n', '--num_threads [THREADS]',
'Specify the number of processor threads to use when running',
'BLAST and GeneValidator.') do |num_threads|
opt[:num_threads] = num_threads
end
opt[:mafft_threads] = 1
opts.on('-m', '--mafft_threads [THREADS]',
'Specify the number of processor threads to use when running',
'Mafft. Note Mafft is run independently in each of the threads',
'specified in --num_threads.') do |mafft_threads|
opt[:mafft_threads] = mafft_threads
end
opts.on('-b', '--bin [PATH]', Array,
'Path to BLAST and MAFFT bin folders (is added to $PATH variable)',
'To be provided as follows:',
'e.g. genevalidator -b /blast/bin/path/ -b /mafft/bin/path/') do |bin|
(opt[:bin] ||= []).concat(bin)
end
opts.on('-D', '--devel',
'Start GeneValidatorApp in development mode') do
opt[:devel] = true
end
opts.on('-h', '--help', 'Show this screen.') do
puts subcommands['app']
exit
end
end,
'serve' => OptionParser.new do |opts|
opts.banner = <<BANNER
SUMMARY:
Create a web server for viewing results
USAGE:
$ genevalidator serve [options] OUTPUT_DIR
ARGUMENTS:
BANNER
opt[:file_server_port] = 6789
opts.on('-p', '--port [PORT]',
'Port to run the server on') do |port|
opt[:file_server_port] = port
end
opts.on('-h', '--help', 'Show this screen.') do
puts subcommands['serve']
exit
end
end,
'ncbi-blast-dbs' => OptionParser.new do |opts|
opts.banner = <<BANNER
SUMMARY:
Fast download BLAST databases from NCBI
- Database files (volumes) are downloaded in parallel.
- Number of threads to use is determined automatically.
- MD5 checksum is verified and the database volume extracted upon download.
- Database volumes are not downloaded in a particular order.
- Volumes are updated if a newer version is available on the server, or
re-downloaded if corrupt.
- Aborted downloads are safely resumed.
USAGE:
# List available BLAST databases
$ genevalidator ncbi-blast-dbs
# Download all volumnes of a BLAST database (in the current directory)
$ genevalidator ncbi-blast-dbs nt nr
OPTIONS:
BANNER
opts.on('-h', '--help', 'Show this screen.') do
puts subcommands['ncbi-blast-dbs']
exit
end
end
}
begin
global.order!
first_arg = ARGV.shift
if subcommands.key?(first_arg)
command = first_arg
subcommands[command].order! unless subcommands[command].nil?
end
rescue OptionParser::ParseError
$stderr.print 'Error: ' + $ERROR_INFO.to_s + "\n"
exit 1
end
if command.nil? && opt[:extract_raw_seqs] && opt[:raw_sequences].nil?
require 'genevalidator'
require 'genevalidator/get_raw_sequences'
blast_file = opt[:blast_xml_file] if opt[:blast_xml_file]
blast_file = opt[:blast_tabular_file] if opt[:blast_tabular_file]
fname = File.basename(blast_file, File.extname(blast_file))
GeneValidator.opt = opt
GeneValidator.config = {}
output_dir = GeneValidator.make_output_dir(fname)
GeneValidator.dirs = { tmp_dir: output_dir }
GeneValidator::RawSequences.run
warn '==> The BLAST HSP raw sequence FASTA files have been saved to:'
warn " #{output_dir}"
elsif command.nil? && opt[:json_file]
start = Time.now
require 'genevalidator/json_to_gv_results'
GeneValidator::JsonToGVResults.init(opt)
GeneValidator::JsonToGVResults.run
warn "Total running time: #{(Time.now - start).round(3)}s"
warn '' # a blank line
elsif command.nil?
start = Time.now
opt[:input_fasta_file] = first_arg
if opt[:input_fasta_file].nil?
warn '' # a blank line
warn '==> Error: Input Fasta file required.'
warn '' # a blank line
puts global
exit 1
end
require 'genevalidator'
opt[:output_formats] = 'fasta' if opt[:select_single_best]
GeneValidator.init(opt)
GeneValidator.run
warn "Total running time: #{(Time.now - start).round(3)}s"
warn '' # a blank line
elsif command == 'serve'
require 'rack'
# Uses `open` on Mac or `xdg-open` on Linux to opens the search form in
# user's default browser. Errors, if any, are silenced.
def open_in_browser(server_url)
return if using_ssh?
if RUBY_PLATFORM =~ /linux/ && xdg?
system "xdg-open #{server_url}"
elsif RUBY_PLATFORM =~ /darwin/
system "open #{server_url}"
end
rescue StandardError
# fail silently
end
def using_ssh?
true if ENV['SSH_CLIENT'] || ENV['SSH_TTY'] || ENV['SSH_CONNECTION']
end
def xdg?
true if ENV['DISPLAY'] && command?('xdg-open')
end
dir = ARGV[0]
html_files = Dir[File.join(dir, '*.html')]
if html_files.empty?
warn '' # a blank line
warn '==> Error: GV HTML files not found at this location.'
warn ' Was the output folder moved after carrying out the analysis?'
warn '' # a blank line
puts subcommands['serve']
exit 1
end
index_file = File.basename(html_files.min)
app = Rack::Static.new nil, urls: [''], root: dir, index: index_file
Rack::Server.start app: app, Port: opt[:file_server_port], StartCallback: proc {
open_in_browser("http://localhost:#{opt[:file_server_port]}")
}
elsif command == 'ncbi-blast-dbs'
require 'rake'
load 'ncbi-blast-dbs.rake'
Rake.application.init 'ncbi-blast-dbs'
Rake.application.load_imports
Rake.application.top_level
elsif command == 'app'
ENV['RACK_ENV'] = 'development' if opt[:devel]
# Exit gracefully on SIGINT.
stty = `stty -g`.chomp
trap('INT') do
warn ''
warn 'Aborted.'
system('stty', stty)
exit
end
require 'genevalidatorapp'
begin
GeneValidatorApp.init opt
# The aim of following error recovery scenarios is to guide user to a
# working GeneValidatorApp installation. We expect to land following
# error scenarios either when creating a new GeneValidatorApp (first
# time or later), or updating config values using -s CLI option.
rescue GeneValidatorApp::CONFIG_FILE_ERROR => e
warn e
exit!
rescue GeneValidatorApp::NUM_THREADS_INCORRECT => e
warn e
if opt[:num_threads].nil?
warn 'You can set the correct value by running:'
warn
warn ' genevalidator app -s -n <value>'
warn
end
exit!
rescue GeneValidatorApp::BIN_DIR_NOT_FOUND => e
warn e
if opt[:bin].nil?
warn 'You can set the correct value by running:'
warn
warn ' genevalidator app -s -b <value>'
warn
end
exit!
rescue GeneValidatorApp::DATABASE_DIR_NOT_FOUND => e
warn e
if opt[:database_dir].nil?
warn 'You can set the correct value by running:'
warn
warn ' genevalidator app -s -d <value>'
warn
end
exit!
rescue GeneValidatorApp::BLAST_DATABASE_ERROR => e
warn e
exit!
rescue StandardError => e
# This will catch any unhandled error and some very special errors.
# Ideally we will never hit this block. If we do, there's a bug in
# GeneValidatorApp or something really weird going on. If we hit this
# error block we show the stacktrace to the user requesting them to
# post the same to our Google Group.
warn <<MSG
Something went wonky
Looks like you have encountered a bug in GeneValidatorApp. Please could you
report this incident here -
https://github.com/wurmlab/genevalidator/issues
Error:
#{e.backtrace.unshift(e.message).join("\n")}
MSG
exit!
end
if opt[:list_databases]
puts GeneValidatorApp::Database.all
exit
end
if opt[:set]
GeneValidatorApp.config.write_config_file
exit
end
GeneValidatorApp.run
end